aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c8
-rw-r--r--net/802/hippi.c6
-rw-r--r--net/802/p8022.c3
-rw-r--r--net/802/p8023.c1
-rw-r--r--net/802/psnap.c7
-rw-r--r--net/802/sysctl_net_802.c3
-rw-r--r--net/802/tr.c22
-rw-r--r--net/8021q/Kconfig19
-rw-r--r--net/8021q/vlan.c8
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/Kconfig476
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/aarp.c2
-rw-r--r--net/appletalk/ddp.c8
-rw-r--r--net/atm/Kconfig74
-rw-r--r--net/atm/br2684.c3
-rw-r--r--net/atm/ioctl.c1
-rw-r--r--net/atm/ipcommon.c3
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/ax25/af_ax25.c29
-rw-r--r--net/ax25/ax25_ds_in.c3
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_in.c17
-rw-r--r--net/ax25/ax25_route.c12
-rw-r--r--net/ax25/ax25_std_in.c3
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_uid.c83
-rw-r--r--net/bluetooth/hci_core.c20
-rw-r--r--net/bluetooth/hci_event.c83
-rw-r--r--net/bluetooth/hci_sock.c26
-rw-r--r--net/bluetooth/l2cap.c2
-rw-r--r--net/bluetooth/lib.c25
-rw-r--r--net/bluetooth/rfcomm/core.c77
-rw-r--r--net/bluetooth/rfcomm/sock.c2
-rw-r--r--net/bluetooth/rfcomm/tty.c206
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bridge/Kconfig31
-rw-r--r--net/bridge/br_fdb.c2
-rw-r--r--net/bridge/netfilter/Kconfig2
-rw-r--r--net/bridge/netfilter/ebt_mark.c5
-rw-r--r--net/bridge/netfilter/ebt_ulog.c9
-rw-r--r--net/compat.c9
-rw-r--r--net/core/Makefile4
-rw-r--r--net/core/datagram.c6
-rw-r--r--net/core/dev.c91
-rw-r--r--net/core/dst.c15
-rw-r--r--net/core/ethtool.c49
-rw-r--r--net/core/filter.c6
-rw-r--r--net/core/flow.c2
-rw-r--r--net/core/neighbour.c15
-rw-r--r--net/core/netfilter.c648
-rw-r--r--net/core/netpoll.c63
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/request_sock.c28
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c162
-rw-r--r--net/core/sock.c148
-rw-r--r--net/core/sysctl_net_core.c9
-rw-r--r--net/core/utils.c39
-rw-r--r--net/core/wireless.c8
-rw-r--r--net/dccp/Kconfig50
-rw-r--r--net/dccp/Makefile10
-rw-r--r--net/dccp/ccid.c139
-rw-r--r--net/dccp/ccid.h180
-rw-r--r--net/dccp/ccids/Kconfig29
-rw-r--r--net/dccp/ccids/Makefile5
-rw-r--r--net/dccp/ccids/ccid3.c1221
-rw-r--r--net/dccp/ccids/ccid3.h137
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c144
-rw-r--r--net/dccp/ccids/lib/loss_interval.h61
-rw-r--r--net/dccp/ccids/lib/packet_history.c398
-rw-r--r--net/dccp/ccids/lib/packet_history.h199
-rw-r--r--net/dccp/ccids/lib/tfrc.h22
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c644
-rw-r--r--net/dccp/dccp.h493
-rw-r--r--net/dccp/diag.c71
-rw-r--r--net/dccp/input.c600
-rw-r--r--net/dccp/ipv4.c1356
-rw-r--r--net/dccp/minisocks.c264
-rw-r--r--net/dccp/options.c855
-rw-r--r--net/dccp/output.c528
-rw-r--r--net/dccp/proto.c826
-rw-r--r--net/dccp/timer.c255
-rw-r--r--net/decnet/Kconfig23
-rw-r--r--net/decnet/af_decnet.c51
-rw-r--r--net/decnet/dn_dev.c8
-rw-r--r--net/decnet/dn_neigh.c2
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_nsp_out.c65
-rw-r--r--net/decnet/dn_route.c2
-rw-r--r--net/decnet/dn_table.c6
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c11
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/econet/af_econet.c8
-rw-r--r--net/ethernet/eth.c5
-rw-r--r--net/ethernet/sysctl_net_ether.c1
-rw-r--r--net/ieee80211/Kconfig69
-rw-r--r--net/ieee80211/Makefile11
-rw-r--r--net/ieee80211/ieee80211_crypt.c259
-rw-r--r--net/ieee80211/ieee80211_crypt_ccmp.c470
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c708
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c272
-rw-r--r--net/ieee80211/ieee80211_module.c299
-rw-r--r--net/ieee80211/ieee80211_rx.c1189
-rw-r--r--net/ieee80211/ieee80211_tx.c438
-rw-r--r--net/ieee80211/ieee80211_wx.c471
-rw-r--r--net/ipv4/Kconfig56
-rw-r--r--net/ipv4/Makefile10
-rw-r--r--net/ipv4/af_inet.c181
-rw-r--r--net/ipv4/ah4.c18
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/datagram.c3
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/esp4.c36
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_hash.c4
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_semantics.c16
-rw-r--r--net/ipv4/fib_trie.c1968
-rw-r--r--net/ipv4/icmp.c27
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c641
-rw-r--r--net/ipv4/inet_diag.c868
-rw-r--r--net/ipv4/inet_hashtables.c165
-rw-r--r--net/ipv4/inet_timewait_sock.c384
-rw-r--r--net/ipv4/inetpeer.c16
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c12
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_input.c141
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c33
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipcomp.c9
-rw-r--r--net/ipv4/ipconfig.c9
-rw-r--r--net/ipv4/ipip.c56
-rw-r--r--net/ipv4/ipmr.c8
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c9
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c13
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c2
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/netfilter.c139
-rw-r--r--net/ipv4/netfilter/Kconfig70
-rw-r--r--net/ipv4/netfilter/Makefile9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c26
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c427
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c35
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1579
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c73
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c48
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c58
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c104
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c30
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c26
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_queue.c58
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c4
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c15
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c86
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c5
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c8
-rw-r--r--net/ipv4/netfilter/ipt_NFQUEUE.c70
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c10
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c119
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c53
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c162
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c7
-rw-r--r--net/ipv4/netfilter/ipt_dccp.c176
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c2
-rw-r--r--net/ipv4/netfilter/ipt_mark.c7
-rw-r--r--net/ipv4/netfilter/ipt_owner.c132
-rw-r--r--net/ipv4/netfilter/ipt_string.c91
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c10
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c47
-rw-r--r--net/ipv4/tcp.c421
-rw-r--r--net/ipv4/tcp_bic.c46
-rw-r--r--net/ipv4/tcp_cong.c44
-rw-r--r--net/ipv4/tcp_diag.c784
-rw-r--r--net/ipv4/tcp_highspeed.c17
-rw-r--r--net/ipv4/tcp_htcp.c53
-rw-r--r--net/ipv4/tcp_hybla.c31
-rw-r--r--net/ipv4/tcp_input.c549
-rw-r--r--net/ipv4/tcp_ipv4.c941
-rw-r--r--net/ipv4/tcp_minisocks.c605
-rw-r--r--net/ipv4/tcp_output.c313
-rw-r--r--net/ipv4/tcp_scalable.c6
-rw-r--r--net/ipv4/tcp_timer.c253
-rw-r--r--net/ipv4/tcp_vegas.c50
-rw-r--r--net/ipv4/tcp_westwood.c64
-rw-r--r--net/ipv4/udp.c39
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_state.c2
-rw-r--r--net/ipv4/xfrm4_tunnel.c3
-rw-r--r--net/ipv6/Kconfig23
-rw-r--r--net/ipv6/Makefile4
-rw-r--r--net/ipv6/addrconf.c38
-rw-r--r--net/ipv6/af_inet6.c62
-rw-r--r--net/ipv6/ah6.c31
-rw-r--r--net/ipv6/datagram.c5
-rw-r--r--net/ipv6/esp6.c27
-rw-r--r--net/ipv6/exthdrs.c8
-rw-r--r--net/ipv6/icmp.c25
-rw-r--r--net/ipv6/inet6_hashtables.c81
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/ip6_input.c15
-rw-r--r--net/ipv6/ip6_output.c61
-rw-r--r--net/ipv6/ip6_tunnel.c38
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c28
-rw-r--r--net/ipv6/ipv6_syms.c3
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter.c104
-rw-r--r--net/ipv6/netfilter/Kconfig37
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_queue.c57
-rw-r--r--net/ipv6/netfilter/ip6_tables.c5
-rw-r--r--net/ipv6/netfilter/ip6t_HL.c118
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c104
-rw-r--r--net/ipv6/netfilter/ip6t_MARK.c5
-rw-r--r--net/ipv6/netfilter/ip6t_NFQUEUE.c70
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c284
-rw-r--r--net/ipv6/netfilter/ip6t_owner.c90
-rw-r--r--net/ipv6/raw.c22
-rw-r--r--net/ipv6/reassembly.c4
-rw-r--r--net/ipv6/route.c14
-rw-r--r--net/ipv6/sit.c23
-rw-r--r--net/ipv6/sysctl_net_ipv6.c3
-rw-r--r--net/ipv6/tcp_ipv6.c448
-rw-r--r--net/ipv6/udp.c12
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/Kconfig33
-rw-r--r--net/ipx/af_ipx.c10
-rw-r--r--net/ipx/ipx_proc.c2
-rw-r--r--net/irda/af_irda.c2
-rw-r--r--net/irda/irlan/irlan_filter.c1
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irlmp.c3
-rw-r--r--net/irda/irmod.c2
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.c2
-rw-r--r--net/irda/irqueue.c1
-rw-r--r--net/irda/qos.c1
-rw-r--r--net/lapb/Kconfig22
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/llc/llc_conn.c8
-rw-r--r--net/llc/llc_core.c3
-rw-r--r--net/llc/llc_if.c2
-rw-r--r--net/llc/llc_input.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/netfilter/Kconfig24
-rw-r--r--net/netfilter/Makefile7
-rw-r--r--net/netfilter/core.c216
-rw-r--r--net/netfilter/nf_internals.h39
-rw-r--r--net/netfilter/nf_log.c178
-rw-r--r--net/netfilter/nf_queue.c343
-rw-r--r--net/netfilter/nf_sockopt.c132
-rw-r--r--net/netfilter/nfnetlink.c376
-rw-r--r--net/netfilter/nfnetlink_log.c1055
-rw-r--r--net/netfilter/nfnetlink_queue.c1121
-rw-r--r--net/netlink/af_netlink.c317
-rw-r--r--net/netrom/af_netrom.c38
-rw-r--r--net/netrom/nr_dev.c5
-rw-r--r--net/netrom/nr_in.c3
-rw-r--r--net/netrom/nr_subr.c4
-rw-r--r--net/netrom/nr_timer.c2
-rw-r--r--net/packet/Kconfig26
-rw-r--r--net/packet/af_packet.c20
-rw-r--r--net/rose/af_rose.c29
-rw-r--r--net/rose/rose_in.c3
-rw-r--r--net/rose/rose_route.c8
-rw-r--r--net/rose/rose_subr.c4
-rw-r--r--net/rose/rose_timer.c2
-rw-r--r--net/rxrpc/transport.c2
-rw-r--r--net/sched/Kconfig38
-rw-r--r--net/sched/act_api.c15
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/em_meta.c62
-rw-r--r--net/sched/em_text.c3
-rw-r--r--net/sched/gact.c2
-rw-r--r--net/sched/ipt.c2
-rw-r--r--net/sched/mirred.c2
-rw-r--r--net/sched/pedit.c2
-rw-r--r--net/sched/police.c3
-rw-r--r--net/sched/sch_api.c4
-rw-r--r--net/sched/sch_generic.c24
-rw-r--r--net/sched/simple.c2
-rw-r--r--net/sctp/associola.c13
-rw-r--r--net/sctp/bind_addr.c16
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c9
-rw-r--r--net/sctp/input.c49
-rw-r--r--net/sctp/ipv6.c14
-rw-r--r--net/sctp/objcnt.c6
-rw-r--r--net/sctp/proc.c1
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/sm_make_chunk.c24
-rw-r--r--net/sctp/sm_sideeffect.c13
-rw-r--r--net/sctp/socket.c7
-rw-r--r--net/sctp/ssnmap.c3
-rw-r--r--net/sctp/sysctl.c1
-rw-r--r--net/sctp/transport.c5
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c72
-rw-r--r--net/socket.c19
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c7
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c9
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c12
-rw-r--r--net/sunrpc/rpc_pipe.c6
-rw-r--r--net/sunrpc/sched.c8
-rw-r--r--net/sunrpc/svcsock.c13
-rw-r--r--net/sunrpc/xdr.c1
-rw-r--r--net/sysctl_net.c8
-rw-r--r--net/unix/Kconfig21
-rw-r--r--net/unix/af_unix.c10
-rw-r--r--net/unix/garbage.c14
-rw-r--r--net/unix/sysctl_net_unix.c2
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/af_wanpipe.c2
-rw-r--r--net/wanrouter/wanmain.c6
-rw-r--r--net/x25/Kconfig36
-rw-r--r--net/x25/af_x25.c2
-rw-r--r--net/x25/x25_dev.c2
-rw-r--r--net/x25/x25_in.c2
-rw-r--r--net/x25/x25_subr.c4
-rw-r--r--net/x25/x25_timer.c2
-rw-r--r--net/xfrm/Kconfig15
-rw-r--r--net/xfrm/xfrm_input.c2
-rw-r--r--net/xfrm/xfrm_policy.c2
-rw-r--r--net/xfrm/xfrm_user.c34
363 files changed, 27251 insertions, 7889 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 640d34e026c2..282c4ab1abe6 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb)
87 struct fch_hdr *fch=(struct fch_hdr *)skb->data; 87 struct fch_hdr *fch=(struct fch_hdr *)skb->data;
88 struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); 88 struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
89 if(fcllc->ethertype != htons(ETH_P_IP)) { 89 if(fcllc->ethertype != htons(ETH_P_IP)) {
90 printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype)); 90 printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
91 return 0; 91 return 0;
92 } 92 }
93#ifdef CONFIG_INET 93#ifdef CONFIG_INET
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..ac242a4bc346 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff *skb)
108 else 108 else
109#endif 109#endif
110 { 110 {
111 printk("%s: Don't know how to resolve type %02X addresses.\n", 111 printk("%s: Don't know how to resolve type %04X addresses.\n",
112 skb->dev->name, htons(fddi->hdr.llc_snap.ethertype)); 112 skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
113 return(0); 113 return(0);
114 } 114 }
115} 115}
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff *skb)
122 * the proper pointer to the start of packet data (skb->data). 122 * the proper pointer to the start of packet data (skb->data).
123 */ 123 */
124 124
125unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev) 125__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
126{ 126{
127 struct fddihdr *fddi = (struct fddihdr *)skb->data; 127 struct fddihdr *fddi = (struct fddihdr *)skb->data;
128 unsigned short type; 128 __be16 type;
129 129
130 /* 130 /*
131 * Set mac.raw field to point to FC byte, set data field to point 131 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 051e8af56a77..6d7fed3dd99a 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
51 unsigned len) 51 unsigned len)
52{ 52{
53 struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); 53 struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
54 struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
54 55
55 if (!len){ 56 if (!len){
56 len = skb->len - HIPPI_HLEN; 57 len = skb->len - HIPPI_HLEN;
@@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
84 if (daddr) 85 if (daddr)
85 { 86 {
86 memcpy(hip->le.dest_switch_addr, daddr + 3, 3); 87 memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
87 memcpy(&skb->private.ifield, daddr + 2, 4); 88 memcpy(&hcb->ifield, daddr + 2, 4);
88 return HIPPI_HLEN; 89 return HIPPI_HLEN;
89 } 90 }
91 hcb->ifield = 0;
90 return -((int)HIPPI_HLEN); 92 return -((int)HIPPI_HLEN);
91} 93}
92 94
@@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb)
122 * Determine the packet's protocol ID. 124 * Determine the packet's protocol ID.
123 */ 125 */
124 126
125unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev) 127__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
126{ 128{
127 struct hippi_hdr *hip; 129 struct hippi_hdr *hip;
128 130
diff --git a/net/802/p8022.c b/net/802/p8022.c
index 5ae63416df6d..b24817c63ca8 100644
--- a/net/802/p8022.c
+++ b/net/802/p8022.c
@@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
35struct datalink_proto *register_8022_client(unsigned char type, 35struct datalink_proto *register_8022_client(unsigned char type,
36 int (*func)(struct sk_buff *skb, 36 int (*func)(struct sk_buff *skb,
37 struct net_device *dev, 37 struct net_device *dev,
38 struct packet_type *pt)) 38 struct packet_type *pt,
39 struct net_device *orig_dev))
39{ 40{
40 struct datalink_proto *proto; 41 struct datalink_proto *proto;
41 42
diff --git a/net/802/p8023.c b/net/802/p8023.c
index a0b61b40225f..6368d3dce444 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -20,6 +20,7 @@
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21 21
22#include <net/datalink.h> 22#include <net/datalink.h>
23#include <net/p8022.h>
23 24
24/* 25/*
25 * Place an 802.3 header on a packet. The driver will do the mac 26 * Place an 802.3 header on a packet. The driver will do the mac
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 1053821ddf93..ab80b1fab53c 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc)
47 * A SNAP packet has arrived 47 * A SNAP packet has arrived
48 */ 48 */
49static int snap_rcv(struct sk_buff *skb, struct net_device *dev, 49static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
50 struct packet_type *pt) 50 struct packet_type *pt, struct net_device *orig_dev)
51{ 51{
52 int rc = 1; 52 int rc = 1;
53 struct datalink_proto *proto; 53 struct datalink_proto *proto;
@@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
61 /* Pass the frame on. */ 61 /* Pass the frame on. */
62 skb->h.raw += 5; 62 skb->h.raw += 5;
63 skb_pull(skb, 5); 63 skb_pull(skb, 5);
64 rc = proto->rcvfunc(skb, dev, &snap_packet_type); 64 rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
65 } else { 65 } else {
66 skb->sk = NULL; 66 skb->sk = NULL;
67 kfree_skb(skb); 67 kfree_skb(skb);
@@ -118,7 +118,8 @@ module_exit(snap_exit);
118struct datalink_proto *register_snap_client(unsigned char *desc, 118struct datalink_proto *register_snap_client(unsigned char *desc,
119 int (*rcvfunc)(struct sk_buff *, 119 int (*rcvfunc)(struct sk_buff *,
120 struct net_device *, 120 struct net_device *,
121 struct packet_type *)) 121 struct packet_type *,
122 struct net_device *))
122{ 123{
123 struct datalink_proto *proto = NULL; 124 struct datalink_proto *proto = NULL;
124 125
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index 36079630c49f..700129556c13 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -10,9 +10,10 @@
10 * 2 of the License, or (at your option) any later version. 10 * 2 of the License, or (at your option) any later version.
11 */ 11 */
12 12
13#include <linux/config.h>
13#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/if_tr.h>
14#include <linux/sysctl.h> 16#include <linux/sysctl.h>
15#include <linux/config.h>
16 17
17#ifdef CONFIG_TR 18#ifdef CONFIG_TR
18extern int sysctl_tr_rif_timeout; 19extern int sysctl_tr_rif_timeout;
diff --git a/net/802/tr.c b/net/802/tr.c
index a755e880f4ba..1bb7dc1b85cd 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -251,10 +251,11 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
251 unsigned int hash; 251 unsigned int hash;
252 struct rif_cache *entry; 252 struct rif_cache *entry;
253 unsigned char *olddata; 253 unsigned char *olddata;
254 unsigned long flags;
254 static const unsigned char mcast_func_addr[] 255 static const unsigned char mcast_func_addr[]
255 = {0xC0,0x00,0x00,0x04,0x00,0x00}; 256 = {0xC0,0x00,0x00,0x04,0x00,0x00};
256 257
257 spin_lock_bh(&rif_lock); 258 spin_lock_irqsave(&rif_lock, flags);
258 259
259 /* 260 /*
260 * Broadcasts are single route as stated in RFC 1042 261 * Broadcasts are single route as stated in RFC 1042
@@ -323,7 +324,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
323 else 324 else
324 slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8); 325 slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
325 olddata = skb->data; 326 olddata = skb->data;
326 spin_unlock_bh(&rif_lock); 327 spin_unlock_irqrestore(&rif_lock, flags);
327 328
328 skb_pull(skb, slack); 329 skb_pull(skb, slack);
329 memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack); 330 memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
@@ -337,10 +338,11 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
337static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) 338static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
338{ 339{
339 unsigned int hash, rii_p = 0; 340 unsigned int hash, rii_p = 0;
341 unsigned long flags;
340 struct rif_cache *entry; 342 struct rif_cache *entry;
341 343
342 344
343 spin_lock_bh(&rif_lock); 345 spin_lock_irqsave(&rif_lock, flags);
344 346
345 /* 347 /*
346 * Firstly see if the entry exists 348 * Firstly see if the entry exists
@@ -378,7 +380,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
378 if(!entry) 380 if(!entry)
379 { 381 {
380 printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n"); 382 printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
381 spin_unlock_bh(&rif_lock); 383 spin_unlock_irqrestore(&rif_lock, flags);
382 return; 384 return;
383 } 385 }
384 386
@@ -420,7 +422,7 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
420 } 422 }
421 entry->last_used=jiffies; 423 entry->last_used=jiffies;
422 } 424 }
423 spin_unlock_bh(&rif_lock); 425 spin_unlock_irqrestore(&rif_lock, flags);
424} 426}
425 427
426/* 428/*
@@ -430,9 +432,9 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
430static void rif_check_expire(unsigned long dummy) 432static void rif_check_expire(unsigned long dummy)
431{ 433{
432 int i; 434 int i;
433 unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2; 435 unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
434 436
435 spin_lock_bh(&rif_lock); 437 spin_lock_irqsave(&rif_lock, flags);
436 438
437 for(i =0; i < RIF_TABLE_SIZE; i++) { 439 for(i =0; i < RIF_TABLE_SIZE; i++) {
438 struct rif_cache *entry, **pentry; 440 struct rif_cache *entry, **pentry;
@@ -454,7 +456,7 @@ static void rif_check_expire(unsigned long dummy)
454 } 456 }
455 } 457 }
456 458
457 spin_unlock_bh(&rif_lock); 459 spin_unlock_irqrestore(&rif_lock, flags);
458 460
459 mod_timer(&rif_timer, next_interval); 461 mod_timer(&rif_timer, next_interval);
460 462
@@ -485,7 +487,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
485 487
486static void *rif_seq_start(struct seq_file *seq, loff_t *pos) 488static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
487{ 489{
488 spin_lock_bh(&rif_lock); 490 spin_lock_irq(&rif_lock);
489 491
490 return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN; 492 return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
491} 493}
@@ -516,7 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
516 518
517static void rif_seq_stop(struct seq_file *seq, void *v) 519static void rif_seq_stop(struct seq_file *seq, void *v)
518{ 520{
519 spin_unlock_bh(&rif_lock); 521 spin_unlock_irq(&rif_lock);
520} 522}
521 523
522static int rif_seq_show(struct seq_file *seq, void *v) 524static int rif_seq_show(struct seq_file *seq, void *v)
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000000..c4a382e450e2
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
1#
2# Configuration for 802.1Q VLAN support
3#
4
5config VLAN_8021Q
6 tristate "802.1Q VLAN Support"
7 ---help---
8 Select this and you will be able to create 802.1Q VLAN interfaces
9 on your ethernet interfaces. 802.1Q VLAN supports almost
10 everything a regular ethernet interface does, including
11 firewalling, bridging, and of course IP traffic. You will need
12 the 'vconfig' tool from the VLAN project in order to effectively
13 use VLANs. See the VLAN web page for more information:
14 <http://www.candelatech.com/~greear/vlan.html>
15
16 To compile this code as a module, choose M here: the module
17 will be called 8021q.
18
19 If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670bc7..91e412b0ab00 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
578 if (!vlandev) 578 if (!vlandev)
579 continue; 579 continue;
580 580
581 if (netif_carrier_ok(dev)) {
582 if (!netif_carrier_ok(vlandev))
583 netif_carrier_on(vlandev);
584 } else {
585 if (netif_carrier_ok(vlandev))
586 netif_carrier_off(vlandev);
587 }
588
581 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { 589 if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
582 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 590 vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK)
583 | flgs; 591 | flgs;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 508b1fa14546..9ae3a14dd016 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev,
51/* found in vlan_dev.c */ 51/* found in vlan_dev.c */
52int vlan_dev_rebuild_header(struct sk_buff *skb); 52int vlan_dev_rebuild_header(struct sk_buff *skb);
53int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, 53int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
54 struct packet_type* ptype); 54 struct packet_type *ptype, struct net_device *orig_dev);
55int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, 55int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
56 unsigned short type, void *daddr, void *saddr, 56 unsigned short type, void *daddr, void *saddr,
57 unsigned len); 57 unsigned len);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 49c487413518..145f5cde96cf 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
113 * 113 *
114 */ 114 */
115int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, 115int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
116 struct packet_type* ptype) 116 struct packet_type* ptype, struct net_device *orig_dev)
117{ 117{
118 unsigned char *rawp = NULL; 118 unsigned char *rawp = NULL;
119 struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data); 119 struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d5d..2bdd5623fdd5 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
2# Network configuration 2# Network configuration
3# 3#
4 4
5menu "Networking support" 5menu "Networking"
6 6
7config NET 7config NET
8 bool "Networking support" 8 bool "Networking support"
@@ -10,7 +10,9 @@ config NET
10 Unless you really know what you are doing, you should say Y here. 10 Unless you really know what you are doing, you should say Y here.
11 The reason is that some programs need kernel networking support even 11 The reason is that some programs need kernel networking support even
12 when running on a stand-alone machine that isn't connected to any 12 when running on a stand-alone machine that isn't connected to any
13 other computer. If you are upgrading from an older kernel, you 13 other computer.
14
15 If you are upgrading from an older kernel, you
14 should consider updating your networking tools too because changes 16 should consider updating your networking tools too because changes
15 in the kernel and the tools often go hand in hand. The tools are 17 in the kernel and the tools often go hand in hand. The tools are
16 contained in the package net-tools, the location and version number 18 contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
20 recommended to read the NET-HOWTO, available from 22 recommended to read the NET-HOWTO, available from
21 <http://www.tldp.org/docs.html#howto>. 23 <http://www.tldp.org/docs.html#howto>.
22 24
23menu "Networking options" 25# Make sure that all config symbols are dependent on NET
24 depends on NET 26if NET
25
26config PACKET
27 tristate "Packet socket"
28 ---help---
29 The Packet protocol is used by applications which communicate
30 directly with network devices without an intermediate network
31 protocol implemented in the kernel, e.g. tcpdump. If you want them
32 to work, choose Y.
33 27
34 To compile this driver as a module, choose M here: the module will 28menu "Networking options"
35 be called af_packet.
36
37 If unsure, say Y.
38
39config PACKET_MMAP
40 bool "Packet socket: mmapped IO"
41 depends on PACKET
42 help
43 If you say Y here, the Packet protocol driver will use an IO
44 mechanism that results in faster communication.
45
46 If unsure, say N.
47
48config UNIX
49 tristate "Unix domain sockets"
50 ---help---
51 If you say Y here, you will include support for Unix domain sockets;
52 sockets are the standard Unix mechanism for establishing and
53 accessing network connections. Many commonly used programs such as
54 the X Window system and syslog use these sockets even if your
55 machine is not connected to any network. Unless you are working on
56 an embedded system or something similar, you therefore definitely
57 want to say Y here.
58
59 To compile this driver as a module, choose M here: the module will be
60 called unix. Note that several important services won't work
61 correctly if you say M here and then neglect to load the module.
62
63 Say Y unless you know what you are doing.
64
65config NET_KEY
66 tristate "PF_KEY sockets"
67 select XFRM
68 ---help---
69 PF_KEYv2 socket family, compatible to KAME ones.
70 They are required if you are going to use IPsec tools ported
71 from KAME.
72 29
73 Say Y unless you know what you are doing. 30source "net/packet/Kconfig"
31source "net/unix/Kconfig"
32source "net/xfrm/Kconfig"
74 33
75config INET 34config INET
76 bool "TCP/IP networking" 35 bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
94 53
95 Short answer: say Y. 54 Short answer: say Y.
96 55
56if INET
97source "net/ipv4/Kconfig" 57source "net/ipv4/Kconfig"
98
99# IPv6 as module will cause a CRASH if you try to unload it
100config IPV6
101 tristate "The IPv6 protocol"
102 depends on INET
103 default m
104 select CRYPTO if IPV6_PRIVACY
105 select CRYPTO_MD5 if IPV6_PRIVACY
106 ---help---
107 This is complemental support for the IP version 6.
108 You will still be able to do traditional IPv4 networking as well.
109
110 For general information about IPv6, see
111 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
112 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
113 For specific information about IPv6 under Linux, read the HOWTO at
114 <http://www.bieringer.de/linux/IPv6/>.
115
116 To compile this protocol support as a module, choose M here: the
117 module will be called ipv6.
118
119source "net/ipv6/Kconfig" 58source "net/ipv6/Kconfig"
120 59
60endif # if INET
61
121menuconfig NETFILTER 62menuconfig NETFILTER
122 bool "Network packet filtering (replaces ipchains)" 63 bool "Network packet filtering (replaces ipchains)"
123 ---help--- 64 ---help---
@@ -206,269 +147,17 @@ source "net/bridge/netfilter/Kconfig"
206 147
207endif 148endif
208 149
209config XFRM 150source "net/dccp/Kconfig"
210 bool
211 depends on NET
212
213source "net/xfrm/Kconfig"
214
215source "net/sctp/Kconfig" 151source "net/sctp/Kconfig"
216 152source "net/atm/Kconfig"
217config ATM 153source "net/bridge/Kconfig"
218 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" 154source "net/8021q/Kconfig"
219 depends on EXPERIMENTAL
220 ---help---
221 ATM is a high-speed networking technology for Local Area Networks
222 and Wide Area Networks. It uses a fixed packet size and is
223 connection oriented, allowing for the negotiation of minimum
224 bandwidth requirements.
225
226 In order to participate in an ATM network, your Linux box needs an
227 ATM networking card. If you have that, say Y here and to the driver
228 of your ATM card below.
229
230 Note that you need a set of user-space programs to actually make use
231 of ATM. See the file <file:Documentation/networking/atm.txt> for
232 further details.
233
234config ATM_CLIP
235 tristate "Classical IP over ATM (EXPERIMENTAL)"
236 depends on ATM && INET
237 help
238 Classical IP over ATM for PVCs and SVCs, supporting InARP and
239 ATMARP. If you want to communication with other IP hosts on your ATM
240 network, you will typically either say Y here or to "LAN Emulation
241 (LANE)" below.
242
243config ATM_CLIP_NO_ICMP
244 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
245 depends on ATM_CLIP
246 help
247 Normally, an "ICMP host unreachable" message is sent if a neighbour
248 cannot be reached because there is no VC to it in the kernel's
249 ATMARP table. This may cause problems when ATMARP table entries are
250 briefly removed during revalidation. If you say Y here, packets to
251 such neighbours are silently discarded instead.
252
253config ATM_LANE
254 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
255 depends on ATM
256 help
257 LAN Emulation emulates services of existing LANs across an ATM
258 network. Besides operating as a normal ATM end station client, Linux
259 LANE client can also act as an proxy client bridging packets between
260 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
261
262config ATM_MPOA
263 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
264 depends on ATM && INET && ATM_LANE!=n
265 help
266 Multi-Protocol Over ATM allows ATM edge devices such as routers,
267 bridges and ATM attached hosts establish direct ATM VCs across
268 subnetwork boundaries. These shortcut connections bypass routers
269 enhancing overall network performance.
270
271config ATM_BR2684
272 tristate "RFC1483/2684 Bridged protocols"
273 depends on ATM && INET
274 help
275 ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
276 This device will act like an ethernet from the kernels point of view,
277 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
278 This is sometimes used over DSL lines. If in doubt, say N.
279
280config ATM_BR2684_IPFILTER
281 bool "Per-VC IP filter kludge"
282 depends on ATM_BR2684
283 help
284 This is an experimental mechanism for users who need to terminating a
285 large number of IP-only vcc's. Do not enable this unless you are sure
286 you know what you are doing.
287
288config BRIDGE
289 tristate "802.1d Ethernet Bridging"
290 ---help---
291 If you say Y here, then your Linux box will be able to act as an
292 Ethernet bridge, which means that the different Ethernet segments it
293 is connected to will appear as one Ethernet to the participants.
294 Several such bridges can work together to create even larger
295 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
296 As this is a standard, Linux bridges will cooperate properly with
297 other third party bridge products.
298
299 In order to use the Ethernet bridge, you'll need the bridge
300 configuration tools; see <file:Documentation/networking/bridge.txt>
301 for location. Please read the Bridge mini-HOWTO for more
302 information.
303
304 If you enable iptables support along with the bridge support then you
305 turn your bridge into a bridging IP firewall.
306 iptables will then see the IP packets being bridged, so you need to
307 take this into account when setting up your firewall rules.
308 Enabling arptables support when bridging will let arptables see
309 bridged ARP traffic in the arptables FORWARD chain.
310
311 To compile this code as a module, choose M here: the module
312 will be called bridge.
313
314 If unsure, say N.
315
316config VLAN_8021Q
317 tristate "802.1Q VLAN Support"
318 ---help---
319 Select this and you will be able to create 802.1Q VLAN interfaces
320 on your ethernet interfaces. 802.1Q VLAN supports almost
321 everything a regular ethernet interface does, including
322 firewalling, bridging, and of course IP traffic. You will need
323 the 'vconfig' tool from the VLAN project in order to effectively
324 use VLANs. See the VLAN web page for more information:
325 <http://www.candelatech.com/~greear/vlan.html>
326
327 To compile this code as a module, choose M here: the module
328 will be called 8021q.
329
330 If unsure, say N.
331
332config DECNET
333 tristate "DECnet Support"
334 ---help---
335 The DECnet networking protocol was used in many products made by
336 Digital (now Compaq). It provides reliable stream and sequenced
337 packet communications over which run a variety of services similar
338 to those which run over TCP/IP.
339
340 To find some tools to use with the kernel layer support, please
341 look at Patrick Caulfield's web site:
342 <http://linux-decnet.sourceforge.net/>.
343
344 More detailed documentation is available in
345 <file:Documentation/networking/decnet.txt>.
346
347 Be sure to say Y to "/proc file system support" and "Sysctl support"
348 below when using DECnet, since you will need sysctl support to aid
349 in configuration at run time.
350
351 The DECnet code is also available as a module ( = code which can be
352 inserted in and removed from the running kernel whenever you want).
353 The module is called decnet.
354
355source "net/decnet/Kconfig" 155source "net/decnet/Kconfig"
356
357source "net/llc/Kconfig" 156source "net/llc/Kconfig"
358
359config IPX
360 tristate "The IPX protocol"
361 select LLC
362 ---help---
363 This is support for the Novell networking protocol, IPX, commonly
364 used for local networks of Windows machines. You need it if you
365 want to access Novell NetWare file or print servers using the Linux
366 Novell client ncpfs (available from
367 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
368 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
369 available from <http://www.tldp.org/docs.html#howto>). In order
370 to do the former, you'll also have to say Y to "NCP file system
371 support", below.
372
373 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
374 is similar to TCP. There is also experimental support for SPX in
375 Linux (see "SPX networking", below).
376
377 To turn your Linux box into a fully featured NetWare file server and
378 IPX router, say Y here and fetch either lwared from
379 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
380 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
381 information, read the IPX-HOWTO available from
382 <http://www.tldp.org/docs.html#howto>.
383
384 General information about how to connect Linux, Windows machines and
385 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
386
387 The IPX driver would enlarge your kernel by about 16 KB. To compile
388 this driver as a module, choose M here: the module will be called ipx.
389 Unless you want to integrate your Linux box with a local Novell
390 network, say N.
391
392source "net/ipx/Kconfig" 157source "net/ipx/Kconfig"
393
394config ATALK
395 tristate "Appletalk protocol support"
396 select LLC
397 ---help---
398 AppleTalk is the protocol that Apple computers can use to communicate
399 on a network. If your Linux box is connected to such a network and you
400 wish to connect to it, say Y. You will need to use the netatalk package
401 so that your Linux box can act as a print and file server for Macs as
402 well as access AppleTalk printers. Check out
403 <http://www.zettabyte.net/netatalk/> on the WWW for details.
404 EtherTalk is the name used for AppleTalk over Ethernet and the
405 cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
406 network using serial links. EtherTalk and LocalTalk are fully
407 supported by Linux.
408
409 General information about how to connect Linux, Windows machines and
410 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
411 NET-3-HOWTO, available from
412 <http://www.tldp.org/docs.html#howto>, contains valuable
413 information as well.
414
415 To compile this driver as a module, choose M here: the module will be
416 called appletalk. You almost certainly want to compile it as a
417 module so you can restart your AppleTalk stack without rebooting
418 your machine. I hear that the GNU boycott of Apple is over, so
419 even politically correct people are allowed to say Y here.
420
421source "drivers/net/appletalk/Kconfig" 158source "drivers/net/appletalk/Kconfig"
422 159source "net/x25/Kconfig"
423config X25 160source "net/lapb/Kconfig"
424 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
425 depends on EXPERIMENTAL
426 ---help---
427 X.25 is a set of standardized network protocols, similar in scope to
428 frame relay; the one physical line from your box to the X.25 network
429 entry point can carry several logical point-to-point connections
430 (called "virtual circuits") to other computers connected to the X.25
431 network. Governments, banks, and other organizations tend to use it
432 to connect to each other or to form Wide Area Networks (WANs). Many
433 countries have public X.25 networks. X.25 consists of two
434 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
435 if you want that) and the lower level data link layer protocol LAPB
436 (say Y to "LAPB Data Link Driver" below if you want that).
437
438 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
439 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
440 Information about X.25 for Linux is contained in the files
441 <file:Documentation/networking/x25.txt> and
442 <file:Documentation/networking/x25-iface.txt>.
443
444 One connects to an X.25 network either with a dedicated network card
445 using the X.21 protocol (not yet supported by Linux) or one can do
446 X.25 over a standard telephone line using an ordinary modem (say Y
447 to "X.25 async driver" below) or over Ethernet using an ordinary
448 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
449 Driver" and "LAPB over Ethernet driver" below).
450
451 To compile this driver as a module, choose M here: the module
452 will be called x25. If unsure, say N.
453
454config LAPB
455 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
456 depends on EXPERIMENTAL
457 ---help---
458 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
459 the lower) part of the X.25 protocol. It offers a reliable
460 connection service to exchange data frames with one other host, and
461 it is used to transport higher level protocols (mostly X.25 Packet
462 Layer, the higher part of X.25, but others are possible as well).
463 Usually, LAPB is used with specialized X.21 network cards, but Linux
464 currently supports LAPB only over Ethernet connections. If you want
465 to use LAPB connections over Ethernet, say Y here and to "LAPB over
466 Ethernet driver" below. Read
467 <file:Documentation/networking/lapb-module.txt> for technical
468 details.
469
470 To compile this driver as a module, choose M here: the
471 module will be called lapb. If unsure, say N.
472 161
473config NET_DIVERT 162config NET_DIVERT
474 bool "Frame Diverter (EXPERIMENTAL)" 163 bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +185,10 @@ config NET_DIVERT
496 185
497 If unsure, say N. 186 If unsure, say N.
498 187
499config ECONET 188source "net/econet/Kconfig"
500 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" 189source "net/wanrouter/Kconfig"
501 depends on EXPERIMENTAL && INET
502 ---help---
503 Econet is a fairly old and slow networking protocol mainly used by
504 Acorn computers to access file and print servers. It uses native
505 Econet network cards. AUN is an implementation of the higher level
506 parts of Econet that runs over ordinary Ethernet connections, on
507 top of the UDP packet protocol, which in turn runs on top of the
508 Internet protocol IP.
509
510 If you say Y here, you can choose with the next two options whether
511 to send Econet/AUN traffic over a UDP Ethernet connection or over
512 a native Econet network card.
513
514 To compile this driver as a module, choose M here: the module
515 will be called econet.
516
517config ECONET_AUNUDP
518 bool "AUN over UDP"
519 depends on ECONET
520 help
521 Say Y here if you want to send Econet/AUN traffic over a UDP
522 connection (UDP is a packet based protocol that runs on top of the
523 Internet protocol IP) using an ordinary Ethernet network card.
524
525config ECONET_NATIVE
526 bool "Native Econet"
527 depends on ECONET
528 help
529 Say Y here if you have a native Econet network card installed in
530 your computer.
531
532config WAN_ROUTER
533 tristate "WAN router"
534 depends on EXPERIMENTAL
535 ---help---
536 Wide Area Networks (WANs), such as X.25, frame relay and leased
537 lines, are used to interconnect Local Area Networks (LANs) over vast
538 distances with data transfer rates significantly higher than those
539 achievable with commonly used asynchronous modem connections.
540 Usually, a quite expensive external device called a `WAN router' is
541 needed to connect to a WAN.
542
543 As an alternative, WAN routing can be built into the Linux kernel.
544 With relatively inexpensive WAN interface cards available on the
545 market, a perfectly usable router can be built for less than half
546 the price of an external router. If you have one of those cards and
547 wish to use your Linux box as a WAN router, say Y here and also to
548 the WAN driver for your card, below. You will then need the
549 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
550 Read <file:Documentation/networking/wan-router.txt> for more
551 information.
552
553 To compile WAN routing support as a module, choose M here: the
554 module will be called wanrouter.
555
556 If unsure, say N.
557
558menu "QoS and/or fair queueing"
559
560config NET_SCHED
561 bool "QoS and/or fair queueing"
562 ---help---
563 When the kernel has several packets to send out over a network
564 device, it has to decide which ones to send first, which ones to
565 delay, and which ones to drop. This is the job of the packet
566 scheduler, and several different algorithms for how to do this
567 "fairly" have been proposed.
568
569 If you say N here, you will get the standard packet scheduler, which
570 is a FIFO (first come, first served). If you say Y here, you will be
571 able to choose from among several alternative algorithms which can
572 then be attached to different network devices. This is useful for
573 example if some of your network devices are real time devices that
574 need a certain minimum data flow rate, or if you need to limit the
575 maximum data flow rate for traffic which matches specified criteria.
576 This code is considered to be experimental.
577
578 To administer these schedulers, you'll need the user-level utilities
579 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
580 That package also contains some documentation; for more, check out
581 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
582
583 This Quality of Service (QoS) support will enable you to use
584 Differentiated Services (diffserv) and Resource Reservation Protocol
585 (RSVP) on your Linux router if you also say Y to "QoS support",
586 "Packet classifier API" and to some classifiers below. Documentation
587 and software is at <http://diffserv.sourceforge.net/>.
588
589 If you say Y here and to "/proc file system" below, you will be able
590 to read status information about packet schedulers from the file
591 /proc/net/psched.
592
593 The available schedulers are listed in the following questions; you
594 can say Y to as many as you like. If unsure, say N now.
595
596source "net/sched/Kconfig" 190source "net/sched/Kconfig"
597 191
598endmenu
599
600menu "Network testing" 192menu "Network testing"
601 193
602config NET_PKTGEN 194config NET_PKTGEN
@@ -614,33 +206,17 @@ config NET_PKTGEN
614 To compile this code as a module, choose M here: the 206 To compile this code as a module, choose M here: the
615 module will be called pktgen. 207 module will be called pktgen.
616 208
617endmenu 209source "net/netfilter/Kconfig"
618 210
619endmenu 211endmenu
620 212
621config NETPOLL 213endmenu
622 def_bool NETCONSOLE
623
624config NETPOLL_RX
625 bool "Netpoll support for trapping incoming packets"
626 default n
627 depends on NETPOLL
628
629config NETPOLL_TRAP
630 bool "Netpoll traffic trapping"
631 default n
632 depends on NETPOLL
633
634config NET_POLL_CONTROLLER
635 def_bool NETPOLL
636 214
637source "net/ax25/Kconfig" 215source "net/ax25/Kconfig"
638
639source "net/irda/Kconfig" 216source "net/irda/Kconfig"
640
641source "net/bluetooth/Kconfig" 217source "net/bluetooth/Kconfig"
218source "net/ieee80211/Kconfig"
642 219
643source "drivers/net/Kconfig" 220endif # if NET
644 221endmenu # Networking
645endmenu
646 222
diff --git a/net/Makefile b/net/Makefile
index 8e2bdc025ab8..4aa2f46d2a56 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
16obj-$(CONFIG_LLC) += llc/ 16obj-$(CONFIG_LLC) += llc/
17obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ 17obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
18obj-$(CONFIG_INET) += ipv4/ 18obj-$(CONFIG_INET) += ipv4/
19obj-$(CONFIG_NETFILTER) += netfilter/
19obj-$(CONFIG_XFRM) += xfrm/ 20obj-$(CONFIG_XFRM) += xfrm/
20obj-$(CONFIG_UNIX) += unix/ 21obj-$(CONFIG_UNIX) += unix/
21ifneq ($(CONFIG_IPV6),) 22ifneq ($(CONFIG_IPV6),)
@@ -41,7 +42,9 @@ obj-$(CONFIG_ATM) += atm/
41obj-$(CONFIG_DECNET) += decnet/ 42obj-$(CONFIG_DECNET) += decnet/
42obj-$(CONFIG_ECONET) += econet/ 43obj-$(CONFIG_ECONET) += econet/
43obj-$(CONFIG_VLAN_8021Q) += 8021q/ 44obj-$(CONFIG_VLAN_8021Q) += 8021q/
45obj-$(CONFIG_IP_DCCP) += dccp/
44obj-$(CONFIG_IP_SCTP) += sctp/ 46obj-$(CONFIG_IP_SCTP) += sctp/
47obj-$(CONFIG_IEEE80211) += ieee80211/
45 48
46ifeq ($(CONFIG_NET),y) 49ifeq ($(CONFIG_NET),y)
47obj-$(CONFIG_SYSCTL) += sysctl_net.o 50obj-$(CONFIG_SYSCTL) += sysctl_net.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c34614ea5fce..7076097debc2 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
698 * frame. We currently only support Ethernet. 698 * frame. We currently only support Ethernet.
699 */ 699 */
700static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, 700static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
701 struct packet_type *pt) 701 struct packet_type *pt, struct net_device *orig_dev)
702{ 702{
703 struct elapaarp *ea = aarp_hdr(skb); 703 struct elapaarp *ea = aarp_hdr(skb);
704 int hash, ret = 0; 704 int hash, ret = 0;
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 192b529f86a4..1d31b3a3f1e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -53,12 +53,12 @@
53 53
54#include <linux/config.h> 54#include <linux/config.h>
55#include <linux/module.h> 55#include <linux/module.h>
56#include <linux/tcp.h>
57#include <linux/if_arp.h> 56#include <linux/if_arp.h>
58#include <linux/termios.h> /* For TIOCOUTQ/INQ */ 57#include <linux/termios.h> /* For TIOCOUTQ/INQ */
59#include <net/datalink.h> 58#include <net/datalink.h>
60#include <net/psnap.h> 59#include <net/psnap.h>
61#include <net/sock.h> 60#include <net/sock.h>
61#include <net/tcp_states.h>
62#include <net/route.h> 62#include <net/route.h>
63#include <linux/atalk.h> 63#include <linux/atalk.h>
64 64
@@ -1390,7 +1390,7 @@ free_it:
1390 * [ie ARPHRD_ETHERTALK] 1390 * [ie ARPHRD_ETHERTALK]
1391 */ 1391 */
1392static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, 1392static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
1393 struct packet_type *pt) 1393 struct packet_type *pt, struct net_device *orig_dev)
1394{ 1394{
1395 struct ddpehdr *ddp; 1395 struct ddpehdr *ddp;
1396 struct sock *sock; 1396 struct sock *sock;
@@ -1482,7 +1482,7 @@ freeit:
1482 * header and append a long one. 1482 * header and append a long one.
1483 */ 1483 */
1484static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, 1484static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
1485 struct packet_type *pt) 1485 struct packet_type *pt, struct net_device *orig_dev)
1486{ 1486{
1487 /* Expand any short form frames */ 1487 /* Expand any short form frames */
1488 if (skb->mac.raw[2] == 1) { 1488 if (skb->mac.raw[2] == 1) {
@@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
1528 } 1528 }
1529 skb->h.raw = skb->data; 1529 skb->h.raw = skb->data;
1530 1530
1531 return atalk_rcv(skb, dev, pt); 1531 return atalk_rcv(skb, dev, pt, orig_dev);
1532freeit: 1532freeit:
1533 kfree_skb(skb); 1533 kfree_skb(skb);
1534 return 0; 1534 return 0;
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000000..21ff276b2d80
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
1#
2# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
3#
4
5config ATM
6 tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 ATM is a high-speed networking technology for Local Area Networks
10 and Wide Area Networks. It uses a fixed packet size and is
11 connection oriented, allowing for the negotiation of minimum
12 bandwidth requirements.
13
14 In order to participate in an ATM network, your Linux box needs an
15 ATM networking card. If you have that, say Y here and to the driver
16 of your ATM card below.
17
18 Note that you need a set of user-space programs to actually make use
19 of ATM. See the file <file:Documentation/networking/atm.txt> for
20 further details.
21
22config ATM_CLIP
23 tristate "Classical IP over ATM (EXPERIMENTAL)"
24 depends on ATM && INET
25 help
26 Classical IP over ATM for PVCs and SVCs, supporting InARP and
27 ATMARP. If you want to communication with other IP hosts on your ATM
28 network, you will typically either say Y here or to "LAN Emulation
29 (LANE)" below.
30
31config ATM_CLIP_NO_ICMP
32 bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
33 depends on ATM_CLIP
34 help
35 Normally, an "ICMP host unreachable" message is sent if a neighbour
36 cannot be reached because there is no VC to it in the kernel's
37 ATMARP table. This may cause problems when ATMARP table entries are
38 briefly removed during revalidation. If you say Y here, packets to
39 such neighbours are silently discarded instead.
40
41config ATM_LANE
42 tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
43 depends on ATM
44 help
45 LAN Emulation emulates services of existing LANs across an ATM
46 network. Besides operating as a normal ATM end station client, Linux
47 LANE client can also act as an proxy client bridging packets between
48 ELAN and Ethernet segments. You need LANE if you want to try MPOA.
49
50config ATM_MPOA
51 tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
52 depends on ATM && INET && ATM_LANE!=n
53 help
54 Multi-Protocol Over ATM allows ATM edge devices such as routers,
55 bridges and ATM attached hosts establish direct ATM VCs across
56 subnetwork boundaries. These shortcut connections bypass routers
57 enhancing overall network performance.
58
59config ATM_BR2684
60 tristate "RFC1483/2684 Bridged protocols"
61 depends on ATM && INET
62 help
63 ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
64 This device will act like an ethernet from the kernels point of view,
65 with the traffic being carried by ATM PVCs (currently 1 PVC/device).
66 This is sometimes used over DSL lines. If in doubt, say N.
67
68config ATM_BR2684_IPFILTER
69 bool "Per-VC IP filter kludge"
70 depends on ATM_BR2684
71 help
72 This is an experimental mechanism for users who need to terminate a
73 large number of IP-only vcc's. Do not enable this unless you are sure
74 you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
289 * This is similar to eth_type_trans, which cannot be used because of 289 * This is similar to eth_type_trans, which cannot be used because of
290 * our dev->hard_header_len 290 * our dev->hard_header_len
291 */ 291 */
292static inline unsigned short br_type_trans(struct sk_buff *skb, 292static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
293 struct net_device *dev)
294{ 293{
295 struct ethhdr *eth; 294 struct ethhdr *eth;
296 unsigned char *rawp; 295 unsigned char *rawp;
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 4dbb5af34a5e..d89056ec44d4 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -21,6 +21,7 @@
21 21
22#include "resources.h" 22#include "resources.h"
23#include "signaling.h" /* for WAITING and sigd_attach */ 23#include "signaling.h" /* for WAITING and sigd_attach */
24#include "common.h"
24 25
25 26
26static DECLARE_MUTEX(ioctl_mutex); 27static DECLARE_MUTEX(ioctl_mutex);
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index 181a3002d8ad..4b1faca5013f 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -34,7 +34,6 @@
34 34
35void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) 35void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
36{ 36{
37 struct sk_buff *skb;
38 unsigned long flags; 37 unsigned long flags;
39 struct sk_buff *skb_from = (struct sk_buff *) from; 38 struct sk_buff *skb_from = (struct sk_buff *) from;
40 struct sk_buff *skb_to = (struct sk_buff *) to; 39 struct sk_buff *skb_to = (struct sk_buff *) to;
@@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
47 prev->next = skb_to; 46 prev->next = skb_to;
48 to->prev->next = from->next; 47 to->prev->next = from->next;
49 to->prev = from->prev; 48 to->prev = from->prev;
50 for (skb = from->next; skb != skb_to; skb = skb->next)
51 skb->list = to;
52 to->qlen += from->qlen; 49 to->qlen += from->qlen;
53 spin_unlock(&to->lock); 50 spin_unlock(&to->lock);
54 from->prev = skb_from; 51 from->prev = skb_from;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 02f5374a51f2..08e46052a3e4 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -118,10 +118,6 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
118 goto out; 118 goto out;
119 } 119 }
120 vcc = ATM_SD(sock); 120 vcc = ATM_SD(sock);
121 if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
122 error = -EINVAL;
123 goto out;
124 }
125 addr = (struct sockaddr_atmsvc *) sockaddr; 121 addr = (struct sockaddr_atmsvc *) sockaddr;
126 if (addr->sas_family != AF_ATMSVC) { 122 if (addr->sas_family != AF_ATMSVC) {
127 error = -EAFNOSUPPORT; 123 error = -EAFNOSUPPORT;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 707097deac3d..ea43dfb774e2 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -45,7 +45,7 @@
45#include <linux/sysctl.h> 45#include <linux/sysctl.h>
46#include <linux/init.h> 46#include <linux/init.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <net/tcp.h> 48#include <net/tcp_states.h>
49#include <net/ip.h> 49#include <net/ip.h>
50#include <net/arp.h> 50#include <net/arp.h>
51 51
@@ -875,12 +875,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
875 sk->sk_sndbuf = osk->sk_sndbuf; 875 sk->sk_sndbuf = osk->sk_sndbuf;
876 sk->sk_state = TCP_ESTABLISHED; 876 sk->sk_state = TCP_ESTABLISHED;
877 sk->sk_sleep = osk->sk_sleep; 877 sk->sk_sleep = osk->sk_sleep;
878 878 sock_copy_flags(sk, osk);
879 if (sock_flag(osk, SOCK_DBG))
880 sock_set_flag(sk, SOCK_DBG);
881
882 if (sock_flag(osk, SOCK_ZAPPED))
883 sock_set_flag(sk, SOCK_ZAPPED);
884 879
885 oax25 = ax25_sk(osk); 880 oax25 = ax25_sk(osk);
886 881
@@ -1007,7 +1002,8 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1007 struct sock *sk = sock->sk; 1002 struct sock *sk = sock->sk;
1008 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; 1003 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
1009 ax25_dev *ax25_dev = NULL; 1004 ax25_dev *ax25_dev = NULL;
1010 ax25_address *call; 1005 ax25_uid_assoc *user;
1006 ax25_address call;
1011 ax25_cb *ax25; 1007 ax25_cb *ax25;
1012 int err = 0; 1008 int err = 0;
1013 1009
@@ -1026,9 +1022,15 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1026 if (addr->fsa_ax25.sax25_family != AF_AX25) 1022 if (addr->fsa_ax25.sax25_family != AF_AX25)
1027 return -EINVAL; 1023 return -EINVAL;
1028 1024
1029 call = ax25_findbyuid(current->euid); 1025 user = ax25_findbyuid(current->euid);
1030 if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) { 1026 if (user) {
1031 return -EACCES; 1027 call = user->call;
1028 ax25_uid_put(user);
1029 } else {
1030 if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
1031 return -EACCES;
1032
1033 call = addr->fsa_ax25.sax25_call;
1032 } 1034 }
1033 1035
1034 lock_sock(sk); 1036 lock_sock(sk);
@@ -1039,10 +1041,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1039 goto out; 1041 goto out;
1040 } 1042 }
1041 1043
1042 if (call == NULL) 1044 ax25->source_addr = call;
1043 ax25->source_addr = addr->fsa_ax25.sax25_call;
1044 else
1045 ax25->source_addr = *call;
1046 1045
1047 /* 1046 /*
1048 * User already set interface with SO_BINDTODEVICE 1047 * User already set interface with SO_BINDTODEVICE
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 8adc0022cf58..edcaa897027c 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,8 +22,7 @@
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/skbuff.h> 23#include <linux/skbuff.h>
24#include <net/sock.h> 24#include <net/sock.h>
25#include <net/ip.h> /* For ip_rcv */ 25#include <net/tcp_states.h>
26#include <net/tcp.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28#include <asm/system.h> 27#include <asm/system.h>
29#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 3a8b67316fc3..061083efc1dc 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -18,7 +18,7 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/sockios.h> 19#include <linux/sockios.h>
20#include <linux/net.h> 20#include <linux/net.h>
21#include <net/tcp.h> 21#include <net/tcp_states.h>
22#include <net/ax25.h> 22#include <net/ax25.h>
23#include <linux/inet.h> 23#include <linux/inet.h>
24#include <linux/netdevice.h> 24#include <linux/netdevice.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 3dc808fde33f..810c9c76c2e0 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -9,7 +9,6 @@
9 * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) 9 * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
10 * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) 10 * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
11 */ 11 */
12#include <linux/config.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/types.h> 13#include <linux/types.h>
15#include <linux/socket.h> 14#include <linux/socket.h>
@@ -26,9 +25,7 @@
26#include <linux/skbuff.h> 25#include <linux/skbuff.h>
27#include <linux/netfilter.h> 26#include <linux/netfilter.h>
28#include <net/sock.h> 27#include <net/sock.h>
29#include <net/ip.h> /* For ip_rcv */ 28#include <net/tcp_states.h>
30#include <net/tcp.h>
31#include <net/arp.h> /* For arp_rcv */
32#include <asm/uaccess.h> 29#include <asm/uaccess.h>
33#include <asm/system.h> 30#include <asm/system.h>
34#include <linux/fcntl.h> 31#include <linux/fcntl.h>
@@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
114 111
115 pid = *skb->data; 112 pid = *skb->data;
116 113
117#ifdef CONFIG_INET
118 if (pid == AX25_P_IP) { 114 if (pid == AX25_P_IP) {
119 /* working around a TCP bug to keep additional listeners 115 /* working around a TCP bug to keep additional listeners
120 * happy. TCP re-uses the buffer and destroys the original 116 * happy. TCP re-uses the buffer and destroys the original
@@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
132 skb->dev = ax25->ax25_dev->dev; 128 skb->dev = ax25->ax25_dev->dev;
133 skb->pkt_type = PACKET_HOST; 129 skb->pkt_type = PACKET_HOST;
134 skb->protocol = htons(ETH_P_IP); 130 skb->protocol = htons(ETH_P_IP);
135 ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ 131 netif_rx(skb);
136 return 1; 132 return 1;
137 } 133 }
138#endif
139 if (pid == AX25_P_SEGMENT) { 134 if (pid == AX25_P_SEGMENT) {
140 skb_pull(skb, 1); /* Remove PID */ 135 skb_pull(skb, 1); /* Remove PID */
141 return ax25_rx_fragment(ax25, skb); 136 return ax25_rx_fragment(ax25, skb);
@@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
250 245
251 /* Now we are pointing at the pid byte */ 246 /* Now we are pointing at the pid byte */
252 switch (skb->data[1]) { 247 switch (skb->data[1]) {
253#ifdef CONFIG_INET
254 case AX25_P_IP: 248 case AX25_P_IP:
255 skb_pull(skb,2); /* drop PID/CTRL */ 249 skb_pull(skb,2); /* drop PID/CTRL */
256 skb->h.raw = skb->data; 250 skb->h.raw = skb->data;
@@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
258 skb->dev = dev; 252 skb->dev = dev;
259 skb->pkt_type = PACKET_HOST; 253 skb->pkt_type = PACKET_HOST;
260 skb->protocol = htons(ETH_P_IP); 254 skb->protocol = htons(ETH_P_IP);
261 ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ 255 netif_rx(skb);
262 break; 256 break;
263 257
264 case AX25_P_ARP: 258 case AX25_P_ARP:
@@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
268 skb->dev = dev; 262 skb->dev = dev;
269 skb->pkt_type = PACKET_HOST; 263 skb->pkt_type = PACKET_HOST;
270 skb->protocol = htons(ETH_P_ARP); 264 skb->protocol = htons(ETH_P_ARP);
271 arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ 265 netif_rx(skb);
272 break; 266 break;
273#endif
274 case AX25_P_TEXT: 267 case AX25_P_TEXT:
275 /* Now find a suitable dgram socket */ 268 /* Now find a suitable dgram socket */
276 sk = ax25_get_socket(&dest, &src, SOCK_DGRAM); 269 sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
@@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
454 * Receive an AX.25 frame via a SLIP interface. 447 * Receive an AX.25 frame via a SLIP interface.
455 */ 448 */
456int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, 449int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
457 struct packet_type *ptype) 450 struct packet_type *ptype, struct net_device *orig_dev)
458{ 451{
459 skb->sk = NULL; /* Initially we don't know who it's for */ 452 skb->sk = NULL; /* Initially we don't know who it's for */
460 skb->destructor = NULL; /* Who initializes this, dammit?! */ 453 skb->destructor = NULL; /* Who initializes this, dammit?! */
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 44b99b1ff9f8..c288526da4ce 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -422,8 +422,8 @@ static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
422 */ 422 */
423int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) 423int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
424{ 424{
425 ax25_uid_assoc *user;
425 ax25_route *ax25_rt; 426 ax25_route *ax25_rt;
426 ax25_address *call;
427 int err; 427 int err;
428 428
429 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) 429 if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
@@ -434,16 +434,18 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
434 goto put; 434 goto put;
435 } 435 }
436 436
437 if ((call = ax25_findbyuid(current->euid)) == NULL) { 437 user = ax25_findbyuid(current->euid);
438 if (user) {
439 ax25->source_addr = user->call;
440 ax25_uid_put(user);
441 } else {
438 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { 442 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
439 err = -EPERM; 443 err = -EPERM;
440 goto put; 444 goto put;
441 } 445 }
442 call = (ax25_address *)ax25->ax25_dev->dev->dev_addr; 446 ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
443 } 447 }
444 448
445 ax25->source_addr = *call;
446
447 if (ax25_rt->digipeat != NULL) { 449 if (ax25_rt->digipeat != NULL) {
448 if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { 450 if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
449 err = -ENOMEM; 451 err = -ENOMEM;
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 7131873322c4..f6ed283e9de8 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,8 +29,7 @@
29#include <linux/netdevice.h> 29#include <linux/netdevice.h>
30#include <linux/skbuff.h> 30#include <linux/skbuff.h>
31#include <net/sock.h> 31#include <net/sock.h>
32#include <net/ip.h> /* For ip_rcv */ 32#include <net/tcp_states.h>
33#include <net/tcp.h>
34#include <asm/uaccess.h> 33#include <asm/uaccess.h>
35#include <asm/system.h> 34#include <asm/system.h>
36#include <linux/fcntl.h> 35#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 066897bc0749..a29c480a4dc1 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
24#include <linux/netdevice.h> 24#include <linux/netdevice.h>
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp.h> 27#include <net/tcp_states.h>
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/system.h> 29#include <asm/system.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 99694b57f6f5..c41dbe5fadee 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -24,7 +24,7 @@
24#include <linux/netdevice.h> 24#include <linux/netdevice.h>
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp.h> 27#include <net/tcp_states.h>
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/system.h> 29#include <asm/system.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
@@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25)
76 if (skb_prev == NULL) 76 if (skb_prev == NULL)
77 skb_queue_head(&ax25->write_queue, skb); 77 skb_queue_head(&ax25->write_queue, skb);
78 else 78 else
79 skb_append(skb_prev, skb); 79 skb_append(skb_prev, skb, &ax25->write_queue);
80 skb_prev = skb; 80 skb_prev = skb;
81 } 81 }
82} 82}
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index cea6b7d19729..a8b3822f3ee4 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -28,6 +28,7 @@
28#include <linux/fcntl.h> 28#include <linux/fcntl.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/interrupt.h> 30#include <linux/interrupt.h>
31#include <linux/list.h>
31#include <linux/notifier.h> 32#include <linux/notifier.h>
32#include <linux/proc_fs.h> 33#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 34#include <linux/seq_file.h>
@@ -41,38 +42,41 @@
41 * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines. 42 * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
42 */ 43 */
43 44
44static ax25_uid_assoc *ax25_uid_list; 45HLIST_HEAD(ax25_uid_list);
45static DEFINE_RWLOCK(ax25_uid_lock); 46static DEFINE_RWLOCK(ax25_uid_lock);
46 47
47int ax25_uid_policy = 0; 48int ax25_uid_policy = 0;
48 49
49ax25_address *ax25_findbyuid(uid_t uid) 50ax25_uid_assoc *ax25_findbyuid(uid_t uid)
50{ 51{
51 ax25_uid_assoc *ax25_uid; 52 ax25_uid_assoc *ax25_uid, *res = NULL;
52 ax25_address *res = NULL; 53 struct hlist_node *node;
53 54
54 read_lock(&ax25_uid_lock); 55 read_lock(&ax25_uid_lock);
55 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 56 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
56 if (ax25_uid->uid == uid) { 57 if (ax25_uid->uid == uid) {
57 res = &ax25_uid->call; 58 ax25_uid_hold(ax25_uid);
59 res = ax25_uid;
58 break; 60 break;
59 } 61 }
60 } 62 }
61 read_unlock(&ax25_uid_lock); 63 read_unlock(&ax25_uid_lock);
62 64
63 return NULL; 65 return res;
64} 66}
65 67
66int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) 68int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
67{ 69{
68 ax25_uid_assoc *s, *ax25_uid; 70 ax25_uid_assoc *ax25_uid;
71 struct hlist_node *node;
72 ax25_uid_assoc *user;
69 unsigned long res; 73 unsigned long res;
70 74
71 switch (cmd) { 75 switch (cmd) {
72 case SIOCAX25GETUID: 76 case SIOCAX25GETUID:
73 res = -ENOENT; 77 res = -ENOENT;
74 read_lock(&ax25_uid_lock); 78 read_lock(&ax25_uid_lock);
75 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 79 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
76 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { 80 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
77 res = ax25_uid->uid; 81 res = ax25_uid->uid;
78 break; 82 break;
@@ -85,19 +89,22 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
85 case SIOCAX25ADDUID: 89 case SIOCAX25ADDUID:
86 if (!capable(CAP_NET_ADMIN)) 90 if (!capable(CAP_NET_ADMIN))
87 return -EPERM; 91 return -EPERM;
88 if (ax25_findbyuid(sax->sax25_uid)) 92 user = ax25_findbyuid(sax->sax25_uid);
93 if (user) {
94 ax25_uid_put(user);
89 return -EEXIST; 95 return -EEXIST;
96 }
90 if (sax->sax25_uid == 0) 97 if (sax->sax25_uid == 0)
91 return -EINVAL; 98 return -EINVAL;
92 if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL) 99 if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
93 return -ENOMEM; 100 return -ENOMEM;
94 101
102 atomic_set(&ax25_uid->refcount, 1);
95 ax25_uid->uid = sax->sax25_uid; 103 ax25_uid->uid = sax->sax25_uid;
96 ax25_uid->call = sax->sax25_call; 104 ax25_uid->call = sax->sax25_call;
97 105
98 write_lock(&ax25_uid_lock); 106 write_lock(&ax25_uid_lock);
99 ax25_uid->next = ax25_uid_list; 107 hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
100 ax25_uid_list = ax25_uid;
101 write_unlock(&ax25_uid_lock); 108 write_unlock(&ax25_uid_lock);
102 109
103 return 0; 110 return 0;
@@ -106,34 +113,21 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
106 if (!capable(CAP_NET_ADMIN)) 113 if (!capable(CAP_NET_ADMIN))
107 return -EPERM; 114 return -EPERM;
108 115
116 ax25_uid = NULL;
109 write_lock(&ax25_uid_lock); 117 write_lock(&ax25_uid_lock);
110 for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { 118 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
111 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { 119 if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
112 break; 120 break;
113 }
114 } 121 }
115 if (ax25_uid == NULL) { 122 if (ax25_uid == NULL) {
116 write_unlock(&ax25_uid_lock); 123 write_unlock(&ax25_uid_lock);
117 return -ENOENT; 124 return -ENOENT;
118 } 125 }
119 if ((s = ax25_uid_list) == ax25_uid) { 126 hlist_del_init(&ax25_uid->uid_node);
120 ax25_uid_list = s->next; 127 ax25_uid_put(ax25_uid);
121 write_unlock(&ax25_uid_lock);
122 kfree(ax25_uid);
123 return 0;
124 }
125 while (s != NULL && s->next != NULL) {
126 if (s->next == ax25_uid) {
127 s->next = ax25_uid->next;
128 write_unlock(&ax25_uid_lock);
129 kfree(ax25_uid);
130 return 0;
131 }
132 s = s->next;
133 }
134 write_unlock(&ax25_uid_lock); 128 write_unlock(&ax25_uid_lock);
135 129
136 return -ENOENT; 130 return 0;
137 131
138 default: 132 default:
139 return -EINVAL; 133 return -EINVAL;
@@ -147,13 +141,11 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
147static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos) 141static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
148{ 142{
149 struct ax25_uid_assoc *pt; 143 struct ax25_uid_assoc *pt;
150 int i = 1; 144 struct hlist_node *node;
145 int i = 0;
151 146
152 read_lock(&ax25_uid_lock); 147 read_lock(&ax25_uid_lock);
153 if (*pos == 0) 148 ax25_uid_for_each(pt, node, &ax25_uid_list) {
154 return SEQ_START_TOKEN;
155
156 for (pt = ax25_uid_list; pt != NULL; pt = pt->next) {
157 if (i == *pos) 149 if (i == *pos)
158 return pt; 150 return pt;
159 ++i; 151 ++i;
@@ -164,8 +156,9 @@ static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
164static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos) 156static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
165{ 157{
166 ++*pos; 158 ++*pos;
167 return (v == SEQ_START_TOKEN) ? ax25_uid_list : 159
168 ((struct ax25_uid_assoc *) v)->next; 160 return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
161 ax25_uid_assoc, uid_node);
169} 162}
170 163
171static void ax25_uid_seq_stop(struct seq_file *seq, void *v) 164static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
@@ -179,7 +172,6 @@ static int ax25_uid_seq_show(struct seq_file *seq, void *v)
179 seq_printf(seq, "Policy: %d\n", ax25_uid_policy); 172 seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
180 else { 173 else {
181 struct ax25_uid_assoc *pt = v; 174 struct ax25_uid_assoc *pt = v;
182
183 175
184 seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call)); 176 seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call));
185 } 177 }
@@ -213,16 +205,13 @@ struct file_operations ax25_uid_fops = {
213 */ 205 */
214void __exit ax25_uid_free(void) 206void __exit ax25_uid_free(void)
215{ 207{
216 ax25_uid_assoc *s, *ax25_uid; 208 ax25_uid_assoc *ax25_uid;
209 struct hlist_node *node;
217 210
218 write_lock(&ax25_uid_lock); 211 write_lock(&ax25_uid_lock);
219 ax25_uid = ax25_uid_list; 212 ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
220 while (ax25_uid != NULL) { 213 hlist_del_init(&ax25_uid->uid_node);
221 s = ax25_uid; 214 ax25_uid_put(ax25_uid);
222 ax25_uid = ax25_uid->next;
223
224 kfree(s);
225 } 215 }
226 ax25_uid_list = NULL;
227 write_unlock(&ax25_uid_lock); 216 write_unlock(&ax25_uid_lock);
228} 217}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb5524365bc2..55dc42eac92c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
191 191
192 /* Special commands */ 192 /* Special commands */
193 while ((skb = skb_dequeue(&hdev->driver_init))) { 193 while ((skb = skb_dequeue(&hdev->driver_init))) {
194 skb->pkt_type = HCI_COMMAND_PKT; 194 bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
195 skb->dev = (void *) hdev; 195 skb->dev = (void *) hdev;
196 skb_queue_tail(&hdev->cmd_q, skb); 196 skb_queue_tail(&hdev->cmd_q, skb);
197 hci_sched_cmd(hdev); 197 hci_sched_cmd(hdev);
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
299 read_unlock(&hci_dev_list_lock); 299 read_unlock(&hci_dev_list_lock);
300 return hdev; 300 return hdev;
301} 301}
302EXPORT_SYMBOL(hci_dev_get);
303 302
304/* ---- Inquiry support ---- */ 303/* ---- Inquiry support ---- */
305static void inquiry_cache_flush(struct hci_dev *hdev) 304static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -996,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb)
996 return -ENODEV; 995 return -ENODEV;
997 } 996 }
998 997
999 BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); 998 BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
1000 999
1001 if (atomic_read(&hdev->promisc)) { 1000 if (atomic_read(&hdev->promisc)) {
1002 /* Time stamp */ 1001 /* Time stamp */
1003 do_gettimeofday(&skb->stamp); 1002 __net_timestamp(skb);
1004 1003
1005 hci_send_to_sock(hdev, skb); 1004 hci_send_to_sock(hdev, skb);
1006 } 1005 }
@@ -1035,14 +1034,13 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
1035 1034
1036 BT_DBG("skb len %d", skb->len); 1035 BT_DBG("skb len %d", skb->len);
1037 1036
1038 skb->pkt_type = HCI_COMMAND_PKT; 1037 bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
1039 skb->dev = (void *) hdev; 1038 skb->dev = (void *) hdev;
1040 skb_queue_tail(&hdev->cmd_q, skb); 1039 skb_queue_tail(&hdev->cmd_q, skb);
1041 hci_sched_cmd(hdev); 1040 hci_sched_cmd(hdev);
1042 1041
1043 return 0; 1042 return 0;
1044} 1043}
1045EXPORT_SYMBOL(hci_send_cmd);
1046 1044
1047/* Get data from the previously sent command */ 1045/* Get data from the previously sent command */
1048void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf) 1046void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
@@ -1083,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
1083 BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags); 1081 BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
1084 1082
1085 skb->dev = (void *) hdev; 1083 skb->dev = (void *) hdev;
1086 skb->pkt_type = HCI_ACLDATA_PKT; 1084 bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
1087 hci_add_acl_hdr(skb, conn->handle, flags | ACL_START); 1085 hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
1088 1086
1089 if (!(list = skb_shinfo(skb)->frag_list)) { 1087 if (!(list = skb_shinfo(skb)->frag_list)) {
@@ -1105,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
1105 skb = list; list = list->next; 1103 skb = list; list = list->next;
1106 1104
1107 skb->dev = (void *) hdev; 1105 skb->dev = (void *) hdev;
1108 skb->pkt_type = HCI_ACLDATA_PKT; 1106 bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
1109 hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT); 1107 hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
1110 1108
1111 BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); 1109 BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -1141,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
1141 memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE); 1139 memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
1142 1140
1143 skb->dev = (void *) hdev; 1141 skb->dev = (void *) hdev;
1144 skb->pkt_type = HCI_SCODATA_PKT; 1142 bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
1145 skb_queue_tail(&conn->data_q, skb); 1143 skb_queue_tail(&conn->data_q, skb);
1146 hci_sched_tx(hdev); 1144 hci_sched_tx(hdev);
1147 return 0; 1145 return 0;
@@ -1371,7 +1369,7 @@ void hci_rx_task(unsigned long arg)
1371 1369
1372 if (test_bit(HCI_INIT, &hdev->flags)) { 1370 if (test_bit(HCI_INIT, &hdev->flags)) {
1373 /* Don't process data packets in this states. */ 1371 /* Don't process data packets in this states. */
1374 switch (skb->pkt_type) { 1372 switch (bt_cb(skb)->pkt_type) {
1375 case HCI_ACLDATA_PKT: 1373 case HCI_ACLDATA_PKT:
1376 case HCI_SCODATA_PKT: 1374 case HCI_SCODATA_PKT:
1377 kfree_skb(skb); 1375 kfree_skb(skb);
@@ -1380,7 +1378,7 @@ void hci_rx_task(unsigned long arg)
1380 } 1378 }
1381 1379
1382 /* Process frame */ 1380 /* Process frame */
1383 switch (skb->pkt_type) { 1381 switch (bt_cb(skb)->pkt_type) {
1384 case HCI_EVENT_PKT: 1382 case HCI_EVENT_PKT:
1385 hci_event_packet(hdev, skb); 1383 hci_event_packet(hdev, skb);
1386 break; 1384 break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b4ef10..d6da0939216d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff
484/* Inquiry Result */ 484/* Inquiry Result */
485static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) 485static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
486{ 486{
487 struct inquiry_data data;
487 struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1); 488 struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
488 int num_rsp = *((__u8 *) skb->data); 489 int num_rsp = *((__u8 *) skb->data);
489 490
490 BT_DBG("%s num_rsp %d", hdev->name, num_rsp); 491 BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
491 492
493 if (!num_rsp)
494 return;
495
492 hci_dev_lock(hdev); 496 hci_dev_lock(hdev);
497
493 for (; num_rsp; num_rsp--) { 498 for (; num_rsp; num_rsp--) {
494 struct inquiry_data data;
495 bacpy(&data.bdaddr, &info->bdaddr); 499 bacpy(&data.bdaddr, &info->bdaddr);
496 data.pscan_rep_mode = info->pscan_rep_mode; 500 data.pscan_rep_mode = info->pscan_rep_mode;
497 data.pscan_period_mode = info->pscan_period_mode; 501 data.pscan_period_mode = info->pscan_period_mode;
@@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *
502 info++; 506 info++;
503 hci_inquiry_cache_update(hdev, &data); 507 hci_inquiry_cache_update(hdev, &data);
504 } 508 }
509
505 hci_dev_unlock(hdev); 510 hci_dev_unlock(hdev);
506} 511}
507 512
508/* Inquiry Result With RSSI */ 513/* Inquiry Result With RSSI */
509static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb) 514static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
510{ 515{
511 struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1); 516 struct inquiry_data data;
512 int num_rsp = *((__u8 *) skb->data); 517 int num_rsp = *((__u8 *) skb->data);
513 518
514 BT_DBG("%s num_rsp %d", hdev->name, num_rsp); 519 BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
515 520
521 if (!num_rsp)
522 return;
523
516 hci_dev_lock(hdev); 524 hci_dev_lock(hdev);
517 for (; num_rsp; num_rsp--) { 525
518 struct inquiry_data data; 526 if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
519 bacpy(&data.bdaddr, &info->bdaddr); 527 struct inquiry_info_with_rssi_and_pscan_mode *info =
520 data.pscan_rep_mode = info->pscan_rep_mode; 528 (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
521 data.pscan_period_mode = info->pscan_period_mode; 529
522 data.pscan_mode = 0x00; 530 for (; num_rsp; num_rsp--) {
523 memcpy(data.dev_class, info->dev_class, 3); 531 bacpy(&data.bdaddr, &info->bdaddr);
524 data.clock_offset = info->clock_offset; 532 data.pscan_rep_mode = info->pscan_rep_mode;
525 data.rssi = info->rssi; 533 data.pscan_period_mode = info->pscan_period_mode;
526 info++; 534 data.pscan_mode = info->pscan_mode;
527 hci_inquiry_cache_update(hdev, &data); 535 memcpy(data.dev_class, info->dev_class, 3);
536 data.clock_offset = info->clock_offset;
537 data.rssi = info->rssi;
538 info++;
539 hci_inquiry_cache_update(hdev, &data);
540 }
541 } else {
542 struct inquiry_info_with_rssi *info =
543 (struct inquiry_info_with_rssi *) (skb->data + 1);
544
545 for (; num_rsp; num_rsp--) {
546 bacpy(&data.bdaddr, &info->bdaddr);
547 data.pscan_rep_mode = info->pscan_rep_mode;
548 data.pscan_period_mode = info->pscan_period_mode;
549 data.pscan_mode = 0x00;
550 memcpy(data.dev_class, info->dev_class, 3);
551 data.clock_offset = info->clock_offset;
552 data.rssi = info->rssi;
553 info++;
554 hci_inquiry_cache_update(hdev, &data);
555 }
528 } 556 }
557
529 hci_dev_unlock(hdev); 558 hci_dev_unlock(hdev);
530} 559}
531 560
@@ -865,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
865 hci_dev_unlock(hdev); 894 hci_dev_unlock(hdev);
866} 895}
867 896
897/* Page Scan Repetition Mode */
898static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
899{
900 struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
901 struct inquiry_entry *ie;
902
903 BT_DBG("%s", hdev->name);
904
905 hci_dev_lock(hdev);
906
907 if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
908 ie->data.pscan_rep_mode = ev->pscan_rep_mode;
909 ie->timestamp = jiffies;
910 }
911
912 hci_dev_unlock(hdev);
913}
914
868void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) 915void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
869{ 916{
870 struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data; 917 struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -937,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
937 hci_clock_offset_evt(hdev, skb); 984 hci_clock_offset_evt(hdev, skb);
938 break; 985 break;
939 986
987 case HCI_EV_PSCAN_REP_MODE:
988 hci_pscan_rep_mode_evt(hdev, skb);
989 break;
990
940 case HCI_EV_CMD_STATUS: 991 case HCI_EV_CMD_STATUS:
941 cs = (struct hci_ev_cmd_status *) skb->data; 992 cs = (struct hci_ev_cmd_status *) skb->data;
942 skb_pull(skb, sizeof(cs)); 993 skb_pull(skb, sizeof(cs));
@@ -1035,9 +1086,11 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
1035 ev->type = type; 1086 ev->type = type;
1036 memcpy(ev->data, data, dlen); 1087 memcpy(ev->data, data, dlen);
1037 1088
1038 skb->pkt_type = HCI_EVENT_PKT; 1089 bt_cb(skb)->incoming = 1;
1090 __net_timestamp(skb);
1091
1092 bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
1039 skb->dev = (void *) hdev; 1093 skb->dev = (void *) hdev;
1040 hci_send_to_sock(hdev, skb); 1094 hci_send_to_sock(hdev, skb);
1041 kfree_skb(skb); 1095 kfree_skb(skb);
1042} 1096}
1043EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ebdcce5e7ca0..32ef7975a139 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
110 /* Apply filter */ 110 /* Apply filter */
111 flt = &hci_pi(sk)->filter; 111 flt = &hci_pi(sk)->filter;
112 112
113 if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ? 113 if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
114 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) 114 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
115 continue; 115 continue;
116 116
117 if (skb->pkt_type == HCI_EVENT_PKT) { 117 if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
118 register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); 118 register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
119 119
120 if (!hci_test_bit(evt, &flt->event_mask)) 120 if (!hci_test_bit(evt, &flt->event_mask))
@@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
131 continue; 131 continue;
132 132
133 /* Put type byte before the data */ 133 /* Put type byte before the data */
134 memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1); 134 memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
135 135
136 if (sock_queue_rcv_skb(sk, nskb)) 136 if (sock_queue_rcv_skb(sk, nskb))
137 kfree_skb(nskb); 137 kfree_skb(nskb);
@@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_
327{ 327{
328 __u32 mask = hci_pi(sk)->cmsg_mask; 328 __u32 mask = hci_pi(sk)->cmsg_mask;
329 329
330 if (mask & HCI_CMSG_DIR) 330 if (mask & HCI_CMSG_DIR) {
331 put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming); 331 int incoming = bt_cb(skb)->incoming;
332 put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
333 }
334
335 if (mask & HCI_CMSG_TSTAMP) {
336 struct timeval tv;
332 337
333 if (mask & HCI_CMSG_TSTAMP) 338 skb_get_timestamp(skb, &tv);
334 put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp); 339 put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
340 }
335} 341}
336 342
337static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, 343static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
405 goto drop; 411 goto drop;
406 } 412 }
407 413
408 skb->pkt_type = *((unsigned char *) skb->data); 414 bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
409 skb_pull(skb, 1); 415 skb_pull(skb, 1);
410 skb->dev = (void *) hdev; 416 skb->dev = (void *) hdev;
411 417
412 if (skb->pkt_type == HCI_COMMAND_PKT) { 418 if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
413 u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); 419 u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
414 u16 ogf = hci_opcode_ogf(opcode); 420 u16 ogf = hci_opcode_ogf(opcode);
415 u16 ocf = hci_opcode_ocf(opcode); 421 u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 32fccfb5bfa5..d3d6bc547212 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
372 .obj_size = sizeof(struct l2cap_pinfo) 372 .obj_size = sizeof(struct l2cap_pinfo)
373}; 373};
374 374
375static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio) 375static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
376{ 376{
377 struct sock *sk; 377 struct sock *sk;
378 378
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a093612..ee6a66979913 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
34 34
35#include <net/bluetooth/bluetooth.h> 35#include <net/bluetooth/bluetooth.h>
36 36
37void bt_dump(char *pref, __u8 *buf, int count)
38{
39 char *ptr;
40 char line[100];
41 unsigned int i;
42
43 printk(KERN_INFO "%s: dump, len %d\n", pref, count);
44
45 ptr = line;
46 *ptr = 0;
47 for (i = 0; i < count; i++) {
48 ptr += sprintf(ptr, " %2.2X", buf[i]);
49
50 if (i && !((i + 1) % 20)) {
51 printk(KERN_INFO "%s:%s\n", pref, line);
52 ptr = line;
53 *ptr = 0;
54 }
55 }
56
57 if (line[0])
58 printk(KERN_INFO "%s:%s\n", pref, line);
59}
60EXPORT_SYMBOL(bt_dump);
61
62void baswap(bdaddr_t *dst, bdaddr_t *src) 37void baswap(bdaddr_t *dst, bdaddr_t *src)
63{ 38{
64 unsigned char *d = (unsigned char *) dst; 39 unsigned char *d = (unsigned char *) dst;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda66f1a..173f46e8cdae 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -21,10 +21,6 @@
21 SOFTWARE IS DISCLAIMED. 21 SOFTWARE IS DISCLAIMED.
22*/ 22*/
23 23
24/*
25 RPN support - Dirk Husemann <hud@zurich.ibm.com>
26*/
27
28/* 24/*
29 * Bluetooth RFCOMM core. 25 * Bluetooth RFCOMM core.
30 * 26 *
@@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
115#define __get_mcc_len(b) ((b & 0xfe) >> 1) 111#define __get_mcc_len(b) ((b & 0xfe) >> 1)
116 112
117/* RPN macros */ 113/* RPN macros */
118#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3)) 114#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
119#define __get_rpn_data_bits(line) ((line) & 0x3) 115#define __get_rpn_data_bits(line) ((line) & 0x3)
120#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) 116#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
121#define __get_rpn_parity(line) (((line) >> 3) & 0x3) 117#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
122 118
123static inline void rfcomm_schedule(uint event) 119static inline void rfcomm_schedule(uint event)
124{ 120{
@@ -233,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
233 d->rx_credits = RFCOMM_DEFAULT_CREDITS; 229 d->rx_credits = RFCOMM_DEFAULT_CREDITS;
234} 230}
235 231
236struct rfcomm_dlc *rfcomm_dlc_alloc(int prio) 232struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio)
237{ 233{
238 struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio); 234 struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
239 if (!d) 235 if (!d)
@@ -389,8 +385,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
389 rfcomm_dlc_unlock(d); 385 rfcomm_dlc_unlock(d);
390 386
391 skb_queue_purge(&d->tx_queue); 387 skb_queue_purge(&d->tx_queue);
392 rfcomm_session_put(s);
393
394 rfcomm_dlc_unlink(d); 388 rfcomm_dlc_unlink(d);
395 } 389 }
396 390
@@ -600,8 +594,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
600 goto failed; 594 goto failed;
601 } 595 }
602 596
603 rfcomm_session_hold(s);
604
605 s->initiator = 1; 597 s->initiator = 1;
606 598
607 bacpy(&addr.l2_bdaddr, dst); 599 bacpy(&addr.l2_bdaddr, dst);
@@ -784,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
784 return rfcomm_send_frame(s, buf, ptr - buf); 776 return rfcomm_send_frame(s, buf, ptr - buf);
785} 777}
786 778
787static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, 779int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
788 u8 bit_rate, u8 data_bits, u8 stop_bits, 780 u8 bit_rate, u8 data_bits, u8 stop_bits,
789 u8 parity, u8 flow_ctrl_settings, 781 u8 parity, u8 flow_ctrl_settings,
790 u8 xon_char, u8 xoff_char, u16 param_mask) 782 u8 xon_char, u8 xoff_char, u16 param_mask)
791{ 783{
792 struct rfcomm_hdr *hdr; 784 struct rfcomm_hdr *hdr;
793 struct rfcomm_mcc *mcc; 785 struct rfcomm_mcc *mcc;
@@ -795,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
795 u8 buf[16], *ptr = buf; 787 u8 buf[16], *ptr = buf;
796 788
797 BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" 789 BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
798 "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", 790 " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
799 s, cr, dlci, bit_rate, data_bits, stop_bits, parity, 791 s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
800 flow_ctrl_settings, xon_char, xoff_char, param_mask); 792 flow_ctrl_settings, xon_char, xoff_char, param_mask);
801 793
802 hdr = (void *) ptr; ptr += sizeof(*hdr); 794 hdr = (void *) ptr; ptr += sizeof(*hdr);
803 hdr->addr = __addr(s->initiator, 0); 795 hdr->addr = __addr(s->initiator, 0);
@@ -1269,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1269 u8 xon_char = 0; 1261 u8 xon_char = 0;
1270 u8 xoff_char = 0; 1262 u8 xoff_char = 0;
1271 u16 rpn_mask = RFCOMM_RPN_PM_ALL; 1263 u16 rpn_mask = RFCOMM_RPN_PM_ALL;
1272 1264
1273 BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", 1265 BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
1274 dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, 1266 dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
1275 rpn->xon_char, rpn->xoff_char, rpn->param_mask); 1267 rpn->xon_char, rpn->xoff_char, rpn->param_mask);
1276 1268
1277 if (!cr) 1269 if (!cr)
1278 return 0; 1270 return 0;
1279 1271
1280 if (len == 1) { 1272 if (len == 1) {
1281 /* request: return default setting */ 1273 /* This is a request, return default settings */
1282 bit_rate = RFCOMM_RPN_BR_115200; 1274 bit_rate = RFCOMM_RPN_BR_115200;
1283 data_bits = RFCOMM_RPN_DATA_8; 1275 data_bits = RFCOMM_RPN_DATA_8;
1284 stop_bits = RFCOMM_RPN_STOP_1; 1276 stop_bits = RFCOMM_RPN_STOP_1;
@@ -1286,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1286 flow_ctrl = RFCOMM_RPN_FLOW_NONE; 1278 flow_ctrl = RFCOMM_RPN_FLOW_NONE;
1287 xon_char = RFCOMM_RPN_XON_CHAR; 1279 xon_char = RFCOMM_RPN_XON_CHAR;
1288 xoff_char = RFCOMM_RPN_XOFF_CHAR; 1280 xoff_char = RFCOMM_RPN_XOFF_CHAR;
1289
1290 goto rpn_out; 1281 goto rpn_out;
1291 } 1282 }
1292 /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity, 1283
1293 no flow control lines, normal XON/XOFF chars */ 1284 /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
1285 * no parity, no flow control lines, normal XON/XOFF chars */
1286
1294 if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) { 1287 if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) {
1295 bit_rate = rpn->bit_rate; 1288 bit_rate = rpn->bit_rate;
1296 if (bit_rate != RFCOMM_RPN_BR_115200) { 1289 if (bit_rate != RFCOMM_RPN_BR_115200) {
@@ -1299,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1299 rpn_mask ^= RFCOMM_RPN_PM_BITRATE; 1292 rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
1300 } 1293 }
1301 } 1294 }
1295
1302 if (rpn->param_mask & RFCOMM_RPN_PM_DATA) { 1296 if (rpn->param_mask & RFCOMM_RPN_PM_DATA) {
1303 data_bits = __get_rpn_data_bits(rpn->line_settings); 1297 data_bits = __get_rpn_data_bits(rpn->line_settings);
1304 if (data_bits != RFCOMM_RPN_DATA_8) { 1298 if (data_bits != RFCOMM_RPN_DATA_8) {
@@ -1307,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1307 rpn_mask ^= RFCOMM_RPN_PM_DATA; 1301 rpn_mask ^= RFCOMM_RPN_PM_DATA;
1308 } 1302 }
1309 } 1303 }
1304
1310 if (rpn->param_mask & RFCOMM_RPN_PM_STOP) { 1305 if (rpn->param_mask & RFCOMM_RPN_PM_STOP) {
1311 stop_bits = __get_rpn_stop_bits(rpn->line_settings); 1306 stop_bits = __get_rpn_stop_bits(rpn->line_settings);
1312 if (stop_bits != RFCOMM_RPN_STOP_1) { 1307 if (stop_bits != RFCOMM_RPN_STOP_1) {
@@ -1315,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1315 rpn_mask ^= RFCOMM_RPN_PM_STOP; 1310 rpn_mask ^= RFCOMM_RPN_PM_STOP;
1316 } 1311 }
1317 } 1312 }
1313
1318 if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) { 1314 if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) {
1319 parity = __get_rpn_parity(rpn->line_settings); 1315 parity = __get_rpn_parity(rpn->line_settings);
1320 if (parity != RFCOMM_RPN_PARITY_NONE) { 1316 if (parity != RFCOMM_RPN_PARITY_NONE) {
@@ -1323,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1323 rpn_mask ^= RFCOMM_RPN_PM_PARITY; 1319 rpn_mask ^= RFCOMM_RPN_PM_PARITY;
1324 } 1320 }
1325 } 1321 }
1322
1326 if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) { 1323 if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) {
1327 flow_ctrl = rpn->flow_ctrl; 1324 flow_ctrl = rpn->flow_ctrl;
1328 if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { 1325 if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
@@ -1331,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1331 rpn_mask ^= RFCOMM_RPN_PM_FLOW; 1328 rpn_mask ^= RFCOMM_RPN_PM_FLOW;
1332 } 1329 }
1333 } 1330 }
1331
1334 if (rpn->param_mask & RFCOMM_RPN_PM_XON) { 1332 if (rpn->param_mask & RFCOMM_RPN_PM_XON) {
1335 xon_char = rpn->xon_char; 1333 xon_char = rpn->xon_char;
1336 if (xon_char != RFCOMM_RPN_XON_CHAR) { 1334 if (xon_char != RFCOMM_RPN_XON_CHAR) {
@@ -1339,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1339 rpn_mask ^= RFCOMM_RPN_PM_XON; 1337 rpn_mask ^= RFCOMM_RPN_PM_XON;
1340 } 1338 }
1341 } 1339 }
1340
1342 if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) { 1341 if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) {
1343 xoff_char = rpn->xoff_char; 1342 xoff_char = rpn->xoff_char;
1344 if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { 1343 if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
@@ -1349,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
1349 } 1348 }
1350 1349
1351rpn_out: 1350rpn_out:
1352 rfcomm_send_rpn(s, 0, dlci, 1351 rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
1353 bit_rate, data_bits, stop_bits, parity, flow_ctrl, 1352 parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
1354 xon_char, xoff_char, rpn_mask);
1355 1353
1356 return 0; 1354 return 0;
1357} 1355}
@@ -1362,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
1362 u8 dlci = __get_dlci(rls->dlci); 1360 u8 dlci = __get_dlci(rls->dlci);
1363 1361
1364 BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); 1362 BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
1365 1363
1366 if (!cr) 1364 if (!cr)
1367 return 0; 1365 return 0;
1368 1366
1369 /* FIXME: We should probably do something with this 1367 /* We should probably do something with this information here. But
1370 information here. But for now it's sufficient just 1368 * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
1371 to reply -- Bluetooth 1.1 says it's mandatory to 1369 * mandatory to recognise and respond to RLS */
1372 recognise and respond to RLS */
1373 1370
1374 rfcomm_send_rls(s, 0, dlci, rls->status); 1371 rfcomm_send_rls(s, 0, dlci, rls->status);
1375 1372
@@ -1385,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
1385 BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); 1382 BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
1386 1383
1387 d = rfcomm_dlc_get(s, dlci); 1384 d = rfcomm_dlc_get(s, dlci);
1388 if (!d) 1385 if (!d)
1389 return 0; 1386 return 0;
1390 1387
1391 if (cr) { 1388 if (cr) {
@@ -1393,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
1393 set_bit(RFCOMM_TX_THROTTLED, &d->flags); 1390 set_bit(RFCOMM_TX_THROTTLED, &d->flags);
1394 else 1391 else
1395 clear_bit(RFCOMM_TX_THROTTLED, &d->flags); 1392 clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
1396 1393
1397 rfcomm_dlc_lock(d); 1394 rfcomm_dlc_lock(d);
1398 if (d->modem_status) 1395 if (d->modem_status)
1399 d->modem_status(d, msc->v24_sig); 1396 d->modem_status(d, msc->v24_sig);
@@ -1402,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
1402 rfcomm_send_msc(s, 0, dlci, msc->v24_sig); 1399 rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
1403 1400
1404 d->mscex |= RFCOMM_MSCEX_RX; 1401 d->mscex |= RFCOMM_MSCEX_RX;
1405 } else 1402 } else
1406 d->mscex |= RFCOMM_MSCEX_TX; 1403 d->mscex |= RFCOMM_MSCEX_TX;
1407 1404
1408 return 0; 1405 return 0;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 63a123c5c41b..90e19eb6d3cc 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
284 .obj_size = sizeof(struct rfcomm_pinfo) 284 .obj_size = sizeof(struct rfcomm_pinfo)
285}; 285};
286 286
287static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio) 287static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
288{ 288{
289 struct rfcomm_dlc *d; 289 struct rfcomm_dlc *d;
290 struct sock *sk; 290 struct sock *sk;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6304590fd36a..1bca860a6109 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
286 skb->destructor = rfcomm_wfree; 286 skb->destructor = rfcomm_wfree;
287} 287}
288 288
289static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority) 289static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority)
290{ 290{
291 if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { 291 if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
292 struct sk_buff *skb = alloc_skb(size, priority); 292 struct sk_buff *skb = alloc_skb(size, priority);
@@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
528 struct rfcomm_dev *dev = dlc->owner; 528 struct rfcomm_dev *dev = dlc->owner;
529 if (!dev) 529 if (!dev)
530 return; 530 return;
531 531
532 BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); 532 BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
533 533
534 if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
535 if (dev->tty && !C_CLOCAL(dev->tty))
536 tty_hangup(dev->tty);
537 }
538
534 dev->modem_status = 539 dev->modem_status =
535 ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | 540 ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
536 ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | 541 ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
@@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
740 return -ENOIOCTLCMD; 745 return -ENOIOCTLCMD;
741} 746}
742 747
743#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
744
745static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old) 748static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
746{ 749{
747 BT_DBG("tty %p", tty); 750 struct termios *new = (struct termios *) tty->termios;
751 int old_baud_rate = tty_termios_baud_rate(old);
752 int new_baud_rate = tty_termios_baud_rate(new);
748 753
749 if ((tty->termios->c_cflag == old->c_cflag) && 754 u8 baud, data_bits, stop_bits, parity, x_on, x_off;
750 (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag))) 755 u16 changes = 0;
751 return; 756
757 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
758
759 BT_DBG("tty %p termios %p", tty, old);
760
761 /* Handle turning off CRTSCTS */
762 if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
763 BT_DBG("Turning off CRTSCTS unsupported");
764
765 /* Parity on/off and when on, odd/even */
766 if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
767 ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
768 changes |= RFCOMM_RPN_PM_PARITY;
769 BT_DBG("Parity change detected.");
770 }
771
772 /* Mark and space parity are not supported! */
773 if (new->c_cflag & PARENB) {
774 if (new->c_cflag & PARODD) {
775 BT_DBG("Parity is ODD");
776 parity = RFCOMM_RPN_PARITY_ODD;
777 } else {
778 BT_DBG("Parity is EVEN");
779 parity = RFCOMM_RPN_PARITY_EVEN;
780 }
781 } else {
782 BT_DBG("Parity is OFF");
783 parity = RFCOMM_RPN_PARITY_NONE;
784 }
785
786 /* Setting the x_on / x_off characters */
787 if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
788 BT_DBG("XOFF custom");
789 x_on = new->c_cc[VSTOP];
790 changes |= RFCOMM_RPN_PM_XON;
791 } else {
792 BT_DBG("XOFF default");
793 x_on = RFCOMM_RPN_XON_CHAR;
794 }
795
796 if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
797 BT_DBG("XON custom");
798 x_off = new->c_cc[VSTART];
799 changes |= RFCOMM_RPN_PM_XOFF;
800 } else {
801 BT_DBG("XON default");
802 x_off = RFCOMM_RPN_XOFF_CHAR;
803 }
804
805 /* Handle setting of stop bits */
806 if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
807 changes |= RFCOMM_RPN_PM_STOP;
808
809 /* POSIX does not support 1.5 stop bits and RFCOMM does not
810 * support 2 stop bits. So a request for 2 stop bits gets
811 * translated to 1.5 stop bits */
812 if (new->c_cflag & CSTOPB) {
813 stop_bits = RFCOMM_RPN_STOP_15;
814 } else {
815 stop_bits = RFCOMM_RPN_STOP_1;
816 }
817
818 /* Handle number of data bits [5-8] */
819 if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
820 changes |= RFCOMM_RPN_PM_DATA;
821
822 switch (new->c_cflag & CSIZE) {
823 case CS5:
824 data_bits = RFCOMM_RPN_DATA_5;
825 break;
826 case CS6:
827 data_bits = RFCOMM_RPN_DATA_6;
828 break;
829 case CS7:
830 data_bits = RFCOMM_RPN_DATA_7;
831 break;
832 case CS8:
833 data_bits = RFCOMM_RPN_DATA_8;
834 break;
835 default:
836 data_bits = RFCOMM_RPN_DATA_8;
837 break;
838 }
839
840 /* Handle baudrate settings */
841 if (old_baud_rate != new_baud_rate)
842 changes |= RFCOMM_RPN_PM_BITRATE;
752 843
753 /* handle turning off CRTSCTS */ 844 switch (new_baud_rate) {
754 if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) { 845 case 2400:
755 BT_DBG("turning off CRTSCTS"); 846 baud = RFCOMM_RPN_BR_2400;
847 break;
848 case 4800:
849 baud = RFCOMM_RPN_BR_4800;
850 break;
851 case 7200:
852 baud = RFCOMM_RPN_BR_7200;
853 break;
854 case 9600:
855 baud = RFCOMM_RPN_BR_9600;
856 break;
857 case 19200:
858 baud = RFCOMM_RPN_BR_19200;
859 break;
860 case 38400:
861 baud = RFCOMM_RPN_BR_38400;
862 break;
863 case 57600:
864 baud = RFCOMM_RPN_BR_57600;
865 break;
866 case 115200:
867 baud = RFCOMM_RPN_BR_115200;
868 break;
869 case 230400:
870 baud = RFCOMM_RPN_BR_230400;
871 break;
872 default:
873 /* 9600 is standard accordinag to the RFCOMM specification */
874 baud = RFCOMM_RPN_BR_9600;
875 break;
876
756 } 877 }
878
879 if (changes)
880 rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
881 data_bits, stop_bits, parity,
882 RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
883
884 return;
757} 885}
758 886
759static void rfcomm_tty_throttle(struct tty_struct *tty) 887static void rfcomm_tty_throttle(struct tty_struct *tty)
@@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
761 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; 889 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
762 890
763 BT_DBG("tty %p dev %p", tty, dev); 891 BT_DBG("tty %p dev %p", tty, dev);
764 892
765 rfcomm_dlc_throttle(dev->dlc); 893 rfcomm_dlc_throttle(dev->dlc);
766} 894}
767 895
@@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
770 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; 898 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
771 899
772 BT_DBG("tty %p dev %p", tty, dev); 900 BT_DBG("tty %p dev %p", tty, dev);
773 901
774 rfcomm_dlc_unthrottle(dev->dlc); 902 rfcomm_dlc_unthrottle(dev->dlc);
775} 903}
776 904
@@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
841 969
842static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear) 970static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
843{ 971{
844 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; 972 struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
845 struct rfcomm_dlc *dlc = dev->dlc; 973 struct rfcomm_dlc *dlc = dev->dlc;
846 u8 v24_sig; 974 u8 v24_sig;
847 975
848 BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); 976 BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
849 977
850 rfcomm_dlc_get_modem_status(dlc, &v24_sig); 978 rfcomm_dlc_get_modem_status(dlc, &v24_sig);
851 979
852 if (set & TIOCM_DSR || set & TIOCM_DTR) 980 if (set & TIOCM_DSR || set & TIOCM_DTR)
853 v24_sig |= RFCOMM_V24_RTC; 981 v24_sig |= RFCOMM_V24_RTC;
854 if (set & TIOCM_RTS || set & TIOCM_CTS) 982 if (set & TIOCM_RTS || set & TIOCM_CTS)
855 v24_sig |= RFCOMM_V24_RTR; 983 v24_sig |= RFCOMM_V24_RTR;
856 if (set & TIOCM_RI) 984 if (set & TIOCM_RI)
857 v24_sig |= RFCOMM_V24_IC; 985 v24_sig |= RFCOMM_V24_IC;
858 if (set & TIOCM_CD) 986 if (set & TIOCM_CD)
859 v24_sig |= RFCOMM_V24_DV; 987 v24_sig |= RFCOMM_V24_DV;
860 988
861 if (clear & TIOCM_DSR || clear & TIOCM_DTR) 989 if (clear & TIOCM_DSR || clear & TIOCM_DTR)
862 v24_sig &= ~RFCOMM_V24_RTC; 990 v24_sig &= ~RFCOMM_V24_RTC;
863 if (clear & TIOCM_RTS || clear & TIOCM_CTS) 991 if (clear & TIOCM_RTS || clear & TIOCM_CTS)
864 v24_sig &= ~RFCOMM_V24_RTR; 992 v24_sig &= ~RFCOMM_V24_RTR;
865 if (clear & TIOCM_RI) 993 if (clear & TIOCM_RI)
866 v24_sig &= ~RFCOMM_V24_IC; 994 v24_sig &= ~RFCOMM_V24_IC;
867 if (clear & TIOCM_CD) 995 if (clear & TIOCM_CD)
868 v24_sig &= ~RFCOMM_V24_DV; 996 v24_sig &= ~RFCOMM_V24_DV;
869 997
870 rfcomm_dlc_set_modem_status(dlc, v24_sig); 998 rfcomm_dlc_set_modem_status(dlc, v24_sig);
871 999
872 return 0; 1000 return 0;
873} 1001}
874 1002
875/* ---- TTY structure ---- */ 1003/* ---- TTY structure ---- */
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 746c11fc017e..ce7ab7dfa0b2 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
418 .obj_size = sizeof(struct sco_pinfo) 418 .obj_size = sizeof(struct sco_pinfo)
419}; 419};
420 420
421static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio) 421static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
422{ 422{
423 struct sock *sk; 423 struct sock *sk;
424 424
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000000..db23d59746cf
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
1#
2# 802.1d Ethernet Bridging
3#
4
5config BRIDGE
6 tristate "802.1d Ethernet Bridging"
7 ---help---
8 If you say Y here, then your Linux box will be able to act as an
9 Ethernet bridge, which means that the different Ethernet segments it
10 is connected to will appear as one Ethernet to the participants.
11 Several such bridges can work together to create even larger
12 networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
13 As this is a standard, Linux bridges will cooperate properly with
14 other third party bridge products.
15
16 In order to use the Ethernet bridge, you'll need the bridge
17 configuration tools; see <file:Documentation/networking/bridge.txt>
18 for location. Please read the Bridge mini-HOWTO for more
19 information.
20
21 If you enable iptables support along with the bridge support then you
22 turn your bridge into a bridging IP firewall.
23 iptables will then see the IP packets being bridged, so you need to
24 take this into account when setting up your firewall rules.
25 Enabling arptables support when bridging will let arptables see
26 bridged ARP traffic in the arptables FORWARD chain.
27
28 To compile this code as a module, choose M here: the module
29 will be called bridge.
30
31 If unsure, say N.
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6c2200b7ca3..24396b914d11 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
23#include <asm/atomic.h> 23#include <asm/atomic.h>
24#include "br_private.h" 24#include "br_private.h"
25 25
26static kmem_cache_t *br_fdb_cache; 26static kmem_cache_t *br_fdb_cache __read_mostly;
27static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, 27static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
28 const unsigned char *addr); 28 const unsigned char *addr);
29 29
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68ccef507b49..c70b3be23026 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -138,7 +138,7 @@ config BRIDGE_EBT_VLAN
138# 138#
139config BRIDGE_EBT_ARPREPLY 139config BRIDGE_EBT_ARPREPLY
140 tristate "ebt: arp reply target support" 140 tristate "ebt: arp reply target support"
141 depends on BRIDGE_NF_EBTABLES 141 depends on BRIDGE_NF_EBTABLES && INET
142 help 142 help
143 This option adds the arp reply target, which allows 143 This option adds the arp reply target, which allows
144 automatically sending arp replies to arp requests. 144 automatically sending arp replies to arp requests.
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 02c632b4d325..c93d35ab95c0 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
23{ 23{
24 struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; 24 struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
25 25
26 if ((*pskb)->nfmark != info->mark) { 26 if ((*pskb)->nfmark != info->mark)
27 (*pskb)->nfmark = info->mark; 27 (*pskb)->nfmark = info->mark;
28 (*pskb)->nfcache |= NFC_ALTERED; 28
29 }
30 return info->target; 29 return info->target;
31} 30}
32 31
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 01af4fcef26d..aae26ae2e61f 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup)
78 if (ub->qlen > 1) 78 if (ub->qlen > 1)
79 ub->lastnlh->nlmsg_type = NLMSG_DONE; 79 ub->lastnlh->nlmsg_type = NLMSG_DONE;
80 80
81 NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup; 81 NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
82 netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC); 82 netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
83 83
84 ub->qlen = 0; 84 ub->qlen = 0;
85 ub->skb = NULL; 85 ub->skb = NULL;
@@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
162 pm->version = EBT_ULOG_VERSION; 162 pm->version = EBT_ULOG_VERSION;
163 do_gettimeofday(&pm->stamp); 163 do_gettimeofday(&pm->stamp);
164 if (ub->qlen == 1) 164 if (ub->qlen == 1)
165 ub->skb->stamp = pm->stamp; 165 skb_set_timestamp(ub->skb, &pm->stamp);
166 pm->data_len = copy_len; 166 pm->data_len = copy_len;
167 pm->mark = skb->nfmark; 167 pm->mark = skb->nfmark;
168 pm->hook = hooknr; 168 pm->hook = hooknr;
@@ -258,7 +258,8 @@ static int __init init(void)
258 spin_lock_init(&ulog_buffers[i].lock); 258 spin_lock_init(&ulog_buffers[i].lock);
259 } 259 }
260 260
261 ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL); 261 ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
262 NULL, THIS_MODULE);
262 if (!ebtulognl) 263 if (!ebtulognl)
263 ret = -ENOMEM; 264 ret = -ENOMEM;
264 else if ((ret = ebt_register_watcher(&ulog))) 265 else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/compat.c b/net/compat.c
index be5d936dc423..d99ab9695893 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
91 } else 91 } else
92 kern_msg->msg_name = NULL; 92 kern_msg->msg_name = NULL;
93 93
94 if(kern_msg->msg_iovlen > UIO_FASTIOV) {
95 kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
96 GFP_KERNEL);
97 if(!kern_iov)
98 return -ENOMEM;
99 }
100
101 tot_len = iov_from_user_compat_to_kern(kern_iov, 94 tot_len = iov_from_user_compat_to_kern(kern_iov,
102 (struct compat_iovec __user *)kern_msg->msg_iov, 95 (struct compat_iovec __user *)kern_msg->msg_iov,
103 kern_msg->msg_iovlen); 96 kern_msg->msg_iovlen);
104 if(tot_len >= 0) 97 if(tot_len >= 0)
105 kern_msg->msg_iov = kern_iov; 98 kern_msg->msg_iov = kern_iov;
106 else if(kern_msg->msg_iovlen > UIO_FASTIOV)
107 kfree(kern_iov);
108 99
109 return tot_len; 100 return tot_len;
110} 101}
diff --git a/net/core/Makefile b/net/core/Makefile
index 5e0c56b7f607..630da0f0579e 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -7,11 +7,11 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
7 7
8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o 8obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
9 9
10obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \ 10obj-y += dev.o ethtool.o dev_mcast.o dst.o \
11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o
12 12
13obj-$(CONFIG_XFRM) += flow.o
13obj-$(CONFIG_SYSFS) += net-sysfs.o 14obj-$(CONFIG_SYSFS) += net-sysfs.o
14obj-$(CONFIG_NETFILTER) += netfilter.o
15obj-$(CONFIG_NET_DIVERT) += dv.o 15obj-$(CONFIG_NET_DIVERT) += dv.o
16obj-$(CONFIG_NET_PKTGEN) += pktgen.o 16obj-$(CONFIG_NET_PKTGEN) += pktgen.o
17obj-$(CONFIG_NET_RADIO) += wireless.o 17obj-$(CONFIG_NET_RADIO) += wireless.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fcee054b6f75..da9bf71421a7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -43,7 +43,6 @@
43#include <linux/errno.h> 43#include <linux/errno.h>
44#include <linux/sched.h> 44#include <linux/sched.h>
45#include <linux/inet.h> 45#include <linux/inet.h>
46#include <linux/tcp.h>
47#include <linux/netdevice.h> 46#include <linux/netdevice.h>
48#include <linux/rtnetlink.h> 47#include <linux/rtnetlink.h>
49#include <linux/poll.h> 48#include <linux/poll.h>
@@ -51,9 +50,10 @@
51 50
52#include <net/protocol.h> 51#include <net/protocol.h>
53#include <linux/skbuff.h> 52#include <linux/skbuff.h>
54#include <net/sock.h>
55#include <net/checksum.h>
56 53
54#include <net/checksum.h>
55#include <net/sock.h>
56#include <net/tcp_states.h>
57 57
58/* 58/*
59 * Is a socket 'connection oriented' ? 59 * Is a socket 'connection oriented' ?
diff --git a/net/core/dev.c b/net/core/dev.c
index ff9dc029233a..c01511e3d0c1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt)
267 spin_unlock_bh(&ptype_lock); 267 spin_unlock_bh(&ptype_lock);
268} 268}
269 269
270extern void linkwatch_run_queue(void);
271
272
273
274/** 270/**
275 * __dev_remove_pack - remove packet handler 271 * __dev_remove_pack - remove packet handler
276 * @pt: packet type declaration 272 * @pt: packet type declaration
@@ -901,8 +897,7 @@ int dev_close(struct net_device *dev)
901 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 897 smp_mb__after_clear_bit(); /* Commit netif_running(). */
902 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { 898 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
903 /* No hurry. */ 899 /* No hurry. */
904 current->state = TASK_INTERRUPTIBLE; 900 msleep(1);
905 schedule_timeout(1);
906 } 901 }
907 902
908 /* 903 /*
@@ -1010,13 +1005,22 @@ void net_disable_timestamp(void)
1010 atomic_dec(&netstamp_needed); 1005 atomic_dec(&netstamp_needed);
1011} 1006}
1012 1007
1013static inline void net_timestamp(struct timeval *stamp) 1008void __net_timestamp(struct sk_buff *skb)
1009{
1010 struct timeval tv;
1011
1012 do_gettimeofday(&tv);
1013 skb_set_timestamp(skb, &tv);
1014}
1015EXPORT_SYMBOL(__net_timestamp);
1016
1017static inline void net_timestamp(struct sk_buff *skb)
1014{ 1018{
1015 if (atomic_read(&netstamp_needed)) 1019 if (atomic_read(&netstamp_needed))
1016 do_gettimeofday(stamp); 1020 __net_timestamp(skb);
1017 else { 1021 else {
1018 stamp->tv_sec = 0; 1022 skb->tstamp.off_sec = 0;
1019 stamp->tv_usec = 0; 1023 skb->tstamp.off_usec = 0;
1020 } 1024 }
1021} 1025}
1022 1026
@@ -1028,7 +1032,8 @@ static inline void net_timestamp(struct timeval *stamp)
1028void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1032void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1029{ 1033{
1030 struct packet_type *ptype; 1034 struct packet_type *ptype;
1031 net_timestamp(&skb->stamp); 1035
1036 net_timestamp(skb);
1032 1037
1033 rcu_read_lock(); 1038 rcu_read_lock();
1034 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1039 list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1059,7 +1064,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1059 1064
1060 skb2->h.raw = skb2->nh.raw; 1065 skb2->h.raw = skb2->nh.raw;
1061 skb2->pkt_type = PACKET_OUTGOING; 1066 skb2->pkt_type = PACKET_OUTGOING;
1062 ptype->func(skb2, skb->dev, ptype); 1067 ptype->func(skb2, skb->dev, ptype, skb->dev);
1063 } 1068 }
1064 } 1069 }
1065 rcu_read_unlock(); 1070 rcu_read_unlock();
@@ -1124,8 +1129,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1124#define illegal_highdma(dev, skb) (0) 1129#define illegal_highdma(dev, skb) (0)
1125#endif 1130#endif
1126 1131
1127extern void skb_release_data(struct sk_buff *);
1128
1129/* Keep head the same: replace data */ 1132/* Keep head the same: replace data */
1130int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask) 1133int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
1131{ 1134{
@@ -1380,8 +1383,8 @@ int netif_rx(struct sk_buff *skb)
1380 if (netpoll_rx(skb)) 1383 if (netpoll_rx(skb))
1381 return NET_RX_DROP; 1384 return NET_RX_DROP;
1382 1385
1383 if (!skb->stamp.tv_sec) 1386 if (!skb->tstamp.off_sec)
1384 net_timestamp(&skb->stamp); 1387 net_timestamp(skb);
1385 1388
1386 /* 1389 /*
1387 * The code is rearranged so that the path is the most 1390 * The code is rearranged so that the path is the most
@@ -1426,14 +1429,14 @@ int netif_rx_ni(struct sk_buff *skb)
1426 1429
1427EXPORT_SYMBOL(netif_rx_ni); 1430EXPORT_SYMBOL(netif_rx_ni);
1428 1431
1429static __inline__ void skb_bond(struct sk_buff *skb) 1432static inline struct net_device *skb_bond(struct sk_buff *skb)
1430{ 1433{
1431 struct net_device *dev = skb->dev; 1434 struct net_device *dev = skb->dev;
1432 1435
1433 if (dev->master) { 1436 if (dev->master)
1434 skb->real_dev = skb->dev;
1435 skb->dev = dev->master; 1437 skb->dev = dev->master;
1436 } 1438
1439 return dev;
1437} 1440}
1438 1441
1439static void net_tx_action(struct softirq_action *h) 1442static void net_tx_action(struct softirq_action *h)
@@ -1483,10 +1486,11 @@ static void net_tx_action(struct softirq_action *h)
1483} 1486}
1484 1487
1485static __inline__ int deliver_skb(struct sk_buff *skb, 1488static __inline__ int deliver_skb(struct sk_buff *skb,
1486 struct packet_type *pt_prev) 1489 struct packet_type *pt_prev,
1490 struct net_device *orig_dev)
1487{ 1491{
1488 atomic_inc(&skb->users); 1492 atomic_inc(&skb->users);
1489 return pt_prev->func(skb, skb->dev, pt_prev); 1493 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1490} 1494}
1491 1495
1492#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) 1496#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
@@ -1497,7 +1501,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1497void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); 1501void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1498 1502
1499static __inline__ int handle_bridge(struct sk_buff **pskb, 1503static __inline__ int handle_bridge(struct sk_buff **pskb,
1500 struct packet_type **pt_prev, int *ret) 1504 struct packet_type **pt_prev, int *ret,
1505 struct net_device *orig_dev)
1501{ 1506{
1502 struct net_bridge_port *port; 1507 struct net_bridge_port *port;
1503 1508
@@ -1506,14 +1511,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
1506 return 0; 1511 return 0;
1507 1512
1508 if (*pt_prev) { 1513 if (*pt_prev) {
1509 *ret = deliver_skb(*pskb, *pt_prev); 1514 *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
1510 *pt_prev = NULL; 1515 *pt_prev = NULL;
1511 } 1516 }
1512 1517
1513 return br_handle_frame_hook(port, pskb); 1518 return br_handle_frame_hook(port, pskb);
1514} 1519}
1515#else 1520#else
1516#define handle_bridge(skb, pt_prev, ret) (0) 1521#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
1517#endif 1522#endif
1518 1523
1519#ifdef CONFIG_NET_CLS_ACT 1524#ifdef CONFIG_NET_CLS_ACT
@@ -1535,17 +1540,14 @@ static int ing_filter(struct sk_buff *skb)
1535 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); 1540 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1536 if (MAX_RED_LOOP < ttl++) { 1541 if (MAX_RED_LOOP < ttl++) {
1537 printk("Redir loop detected Dropping packet (%s->%s)\n", 1542 printk("Redir loop detected Dropping packet (%s->%s)\n",
1538 skb->input_dev?skb->input_dev->name:"??",skb->dev->name); 1543 skb->input_dev->name, skb->dev->name);
1539 return TC_ACT_SHOT; 1544 return TC_ACT_SHOT;
1540 } 1545 }
1541 1546
1542 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); 1547 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1543 1548
1544 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); 1549 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1545 if (NULL == skb->input_dev) { 1550
1546 skb->input_dev = skb->dev;
1547 printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name);
1548 }
1549 spin_lock(&dev->ingress_lock); 1551 spin_lock(&dev->ingress_lock);
1550 if ((q = dev->qdisc_ingress) != NULL) 1552 if ((q = dev->qdisc_ingress) != NULL)
1551 result = q->enqueue(skb, q); 1553 result = q->enqueue(skb, q);
@@ -1560,6 +1562,7 @@ static int ing_filter(struct sk_buff *skb)
1560int netif_receive_skb(struct sk_buff *skb) 1562int netif_receive_skb(struct sk_buff *skb)
1561{ 1563{
1562 struct packet_type *ptype, *pt_prev; 1564 struct packet_type *ptype, *pt_prev;
1565 struct net_device *orig_dev;
1563 int ret = NET_RX_DROP; 1566 int ret = NET_RX_DROP;
1564 unsigned short type; 1567 unsigned short type;
1565 1568
@@ -1567,10 +1570,13 @@ int netif_receive_skb(struct sk_buff *skb)
1567 if (skb->dev->poll && netpoll_rx(skb)) 1570 if (skb->dev->poll && netpoll_rx(skb))
1568 return NET_RX_DROP; 1571 return NET_RX_DROP;
1569 1572
1570 if (!skb->stamp.tv_sec) 1573 if (!skb->tstamp.off_sec)
1571 net_timestamp(&skb->stamp); 1574 net_timestamp(skb);
1575
1576 if (!skb->input_dev)
1577 skb->input_dev = skb->dev;
1572 1578
1573 skb_bond(skb); 1579 orig_dev = skb_bond(skb);
1574 1580
1575 __get_cpu_var(netdev_rx_stat).total++; 1581 __get_cpu_var(netdev_rx_stat).total++;
1576 1582
@@ -1591,14 +1597,14 @@ int netif_receive_skb(struct sk_buff *skb)
1591 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1597 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1592 if (!ptype->dev || ptype->dev == skb->dev) { 1598 if (!ptype->dev || ptype->dev == skb->dev) {
1593 if (pt_prev) 1599 if (pt_prev)
1594 ret = deliver_skb(skb, pt_prev); 1600 ret = deliver_skb(skb, pt_prev, orig_dev);
1595 pt_prev = ptype; 1601 pt_prev = ptype;
1596 } 1602 }
1597 } 1603 }
1598 1604
1599#ifdef CONFIG_NET_CLS_ACT 1605#ifdef CONFIG_NET_CLS_ACT
1600 if (pt_prev) { 1606 if (pt_prev) {
1601 ret = deliver_skb(skb, pt_prev); 1607 ret = deliver_skb(skb, pt_prev, orig_dev);
1602 pt_prev = NULL; /* noone else should process this after*/ 1608 pt_prev = NULL; /* noone else should process this after*/
1603 } else { 1609 } else {
1604 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); 1610 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -1617,7 +1623,7 @@ ncls:
1617 1623
1618 handle_diverter(skb); 1624 handle_diverter(skb);
1619 1625
1620 if (handle_bridge(&skb, &pt_prev, &ret)) 1626 if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
1621 goto out; 1627 goto out;
1622 1628
1623 type = skb->protocol; 1629 type = skb->protocol;
@@ -1625,13 +1631,13 @@ ncls:
1625 if (ptype->type == type && 1631 if (ptype->type == type &&
1626 (!ptype->dev || ptype->dev == skb->dev)) { 1632 (!ptype->dev || ptype->dev == skb->dev)) {
1627 if (pt_prev) 1633 if (pt_prev)
1628 ret = deliver_skb(skb, pt_prev); 1634 ret = deliver_skb(skb, pt_prev, orig_dev);
1629 pt_prev = ptype; 1635 pt_prev = ptype;
1630 } 1636 }
1631 } 1637 }
1632 1638
1633 if (pt_prev) { 1639 if (pt_prev) {
1634 ret = pt_prev->func(skb, skb->dev, pt_prev); 1640 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1635 } else { 1641 } else {
1636 kfree_skb(skb); 1642 kfree_skb(skb);
1637 /* Jamal, now you will not able to escape explaining 1643 /* Jamal, now you will not able to escape explaining
@@ -1697,7 +1703,8 @@ static void net_rx_action(struct softirq_action *h)
1697 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1703 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1698 unsigned long start_time = jiffies; 1704 unsigned long start_time = jiffies;
1699 int budget = netdev_budget; 1705 int budget = netdev_budget;
1700 1706 void *have;
1707
1701 local_irq_disable(); 1708 local_irq_disable();
1702 1709
1703 while (!list_empty(&queue->poll_list)) { 1710 while (!list_empty(&queue->poll_list)) {
@@ -1710,10 +1717,10 @@ static void net_rx_action(struct softirq_action *h)
1710 1717
1711 dev = list_entry(queue->poll_list.next, 1718 dev = list_entry(queue->poll_list.next,
1712 struct net_device, poll_list); 1719 struct net_device, poll_list);
1713 netpoll_poll_lock(dev); 1720 have = netpoll_poll_lock(dev);
1714 1721
1715 if (dev->quota <= 0 || dev->poll(dev, &budget)) { 1722 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1716 netpoll_poll_unlock(dev); 1723 netpoll_poll_unlock(have);
1717 local_irq_disable(); 1724 local_irq_disable();
1718 list_del(&dev->poll_list); 1725 list_del(&dev->poll_list);
1719 list_add_tail(&dev->poll_list, &queue->poll_list); 1726 list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1722,7 +1729,7 @@ static void net_rx_action(struct softirq_action *h)
1722 else 1729 else
1723 dev->quota = dev->weight; 1730 dev->quota = dev->weight;
1724 } else { 1731 } else {
1725 netpoll_poll_unlock(dev); 1732 netpoll_poll_unlock(have);
1726 dev_put(dev); 1733 dev_put(dev);
1727 local_irq_disable(); 1734 local_irq_disable();
1728 } 1735 }
diff --git a/net/core/dst.c b/net/core/dst.c
index fc434ade5270..334790da9f16 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,6 +45,7 @@ static struct timer_list dst_gc_timer =
45static void dst_run_gc(unsigned long dummy) 45static void dst_run_gc(unsigned long dummy)
46{ 46{
47 int delayed = 0; 47 int delayed = 0;
48 int work_performed;
48 struct dst_entry * dst, **dstp; 49 struct dst_entry * dst, **dstp;
49 50
50 if (!spin_trylock(&dst_lock)) { 51 if (!spin_trylock(&dst_lock)) {
@@ -52,9 +53,9 @@ static void dst_run_gc(unsigned long dummy)
52 return; 53 return;
53 } 54 }
54 55
55
56 del_timer(&dst_gc_timer); 56 del_timer(&dst_gc_timer);
57 dstp = &dst_garbage_list; 57 dstp = &dst_garbage_list;
58 work_performed = 0;
58 while ((dst = *dstp) != NULL) { 59 while ((dst = *dstp) != NULL) {
59 if (atomic_read(&dst->__refcnt)) { 60 if (atomic_read(&dst->__refcnt)) {
60 dstp = &dst->next; 61 dstp = &dst->next;
@@ -62,6 +63,7 @@ static void dst_run_gc(unsigned long dummy)
62 continue; 63 continue;
63 } 64 }
64 *dstp = dst->next; 65 *dstp = dst->next;
66 work_performed = 1;
65 67
66 dst = dst_destroy(dst); 68 dst = dst_destroy(dst);
67 if (dst) { 69 if (dst) {
@@ -86,9 +88,14 @@ static void dst_run_gc(unsigned long dummy)
86 dst_gc_timer_inc = DST_GC_MAX; 88 dst_gc_timer_inc = DST_GC_MAX;
87 goto out; 89 goto out;
88 } 90 }
89 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) 91 if (!work_performed) {
90 dst_gc_timer_expires = DST_GC_MAX; 92 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
91 dst_gc_timer_inc += DST_GC_INC; 93 dst_gc_timer_expires = DST_GC_MAX;
94 dst_gc_timer_inc += DST_GC_INC;
95 } else {
96 dst_gc_timer_inc = DST_GC_INC;
97 dst_gc_timer_expires = DST_GC_MIN;
98 }
92 dst_gc_timer.expires = jiffies + dst_gc_timer_expires; 99 dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
93#if RT_CACHE_DEBUG >= 2 100#if RT_CACHE_DEBUG >= 2
94 printk("dst_total: %d/%d %ld\n", 101 printk("dst_total: %d/%d %ld\n",
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a3eeb88e1c81..289c1b5a8e4a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
81 return 0; 81 return 0;
82} 82}
83 83
84int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
85{
86 unsigned char len = dev->addr_len;
87 if ( addr->size < len )
88 return -ETOOSMALL;
89
90 addr->size = len;
91 memcpy(data, dev->perm_addr, len);
92 return 0;
93}
94
95
84/* Handlers for each ethtool command */ 96/* Handlers for each ethtool command */
85 97
86static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) 98static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
683 return ret; 695 return ret;
684} 696}
685 697
698static int ethtool_get_perm_addr(struct net_device *dev, void *useraddr)
699{
700 struct ethtool_perm_addr epaddr;
701 u8 *data;
702 int ret;
703
704 if (!dev->ethtool_ops->get_perm_addr)
705 return -EOPNOTSUPP;
706
707 if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
708 return -EFAULT;
709
710 data = kmalloc(epaddr.size, GFP_USER);
711 if (!data)
712 return -ENOMEM;
713
714 ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
715 if (ret)
716 return ret;
717
718 ret = -EFAULT;
719 if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
720 goto out;
721 useraddr += sizeof(epaddr);
722 if (copy_to_user(useraddr, data, epaddr.size))
723 goto out;
724 ret = 0;
725
726 out:
727 kfree(data);
728 return ret;
729}
730
686/* The main entry point in this file. Called from net/core/dev.c */ 731/* The main entry point in this file. Called from net/core/dev.c */
687 732
688int dev_ethtool(struct ifreq *ifr) 733int dev_ethtool(struct ifreq *ifr)
@@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr)
806 case ETHTOOL_GSTATS: 851 case ETHTOOL_GSTATS:
807 rc = ethtool_get_stats(dev, useraddr); 852 rc = ethtool_get_stats(dev, useraddr);
808 break; 853 break;
854 case ETHTOOL_GPERMADDR:
855 rc = ethtool_get_perm_addr(dev, useraddr);
856 break;
809 default: 857 default:
810 rc = -EOPNOTSUPP; 858 rc = -EOPNOTSUPP;
811 } 859 }
@@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr)
826 874
827EXPORT_SYMBOL(dev_ethtool); 875EXPORT_SYMBOL(dev_ethtool);
828EXPORT_SYMBOL(ethtool_op_get_link); 876EXPORT_SYMBOL(ethtool_op_get_link);
877EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
829EXPORT_SYMBOL(ethtool_op_get_sg); 878EXPORT_SYMBOL(ethtool_op_get_sg);
830EXPORT_SYMBOL(ethtool_op_get_tso); 879EXPORT_SYMBOL(ethtool_op_get_tso);
831EXPORT_SYMBOL(ethtool_op_get_tx_csum); 880EXPORT_SYMBOL(ethtool_op_get_tx_csum);
diff --git a/net/core/filter.c b/net/core/filter.c
index cd91a24f9720..079c2edff789 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -182,7 +182,7 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
182 A = ntohl(*(u32 *)ptr); 182 A = ntohl(*(u32 *)ptr);
183 continue; 183 continue;
184 } 184 }
185 return 0; 185 break;
186 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
187 k = fentry->k; 187 k = fentry->k;
188 load_h: 188 load_h:
@@ -191,7 +191,7 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
191 A = ntohs(*(u16 *)ptr); 191 A = ntohs(*(u16 *)ptr);
192 continue; 192 continue;
193 } 193 }
194 return 0; 194 break;
195 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
196 k = fentry->k; 196 k = fentry->k;
197load_b: 197load_b:
@@ -200,7 +200,7 @@ load_b:
200 A = *(u8 *)ptr; 200 A = *(u8 *)ptr;
201 continue; 201 continue;
202 } 202 }
203 return 0; 203 break;
204 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
205 A = skb->len; 205 A = skb->len;
206 continue; 206 continue;
diff --git a/net/core/flow.c b/net/core/flow.c
index f289570b15a3..7e95b39de9fd 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
42 42
43#define flow_table(cpu) (per_cpu(flow_tables, cpu)) 43#define flow_table(cpu) (per_cpu(flow_tables, cpu))
44 44
45static kmem_cache_t *flow_cachep; 45static kmem_cache_t *flow_cachep __read_mostly;
46 46
47static int flow_lwm, flow_hwm; 47static int flow_lwm, flow_hwm;
48 48
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1beb782ac41b..39fc55edf691 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1217,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg)
1217 1217
1218 while (skb != (struct sk_buff *)&tbl->proxy_queue) { 1218 while (skb != (struct sk_buff *)&tbl->proxy_queue) {
1219 struct sk_buff *back = skb; 1219 struct sk_buff *back = skb;
1220 long tdif = back->stamp.tv_usec - now; 1220 long tdif = NEIGH_CB(back)->sched_next - now;
1221 1221
1222 skb = skb->next; 1222 skb = skb->next;
1223 if (tdif <= 0) { 1223 if (tdif <= 0) {
@@ -1248,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
1248 kfree_skb(skb); 1248 kfree_skb(skb);
1249 return; 1249 return;
1250 } 1250 }
1251 skb->stamp.tv_sec = LOCALLY_ENQUEUED; 1251
1252 skb->stamp.tv_usec = sched_next; 1252 NEIGH_CB(skb)->sched_next = sched_next;
1253 NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
1253 1254
1254 spin_lock(&tbl->proxy_queue.lock); 1255 spin_lock(&tbl->proxy_queue.lock);
1255 if (del_timer(&tbl->proxy_timer)) { 1256 if (del_timer(&tbl->proxy_timer)) {
@@ -2342,8 +2343,8 @@ void neigh_app_ns(struct neighbour *n)
2342 } 2343 }
2343 nlh = (struct nlmsghdr *)skb->data; 2344 nlh = (struct nlmsghdr *)skb->data;
2344 nlh->nlmsg_flags = NLM_F_REQUEST; 2345 nlh->nlmsg_flags = NLM_F_REQUEST;
2345 NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; 2346 NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
2346 netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); 2347 netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
2347} 2348}
2348 2349
2349static void neigh_app_notify(struct neighbour *n) 2350static void neigh_app_notify(struct neighbour *n)
@@ -2360,8 +2361,8 @@ static void neigh_app_notify(struct neighbour *n)
2360 return; 2361 return;
2361 } 2362 }
2362 nlh = (struct nlmsghdr *)skb->data; 2363 nlh = (struct nlmsghdr *)skb->data;
2363 NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; 2364 NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
2364 netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); 2365 netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
2365} 2366}
2366 2367
2367#endif /* CONFIG_ARPD */ 2368#endif /* CONFIG_ARPD */
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
deleted file mode 100644
index 076c156d5eda..000000000000
--- a/net/core/netfilter.c
+++ /dev/null
@@ -1,648 +0,0 @@
1/* netfilter.c: look after the filters for various protocols.
2 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
3 *
4 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
5 * way.
6 *
7 * Rusty Russell (C)2000 -- This code is GPL.
8 *
9 * February 2000: Modified by James Morris to have 1 queue per protocol.
10 * 15-Mar-2000: Added NF_REPEAT --RR.
11 * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
12 */
13#include <linux/config.h>
14#include <linux/kernel.h>
15#include <linux/netfilter.h>
16#include <net/protocol.h>
17#include <linux/init.h>
18#include <linux/skbuff.h>
19#include <linux/wait.h>
20#include <linux/module.h>
21#include <linux/interrupt.h>
22#include <linux/if.h>
23#include <linux/netdevice.h>
24#include <linux/inetdevice.h>
25#include <linux/tcp.h>
26#include <linux/udp.h>
27#include <linux/icmp.h>
28#include <net/sock.h>
29#include <net/route.h>
30#include <linux/ip.h>
31
32/* In this code, we can be waiting indefinitely for userspace to
33 * service a packet if a hook returns NF_QUEUE. We could keep a count
34 * of skbuffs queued for userspace, and not deregister a hook unless
35 * this is zero, but that sucks. Now, we simply check when the
36 * packets come back: if the hook is gone, the packet is discarded. */
37#ifdef CONFIG_NETFILTER_DEBUG
38#define NFDEBUG(format, args...) printk(format , ## args)
39#else
40#define NFDEBUG(format, args...)
41#endif
42
43/* Sockopts only registered and called from user context, so
44 net locking would be overkill. Also, [gs]etsockopt calls may
45 sleep. */
46static DECLARE_MUTEX(nf_sockopt_mutex);
47
48struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
49static LIST_HEAD(nf_sockopts);
50static DEFINE_SPINLOCK(nf_hook_lock);
51
52/*
53 * A queue handler may be registered for each protocol. Each is protected by
54 * long term mutex. The handler must provide an an outfn() to accept packets
55 * for queueing and must reinject all packets it receives, no matter what.
56 */
57static struct nf_queue_handler_t {
58 nf_queue_outfn_t outfn;
59 void *data;
60} queue_handler[NPROTO];
61static DEFINE_RWLOCK(queue_handler_lock);
62
63int nf_register_hook(struct nf_hook_ops *reg)
64{
65 struct list_head *i;
66
67 spin_lock_bh(&nf_hook_lock);
68 list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
69 if (reg->priority < ((struct nf_hook_ops *)i)->priority)
70 break;
71 }
72 list_add_rcu(&reg->list, i->prev);
73 spin_unlock_bh(&nf_hook_lock);
74
75 synchronize_net();
76 return 0;
77}
78
79void nf_unregister_hook(struct nf_hook_ops *reg)
80{
81 spin_lock_bh(&nf_hook_lock);
82 list_del_rcu(&reg->list);
83 spin_unlock_bh(&nf_hook_lock);
84
85 synchronize_net();
86}
87
88/* Do exclusive ranges overlap? */
89static inline int overlap(int min1, int max1, int min2, int max2)
90{
91 return max1 > min2 && min1 < max2;
92}
93
94/* Functions to register sockopt ranges (exclusive). */
95int nf_register_sockopt(struct nf_sockopt_ops *reg)
96{
97 struct list_head *i;
98 int ret = 0;
99
100 if (down_interruptible(&nf_sockopt_mutex) != 0)
101 return -EINTR;
102
103 list_for_each(i, &nf_sockopts) {
104 struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
105 if (ops->pf == reg->pf
106 && (overlap(ops->set_optmin, ops->set_optmax,
107 reg->set_optmin, reg->set_optmax)
108 || overlap(ops->get_optmin, ops->get_optmax,
109 reg->get_optmin, reg->get_optmax))) {
110 NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
111 ops->set_optmin, ops->set_optmax,
112 ops->get_optmin, ops->get_optmax,
113 reg->set_optmin, reg->set_optmax,
114 reg->get_optmin, reg->get_optmax);
115 ret = -EBUSY;
116 goto out;
117 }
118 }
119
120 list_add(&reg->list, &nf_sockopts);
121out:
122 up(&nf_sockopt_mutex);
123 return ret;
124}
125
126void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
127{
128 /* No point being interruptible: we're probably in cleanup_module() */
129 restart:
130 down(&nf_sockopt_mutex);
131 if (reg->use != 0) {
132 /* To be woken by nf_sockopt call... */
133 /* FIXME: Stuart Young's name appears gratuitously. */
134 set_current_state(TASK_UNINTERRUPTIBLE);
135 reg->cleanup_task = current;
136 up(&nf_sockopt_mutex);
137 schedule();
138 goto restart;
139 }
140 list_del(&reg->list);
141 up(&nf_sockopt_mutex);
142}
143
144/* Call get/setsockopt() */
145static int nf_sockopt(struct sock *sk, int pf, int val,
146 char __user *opt, int *len, int get)
147{
148 struct list_head *i;
149 struct nf_sockopt_ops *ops;
150 int ret;
151
152 if (down_interruptible(&nf_sockopt_mutex) != 0)
153 return -EINTR;
154
155 list_for_each(i, &nf_sockopts) {
156 ops = (struct nf_sockopt_ops *)i;
157 if (ops->pf == pf) {
158 if (get) {
159 if (val >= ops->get_optmin
160 && val < ops->get_optmax) {
161 ops->use++;
162 up(&nf_sockopt_mutex);
163 ret = ops->get(sk, val, opt, len);
164 goto out;
165 }
166 } else {
167 if (val >= ops->set_optmin
168 && val < ops->set_optmax) {
169 ops->use++;
170 up(&nf_sockopt_mutex);
171 ret = ops->set(sk, val, opt, *len);
172 goto out;
173 }
174 }
175 }
176 }
177 up(&nf_sockopt_mutex);
178 return -ENOPROTOOPT;
179
180 out:
181 down(&nf_sockopt_mutex);
182 ops->use--;
183 if (ops->cleanup_task)
184 wake_up_process(ops->cleanup_task);
185 up(&nf_sockopt_mutex);
186 return ret;
187}
188
189int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
190 int len)
191{
192 return nf_sockopt(sk, pf, val, opt, &len, 0);
193}
194
195int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
196{
197 return nf_sockopt(sk, pf, val, opt, len, 1);
198}
199
200static unsigned int nf_iterate(struct list_head *head,
201 struct sk_buff **skb,
202 int hook,
203 const struct net_device *indev,
204 const struct net_device *outdev,
205 struct list_head **i,
206 int (*okfn)(struct sk_buff *),
207 int hook_thresh)
208{
209 unsigned int verdict;
210
211 /*
212 * The caller must not block between calls to this
213 * function because of risk of continuing from deleted element.
214 */
215 list_for_each_continue_rcu(*i, head) {
216 struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
217
218 if (hook_thresh > elem->priority)
219 continue;
220
221 /* Optimization: we don't need to hold module
222 reference here, since function can't sleep. --RR */
223 verdict = elem->hook(hook, skb, indev, outdev, okfn);
224 if (verdict != NF_ACCEPT) {
225#ifdef CONFIG_NETFILTER_DEBUG
226 if (unlikely(verdict > NF_MAX_VERDICT)) {
227 NFDEBUG("Evil return from %p(%u).\n",
228 elem->hook, hook);
229 continue;
230 }
231#endif
232 if (verdict != NF_REPEAT)
233 return verdict;
234 *i = (*i)->prev;
235 }
236 }
237 return NF_ACCEPT;
238}
239
240int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
241{
242 int ret;
243
244 write_lock_bh(&queue_handler_lock);
245 if (queue_handler[pf].outfn)
246 ret = -EBUSY;
247 else {
248 queue_handler[pf].outfn = outfn;
249 queue_handler[pf].data = data;
250 ret = 0;
251 }
252 write_unlock_bh(&queue_handler_lock);
253
254 return ret;
255}
256
257/* The caller must flush their queue before this */
258int nf_unregister_queue_handler(int pf)
259{
260 write_lock_bh(&queue_handler_lock);
261 queue_handler[pf].outfn = NULL;
262 queue_handler[pf].data = NULL;
263 write_unlock_bh(&queue_handler_lock);
264
265 return 0;
266}
267
268/*
269 * Any packet that leaves via this function must come back
270 * through nf_reinject().
271 */
272static int nf_queue(struct sk_buff *skb,
273 struct list_head *elem,
274 int pf, unsigned int hook,
275 struct net_device *indev,
276 struct net_device *outdev,
277 int (*okfn)(struct sk_buff *))
278{
279 int status;
280 struct nf_info *info;
281#ifdef CONFIG_BRIDGE_NETFILTER
282 struct net_device *physindev = NULL;
283 struct net_device *physoutdev = NULL;
284#endif
285
286 /* QUEUE == DROP if noone is waiting, to be safe. */
287 read_lock(&queue_handler_lock);
288 if (!queue_handler[pf].outfn) {
289 read_unlock(&queue_handler_lock);
290 kfree_skb(skb);
291 return 1;
292 }
293
294 info = kmalloc(sizeof(*info), GFP_ATOMIC);
295 if (!info) {
296 if (net_ratelimit())
297 printk(KERN_ERR "OOM queueing packet %p\n",
298 skb);
299 read_unlock(&queue_handler_lock);
300 kfree_skb(skb);
301 return 1;
302 }
303
304 *info = (struct nf_info) {
305 (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
306
307 /* If it's going away, ignore hook. */
308 if (!try_module_get(info->elem->owner)) {
309 read_unlock(&queue_handler_lock);
310 kfree(info);
311 return 0;
312 }
313
314 /* Bump dev refs so they don't vanish while packet is out */
315 if (indev) dev_hold(indev);
316 if (outdev) dev_hold(outdev);
317
318#ifdef CONFIG_BRIDGE_NETFILTER
319 if (skb->nf_bridge) {
320 physindev = skb->nf_bridge->physindev;
321 if (physindev) dev_hold(physindev);
322 physoutdev = skb->nf_bridge->physoutdev;
323 if (physoutdev) dev_hold(physoutdev);
324 }
325#endif
326
327 status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
328 read_unlock(&queue_handler_lock);
329
330 if (status < 0) {
331 /* James M doesn't say fuck enough. */
332 if (indev) dev_put(indev);
333 if (outdev) dev_put(outdev);
334#ifdef CONFIG_BRIDGE_NETFILTER
335 if (physindev) dev_put(physindev);
336 if (physoutdev) dev_put(physoutdev);
337#endif
338 module_put(info->elem->owner);
339 kfree(info);
340 kfree_skb(skb);
341 return 1;
342 }
343 return 1;
344}
345
346/* Returns 1 if okfn() needs to be executed by the caller,
347 * -EPERM for NF_DROP, 0 otherwise. */
348int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
349 struct net_device *indev,
350 struct net_device *outdev,
351 int (*okfn)(struct sk_buff *),
352 int hook_thresh)
353{
354 struct list_head *elem;
355 unsigned int verdict;
356 int ret = 0;
357
358 /* We may already have this, but read-locks nest anyway */
359 rcu_read_lock();
360
361 elem = &nf_hooks[pf][hook];
362next_hook:
363 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
364 outdev, &elem, okfn, hook_thresh);
365 if (verdict == NF_ACCEPT || verdict == NF_STOP) {
366 ret = 1;
367 goto unlock;
368 } else if (verdict == NF_DROP) {
369 kfree_skb(*pskb);
370 ret = -EPERM;
371 } else if (verdict == NF_QUEUE) {
372 NFDEBUG("nf_hook: Verdict = QUEUE.\n");
373 if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
374 goto next_hook;
375 }
376unlock:
377 rcu_read_unlock();
378 return ret;
379}
380
381void nf_reinject(struct sk_buff *skb, struct nf_info *info,
382 unsigned int verdict)
383{
384 struct list_head *elem = &info->elem->list;
385 struct list_head *i;
386
387 rcu_read_lock();
388
389 /* Release those devices we held, or Alexey will kill me. */
390 if (info->indev) dev_put(info->indev);
391 if (info->outdev) dev_put(info->outdev);
392#ifdef CONFIG_BRIDGE_NETFILTER
393 if (skb->nf_bridge) {
394 if (skb->nf_bridge->physindev)
395 dev_put(skb->nf_bridge->physindev);
396 if (skb->nf_bridge->physoutdev)
397 dev_put(skb->nf_bridge->physoutdev);
398 }
399#endif
400
401 /* Drop reference to owner of hook which queued us. */
402 module_put(info->elem->owner);
403
404 list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
405 if (i == elem)
406 break;
407 }
408
409 if (elem == &nf_hooks[info->pf][info->hook]) {
410 /* The module which sent it to userspace is gone. */
411 NFDEBUG("%s: module disappeared, dropping packet.\n",
412 __FUNCTION__);
413 verdict = NF_DROP;
414 }
415
416 /* Continue traversal iff userspace said ok... */
417 if (verdict == NF_REPEAT) {
418 elem = elem->prev;
419 verdict = NF_ACCEPT;
420 }
421
422 if (verdict == NF_ACCEPT) {
423 next_hook:
424 verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
425 &skb, info->hook,
426 info->indev, info->outdev, &elem,
427 info->okfn, INT_MIN);
428 }
429
430 switch (verdict) {
431 case NF_ACCEPT:
432 info->okfn(skb);
433 break;
434
435 case NF_QUEUE:
436 if (!nf_queue(skb, elem, info->pf, info->hook,
437 info->indev, info->outdev, info->okfn))
438 goto next_hook;
439 break;
440 }
441 rcu_read_unlock();
442
443 if (verdict == NF_DROP)
444 kfree_skb(skb);
445
446 kfree(info);
447 return;
448}
449
450#ifdef CONFIG_INET
451/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
452int ip_route_me_harder(struct sk_buff **pskb)
453{
454 struct iphdr *iph = (*pskb)->nh.iph;
455 struct rtable *rt;
456 struct flowi fl = {};
457 struct dst_entry *odst;
458 unsigned int hh_len;
459
460 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
461 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
462 */
463 if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
464 fl.nl_u.ip4_u.daddr = iph->daddr;
465 fl.nl_u.ip4_u.saddr = iph->saddr;
466 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
467 fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
468#ifdef CONFIG_IP_ROUTE_FWMARK
469 fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
470#endif
471 fl.proto = iph->protocol;
472 if (ip_route_output_key(&rt, &fl) != 0)
473 return -1;
474
475 /* Drop old route. */
476 dst_release((*pskb)->dst);
477 (*pskb)->dst = &rt->u.dst;
478 } else {
479 /* non-local src, find valid iif to satisfy
480 * rp-filter when calling ip_route_input. */
481 fl.nl_u.ip4_u.daddr = iph->saddr;
482 if (ip_route_output_key(&rt, &fl) != 0)
483 return -1;
484
485 odst = (*pskb)->dst;
486 if (ip_route_input(*pskb, iph->daddr, iph->saddr,
487 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
488 dst_release(&rt->u.dst);
489 return -1;
490 }
491 dst_release(&rt->u.dst);
492 dst_release(odst);
493 }
494
495 if ((*pskb)->dst->error)
496 return -1;
497
498 /* Change in oif may mean change in hh_len. */
499 hh_len = (*pskb)->dst->dev->hard_header_len;
500 if (skb_headroom(*pskb) < hh_len) {
501 struct sk_buff *nskb;
502
503 nskb = skb_realloc_headroom(*pskb, hh_len);
504 if (!nskb)
505 return -1;
506 if ((*pskb)->sk)
507 skb_set_owner_w(nskb, (*pskb)->sk);
508 kfree_skb(*pskb);
509 *pskb = nskb;
510 }
511
512 return 0;
513}
514EXPORT_SYMBOL(ip_route_me_harder);
515
516int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
517{
518 struct sk_buff *nskb;
519
520 if (writable_len > (*pskb)->len)
521 return 0;
522
523 /* Not exclusive use of packet? Must copy. */
524 if (skb_shared(*pskb) || skb_cloned(*pskb))
525 goto copy_skb;
526
527 return pskb_may_pull(*pskb, writable_len);
528
529copy_skb:
530 nskb = skb_copy(*pskb, GFP_ATOMIC);
531 if (!nskb)
532 return 0;
533 BUG_ON(skb_is_nonlinear(nskb));
534
535 /* Rest of kernel will get very unhappy if we pass it a
536 suddenly-orphaned skbuff */
537 if ((*pskb)->sk)
538 skb_set_owner_w(nskb, (*pskb)->sk);
539 kfree_skb(*pskb);
540 *pskb = nskb;
541 return 1;
542}
543EXPORT_SYMBOL(skb_ip_make_writable);
544#endif /*CONFIG_INET*/
545
546/* Internal logging interface, which relies on the real
547 LOG target modules */
548
549#define NF_LOG_PREFIXLEN 128
550
551static nf_logfn *nf_logging[NPROTO]; /* = NULL */
552static int reported = 0;
553static DEFINE_SPINLOCK(nf_log_lock);
554
555int nf_log_register(int pf, nf_logfn *logfn)
556{
557 int ret = -EBUSY;
558
559 /* Any setup of logging members must be done before
560 * substituting pointer. */
561 spin_lock(&nf_log_lock);
562 if (!nf_logging[pf]) {
563 rcu_assign_pointer(nf_logging[pf], logfn);
564 ret = 0;
565 }
566 spin_unlock(&nf_log_lock);
567 return ret;
568}
569
570void nf_log_unregister(int pf, nf_logfn *logfn)
571{
572 spin_lock(&nf_log_lock);
573 if (nf_logging[pf] == logfn)
574 nf_logging[pf] = NULL;
575 spin_unlock(&nf_log_lock);
576
577 /* Give time to concurrent readers. */
578 synchronize_net();
579}
580
581void nf_log_packet(int pf,
582 unsigned int hooknum,
583 const struct sk_buff *skb,
584 const struct net_device *in,
585 const struct net_device *out,
586 const char *fmt, ...)
587{
588 va_list args;
589 char prefix[NF_LOG_PREFIXLEN];
590 nf_logfn *logfn;
591
592 rcu_read_lock();
593 logfn = rcu_dereference(nf_logging[pf]);
594 if (logfn) {
595 va_start(args, fmt);
596 vsnprintf(prefix, sizeof(prefix), fmt, args);
597 va_end(args);
598 /* We must read logging before nf_logfn[pf] */
599 logfn(hooknum, skb, in, out, prefix);
600 } else if (!reported) {
601 printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
602 "no backend logging module loaded in!\n");
603 reported++;
604 }
605 rcu_read_unlock();
606}
607EXPORT_SYMBOL(nf_log_register);
608EXPORT_SYMBOL(nf_log_unregister);
609EXPORT_SYMBOL(nf_log_packet);
610
611/* This does not belong here, but locally generated errors need it if connection
612 tracking in use: without this, connection may not be in hash table, and hence
613 manufactured ICMP or RST packets will not be associated with it. */
614void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
615
616void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
617{
618 void (*attach)(struct sk_buff *, struct sk_buff *);
619
620 if (skb->nfct && (attach = ip_ct_attach) != NULL) {
621 mb(); /* Just to be sure: must be read before executing this */
622 attach(new, skb);
623 }
624}
625
626void __init netfilter_init(void)
627{
628 int i, h;
629
630 for (i = 0; i < NPROTO; i++) {
631 for (h = 0; h < NF_MAX_HOOKS; h++)
632 INIT_LIST_HEAD(&nf_hooks[i][h]);
633 }
634}
635
636EXPORT_SYMBOL(ip_ct_attach);
637EXPORT_SYMBOL(nf_ct_attach);
638EXPORT_SYMBOL(nf_getsockopt);
639EXPORT_SYMBOL(nf_hook_slow);
640EXPORT_SYMBOL(nf_hooks);
641EXPORT_SYMBOL(nf_register_hook);
642EXPORT_SYMBOL(nf_register_queue_handler);
643EXPORT_SYMBOL(nf_register_sockopt);
644EXPORT_SYMBOL(nf_reinject);
645EXPORT_SYMBOL(nf_setsockopt);
646EXPORT_SYMBOL(nf_unregister_hook);
647EXPORT_SYMBOL(nf_unregister_queue_handler);
648EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c327c9edadc5..a1a9a7abff50 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
33#define MAX_UDP_CHUNK 1460 33#define MAX_UDP_CHUNK 1460
34#define MAX_SKBS 32 34#define MAX_SKBS 32
35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) 35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
36#define MAX_RETRIES 20000
36 37
37static DEFINE_SPINLOCK(skb_list_lock); 38static DEFINE_SPINLOCK(skb_list_lock);
38static int nr_skbs; 39static int nr_skbs;
@@ -248,14 +249,14 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
248 int status; 249 int status;
249 struct netpoll_info *npinfo; 250 struct netpoll_info *npinfo;
250 251
251repeat: 252 if (!np || !np->dev || !netif_running(np->dev)) {
252 if(!np || !np->dev || !netif_running(np->dev)) {
253 __kfree_skb(skb); 253 __kfree_skb(skb);
254 return; 254 return;
255 } 255 }
256 256
257 /* avoid recursion */
258 npinfo = np->dev->npinfo; 257 npinfo = np->dev->npinfo;
258
259 /* avoid recursion */
259 if (npinfo->poll_owner == smp_processor_id() || 260 if (npinfo->poll_owner == smp_processor_id() ||
260 np->dev->xmit_lock_owner == smp_processor_id()) { 261 np->dev->xmit_lock_owner == smp_processor_id()) {
261 if (np->drop) 262 if (np->drop)
@@ -265,30 +266,37 @@ repeat:
265 return; 266 return;
266 } 267 }
267 268
268 spin_lock(&np->dev->xmit_lock); 269 do {
269 np->dev->xmit_lock_owner = smp_processor_id(); 270 npinfo->tries--;
271 spin_lock(&np->dev->xmit_lock);
272 np->dev->xmit_lock_owner = smp_processor_id();
270 273
271 /* 274 /*
272 * network drivers do not expect to be called if the queue is 275 * network drivers do not expect to be called if the queue is
273 * stopped. 276 * stopped.
274 */ 277 */
275 if (netif_queue_stopped(np->dev)) { 278 if (netif_queue_stopped(np->dev)) {
279 np->dev->xmit_lock_owner = -1;
280 spin_unlock(&np->dev->xmit_lock);
281 netpoll_poll(np);
282 udelay(50);
283 continue;
284 }
285
286 status = np->dev->hard_start_xmit(skb, np->dev);
276 np->dev->xmit_lock_owner = -1; 287 np->dev->xmit_lock_owner = -1;
277 spin_unlock(&np->dev->xmit_lock); 288 spin_unlock(&np->dev->xmit_lock);
278 289
279 netpoll_poll(np); 290 /* success */
280 goto repeat; 291 if(!status) {
281 } 292 npinfo->tries = MAX_RETRIES; /* reset */
282 293 return;
283 status = np->dev->hard_start_xmit(skb, np->dev); 294 }
284 np->dev->xmit_lock_owner = -1;
285 spin_unlock(&np->dev->xmit_lock);
286 295
287 /* transmit busy */ 296 /* transmit busy */
288 if(status) {
289 netpoll_poll(np); 297 netpoll_poll(np);
290 goto repeat; 298 udelay(50);
291 } 299 } while (npinfo->tries > 0);
292} 300}
293 301
294void netpoll_send_udp(struct netpoll *np, const char *msg, int len) 302void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -349,15 +357,11 @@ static void arp_reply(struct sk_buff *skb)
349 unsigned char *arp_ptr; 357 unsigned char *arp_ptr;
350 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; 358 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
351 u32 sip, tip; 359 u32 sip, tip;
352 unsigned long flags;
353 struct sk_buff *send_skb; 360 struct sk_buff *send_skb;
354 struct netpoll *np = NULL; 361 struct netpoll *np = NULL;
355 362
356 spin_lock_irqsave(&npinfo->rx_lock, flags);
357 if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev) 363 if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
358 np = npinfo->rx_np; 364 np = npinfo->rx_np;
359 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
360
361 if (!np) 365 if (!np)
362 return; 366 return;
363 367
@@ -639,9 +643,11 @@ int netpoll_setup(struct netpoll *np)
639 if (!npinfo) 643 if (!npinfo)
640 goto release; 644 goto release;
641 645
646 npinfo->rx_flags = 0;
642 npinfo->rx_np = NULL; 647 npinfo->rx_np = NULL;
643 npinfo->poll_lock = SPIN_LOCK_UNLOCKED; 648 npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
644 npinfo->poll_owner = -1; 649 npinfo->poll_owner = -1;
650 npinfo->tries = MAX_RETRIES;
645 npinfo->rx_lock = SPIN_LOCK_UNLOCKED; 651 npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
646 } else 652 } else
647 npinfo = ndev->npinfo; 653 npinfo = ndev->npinfo;
@@ -718,9 +724,16 @@ int netpoll_setup(struct netpoll *np)
718 npinfo->rx_np = np; 724 npinfo->rx_np = np;
719 spin_unlock_irqrestore(&npinfo->rx_lock, flags); 725 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
720 } 726 }
727
728 /* fill up the skb queue */
729 refill_skbs();
730
721 /* last thing to do is link it to the net device structure */ 731 /* last thing to do is link it to the net device structure */
722 ndev->npinfo = npinfo; 732 ndev->npinfo = npinfo;
723 733
734 /* avoid racing with NAPI reading npinfo */
735 synchronize_rcu();
736
724 return 0; 737 return 0;
725 738
726 release: 739 release:
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 975d651312dc..8eb083b6041a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -363,7 +363,7 @@ struct pktgen_thread {
363 * All Rights Reserved. 363 * All Rights Reserved.
364 * 364 *
365 */ 365 */
366inline static s64 divremdi3(s64 x, s64 y, int type) 366static inline s64 divremdi3(s64 x, s64 y, int type)
367{ 367{
368 u64 a = (x < 0) ? -x : x; 368 u64 a = (x < 0) ? -x : x;
369 u64 b = (y < 0) ? -y : y; 369 u64 b = (y < 0) ? -y : y;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index bb55675f0685..b8203de5ff07 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -32,7 +32,6 @@
32 * Further increasing requires to change hash table size. 32 * Further increasing requires to change hash table size.
33 */ 33 */
34int sysctl_max_syn_backlog = 256; 34int sysctl_max_syn_backlog = 256;
35EXPORT_SYMBOL(sysctl_max_syn_backlog);
36 35
37int reqsk_queue_alloc(struct request_sock_queue *queue, 36int reqsk_queue_alloc(struct request_sock_queue *queue,
38 const int nr_table_entries) 37 const int nr_table_entries)
@@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
53 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); 52 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
54 rwlock_init(&queue->syn_wait_lock); 53 rwlock_init(&queue->syn_wait_lock);
55 queue->rskq_accept_head = queue->rskq_accept_head = NULL; 54 queue->rskq_accept_head = queue->rskq_accept_head = NULL;
55 queue->rskq_defer_accept = 0;
56 lopt->nr_table_entries = nr_table_entries;
56 57
57 write_lock_bh(&queue->syn_wait_lock); 58 write_lock_bh(&queue->syn_wait_lock);
58 queue->listen_opt = lopt; 59 queue->listen_opt = lopt;
@@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
62} 63}
63 64
64EXPORT_SYMBOL(reqsk_queue_alloc); 65EXPORT_SYMBOL(reqsk_queue_alloc);
66
67void reqsk_queue_destroy(struct request_sock_queue *queue)
68{
69 /* make all the listen_opt local to us */
70 struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
71
72 if (lopt->qlen != 0) {
73 int i;
74
75 for (i = 0; i < lopt->nr_table_entries; i++) {
76 struct request_sock *req;
77
78 while ((req = lopt->syn_table[i]) != NULL) {
79 lopt->syn_table[i] = req->dl_next;
80 lopt->qlen--;
81 reqsk_free(req);
82 }
83 }
84 }
85
86 BUG_TRAP(lopt->qlen == 0);
87 kfree(lopt);
88}
89
90EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4b1bb30e6381..9bed7569ce3f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
148{ 148{
149 int err = 0; 149 int err = 0;
150 150
151 NETLINK_CB(skb).dst_groups = group; 151 NETLINK_CB(skb).dst_group = group;
152 if (echo) 152 if (echo)
153 atomic_inc(&skb->users); 153 atomic_inc(&skb->users);
154 netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); 154 netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
@@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
458 kfree_skb(skb); 458 kfree_skb(skb);
459 return; 459 return;
460 } 460 }
461 NETLINK_CB(skb).dst_groups = RTMGRP_LINK; 461 NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
462 netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); 462 netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
463} 463}
464 464
465static int rtnetlink_done(struct netlink_callback *cb) 465static int rtnetlink_done(struct netlink_callback *cb)
@@ -708,7 +708,8 @@ void __init rtnetlink_init(void)
708 if (!rta_buf) 708 if (!rta_buf)
709 panic("rtnetlink_init: cannot allocate rta_buf\n"); 709 panic("rtnetlink_init: cannot allocate rta_buf\n");
710 710
711 rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); 711 rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
712 THIS_MODULE);
712 if (rtnl == NULL) 713 if (rtnl == NULL)
713 panic("rtnetlink_init: cannot initialize rtnetlink\n"); 714 panic("rtnetlink_init: cannot initialize rtnetlink\n");
714 netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); 715 netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d9f7b06fe886..f80a28785610 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,7 +68,10 @@
68#include <asm/uaccess.h> 68#include <asm/uaccess.h>
69#include <asm/system.h> 69#include <asm/system.h>
70 70
71static kmem_cache_t *skbuff_head_cache; 71static kmem_cache_t *skbuff_head_cache __read_mostly;
72static kmem_cache_t *skbuff_fclone_cache __read_mostly;
73
74struct timeval __read_mostly skb_tv_base;
72 75
73/* 76/*
74 * Keep out-of-line to prevent kernel bloat. 77 * Keep out-of-line to prevent kernel bloat.
@@ -118,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
118 */ 121 */
119 122
120/** 123/**
121 * alloc_skb - allocate a network buffer 124 * __alloc_skb - allocate a network buffer
122 * @size: size to allocate 125 * @size: size to allocate
123 * @gfp_mask: allocation mask 126 * @gfp_mask: allocation mask
124 * 127 *
@@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
129 * Buffers may only be allocated from interrupts using a @gfp_mask of 132 * Buffers may only be allocated from interrupts using a @gfp_mask of
130 * %GFP_ATOMIC. 133 * %GFP_ATOMIC.
131 */ 134 */
132struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) 135struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
136 int fclone)
133{ 137{
134 struct sk_buff *skb; 138 struct sk_buff *skb;
135 u8 *data; 139 u8 *data;
136 140
137 /* Get the HEAD */ 141 /* Get the HEAD */
138 skb = kmem_cache_alloc(skbuff_head_cache, 142 if (fclone)
139 gfp_mask & ~__GFP_DMA); 143 skb = kmem_cache_alloc(skbuff_fclone_cache,
144 gfp_mask & ~__GFP_DMA);
145 else
146 skb = kmem_cache_alloc(skbuff_head_cache,
147 gfp_mask & ~__GFP_DMA);
148
140 if (!skb) 149 if (!skb)
141 goto out; 150 goto out;
142 151
@@ -153,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
153 skb->data = data; 162 skb->data = data;
154 skb->tail = data; 163 skb->tail = data;
155 skb->end = data + size; 164 skb->end = data + size;
165 if (fclone) {
166 struct sk_buff *child = skb + 1;
167 atomic_t *fclone_ref = (atomic_t *) (child + 1);
156 168
169 skb->fclone = SKB_FCLONE_ORIG;
170 atomic_set(fclone_ref, 1);
171
172 child->fclone = SKB_FCLONE_UNAVAILABLE;
173 }
157 atomic_set(&(skb_shinfo(skb)->dataref), 1); 174 atomic_set(&(skb_shinfo(skb)->dataref), 1);
158 skb_shinfo(skb)->nr_frags = 0; 175 skb_shinfo(skb)->nr_frags = 0;
159 skb_shinfo(skb)->tso_size = 0; 176 skb_shinfo(skb)->tso_size = 0;
@@ -266,8 +283,34 @@ void skb_release_data(struct sk_buff *skb)
266 */ 283 */
267void kfree_skbmem(struct sk_buff *skb) 284void kfree_skbmem(struct sk_buff *skb)
268{ 285{
286 struct sk_buff *other;
287 atomic_t *fclone_ref;
288
269 skb_release_data(skb); 289 skb_release_data(skb);
270 kmem_cache_free(skbuff_head_cache, skb); 290 switch (skb->fclone) {
291 case SKB_FCLONE_UNAVAILABLE:
292 kmem_cache_free(skbuff_head_cache, skb);
293 break;
294
295 case SKB_FCLONE_ORIG:
296 fclone_ref = (atomic_t *) (skb + 2);
297 if (atomic_dec_and_test(fclone_ref))
298 kmem_cache_free(skbuff_fclone_cache, skb);
299 break;
300
301 case SKB_FCLONE_CLONE:
302 fclone_ref = (atomic_t *) (skb + 1);
303 other = skb - 1;
304
305 /* The clone portion is available for
306 * fast-cloning again.
307 */
308 skb->fclone = SKB_FCLONE_UNAVAILABLE;
309
310 if (atomic_dec_and_test(fclone_ref))
311 kmem_cache_free(skbuff_fclone_cache, other);
312 break;
313 };
271} 314}
272 315
273/** 316/**
@@ -281,8 +324,6 @@ void kfree_skbmem(struct sk_buff *skb)
281 324
282void __kfree_skb(struct sk_buff *skb) 325void __kfree_skb(struct sk_buff *skb)
283{ 326{
284 BUG_ON(skb->list != NULL);
285
286 dst_release(skb->dst); 327 dst_release(skb->dst);
287#ifdef CONFIG_XFRM 328#ifdef CONFIG_XFRM
288 secpath_put(skb->sp); 329 secpath_put(skb->sp);
@@ -302,7 +343,6 @@ void __kfree_skb(struct sk_buff *skb)
302 skb->tc_index = 0; 343 skb->tc_index = 0;
303#ifdef CONFIG_NET_CLS_ACT 344#ifdef CONFIG_NET_CLS_ACT
304 skb->tc_verd = 0; 345 skb->tc_verd = 0;
305 skb->tc_classid = 0;
306#endif 346#endif
307#endif 347#endif
308 348
@@ -325,19 +365,27 @@ void __kfree_skb(struct sk_buff *skb)
325 365
326struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) 366struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
327{ 367{
328 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 368 struct sk_buff *n;
329 369
330 if (!n) 370 n = skb + 1;
331 return NULL; 371 if (skb->fclone == SKB_FCLONE_ORIG &&
372 n->fclone == SKB_FCLONE_UNAVAILABLE) {
373 atomic_t *fclone_ref = (atomic_t *) (n + 1);
374 n->fclone = SKB_FCLONE_CLONE;
375 atomic_inc(fclone_ref);
376 } else {
377 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
378 if (!n)
379 return NULL;
380 n->fclone = SKB_FCLONE_UNAVAILABLE;
381 }
332 382
333#define C(x) n->x = skb->x 383#define C(x) n->x = skb->x
334 384
335 n->next = n->prev = NULL; 385 n->next = n->prev = NULL;
336 n->list = NULL;
337 n->sk = NULL; 386 n->sk = NULL;
338 C(stamp); 387 C(tstamp);
339 C(dev); 388 C(dev);
340 C(real_dev);
341 C(h); 389 C(h);
342 C(nh); 390 C(nh);
343 C(mac); 391 C(mac);
@@ -361,7 +409,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
361 n->destructor = NULL; 409 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 410#ifdef CONFIG_NETFILTER
363 C(nfmark); 411 C(nfmark);
364 C(nfcache);
365 C(nfct); 412 C(nfct);
366 nf_conntrack_get(skb->nfct); 413 nf_conntrack_get(skb->nfct);
367 C(nfctinfo); 414 C(nfctinfo);
@@ -370,17 +417,13 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
370 nf_bridge_get(skb->nf_bridge); 417 nf_bridge_get(skb->nf_bridge);
371#endif 418#endif
372#endif /*CONFIG_NETFILTER*/ 419#endif /*CONFIG_NETFILTER*/
373#if defined(CONFIG_HIPPI)
374 C(private);
375#endif
376#ifdef CONFIG_NET_SCHED 420#ifdef CONFIG_NET_SCHED
377 C(tc_index); 421 C(tc_index);
378#ifdef CONFIG_NET_CLS_ACT 422#ifdef CONFIG_NET_CLS_ACT
379 n->tc_verd = SET_TC_VERD(skb->tc_verd,0); 423 n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
380 n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); 424 n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
381 n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); 425 n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
382 C(input_dev); 426 C(input_dev);
383 C(tc_classid);
384#endif 427#endif
385 428
386#endif 429#endif
@@ -404,10 +447,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
404 */ 447 */
405 unsigned long offset = new->data - old->data; 448 unsigned long offset = new->data - old->data;
406 449
407 new->list = NULL;
408 new->sk = NULL; 450 new->sk = NULL;
409 new->dev = old->dev; 451 new->dev = old->dev;
410 new->real_dev = old->real_dev;
411 new->priority = old->priority; 452 new->priority = old->priority;
412 new->protocol = old->protocol; 453 new->protocol = old->protocol;
413 new->dst = dst_clone(old->dst); 454 new->dst = dst_clone(old->dst);
@@ -419,12 +460,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
419 new->mac.raw = old->mac.raw + offset; 460 new->mac.raw = old->mac.raw + offset;
420 memcpy(new->cb, old->cb, sizeof(old->cb)); 461 memcpy(new->cb, old->cb, sizeof(old->cb));
421 new->local_df = old->local_df; 462 new->local_df = old->local_df;
463 new->fclone = SKB_FCLONE_UNAVAILABLE;
422 new->pkt_type = old->pkt_type; 464 new->pkt_type = old->pkt_type;
423 new->stamp = old->stamp; 465 new->tstamp = old->tstamp;
424 new->destructor = NULL; 466 new->destructor = NULL;
425#ifdef CONFIG_NETFILTER 467#ifdef CONFIG_NETFILTER
426 new->nfmark = old->nfmark; 468 new->nfmark = old->nfmark;
427 new->nfcache = old->nfcache;
428 new->nfct = old->nfct; 469 new->nfct = old->nfct;
429 nf_conntrack_get(old->nfct); 470 nf_conntrack_get(old->nfct);
430 new->nfctinfo = old->nfctinfo; 471 new->nfctinfo = old->nfctinfo;
@@ -1344,50 +1385,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
1344 __skb_queue_tail(list, newsk); 1385 __skb_queue_tail(list, newsk);
1345 spin_unlock_irqrestore(&list->lock, flags); 1386 spin_unlock_irqrestore(&list->lock, flags);
1346} 1387}
1388
1347/** 1389/**
1348 * skb_unlink - remove a buffer from a list 1390 * skb_unlink - remove a buffer from a list
1349 * @skb: buffer to remove 1391 * @skb: buffer to remove
1392 * @list: list to use
1350 * 1393 *
1351 * Place a packet after a given packet in a list. The list locks are taken 1394 * Remove a packet from a list. The list locks are taken and this
1352 * and this function is atomic with respect to other list locked calls 1395 * function is atomic with respect to other list locked calls
1353 * 1396 *
1354 * Works even without knowing the list it is sitting on, which can be 1397 * You must know what list the SKB is on.
1355 * handy at times. It also means that THE LIST MUST EXIST when you
1356 * unlink. Thus a list must have its contents unlinked before it is
1357 * destroyed.
1358 */ 1398 */
1359void skb_unlink(struct sk_buff *skb) 1399void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
1360{ 1400{
1361 struct sk_buff_head *list = skb->list; 1401 unsigned long flags;
1362
1363 if (list) {
1364 unsigned long flags;
1365 1402
1366 spin_lock_irqsave(&list->lock, flags); 1403 spin_lock_irqsave(&list->lock, flags);
1367 if (skb->list == list) 1404 __skb_unlink(skb, list);
1368 __skb_unlink(skb, skb->list); 1405 spin_unlock_irqrestore(&list->lock, flags);
1369 spin_unlock_irqrestore(&list->lock, flags);
1370 }
1371} 1406}
1372 1407
1373
1374/** 1408/**
1375 * skb_append - append a buffer 1409 * skb_append - append a buffer
1376 * @old: buffer to insert after 1410 * @old: buffer to insert after
1377 * @newsk: buffer to insert 1411 * @newsk: buffer to insert
1412 * @list: list to use
1378 * 1413 *
1379 * Place a packet after a given packet in a list. The list locks are taken 1414 * Place a packet after a given packet in a list. The list locks are taken
1380 * and this function is atomic with respect to other list locked calls. 1415 * and this function is atomic with respect to other list locked calls.
1381 * A buffer cannot be placed on two lists at the same time. 1416 * A buffer cannot be placed on two lists at the same time.
1382 */ 1417 */
1383 1418void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
1384void skb_append(struct sk_buff *old, struct sk_buff *newsk)
1385{ 1419{
1386 unsigned long flags; 1420 unsigned long flags;
1387 1421
1388 spin_lock_irqsave(&old->list->lock, flags); 1422 spin_lock_irqsave(&list->lock, flags);
1389 __skb_append(old, newsk); 1423 __skb_append(old, newsk, list);
1390 spin_unlock_irqrestore(&old->list->lock, flags); 1424 spin_unlock_irqrestore(&list->lock, flags);
1391} 1425}
1392 1426
1393 1427
@@ -1395,19 +1429,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk)
1395 * skb_insert - insert a buffer 1429 * skb_insert - insert a buffer
1396 * @old: buffer to insert before 1430 * @old: buffer to insert before
1397 * @newsk: buffer to insert 1431 * @newsk: buffer to insert
1432 * @list: list to use
1433 *
1434 * Place a packet before a given packet in a list. The list locks are
1435 * taken and this function is atomic with respect to other list locked
1436 * calls.
1398 * 1437 *
1399 * Place a packet before a given packet in a list. The list locks are taken
1400 * and this function is atomic with respect to other list locked calls
1401 * A buffer cannot be placed on two lists at the same time. 1438 * A buffer cannot be placed on two lists at the same time.
1402 */ 1439 */
1403 1440void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
1404void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
1405{ 1441{
1406 unsigned long flags; 1442 unsigned long flags;
1407 1443
1408 spin_lock_irqsave(&old->list->lock, flags); 1444 spin_lock_irqsave(&list->lock, flags);
1409 __skb_insert(newsk, old->prev, old, old->list); 1445 __skb_insert(newsk, old->prev, old, list);
1410 spin_unlock_irqrestore(&old->list->lock, flags); 1446 spin_unlock_irqrestore(&list->lock, flags);
1411} 1447}
1412 1448
1413#if 0 1449#if 0
@@ -1663,12 +1699,23 @@ void __init skb_init(void)
1663 NULL, NULL); 1699 NULL, NULL);
1664 if (!skbuff_head_cache) 1700 if (!skbuff_head_cache)
1665 panic("cannot create skbuff cache"); 1701 panic("cannot create skbuff cache");
1702
1703 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
1704 (2*sizeof(struct sk_buff)) +
1705 sizeof(atomic_t),
1706 0,
1707 SLAB_HWCACHE_ALIGN,
1708 NULL, NULL);
1709 if (!skbuff_fclone_cache)
1710 panic("cannot create skbuff cache");
1711
1712 do_gettimeofday(&skb_tv_base);
1666} 1713}
1667 1714
1668EXPORT_SYMBOL(___pskb_trim); 1715EXPORT_SYMBOL(___pskb_trim);
1669EXPORT_SYMBOL(__kfree_skb); 1716EXPORT_SYMBOL(__kfree_skb);
1670EXPORT_SYMBOL(__pskb_pull_tail); 1717EXPORT_SYMBOL(__pskb_pull_tail);
1671EXPORT_SYMBOL(alloc_skb); 1718EXPORT_SYMBOL(__alloc_skb);
1672EXPORT_SYMBOL(pskb_copy); 1719EXPORT_SYMBOL(pskb_copy);
1673EXPORT_SYMBOL(pskb_expand_head); 1720EXPORT_SYMBOL(pskb_expand_head);
1674EXPORT_SYMBOL(skb_checksum); 1721EXPORT_SYMBOL(skb_checksum);
@@ -1696,3 +1743,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
1696EXPORT_SYMBOL(skb_seq_read); 1743EXPORT_SYMBOL(skb_seq_read);
1697EXPORT_SYMBOL(skb_abort_seq_read); 1744EXPORT_SYMBOL(skb_abort_seq_read);
1698EXPORT_SYMBOL(skb_find_text); 1745EXPORT_SYMBOL(skb_find_text);
1746EXPORT_SYMBOL(skb_tv_base);
diff --git a/net/core/sock.c b/net/core/sock.c
index 8b35ccdc2b3b..c13594579bfb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -206,13 +206,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
206 */ 206 */
207 207
208#ifdef SO_DONTLINGER /* Compatibility item... */ 208#ifdef SO_DONTLINGER /* Compatibility item... */
209 switch (optname) { 209 if (optname == SO_DONTLINGER) {
210 case SO_DONTLINGER: 210 lock_sock(sk);
211 sock_reset_flag(sk, SOCK_LINGER); 211 sock_reset_flag(sk, SOCK_LINGER);
212 return 0; 212 release_sock(sk);
213 return 0;
213 } 214 }
214#endif 215#endif
215 216
216 if(optlen<sizeof(int)) 217 if(optlen<sizeof(int))
217 return(-EINVAL); 218 return(-EINVAL);
218 219
@@ -259,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
259 260
260 if (val > sysctl_wmem_max) 261 if (val > sysctl_wmem_max)
261 val = sysctl_wmem_max; 262 val = sysctl_wmem_max;
262 263set_sndbuf:
263 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 264 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
264 if ((val * 2) < SOCK_MIN_SNDBUF) 265 if ((val * 2) < SOCK_MIN_SNDBUF)
265 sk->sk_sndbuf = SOCK_MIN_SNDBUF; 266 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
@@ -273,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
273 sk->sk_write_space(sk); 274 sk->sk_write_space(sk);
274 break; 275 break;
275 276
277 case SO_SNDBUFFORCE:
278 if (!capable(CAP_NET_ADMIN)) {
279 ret = -EPERM;
280 break;
281 }
282 goto set_sndbuf;
283
276 case SO_RCVBUF: 284 case SO_RCVBUF:
277 /* Don't error on this BSD doesn't and if you think 285 /* Don't error on this BSD doesn't and if you think
278 about it this is right. Otherwise apps have to 286 about it this is right. Otherwise apps have to
@@ -281,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
281 289
282 if (val > sysctl_rmem_max) 290 if (val > sysctl_rmem_max)
283 val = sysctl_rmem_max; 291 val = sysctl_rmem_max;
284 292set_rcvbuf:
285 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 293 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
286 /* FIXME: is this lower bound the right one? */ 294 /* FIXME: is this lower bound the right one? */
287 if ((val * 2) < SOCK_MIN_RCVBUF) 295 if ((val * 2) < SOCK_MIN_RCVBUF)
@@ -290,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
290 sk->sk_rcvbuf = val * 2; 298 sk->sk_rcvbuf = val * 2;
291 break; 299 break;
292 300
301 case SO_RCVBUFFORCE:
302 if (!capable(CAP_NET_ADMIN)) {
303 ret = -EPERM;
304 break;
305 }
306 goto set_rcvbuf;
307
293 case SO_KEEPALIVE: 308 case SO_KEEPALIVE:
294#ifdef CONFIG_INET 309#ifdef CONFIG_INET
295 if (sk->sk_protocol == IPPROTO_TCP) 310 if (sk->sk_protocol == IPPROTO_TCP)
@@ -685,6 +700,80 @@ void sk_free(struct sock *sk)
685 module_put(owner); 700 module_put(owner);
686} 701}
687 702
703struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
704{
705 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
706
707 if (newsk != NULL) {
708 struct sk_filter *filter;
709
710 memcpy(newsk, sk, sk->sk_prot->obj_size);
711
712 /* SANITY */
713 sk_node_init(&newsk->sk_node);
714 sock_lock_init(newsk);
715 bh_lock_sock(newsk);
716
717 atomic_set(&newsk->sk_rmem_alloc, 0);
718 atomic_set(&newsk->sk_wmem_alloc, 0);
719 atomic_set(&newsk->sk_omem_alloc, 0);
720 skb_queue_head_init(&newsk->sk_receive_queue);
721 skb_queue_head_init(&newsk->sk_write_queue);
722
723 rwlock_init(&newsk->sk_dst_lock);
724 rwlock_init(&newsk->sk_callback_lock);
725
726 newsk->sk_dst_cache = NULL;
727 newsk->sk_wmem_queued = 0;
728 newsk->sk_forward_alloc = 0;
729 newsk->sk_send_head = NULL;
730 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
731 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
732
733 sock_reset_flag(newsk, SOCK_DONE);
734 skb_queue_head_init(&newsk->sk_error_queue);
735
736 filter = newsk->sk_filter;
737 if (filter != NULL)
738 sk_filter_charge(newsk, filter);
739
740 if (unlikely(xfrm_sk_clone_policy(newsk))) {
741 /* It is still raw copy of parent, so invalidate
742 * destructor and make plain sk_free() */
743 newsk->sk_destruct = NULL;
744 sk_free(newsk);
745 newsk = NULL;
746 goto out;
747 }
748
749 newsk->sk_err = 0;
750 newsk->sk_priority = 0;
751 atomic_set(&newsk->sk_refcnt, 2);
752
753 /*
754 * Increment the counter in the same struct proto as the master
755 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
756 * is the same as sk->sk_prot->socks, as this field was copied
757 * with memcpy).
758 *
759 * This _changes_ the previous behaviour, where
760 * tcp_create_openreq_child always was incrementing the
761 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
762 * to be taken into account in all callers. -acme
763 */
764 sk_refcnt_debug_inc(newsk);
765 newsk->sk_socket = NULL;
766 newsk->sk_sleep = NULL;
767
768 if (newsk->sk_prot->sockets_allocated)
769 atomic_inc(newsk->sk_prot->sockets_allocated);
770 }
771out:
772 return newsk;
773}
774
775EXPORT_SYMBOL_GPL(sk_clone);
776
688void __init sk_init(void) 777void __init sk_init(void)
689{ 778{
690 if (num_physpages <= 4096) { 779 if (num_physpages <= 4096) {
@@ -1352,11 +1441,7 @@ void sk_common_release(struct sock *sk)
1352 1441
1353 xfrm_sk_free_policy(sk); 1442 xfrm_sk_free_policy(sk);
1354 1443
1355#ifdef INET_REFCNT_DEBUG 1444 sk_refcnt_debug_release(sk);
1356 if (atomic_read(&sk->sk_refcnt) != 1)
1357 printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
1358 sk, atomic_read(&sk->sk_refcnt));
1359#endif
1360 sock_put(sk); 1445 sock_put(sk);
1361} 1446}
1362 1447
@@ -1367,7 +1452,8 @@ static LIST_HEAD(proto_list);
1367 1452
1368int proto_register(struct proto *prot, int alloc_slab) 1453int proto_register(struct proto *prot, int alloc_slab)
1369{ 1454{
1370 char *request_sock_slab_name; 1455 char *request_sock_slab_name = NULL;
1456 char *timewait_sock_slab_name;
1371 int rc = -ENOBUFS; 1457 int rc = -ENOBUFS;
1372 1458
1373 if (alloc_slab) { 1459 if (alloc_slab) {
@@ -1398,6 +1484,23 @@ int proto_register(struct proto *prot, int alloc_slab)
1398 goto out_free_request_sock_slab_name; 1484 goto out_free_request_sock_slab_name;
1399 } 1485 }
1400 } 1486 }
1487
1488 if (prot->twsk_obj_size) {
1489 static const char mask[] = "tw_sock_%s";
1490
1491 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1492
1493 if (timewait_sock_slab_name == NULL)
1494 goto out_free_request_sock_slab;
1495
1496 sprintf(timewait_sock_slab_name, mask, prot->name);
1497 prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
1498 prot->twsk_obj_size,
1499 0, SLAB_HWCACHE_ALIGN,
1500 NULL, NULL);
1501 if (prot->twsk_slab == NULL)
1502 goto out_free_timewait_sock_slab_name;
1503 }
1401 } 1504 }
1402 1505
1403 write_lock(&proto_list_lock); 1506 write_lock(&proto_list_lock);
@@ -1406,6 +1509,13 @@ int proto_register(struct proto *prot, int alloc_slab)
1406 rc = 0; 1509 rc = 0;
1407out: 1510out:
1408 return rc; 1511 return rc;
1512out_free_timewait_sock_slab_name:
1513 kfree(timewait_sock_slab_name);
1514out_free_request_sock_slab:
1515 if (prot->rsk_prot && prot->rsk_prot->slab) {
1516 kmem_cache_destroy(prot->rsk_prot->slab);
1517 prot->rsk_prot->slab = NULL;
1518 }
1409out_free_request_sock_slab_name: 1519out_free_request_sock_slab_name:
1410 kfree(request_sock_slab_name); 1520 kfree(request_sock_slab_name);
1411out_free_sock_slab: 1521out_free_sock_slab:
@@ -1433,6 +1543,14 @@ void proto_unregister(struct proto *prot)
1433 prot->rsk_prot->slab = NULL; 1543 prot->rsk_prot->slab = NULL;
1434 } 1544 }
1435 1545
1546 if (prot->twsk_slab != NULL) {
1547 const char *name = kmem_cache_name(prot->twsk_slab);
1548
1549 kmem_cache_destroy(prot->twsk_slab);
1550 kfree(name);
1551 prot->twsk_slab = NULL;
1552 }
1553
1436 list_del(&prot->node); 1554 list_del(&prot->node);
1437 write_unlock(&proto_list_lock); 1555 write_unlock(&proto_list_lock);
1438} 1556}
@@ -1601,8 +1719,8 @@ EXPORT_SYMBOL(sock_wfree);
1601EXPORT_SYMBOL(sock_wmalloc); 1719EXPORT_SYMBOL(sock_wmalloc);
1602EXPORT_SYMBOL(sock_i_uid); 1720EXPORT_SYMBOL(sock_i_uid);
1603EXPORT_SYMBOL(sock_i_ino); 1721EXPORT_SYMBOL(sock_i_ino);
1604#ifdef CONFIG_SYSCTL
1605EXPORT_SYMBOL(sysctl_optmem_max); 1722EXPORT_SYMBOL(sysctl_optmem_max);
1723#ifdef CONFIG_SYSCTL
1606EXPORT_SYMBOL(sysctl_rmem_max); 1724EXPORT_SYMBOL(sysctl_rmem_max);
1607EXPORT_SYMBOL(sysctl_wmem_max); 1725EXPORT_SYMBOL(sysctl_wmem_max);
1608#endif 1726#endif
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8f817ad9f546..2f278c8e4743 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -9,23 +9,18 @@
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/config.h> 10#include <linux/config.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/socket.h>
13#include <net/sock.h>
12 14
13#ifdef CONFIG_SYSCTL 15#ifdef CONFIG_SYSCTL
14 16
15extern int netdev_max_backlog; 17extern int netdev_max_backlog;
16extern int netdev_budget;
17extern int weight_p; 18extern int weight_p;
18extern int net_msg_cost;
19extern int net_msg_burst;
20 19
21extern __u32 sysctl_wmem_max; 20extern __u32 sysctl_wmem_max;
22extern __u32 sysctl_rmem_max; 21extern __u32 sysctl_rmem_max;
23extern __u32 sysctl_wmem_default;
24extern __u32 sysctl_rmem_default;
25 22
26extern int sysctl_core_destroy_delay; 23extern int sysctl_core_destroy_delay;
27extern int sysctl_optmem_max;
28extern int sysctl_somaxconn;
29 24
30#ifdef CONFIG_NET_DIVERT 25#ifdef CONFIG_NET_DIVERT
31extern char sysctl_divert_version[]; 26extern char sysctl_divert_version[];
diff --git a/net/core/utils.c b/net/core/utils.c
index e11a8654f363..7b5970fc9e40 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -16,17 +16,19 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/jiffies.h> 17#include <linux/jiffies.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/inet.h>
19#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/net.h>
20#include <linux/string.h> 22#include <linux/string.h>
21#include <linux/types.h> 23#include <linux/types.h>
22#include <linux/random.h> 24#include <linux/random.h>
23#include <linux/percpu.h> 25#include <linux/percpu.h>
24#include <linux/init.h> 26#include <linux/init.h>
25 27
28#include <asm/byteorder.h>
26#include <asm/system.h> 29#include <asm/system.h>
27#include <asm/uaccess.h> 30#include <asm/uaccess.h>
28 31
29
30/* 32/*
31 This is a maximally equidistributed combined Tausworthe generator 33 This is a maximally equidistributed combined Tausworthe generator
32 based on code from GNU Scientific Library 1.5 (30 Jun 2004) 34 based on code from GNU Scientific Library 1.5 (30 Jun 2004)
@@ -153,3 +155,38 @@ int net_ratelimit(void)
153EXPORT_SYMBOL(net_random); 155EXPORT_SYMBOL(net_random);
154EXPORT_SYMBOL(net_ratelimit); 156EXPORT_SYMBOL(net_ratelimit);
155EXPORT_SYMBOL(net_srandom); 157EXPORT_SYMBOL(net_srandom);
158
159/*
160 * Convert an ASCII string to binary IP.
161 * This is outside of net/ipv4/ because various code that uses IP addresses
162 * is otherwise not dependent on the TCP/IP stack.
163 */
164
165__u32 in_aton(const char *str)
166{
167 unsigned long l;
168 unsigned int val;
169 int i;
170
171 l = 0;
172 for (i = 0; i < 4; i++)
173 {
174 l <<= 8;
175 if (*str != '\0')
176 {
177 val = 0;
178 while (*str != '\0' && *str != '.')
179 {
180 val *= 10;
181 val += *str - '0';
182 str++;
183 }
184 l |= val;
185 if (*str != '\0')
186 str++;
187 }
188 }
189 return(htonl(l));
190}
191
192EXPORT_SYMBOL(in_aton);
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 3ff5639c0b78..5caae2399f3a 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -571,10 +571,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v)
571 return 0; 571 return 0;
572} 572}
573 573
574extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
575extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
576extern void dev_seq_stop(struct seq_file *seq, void *v);
577
578static struct seq_operations wireless_seq_ops = { 574static struct seq_operations wireless_seq_ops = {
579 .start = dev_seq_start, 575 .start = dev_seq_start,
580 .next = dev_seq_next, 576 .next = dev_seq_next,
@@ -1144,8 +1140,8 @@ static inline void rtmsg_iwinfo(struct net_device * dev,
1144 kfree_skb(skb); 1140 kfree_skb(skb);
1145 return; 1141 return;
1146 } 1142 }
1147 NETLINK_CB(skb).dst_groups = RTMGRP_LINK; 1143 NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
1148 netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); 1144 netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
1149} 1145}
1150#endif /* WE_EVENT_NETLINK */ 1146#endif /* WE_EVENT_NETLINK */
1151 1147
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 000000000000..187ac182e24b
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,50 @@
1menu "DCCP Configuration (EXPERIMENTAL)"
2 depends on INET && EXPERIMENTAL
3
4config IP_DCCP
5 tristate "The DCCP Protocol (EXPERIMENTAL)"
6 ---help---
7 Datagram Congestion Control Protocol
8
9 From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
10
11 The Datagram Congestion Control Protocol (DCCP) is a transport
12 protocol that implements bidirectional, unicast connections of
13 congestion-controlled, unreliable datagrams. It should be suitable
14 for use by applications such as streaming media, Internet telephony,
15 and on-line games
16
17 To compile this protocol support as a module, choose M here: the
18 module will be called dccp.
19
20 If in doubt, say N.
21
22config INET_DCCP_DIAG
23 depends on IP_DCCP && INET_DIAG
24 def_tristate y if (IP_DCCP = y && INET_DIAG = y)
25 def_tristate m
26
27source "net/dccp/ccids/Kconfig"
28
29menu "DCCP Kernel Hacking"
30 depends on IP_DCCP && DEBUG_KERNEL=y
31
32config IP_DCCP_DEBUG
33 bool "DCCP debug messages"
34 ---help---
35 Only use this if you're hacking DCCP.
36
37 Just say N.
38
39config IP_DCCP_UNLOAD_HACK
40 depends on IP_DCCP=m && IP_DCCP_CCID3=m
41 bool "DCCP control sock unload hack"
42 ---help---
43 Enable this to be able to unload the dccp module when the it
44 has only one refcount held, the control sock one. Just execute
45 "rmmod dccp_ccid3 dccp"
46
47 Just say N.
48endmenu
49
50endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 000000000000..fb97bb042455
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_IP_DCCP) += dccp.o
2
3dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
4 timer.o
5
6obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
7
8dccp_diag-y := diag.o
9
10obj-y += ccids/
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 000000000000..9d8fc0e289ea
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,139 @@
1/*
2 * net/dccp/ccid.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * CCID infrastructure
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include "ccid.h"
15
16static struct ccid *ccids[CCID_MAX];
17#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
18static atomic_t ccids_lockct = ATOMIC_INIT(0);
19static DEFINE_SPINLOCK(ccids_lock);
20
21/*
22 * The strategy is: modifications ccids vector are short, do not sleep and
23 * veeery rare, but read access should be free of any exclusive locks.
24 */
25static void ccids_write_lock(void)
26{
27 spin_lock(&ccids_lock);
28 while (atomic_read(&ccids_lockct) != 0) {
29 spin_unlock(&ccids_lock);
30 yield();
31 spin_lock(&ccids_lock);
32 }
33}
34
35static inline void ccids_write_unlock(void)
36{
37 spin_unlock(&ccids_lock);
38}
39
40static inline void ccids_read_lock(void)
41{
42 atomic_inc(&ccids_lockct);
43 spin_unlock_wait(&ccids_lock);
44}
45
46static inline void ccids_read_unlock(void)
47{
48 atomic_dec(&ccids_lockct);
49}
50
51#else
52#define ccids_write_lock() do { } while(0)
53#define ccids_write_unlock() do { } while(0)
54#define ccids_read_lock() do { } while(0)
55#define ccids_read_unlock() do { } while(0)
56#endif
57
58int ccid_register(struct ccid *ccid)
59{
60 int err;
61
62 if (ccid->ccid_init == NULL)
63 return -1;
64
65 ccids_write_lock();
66 err = -EEXIST;
67 if (ccids[ccid->ccid_id] == NULL) {
68 ccids[ccid->ccid_id] = ccid;
69 err = 0;
70 }
71 ccids_write_unlock();
72 if (err == 0)
73 pr_info("CCID: Registered CCID %d (%s)\n",
74 ccid->ccid_id, ccid->ccid_name);
75 return err;
76}
77
78EXPORT_SYMBOL_GPL(ccid_register);
79
80int ccid_unregister(struct ccid *ccid)
81{
82 ccids_write_lock();
83 ccids[ccid->ccid_id] = NULL;
84 ccids_write_unlock();
85 pr_info("CCID: Unregistered CCID %d (%s)\n",
86 ccid->ccid_id, ccid->ccid_name);
87 return 0;
88}
89
90EXPORT_SYMBOL_GPL(ccid_unregister);
91
92struct ccid *ccid_init(unsigned char id, struct sock *sk)
93{
94 struct ccid *ccid;
95
96#ifdef CONFIG_KMOD
97 if (ccids[id] == NULL)
98 request_module("net-dccp-ccid-%d", id);
99#endif
100 ccids_read_lock();
101
102 ccid = ccids[id];
103 if (ccid == NULL)
104 goto out;
105
106 if (!try_module_get(ccid->ccid_owner))
107 goto out_err;
108
109 if (ccid->ccid_init(sk) != 0)
110 goto out_module_put;
111out:
112 ccids_read_unlock();
113 return ccid;
114out_module_put:
115 module_put(ccid->ccid_owner);
116out_err:
117 ccid = NULL;
118 goto out;
119}
120
121EXPORT_SYMBOL_GPL(ccid_init);
122
123void ccid_exit(struct ccid *ccid, struct sock *sk)
124{
125 if (ccid == NULL)
126 return;
127
128 ccids_read_lock();
129
130 if (ccids[ccid->ccid_id] != NULL) {
131 if (ccid->ccid_exit != NULL)
132 ccid->ccid_exit(sk);
133 module_put(ccid->ccid_owner);
134 }
135
136 ccids_read_unlock();
137}
138
139EXPORT_SYMBOL_GPL(ccid_exit);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 000000000000..962f1e9e2f7e
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,180 @@
1#ifndef _CCID_H
2#define _CCID_H
3/*
4 * net/dccp/ccid.h
5 *
6 * An implementation of the DCCP protocol
7 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
8 *
9 * CCID infrastructure
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15
16#include <net/sock.h>
17#include <linux/dccp.h>
18#include <linux/list.h>
19#include <linux/module.h>
20
21#define CCID_MAX 255
22
23struct ccid {
24 unsigned char ccid_id;
25 const char *ccid_name;
26 struct module *ccid_owner;
27 int (*ccid_init)(struct sock *sk);
28 void (*ccid_exit)(struct sock *sk);
29 int (*ccid_hc_rx_init)(struct sock *sk);
30 int (*ccid_hc_tx_init)(struct sock *sk);
31 void (*ccid_hc_rx_exit)(struct sock *sk);
32 void (*ccid_hc_tx_exit)(struct sock *sk);
33 void (*ccid_hc_rx_packet_recv)(struct sock *sk,
34 struct sk_buff *skb);
35 int (*ccid_hc_rx_parse_options)(struct sock *sk,
36 unsigned char option,
37 unsigned char len, u16 idx,
38 unsigned char* value);
39 void (*ccid_hc_rx_insert_options)(struct sock *sk,
40 struct sk_buff *skb);
41 void (*ccid_hc_tx_insert_options)(struct sock *sk,
42 struct sk_buff *skb);
43 void (*ccid_hc_tx_packet_recv)(struct sock *sk,
44 struct sk_buff *skb);
45 int (*ccid_hc_tx_parse_options)(struct sock *sk,
46 unsigned char option,
47 unsigned char len, u16 idx,
48 unsigned char* value);
49 int (*ccid_hc_tx_send_packet)(struct sock *sk,
50 struct sk_buff *skb, int len);
51 void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
52 int len);
53 void (*ccid_hc_rx_get_info)(struct sock *sk,
54 struct tcp_info *info);
55 void (*ccid_hc_tx_get_info)(struct sock *sk,
56 struct tcp_info *info);
57};
58
59extern int ccid_register(struct ccid *ccid);
60extern int ccid_unregister(struct ccid *ccid);
61
62extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
63extern void ccid_exit(struct ccid *ccid, struct sock *sk);
64
65static inline void __ccid_get(struct ccid *ccid)
66{
67 __module_get(ccid->ccid_owner);
68}
69
70static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
71 struct sk_buff *skb, int len)
72{
73 int rc = 0;
74 if (ccid->ccid_hc_tx_send_packet != NULL)
75 rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
76 return rc;
77}
78
79static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
80 int more, int len)
81{
82 if (ccid->ccid_hc_tx_packet_sent != NULL)
83 ccid->ccid_hc_tx_packet_sent(sk, more, len);
84}
85
86static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
87{
88 int rc = 0;
89 if (ccid->ccid_hc_rx_init != NULL)
90 rc = ccid->ccid_hc_rx_init(sk);
91 return rc;
92}
93
94static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
95{
96 int rc = 0;
97 if (ccid->ccid_hc_tx_init != NULL)
98 rc = ccid->ccid_hc_tx_init(sk);
99 return rc;
100}
101
102static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
103{
104 if (ccid->ccid_hc_rx_exit != NULL &&
105 dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
106 ccid->ccid_hc_rx_exit(sk);
107}
108
109static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
110{
111 if (ccid->ccid_hc_tx_exit != NULL &&
112 dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
113 ccid->ccid_hc_tx_exit(sk);
114}
115
116static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
117 struct sk_buff *skb)
118{
119 if (ccid->ccid_hc_rx_packet_recv != NULL)
120 ccid->ccid_hc_rx_packet_recv(sk, skb);
121}
122
123static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
124 struct sk_buff *skb)
125{
126 if (ccid->ccid_hc_tx_packet_recv != NULL)
127 ccid->ccid_hc_tx_packet_recv(sk, skb);
128}
129
130static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
131 unsigned char option,
132 unsigned char len, u16 idx,
133 unsigned char* value)
134{
135 int rc = 0;
136 if (ccid->ccid_hc_tx_parse_options != NULL)
137 rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
138 value);
139 return rc;
140}
141
142static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
143 unsigned char option,
144 unsigned char len, u16 idx,
145 unsigned char* value)
146{
147 int rc = 0;
148 if (ccid->ccid_hc_rx_parse_options != NULL)
149 rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
150 return rc;
151}
152
153static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
154 struct sk_buff *skb)
155{
156 if (ccid->ccid_hc_tx_insert_options != NULL)
157 ccid->ccid_hc_tx_insert_options(sk, skb);
158}
159
160static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
161 struct sk_buff *skb)
162{
163 if (ccid->ccid_hc_rx_insert_options != NULL)
164 ccid->ccid_hc_rx_insert_options(sk, skb);
165}
166
167static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
168 struct tcp_info *info)
169{
170 if (ccid->ccid_hc_rx_get_info != NULL)
171 ccid->ccid_hc_rx_get_info(sk, info);
172}
173
174static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
175 struct tcp_info *info)
176{
177 if (ccid->ccid_hc_tx_get_info != NULL)
178 ccid->ccid_hc_tx_get_info(sk, info);
179}
180#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 000000000000..7684d83946a4
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,29 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on IP_DCCP && EXPERIMENTAL
3
4config IP_DCCP_CCID3
5 tristate "CCID3 (TFRC) (EXPERIMENTAL)"
6 depends on IP_DCCP
7 ---help---
8 CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
9 rate-controlled congestion control mechanism. TFRC is designed to
10 be reasonably fair when competing for bandwidth with TCP-like flows,
11 where a flow is "reasonably fair" if its sending rate is generally
12 within a factor of two of the sending rate of a TCP flow under the
13 same conditions. However, TFRC has a much lower variation of
14 throughput over time compared with TCP, which makes CCID 3 more
15 suitable than CCID 2 for applications such streaming media where a
16 relatively smooth sending rate is of importance.
17
18 CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
19 congestion control algorithms were initially described in RFC 3448.
20
21 This text was extracted from draft-ietf-dccp-spec-11.txt.
22
23 If in doubt, say M.
24
25config IP_DCCP_TFRC_LIB
26 depends on IP_DCCP_CCID3
27 def_tristate IP_DCCP_CCID3
28
29endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
new file mode 100644
index 000000000000..956f79f50743
--- /dev/null
+++ b/net/dccp/ccids/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
2
3dccp_ccid3-y := ccid3.o
4
5obj-y += lib/
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 000000000000..7bf3b3a91e97
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,1221 @@
1/*
2 * net/dccp/ccids/ccid3.c
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
6 *
7 * An implementation of the DCCP protocol
8 *
9 * This code has been developed by the University of Waikato WAND
10 * research group. For further information please see http://www.wand.net.nz/
11 *
12 * This code also uses code from Lulea University, rereleased as GPL by its
13 * authors:
14 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
15 *
16 * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
17 * and to make it work as a loadable module in the DCCP stack written by
18 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
19 *
20 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21 *
22 * This program is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation; either version 2 of the License, or
25 * (at your option) any later version.
26 *
27 * This program is distributed in the hope that it will be useful,
28 * but WITHOUT ANY WARRANTY; without even the implied warranty of
29 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 * GNU General Public License for more details.
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */
36
37#include <linux/config.h>
38#include "../ccid.h"
39#include "../dccp.h"
40#include "lib/packet_history.h"
41#include "lib/loss_interval.h"
42#include "lib/tfrc.h"
43#include "ccid3.h"
44
45/*
46 * Reason for maths with 10 here is to avoid 32 bit overflow when a is big.
47 */
48static inline u32 usecs_div(const u32 a, const u32 b)
49{
50 const u32 tmp = a * (USEC_PER_SEC / 10);
51 return b > 20 ? tmp / (b / 10) : tmp;
52}
53
54static int ccid3_debug;
55
56#ifdef CCID3_DEBUG
57#define ccid3_pr_debug(format, a...) \
58 do { if (ccid3_debug) \
59 printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
60 } while (0)
61#else
62#define ccid3_pr_debug(format, a...)
63#endif
64
65static struct dccp_tx_hist *ccid3_tx_hist;
66static struct dccp_rx_hist *ccid3_rx_hist;
67static struct dccp_li_hist *ccid3_li_hist;
68
69static int ccid3_init(struct sock *sk)
70{
71 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
72 return 0;
73}
74
75static void ccid3_exit(struct sock *sk)
76{
77 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
78}
79
80/* TFRC sender states */
81enum ccid3_hc_tx_states {
82 TFRC_SSTATE_NO_SENT = 1,
83 TFRC_SSTATE_NO_FBACK,
84 TFRC_SSTATE_FBACK,
85 TFRC_SSTATE_TERM,
86};
87
88#ifdef CCID3_DEBUG
89static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
90{
91 static char *ccid3_state_names[] = {
92 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
93 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
94 [TFRC_SSTATE_FBACK] = "FBACK",
95 [TFRC_SSTATE_TERM] = "TERM",
96 };
97
98 return ccid3_state_names[state];
99}
100#endif
101
102static inline void ccid3_hc_tx_set_state(struct sock *sk,
103 enum ccid3_hc_tx_states state)
104{
105 struct dccp_sock *dp = dccp_sk(sk);
106 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
107 enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
108
109 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
110 dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
111 ccid3_tx_state_name(state));
112 WARN_ON(state == oldstate);
113 hctx->ccid3hctx_state = state;
114}
115
116/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
117static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
118{
119 /*
120 * If no feedback spec says t_ipi is 1 second (set elsewhere and then
121 * doubles after every no feedback timer (separate function)
122 */
123 if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
124 hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
125 hctx->ccid3hctx_x);
126}
127
128/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
129static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
130{
131 hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
132 TFRC_OPSYS_HALF_TIME_GRAN);
133}
134
135/*
136 * Update X by
137 * If (p > 0)
138 * x_calc = calcX(s, R, p);
139 * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
140 * Else
141 * If (now - tld >= R)
142 * X = max(min(2 * X, 2 * X_recv), s / R);
143 * tld = now;
144 */
145static void ccid3_hc_tx_update_x(struct sock *sk)
146{
147 struct dccp_sock *dp = dccp_sk(sk);
148 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
149
150 /* To avoid large error in calcX */
151 if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
152 hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
153 hctx->ccid3hctx_rtt,
154 hctx->ccid3hctx_p);
155 hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
156 2 * hctx->ccid3hctx_x_recv),
157 (hctx->ccid3hctx_s /
158 TFRC_MAX_BACK_OFF_TIME));
159 } else {
160 struct timeval now;
161
162 do_gettimeofday(&now);
163 if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
164 hctx->ccid3hctx_rtt) {
165 hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
166 hctx->ccid3hctx_x) * 2,
167 usecs_div(hctx->ccid3hctx_s,
168 hctx->ccid3hctx_rtt));
169 hctx->ccid3hctx_t_ld = now;
170 }
171 }
172}
173
174static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
175{
176 struct sock *sk = (struct sock *)data;
177 struct dccp_sock *dp = dccp_sk(sk);
178 unsigned long next_tmout = 0;
179 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
180
181 bh_lock_sock(sk);
182 if (sock_owned_by_user(sk)) {
183 /* Try again later. */
184 /* XXX: set some sensible MIB */
185 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
186 jiffies + HZ / 5);
187 goto out;
188 }
189
190 ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
191 ccid3_tx_state_name(hctx->ccid3hctx_state));
192
193 switch (hctx->ccid3hctx_state) {
194 case TFRC_SSTATE_TERM:
195 goto out;
196 case TFRC_SSTATE_NO_FBACK:
197 /* Halve send rate */
198 hctx->ccid3hctx_x /= 2;
199 if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
200 TFRC_MAX_BACK_OFF_TIME))
201 hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
202 TFRC_MAX_BACK_OFF_TIME);
203
204 ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
205 "bytes/s\n",
206 dccp_role(sk), sk,
207 ccid3_tx_state_name(hctx->ccid3hctx_state),
208 hctx->ccid3hctx_x);
209 next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
210 hctx->ccid3hctx_x),
211 TFRC_INITIAL_TIMEOUT);
212 /*
213 * FIXME - not sure above calculation is correct. See section
214 * 5 of CCID3 11 should adjust tx_t_ipi and double that to
215 * achieve it really
216 */
217 break;
218 case TFRC_SSTATE_FBACK:
219 /*
220 * Check if IDLE since last timeout and recv rate is less than
221 * 4 packets per RTT
222 */
223 if (!hctx->ccid3hctx_idle ||
224 (hctx->ccid3hctx_x_recv >=
225 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
226 ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
227 dccp_role(sk), sk,
228 ccid3_tx_state_name(hctx->ccid3hctx_state));
229 /* Halve sending rate */
230
231 /* If (X_calc > 2 * X_recv)
232 * X_recv = max(X_recv / 2, s / (2 * t_mbi));
233 * Else
234 * X_recv = X_calc / 4;
235 */
236 BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
237 hctx->ccid3hctx_x_calc == 0);
238
239 /* check also if p is zero -> x_calc is infinity? */
240 if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
241 hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
242 hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
243 hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
244 else
245 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
246
247 /* Update sending rate */
248 ccid3_hc_tx_update_x(sk);
249 }
250 /*
251 * Schedule no feedback timer to expire in
252 * max(4 * R, 2 * s / X)
253 */
254 next_tmout = max_t(u32, hctx->ccid3hctx_t_rto,
255 2 * usecs_div(hctx->ccid3hctx_s,
256 hctx->ccid3hctx_x));
257 break;
258 default:
259 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
260 __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
261 dump_stack();
262 goto out;
263 }
264
265 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
266 jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
267 hctx->ccid3hctx_idle = 1;
268out:
269 bh_unlock_sock(sk);
270 sock_put(sk);
271}
272
273static int ccid3_hc_tx_send_packet(struct sock *sk,
274 struct sk_buff *skb, int len)
275{
276 struct dccp_sock *dp = dccp_sk(sk);
277 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
278 struct dccp_tx_hist_entry *new_packet;
279 struct timeval now;
280 long delay;
281 int rc = -ENOTCONN;
282
283 /* Check if pure ACK or Terminating*/
284
285 /*
286 * XXX: We only call this function for DATA and DATAACK, on, these
287 * packets can have zero length, but why the comment about "pure ACK"?
288 */
289 if (hctx == NULL || len == 0 ||
290 hctx->ccid3hctx_state == TFRC_SSTATE_TERM)
291 goto out;
292
293 /* See if last packet allocated was not sent */
294 new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
295 if (new_packet == NULL || new_packet->dccphtx_sent) {
296 new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
297 SLAB_ATOMIC);
298
299 rc = -ENOBUFS;
300 if (new_packet == NULL) {
301 ccid3_pr_debug("%s, sk=%p, not enough mem to add "
302 "to history, send refused\n",
303 dccp_role(sk), sk);
304 goto out;
305 }
306
307 dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
308 }
309
310 do_gettimeofday(&now);
311
312 switch (hctx->ccid3hctx_state) {
313 case TFRC_SSTATE_NO_SENT:
314 ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n",
315 dccp_role(sk), sk, dp->dccps_gss);
316
317 hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
318 hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
319 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
320 jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
321 hctx->ccid3hctx_last_win_count = 0;
322 hctx->ccid3hctx_t_last_win_count = now;
323 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
324 hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT;
325
326 /* Set nominal send time for initial packet */
327 hctx->ccid3hctx_t_nom = now;
328 timeval_add_usecs(&hctx->ccid3hctx_t_nom,
329 hctx->ccid3hctx_t_ipi);
330 ccid3_calc_new_delta(hctx);
331 rc = 0;
332 break;
333 case TFRC_SSTATE_NO_FBACK:
334 case TFRC_SSTATE_FBACK:
335 delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
336 hctx->ccid3hctx_delta);
337 ccid3_pr_debug("send_packet delay=%ld\n", delay);
338 delay /= -1000;
339 /* divide by -1000 is to convert to ms and get sign right */
340 rc = delay > 0 ? delay : 0;
341 break;
342 default:
343 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
344 __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
345 dump_stack();
346 rc = -EINVAL;
347 break;
348 }
349
350 /* Can we send? if so add options and add to packet history */
351 if (rc == 0)
352 new_packet->dccphtx_ccval =
353 DCCP_SKB_CB(skb)->dccpd_ccval =
354 hctx->ccid3hctx_last_win_count;
355out:
356 return rc;
357}
358
359static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
360{
361 struct dccp_sock *dp = dccp_sk(sk);
362 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
363 struct timeval now;
364
365 BUG_ON(hctx == NULL);
366
367 if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
368 ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n",
369 dccp_role(sk), sk);
370 return;
371 }
372
373 do_gettimeofday(&now);
374
375 /* check if we have sent a data packet */
376 if (len > 0) {
377 unsigned long quarter_rtt;
378 struct dccp_tx_hist_entry *packet;
379
380 packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
381 if (packet == NULL) {
382 printk(KERN_CRIT "%s: packet doesn't exists in "
383 "history!\n", __FUNCTION__);
384 return;
385 }
386 if (packet->dccphtx_sent) {
387 printk(KERN_CRIT "%s: no unsent packet in history!\n",
388 __FUNCTION__);
389 return;
390 }
391 packet->dccphtx_tstamp = now;
392 packet->dccphtx_seqno = dp->dccps_gss;
393 /*
394 * Check if win_count have changed
395 * Algorithm in "8.1. Window Counter Valuer" in
396 * draft-ietf-dccp-ccid3-11.txt
397 */
398 quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
399 if (likely(hctx->ccid3hctx_rtt > 8))
400 quarter_rtt /= hctx->ccid3hctx_rtt / 4;
401
402 if (quarter_rtt > 0) {
403 hctx->ccid3hctx_t_last_win_count = now;
404 hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count +
405 min_t(unsigned long, quarter_rtt, 5)) % 16;
406 ccid3_pr_debug("%s, sk=%p, window changed from "
407 "%u to %u!\n",
408 dccp_role(sk), sk,
409 packet->dccphtx_ccval,
410 hctx->ccid3hctx_last_win_count);
411 }
412
413 hctx->ccid3hctx_idle = 0;
414 packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
415 packet->dccphtx_sent = 1;
416 } else
417 ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
418 dccp_role(sk), sk, dp->dccps_gss);
419
420 switch (hctx->ccid3hctx_state) {
421 case TFRC_SSTATE_NO_SENT:
422 /* if first wasn't pure ack */
423 if (len != 0)
424 printk(KERN_CRIT "%s: %s, First packet sent is noted "
425 "as a data packet\n",
426 __FUNCTION__, dccp_role(sk));
427 return;
428 case TFRC_SSTATE_NO_FBACK:
429 case TFRC_SSTATE_FBACK:
430 if (len > 0) {
431 hctx->ccid3hctx_t_nom = now;
432 ccid3_calc_new_t_ipi(hctx);
433 ccid3_calc_new_delta(hctx);
434 timeval_add_usecs(&hctx->ccid3hctx_t_nom,
435 hctx->ccid3hctx_t_ipi);
436 }
437 break;
438 default:
439 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
440 __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
441 dump_stack();
442 break;
443 }
444}
445
446static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
447{
448 struct dccp_sock *dp = dccp_sk(sk);
449 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
450 struct ccid3_options_received *opt_recv;
451 struct dccp_tx_hist_entry *packet;
452 unsigned long next_tmout;
453 u32 t_elapsed;
454 u32 pinv;
455 u32 x_recv;
456 u32 r_sample;
457
458 if (hctx == NULL)
459 return;
460
461 if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
462 ccid3_pr_debug("%s, sk=%p, received a packet when "
463 "terminating!\n", dccp_role(sk), sk);
464 return;
465 }
466
467 /* we are only interested in ACKs */
468 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
469 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
470 return;
471
472 opt_recv = &hctx->ccid3hctx_options_received;
473
474 t_elapsed = dp->dccps_options_received.dccpor_elapsed_time;
475 x_recv = opt_recv->ccid3or_receive_rate;
476 pinv = opt_recv->ccid3or_loss_event_rate;
477
478 switch (hctx->ccid3hctx_state) {
479 case TFRC_SSTATE_NO_SENT:
480 /* FIXME: what to do here? */
481 return;
482 case TFRC_SSTATE_NO_FBACK:
483 case TFRC_SSTATE_FBACK:
484 /* Calculate new round trip sample by
485 * R_sample = (now - t_recvdata) - t_delay */
486 /* get t_recvdata from history */
487 packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
488 DCCP_SKB_CB(skb)->dccpd_ack_seq);
489 if (packet == NULL) {
490 ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't "
491 "exist in history!\n",
492 dccp_role(sk), sk,
493 DCCP_SKB_CB(skb)->dccpd_ack_seq,
494 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
495 return;
496 }
497
498 /* Update RTT */
499 r_sample = timeval_now_delta(&packet->dccphtx_tstamp);
500 /* FIXME: */
501 // r_sample -= usecs_to_jiffies(t_elapsed * 10);
502
503 /* Update RTT estimate by
504 * If (No feedback recv)
505 * R = R_sample;
506 * Else
507 * R = q * R + (1 - q) * R_sample;
508 *
509 * q is a constant, RFC 3448 recomments 0.9
510 */
511 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
512 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
513 hctx->ccid3hctx_rtt = r_sample;
514 } else
515 hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
516 r_sample / 10;
517
518 ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
519 "r_sample=%us\n", dccp_role(sk), sk,
520 hctx->ccid3hctx_rtt, r_sample);
521
522 /* Update timeout interval */
523 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
524 USEC_PER_SEC);
525
526 /* Update receive rate */
527 hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
528
529 /* Update loss event rate */
530 if (pinv == ~0 || pinv == 0)
531 hctx->ccid3hctx_p = 0;
532 else {
533 hctx->ccid3hctx_p = 1000000 / pinv;
534
535 if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
536 hctx->ccid3hctx_p = TFRC_SMALLEST_P;
537 ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
538 dccp_role(sk), sk);
539 }
540 }
541
542 /* unschedule no feedback timer */
543 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
544
545 /* Update sending rate */
546 ccid3_hc_tx_update_x(sk);
547
548 /* Update next send time */
549 timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
550 hctx->ccid3hctx_t_ipi);
551 ccid3_calc_new_t_ipi(hctx);
552 timeval_add_usecs(&hctx->ccid3hctx_t_nom,
553 hctx->ccid3hctx_t_ipi);
554 ccid3_calc_new_delta(hctx);
555
556 /* remove all packets older than the one acked from history */
557 dccp_tx_hist_purge_older(ccid3_tx_hist,
558 &hctx->ccid3hctx_hist, packet);
559 /*
560 * As we have calculated new ipi, delta, t_nom it is possible that
561 * we now can send a packet, so wake up dccp_wait_for_ccids.
562 */
563 sk->sk_write_space(sk);
564
565 /*
566 * Schedule no feedback timer to expire in
567 * max(4 * R, 2 * s / X)
568 */
569 next_tmout = max(hctx->ccid3hctx_t_rto,
570 2 * usecs_div(hctx->ccid3hctx_s,
571 hctx->ccid3hctx_x));
572
573 ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
574 "expire in %lu jiffies (%luus)\n",
575 dccp_role(sk), sk,
576 usecs_to_jiffies(next_tmout), next_tmout);
577
578 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
579 jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
580
581 /* set idle flag */
582 hctx->ccid3hctx_idle = 1;
583 break;
584 default:
585 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
586 __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
587 dump_stack();
588 break;
589 }
590}
591
592static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
593{
594 const struct dccp_sock *dp = dccp_sk(sk);
595 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
596
597 if (hctx == NULL || !(sk->sk_state == DCCP_OPEN ||
598 sk->sk_state == DCCP_PARTOPEN))
599 return;
600
601 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
602}
603
604static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
605 unsigned char len, u16 idx,
606 unsigned char *value)
607{
608 int rc = 0;
609 struct dccp_sock *dp = dccp_sk(sk);
610 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
611 struct ccid3_options_received *opt_recv;
612
613 if (hctx == NULL)
614 return 0;
615
616 opt_recv = &hctx->ccid3hctx_options_received;
617
618 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
619 opt_recv->ccid3or_seqno = dp->dccps_gsr;
620 opt_recv->ccid3or_loss_event_rate = ~0;
621 opt_recv->ccid3or_loss_intervals_idx = 0;
622 opt_recv->ccid3or_loss_intervals_len = 0;
623 opt_recv->ccid3or_receive_rate = 0;
624 }
625
626 switch (option) {
627 case TFRC_OPT_LOSS_EVENT_RATE:
628 if (len != 4) {
629 ccid3_pr_debug("%s, sk=%p, invalid len for "
630 "TFRC_OPT_LOSS_EVENT_RATE\n",
631 dccp_role(sk), sk);
632 rc = -EINVAL;
633 } else {
634 opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
635 ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
636 dccp_role(sk), sk,
637 opt_recv->ccid3or_loss_event_rate);
638 }
639 break;
640 case TFRC_OPT_LOSS_INTERVALS:
641 opt_recv->ccid3or_loss_intervals_idx = idx;
642 opt_recv->ccid3or_loss_intervals_len = len;
643 ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
644 dccp_role(sk), sk,
645 opt_recv->ccid3or_loss_intervals_idx,
646 opt_recv->ccid3or_loss_intervals_len);
647 break;
648 case TFRC_OPT_RECEIVE_RATE:
649 if (len != 4) {
650 ccid3_pr_debug("%s, sk=%p, invalid len for "
651 "TFRC_OPT_RECEIVE_RATE\n",
652 dccp_role(sk), sk);
653 rc = -EINVAL;
654 } else {
655 opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
656 ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
657 dccp_role(sk), sk,
658 opt_recv->ccid3or_receive_rate);
659 }
660 break;
661 }
662
663 return rc;
664}
665
666static int ccid3_hc_tx_init(struct sock *sk)
667{
668 struct dccp_sock *dp = dccp_sk(sk);
669 struct ccid3_hc_tx_sock *hctx;
670
671 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
672
673 hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx),
674 gfp_any());
675 if (hctx == NULL)
676 return -ENOMEM;
677
678 memset(hctx, 0, sizeof(*hctx));
679
680 if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
681 dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
682 hctx->ccid3hctx_s = dp->dccps_packet_size;
683 else
684 hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
685
686 /* Set transmission rate to 1 packet per second */
687 hctx->ccid3hctx_x = hctx->ccid3hctx_s;
688 hctx->ccid3hctx_t_rto = USEC_PER_SEC;
689 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
690 INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
691 init_timer(&hctx->ccid3hctx_no_feedback_timer);
692
693 return 0;
694}
695
696static void ccid3_hc_tx_exit(struct sock *sk)
697{
698 struct dccp_sock *dp = dccp_sk(sk);
699 struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
700
701 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
702 BUG_ON(hctx == NULL);
703
704 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
705 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
706
707 /* Empty packet history */
708 dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
709
710 kfree(dp->dccps_hc_tx_ccid_private);
711 dp->dccps_hc_tx_ccid_private = NULL;
712}
713
714/*
715 * RX Half Connection methods
716 */
717
718/* TFRC receiver states */
719enum ccid3_hc_rx_states {
720 TFRC_RSTATE_NO_DATA = 1,
721 TFRC_RSTATE_DATA,
722 TFRC_RSTATE_TERM = 127,
723};
724
725#ifdef CCID3_DEBUG
726static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
727{
728 static char *ccid3_rx_state_names[] = {
729 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
730 [TFRC_RSTATE_DATA] = "DATA",
731 [TFRC_RSTATE_TERM] = "TERM",
732 };
733
734 return ccid3_rx_state_names[state];
735}
736#endif
737
738static inline void ccid3_hc_rx_set_state(struct sock *sk,
739 enum ccid3_hc_rx_states state)
740{
741 struct dccp_sock *dp = dccp_sk(sk);
742 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
743 enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
744
745 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
746 dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
747 ccid3_rx_state_name(state));
748 WARN_ON(state == oldstate);
749 hcrx->ccid3hcrx_state = state;
750}
751
752static void ccid3_hc_rx_send_feedback(struct sock *sk)
753{
754 struct dccp_sock *dp = dccp_sk(sk);
755 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
756 struct dccp_rx_hist_entry *packet;
757 struct timeval now;
758
759 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
760
761 do_gettimeofday(&now);
762
763 switch (hcrx->ccid3hcrx_state) {
764 case TFRC_RSTATE_NO_DATA:
765 hcrx->ccid3hcrx_x_recv = 0;
766 break;
767 case TFRC_RSTATE_DATA: {
768 const u32 delta = timeval_delta(&now,
769 &hcrx->ccid3hcrx_tstamp_last_feedback);
770
771 hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv *
772 USEC_PER_SEC);
773 if (likely(delta > 1))
774 hcrx->ccid3hcrx_x_recv /= delta;
775 }
776 break;
777 default:
778 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
779 __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
780 dump_stack();
781 return;
782 }
783
784 packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
785 if (packet == NULL) {
786 printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n",
787 __FUNCTION__, dccp_role(sk), sk);
788 dump_stack();
789 return;
790 }
791
792 hcrx->ccid3hcrx_tstamp_last_feedback = now;
793 hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval;
794 hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno;
795 hcrx->ccid3hcrx_bytes_recv = 0;
796
797 /* Convert to multiples of 10us */
798 hcrx->ccid3hcrx_elapsed_time =
799 timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
800 if (hcrx->ccid3hcrx_p == 0)
801 hcrx->ccid3hcrx_pinv = ~0;
802 else
803 hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
804 dccp_send_ack(sk);
805}
806
807static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
808{
809 const struct dccp_sock *dp = dccp_sk(sk);
810 u32 x_recv, pinv;
811 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
812
813 if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN ||
814 sk->sk_state == DCCP_PARTOPEN))
815 return;
816
817 DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
818
819 if (dccp_packet_without_ack(skb))
820 return;
821
822 if (hcrx->ccid3hcrx_elapsed_time != 0)
823 dccp_insert_option_elapsed_time(sk, skb,
824 hcrx->ccid3hcrx_elapsed_time);
825 dccp_insert_option_timestamp(sk, skb);
826 x_recv = htonl(hcrx->ccid3hcrx_x_recv);
827 pinv = htonl(hcrx->ccid3hcrx_pinv);
828 dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
829 &pinv, sizeof(pinv));
830 dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
831 &x_recv, sizeof(x_recv));
832}
833
834/* calculate first loss interval
835 *
836 * returns estimated loss interval in usecs */
837
838static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
839{
840 struct dccp_sock *dp = dccp_sk(sk);
841 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
842 struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
843 u32 rtt, delta, x_recv, fval, p, tmp2;
844 struct timeval tstamp = { 0, };
845 int interval = 0;
846 int win_count = 0;
847 int step = 0;
848 u64 tmp1;
849
850 list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
851 dccphrx_node) {
852 if (dccp_rx_hist_entry_data_packet(entry)) {
853 tail = entry;
854
855 switch (step) {
856 case 0:
857 tstamp = entry->dccphrx_tstamp;
858 win_count = entry->dccphrx_ccval;
859 step = 1;
860 break;
861 case 1:
862 interval = win_count - entry->dccphrx_ccval;
863 if (interval < 0)
864 interval += TFRC_WIN_COUNT_LIMIT;
865 if (interval > 4)
866 goto found;
867 break;
868 }
869 }
870 }
871
872 if (step == 0) {
873 printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no "
874 "data packets!\n",
875 __FUNCTION__, dccp_role(sk), sk);
876 return ~0;
877 }
878
879 if (interval == 0) {
880 ccid3_pr_debug("%s, sk=%p, Could not find a win_count "
881 "interval > 0. Defaulting to 1\n",
882 dccp_role(sk), sk);
883 interval = 1;
884 }
885found:
886 rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
887 ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
888 dccp_role(sk), sk, rtt);
889 if (rtt == 0)
890 rtt = 1;
891
892 delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback);
893 x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC;
894 if (likely(delta > 1))
895 x_recv /= delta;
896
897 tmp1 = (u64)x_recv * (u64)rtt;
898 do_div(tmp1,10000000);
899 tmp2 = (u32)tmp1;
900 fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
901 /* do not alter order above or you will get overflow on 32 bit */
902 p = tfrc_calc_x_reverse_lookup(fval);
903 ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
904 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
905
906 if (p == 0)
907 return ~0;
908 else
909 return 1000000 / p;
910}
911
912static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
913{
914 struct dccp_sock *dp = dccp_sk(sk);
915 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
916
917 if (seq_loss != DCCP_MAX_SEQNO + 1 &&
918 list_empty(&hcrx->ccid3hcrx_li_hist)) {
919 struct dccp_li_hist_entry *li_tail;
920
921 li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
922 &hcrx->ccid3hcrx_li_hist,
923 seq_loss, win_loss);
924 if (li_tail == NULL)
925 return;
926 li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
927 }
928 /* FIXME: find end of interval */
929}
930
931static void ccid3_hc_rx_detect_loss(struct sock *sk)
932{
933 struct dccp_sock *dp = dccp_sk(sk);
934 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
935 u8 win_loss;
936 const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
937 &hcrx->ccid3hcrx_li_hist,
938 &win_loss);
939
940 ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
941}
942
943static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
944{
945 struct dccp_sock *dp = dccp_sk(sk);
946 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
947 const struct dccp_options_received *opt_recv;
948 struct dccp_rx_hist_entry *packet;
949 struct timeval now;
950 u8 win_count;
951 u32 p_prev;
952 int ins;
953
954 if (hcrx == NULL)
955 return;
956
957 BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
958 hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
959
960 opt_recv = &dp->dccps_options_received;
961
962 switch (DCCP_SKB_CB(skb)->dccpd_type) {
963 case DCCP_PKT_ACK:
964 if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
965 return;
966 case DCCP_PKT_DATAACK:
967 if (opt_recv->dccpor_timestamp_echo == 0)
968 break;
969 p_prev = hcrx->ccid3hcrx_rtt;
970 do_gettimeofday(&now);
971 hcrx->ccid3hcrx_rtt = timeval_usecs(&now) -
972 (opt_recv->dccpor_timestamp_echo -
973 opt_recv->dccpor_elapsed_time) * 10;
974 if (p_prev != hcrx->ccid3hcrx_rtt)
975 ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
976 dccp_role(sk), hcrx->ccid3hcrx_rtt,
977 opt_recv->dccpor_elapsed_time);
978 break;
979 case DCCP_PKT_DATA:
980 break;
981 default:
982 ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n",
983 dccp_role(sk), sk,
984 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
985 return;
986 }
987
988 packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp,
989 skb, SLAB_ATOMIC);
990 if (packet == NULL) {
991 ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet "
992 "to history (consider it lost)!",
993 dccp_role(sk), sk);
994 return;
995 }
996
997 win_count = packet->dccphrx_ccval;
998
999 ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
1000 &hcrx->ccid3hcrx_li_hist, packet);
1001
1002 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
1003 return;
1004
1005 switch (hcrx->ccid3hcrx_state) {
1006 case TFRC_RSTATE_NO_DATA:
1007 ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
1008 "feedback\n",
1009 dccp_role(sk), sk,
1010 dccp_state_name(sk->sk_state), skb);
1011 ccid3_hc_rx_send_feedback(sk);
1012 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
1013 return;
1014 case TFRC_RSTATE_DATA:
1015 hcrx->ccid3hcrx_bytes_recv += skb->len -
1016 dccp_hdr(skb)->dccph_doff * 4;
1017 if (ins != 0)
1018 break;
1019
1020 do_gettimeofday(&now);
1021 if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
1022 hcrx->ccid3hcrx_rtt) {
1023 hcrx->ccid3hcrx_tstamp_last_ack = now;
1024 ccid3_hc_rx_send_feedback(sk);
1025 }
1026 return;
1027 default:
1028 printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
1029 __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
1030 dump_stack();
1031 return;
1032 }
1033
1034 /* Dealing with packet loss */
1035 ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
1036 dccp_role(sk), sk, dccp_state_name(sk->sk_state));
1037
1038 ccid3_hc_rx_detect_loss(sk);
1039 p_prev = hcrx->ccid3hcrx_p;
1040
1041 /* Calculate loss event rate */
1042 if (!list_empty(&hcrx->ccid3hcrx_li_hist))
1043 /* Scaling up by 1000000 as fixed decimal */
1044 hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
1045
1046 if (hcrx->ccid3hcrx_p > p_prev) {
1047 ccid3_hc_rx_send_feedback(sk);
1048 return;
1049 }
1050}
1051
1052static int ccid3_hc_rx_init(struct sock *sk)
1053{
1054 struct dccp_sock *dp = dccp_sk(sk);
1055 struct ccid3_hc_rx_sock *hcrx;
1056
1057 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
1058
1059 hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx),
1060 gfp_any());
1061 if (hcrx == NULL)
1062 return -ENOMEM;
1063
1064 memset(hcrx, 0, sizeof(*hcrx));
1065
1066 if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
1067 dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
1068 hcrx->ccid3hcrx_s = dp->dccps_packet_size;
1069 else
1070 hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
1071
1072 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
1073 INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
1074 INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
1075 /*
1076 * XXX this seems to be paranoid, need to think more about this, for
1077 * now start with something different than zero. -acme
1078 */
1079 hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5;
1080 return 0;
1081}
1082
1083static void ccid3_hc_rx_exit(struct sock *sk)
1084{
1085 struct dccp_sock *dp = dccp_sk(sk);
1086 struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
1087
1088 ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
1089
1090 if (hcrx == NULL)
1091 return;
1092
1093 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
1094
1095 /* Empty packet history */
1096 dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
1097
1098 /* Empty loss interval history */
1099 dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
1100
1101 kfree(dp->dccps_hc_rx_ccid_private);
1102 dp->dccps_hc_rx_ccid_private = NULL;
1103}
1104
1105static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
1106{
1107 const struct dccp_sock *dp = dccp_sk(sk);
1108 const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
1109
1110 if (hcrx == NULL)
1111 return;
1112
1113 info->tcpi_ca_state = hcrx->ccid3hcrx_state;
1114 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1115 info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
1116}
1117
1118static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
1119{
1120 const struct dccp_sock *dp = dccp_sk(sk);
1121 const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
1122
1123 if (hctx == NULL)
1124 return;
1125
1126 info->tcpi_rto = hctx->ccid3hctx_t_rto;
1127 info->tcpi_rtt = hctx->ccid3hctx_rtt;
1128}
1129
1130static struct ccid ccid3 = {
1131 .ccid_id = 3,
1132 .ccid_name = "ccid3",
1133 .ccid_owner = THIS_MODULE,
1134 .ccid_init = ccid3_init,
1135 .ccid_exit = ccid3_exit,
1136 .ccid_hc_tx_init = ccid3_hc_tx_init,
1137 .ccid_hc_tx_exit = ccid3_hc_tx_exit,
1138 .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
1139 .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
1140 .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
1141 .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
1142 .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
1143 .ccid_hc_rx_init = ccid3_hc_rx_init,
1144 .ccid_hc_rx_exit = ccid3_hc_rx_exit,
1145 .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
1146 .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
1147 .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
1148 .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
1149};
1150
1151module_param(ccid3_debug, int, 0444);
1152MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
1153
1154static __init int ccid3_module_init(void)
1155{
1156 int rc = -ENOBUFS;
1157
1158 ccid3_rx_hist = dccp_rx_hist_new("ccid3");
1159 if (ccid3_rx_hist == NULL)
1160 goto out;
1161
1162 ccid3_tx_hist = dccp_tx_hist_new("ccid3");
1163 if (ccid3_tx_hist == NULL)
1164 goto out_free_rx;
1165
1166 ccid3_li_hist = dccp_li_hist_new("ccid3");
1167 if (ccid3_li_hist == NULL)
1168 goto out_free_tx;
1169
1170 rc = ccid_register(&ccid3);
1171 if (rc != 0)
1172 goto out_free_loss_interval_history;
1173out:
1174 return rc;
1175
1176out_free_loss_interval_history:
1177 dccp_li_hist_delete(ccid3_li_hist);
1178 ccid3_li_hist = NULL;
1179out_free_tx:
1180 dccp_tx_hist_delete(ccid3_tx_hist);
1181 ccid3_tx_hist = NULL;
1182out_free_rx:
1183 dccp_rx_hist_delete(ccid3_rx_hist);
1184 ccid3_rx_hist = NULL;
1185 goto out;
1186}
1187module_init(ccid3_module_init);
1188
1189static __exit void ccid3_module_exit(void)
1190{
1191#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
1192 /*
1193 * Hack to use while developing, so that we get rid of the control
1194 * sock, that is what keeps a refcount on dccp.ko -acme
1195 */
1196 extern void dccp_ctl_sock_exit(void);
1197
1198 dccp_ctl_sock_exit();
1199#endif
1200 ccid_unregister(&ccid3);
1201
1202 if (ccid3_tx_hist != NULL) {
1203 dccp_tx_hist_delete(ccid3_tx_hist);
1204 ccid3_tx_hist = NULL;
1205 }
1206 if (ccid3_rx_hist != NULL) {
1207 dccp_rx_hist_delete(ccid3_rx_hist);
1208 ccid3_rx_hist = NULL;
1209 }
1210 if (ccid3_li_hist != NULL) {
1211 dccp_li_hist_delete(ccid3_li_hist);
1212 ccid3_li_hist = NULL;
1213 }
1214}
1215module_exit(ccid3_module_exit);
1216
1217MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
1218 "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
1219MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
1220MODULE_LICENSE("GPL");
1221MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 000000000000..ee8cbace6630
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,137 @@
1/*
2 * net/dccp/ccids/ccid3.h
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 *
6 * An implementation of the DCCP protocol
7 *
8 * This code has been developed by the University of Waikato WAND
9 * research group. For further information please see http://www.wand.net.nz/
10 * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
11 *
12 * This code also uses code from Lulea University, rereleased as GPL by its
13 * authors:
14 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
15 *
16 * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
17 * and to make it work as a loadable module in the DCCP stack written by
18 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
19 *
20 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21 *
22 * This program is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation; either version 2 of the License, or
25 * (at your option) any later version.
26 *
27 * This program is distributed in the hope that it will be useful,
28 * but WITHOUT ANY WARRANTY; without even the implied warranty of
29 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 * GNU General Public License for more details.
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */
36#ifndef _DCCP_CCID3_H_
37#define _DCCP_CCID3_H_
38
39#include <linux/config.h>
40#include <linux/list.h>
41#include <linux/time.h>
42#include <linux/types.h>
43
44#define TFRC_MIN_PACKET_SIZE 16
45#define TFRC_STD_PACKET_SIZE 256
46#define TFRC_MAX_PACKET_SIZE 65535
47
48/* Two seconds as per CCID3 spec */
49#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
50
51/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
52#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
53
54/* In seconds */
55#define TFRC_MAX_BACK_OFF_TIME 64
56
57#define TFRC_SMALLEST_P 40
58
59enum ccid3_options {
60 TFRC_OPT_LOSS_EVENT_RATE = 192,
61 TFRC_OPT_LOSS_INTERVALS = 193,
62 TFRC_OPT_RECEIVE_RATE = 194,
63};
64
65struct ccid3_options_received {
66 u64 ccid3or_seqno:48,
67 ccid3or_loss_intervals_idx:16;
68 u16 ccid3or_loss_intervals_len;
69 u32 ccid3or_loss_event_rate;
70 u32 ccid3or_receive_rate;
71};
72
73/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
74 *
75 * @ccid3hctx_state - Sender state
76 * @ccid3hctx_x - Current sending rate
77 * @ccid3hctx_x_recv - Receive rate
78 * @ccid3hctx_x_calc - Calculated send (?) rate
79 * @ccid3hctx_s - Packet size
80 * @ccid3hctx_rtt - Estimate of current round trip time in usecs
81 * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
82 * @ccid3hctx_last_win_count - Last window counter sent
83 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
84 * with last_win_count value sent
85 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
86 * @ccid3hctx_idle - FIXME
87 * @ccid3hctx_t_ld - Time last doubled during slow start
88 * @ccid3hctx_t_nom - Nominal send time of next packet
89 * @ccid3hctx_t_ipi - Interpacket (send) interval
90 * @ccid3hctx_delta - Send timer delta
91 * @ccid3hctx_hist - Packet history
92 */
93struct ccid3_hc_tx_sock {
94 u32 ccid3hctx_x;
95 u32 ccid3hctx_x_recv;
96 u32 ccid3hctx_x_calc;
97 u16 ccid3hctx_s;
98 u32 ccid3hctx_rtt;
99 u32 ccid3hctx_p;
100 u8 ccid3hctx_state;
101 u8 ccid3hctx_last_win_count;
102 u8 ccid3hctx_idle;
103 struct timeval ccid3hctx_t_last_win_count;
104 struct timer_list ccid3hctx_no_feedback_timer;
105 struct timeval ccid3hctx_t_ld;
106 struct timeval ccid3hctx_t_nom;
107 u32 ccid3hctx_t_rto;
108 u32 ccid3hctx_t_ipi;
109 u32 ccid3hctx_delta;
110 struct list_head ccid3hctx_hist;
111 struct ccid3_options_received ccid3hctx_options_received;
112};
113
114struct ccid3_hc_rx_sock {
115 u64 ccid3hcrx_seqno_last_counter:48,
116 ccid3hcrx_state:8,
117 ccid3hcrx_last_counter:4;
118 unsigned long ccid3hcrx_rtt;
119 u32 ccid3hcrx_p;
120 u32 ccid3hcrx_bytes_recv;
121 struct timeval ccid3hcrx_tstamp_last_feedback;
122 struct timeval ccid3hcrx_tstamp_last_ack;
123 struct list_head ccid3hcrx_hist;
124 struct list_head ccid3hcrx_li_hist;
125 u16 ccid3hcrx_s;
126 u32 ccid3hcrx_pinv;
127 u32 ccid3hcrx_elapsed_time;
128 u32 ccid3hcrx_x_recv;
129};
130
131#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \
132 ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field)
133
134#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \
135 ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field)
136
137#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
new file mode 100644
index 000000000000..5f940a6cbaca
--- /dev/null
+++ b/net/dccp/ccids/lib/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
2
3dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 000000000000..4c01a54143ad
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,144 @@
1/*
2 * net/dccp/ccids/lib/loss_interval.c
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16
17#include "loss_interval.h"
18
19struct dccp_li_hist *dccp_li_hist_new(const char *name)
20{
21 struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
22 static const char dccp_li_hist_mask[] = "li_hist_%s";
23 char *slab_name;
24
25 if (hist == NULL)
26 goto out;
27
28 slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
29 GFP_ATOMIC);
30 if (slab_name == NULL)
31 goto out_free_hist;
32
33 sprintf(slab_name, dccp_li_hist_mask, name);
34 hist->dccplih_slab = kmem_cache_create(slab_name,
35 sizeof(struct dccp_li_hist_entry),
36 0, SLAB_HWCACHE_ALIGN,
37 NULL, NULL);
38 if (hist->dccplih_slab == NULL)
39 goto out_free_slab_name;
40out:
41 return hist;
42out_free_slab_name:
43 kfree(slab_name);
44out_free_hist:
45 kfree(hist);
46 hist = NULL;
47 goto out;
48}
49
50EXPORT_SYMBOL_GPL(dccp_li_hist_new);
51
52void dccp_li_hist_delete(struct dccp_li_hist *hist)
53{
54 const char* name = kmem_cache_name(hist->dccplih_slab);
55
56 kmem_cache_destroy(hist->dccplih_slab);
57 kfree(name);
58 kfree(hist);
59}
60
61EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
62
63void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
64{
65 struct dccp_li_hist_entry *entry, *next;
66
67 list_for_each_entry_safe(entry, next, list, dccplih_node) {
68 list_del_init(&entry->dccplih_node);
69 kmem_cache_free(hist->dccplih_slab, entry);
70 }
71}
72
73EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
74
75/* Weights used to calculate loss event rate */
76/*
77 * These are integers as per section 8 of RFC3448. We can then divide by 4 *
78 * when we use it.
79 */
80static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
81 4, 4, 4, 4, 3, 2, 1, 1,
82};
83
84u32 dccp_li_hist_calc_i_mean(struct list_head *list)
85{
86 struct dccp_li_hist_entry *li_entry, *li_next;
87 int i = 0;
88 u32 i_tot;
89 u32 i_tot0 = 0;
90 u32 i_tot1 = 0;
91 u32 w_tot = 0;
92
93 list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
94 if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
95 i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
96 w_tot += dccp_li_hist_w[i];
97 }
98
99 if (i != 0)
100 i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
101
102 if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
103 break;
104 }
105
106 if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
107 return 0;
108
109 i_tot = max(i_tot0, i_tot1);
110
111 /* FIXME: Why do we do this? -Ian McDonald */
112 if (i_tot * 4 < w_tot)
113 i_tot = w_tot * 4;
114
115 return i_tot * 4 / w_tot;
116}
117
118EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
119
120struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
121 struct list_head *list,
122 const u64 seq_loss,
123 const u8 win_loss)
124{
125 struct dccp_li_hist_entry *tail = NULL, *entry;
126 int i;
127
128 for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
129 entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
130 if (entry == NULL) {
131 dccp_li_hist_purge(hist, list);
132 return NULL;
133 }
134 if (tail == NULL)
135 tail = entry;
136 list_add(&entry->dccplih_node, list);
137 }
138
139 entry->dccplih_seqno = seq_loss;
140 entry->dccplih_win_count = win_loss;
141 return tail;
142}
143
144EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 000000000000..13ad47ba1420
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,61 @@
1#ifndef _DCCP_LI_HIST_
2#define _DCCP_LI_HIST_
3/*
4 * net/dccp/ccids/lib/loss_interval.h
5 *
6 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
7 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/list.h>
18#include <linux/slab.h>
19#include <linux/time.h>
20
21#define DCCP_LI_HIST_IVAL_F_LENGTH 8
22
23struct dccp_li_hist {
24 kmem_cache_t *dccplih_slab;
25};
26
27extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
28extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
29
30struct dccp_li_hist_entry {
31 struct list_head dccplih_node;
32 u64 dccplih_seqno:48,
33 dccplih_win_count:4;
34 u32 dccplih_interval;
35};
36
37static inline struct dccp_li_hist_entry *
38 dccp_li_hist_entry_new(struct dccp_li_hist *hist,
39 const unsigned int __nocast prio)
40{
41 return kmem_cache_alloc(hist->dccplih_slab, prio);
42}
43
44static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
45 struct dccp_li_hist_entry *entry)
46{
47 if (entry != NULL)
48 kmem_cache_free(hist->dccplih_slab, entry);
49}
50
51extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
52 struct list_head *list);
53
54extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
55
56extern struct dccp_li_hist_entry *
57 dccp_li_hist_interval_new(struct dccp_li_hist *hist,
58 struct list_head *list,
59 const u64 seq_loss,
60 const u8 win_loss);
61#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 000000000000..d3f9d2053830
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,398 @@
1/*
2 * net/dccp/packet_history.h
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 *
6 * An implementation of the DCCP protocol
7 *
8 * This code has been developed by the University of Waikato WAND
9 * research group. For further information please see http://www.wand.net.nz/
10 * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
11 *
12 * This code also uses code from Lulea University, rereleased as GPL by its
13 * authors:
14 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
15 *
16 * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
17 * and to make it work as a loadable module in the DCCP stack written by
18 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
19 *
20 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21 *
22 * This program is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation; either version 2 of the License, or
25 * (at your option) any later version.
26 *
27 * This program is distributed in the hope that it will be useful,
28 * but WITHOUT ANY WARRANTY; without even the implied warranty of
29 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 * GNU General Public License for more details.
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */
36
37#include <linux/config.h>
38#include <linux/module.h>
39#include <linux/string.h>
40
41#include "packet_history.h"
42
43struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
44{
45 struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
46 static const char dccp_rx_hist_mask[] = "rx_hist_%s";
47 char *slab_name;
48
49 if (hist == NULL)
50 goto out;
51
52 slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
53 GFP_ATOMIC);
54 if (slab_name == NULL)
55 goto out_free_hist;
56
57 sprintf(slab_name, dccp_rx_hist_mask, name);
58 hist->dccprxh_slab = kmem_cache_create(slab_name,
59 sizeof(struct dccp_rx_hist_entry),
60 0, SLAB_HWCACHE_ALIGN,
61 NULL, NULL);
62 if (hist->dccprxh_slab == NULL)
63 goto out_free_slab_name;
64out:
65 return hist;
66out_free_slab_name:
67 kfree(slab_name);
68out_free_hist:
69 kfree(hist);
70 hist = NULL;
71 goto out;
72}
73
74EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
75
76void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
77{
78 const char* name = kmem_cache_name(hist->dccprxh_slab);
79
80 kmem_cache_destroy(hist->dccprxh_slab);
81 kfree(name);
82 kfree(hist);
83}
84
85EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
86
87void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
88{
89 struct dccp_rx_hist_entry *entry, *next;
90
91 list_for_each_entry_safe(entry, next, list, dccphrx_node) {
92 list_del_init(&entry->dccphrx_node);
93 kmem_cache_free(hist->dccprxh_slab, entry);
94 }
95}
96
97EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
98
99struct dccp_rx_hist_entry *
100 dccp_rx_hist_find_data_packet(const struct list_head *list)
101{
102 struct dccp_rx_hist_entry *entry, *packet = NULL;
103
104 list_for_each_entry(entry, list, dccphrx_node)
105 if (entry->dccphrx_type == DCCP_PKT_DATA ||
106 entry->dccphrx_type == DCCP_PKT_DATAACK) {
107 packet = entry;
108 break;
109 }
110
111 return packet;
112}
113
114EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
115
116int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
117 struct list_head *rx_list,
118 struct list_head *li_list,
119 struct dccp_rx_hist_entry *packet)
120{
121 struct dccp_rx_hist_entry *entry, *next, *iter;
122 u8 num_later = 0;
123
124 iter = dccp_rx_hist_head(rx_list);
125 if (iter == NULL)
126 dccp_rx_hist_add_entry(rx_list, packet);
127 else {
128 const u64 seqno = packet->dccphrx_seqno;
129
130 if (after48(seqno, iter->dccphrx_seqno))
131 dccp_rx_hist_add_entry(rx_list, packet);
132 else {
133 if (dccp_rx_hist_entry_data_packet(iter))
134 num_later = 1;
135
136 list_for_each_entry_continue(iter, rx_list,
137 dccphrx_node) {
138 if (after48(seqno, iter->dccphrx_seqno)) {
139 dccp_rx_hist_add_entry(&iter->dccphrx_node,
140 packet);
141 goto trim_history;
142 }
143
144 if (dccp_rx_hist_entry_data_packet(iter))
145 num_later++;
146
147 if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
148 dccp_rx_hist_entry_delete(hist, packet);
149 return 1;
150 }
151 }
152
153 if (num_later < TFRC_RECV_NUM_LATE_LOSS)
154 dccp_rx_hist_add_entry(rx_list, packet);
155 /*
156 * FIXME: else what? should we destroy the packet
157 * like above?
158 */
159 }
160 }
161
162trim_history:
163 /*
164 * Trim history (remove all packets after the NUM_LATE_LOSS + 1
165 * data packets)
166 */
167 num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
168
169 if (!list_empty(li_list)) {
170 list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
171 if (num_later == 0) {
172 list_del_init(&entry->dccphrx_node);
173 dccp_rx_hist_entry_delete(hist, entry);
174 } else if (dccp_rx_hist_entry_data_packet(entry))
175 --num_later;
176 }
177 } else {
178 int step = 0;
179 u8 win_count = 0; /* Not needed, but lets shut up gcc */
180 int tmp;
181 /*
182 * We have no loss interval history so we need at least one
183 * rtt:s of data packets to approximate rtt.
184 */
185 list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
186 if (num_later == 0) {
187 switch (step) {
188 case 0:
189 step = 1;
190 /* OK, find next data packet */
191 num_later = 1;
192 break;
193 case 1:
194 step = 2;
195 /* OK, find next data packet */
196 num_later = 1;
197 win_count = entry->dccphrx_ccval;
198 break;
199 case 2:
200 tmp = win_count - entry->dccphrx_ccval;
201 if (tmp < 0)
202 tmp += TFRC_WIN_COUNT_LIMIT;
203 if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
204 /*
205 * We have found a packet older
206 * than one rtt remove the rest
207 */
208 step = 3;
209 } else /* OK, find next data packet */
210 num_later = 1;
211 break;
212 case 3:
213 list_del_init(&entry->dccphrx_node);
214 dccp_rx_hist_entry_delete(hist, entry);
215 break;
216 }
217 } else if (dccp_rx_hist_entry_data_packet(entry))
218 --num_later;
219 }
220 }
221
222 return 0;
223}
224
225EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
226
227u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
228 struct list_head *li_list, u8 *win_loss)
229{
230 struct dccp_rx_hist_entry *entry, *next, *packet;
231 struct dccp_rx_hist_entry *a_loss = NULL;
232 struct dccp_rx_hist_entry *b_loss = NULL;
233 u64 seq_loss = DCCP_MAX_SEQNO + 1;
234 u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
235
236 list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
237 if (num_later == 0) {
238 b_loss = entry;
239 break;
240 } else if (dccp_rx_hist_entry_data_packet(entry))
241 --num_later;
242 }
243
244 if (b_loss == NULL)
245 goto out;
246
247 num_later = 1;
248 list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
249 if (num_later == 0) {
250 a_loss = entry;
251 break;
252 } else if (dccp_rx_hist_entry_data_packet(entry))
253 --num_later;
254 }
255
256 if (a_loss == NULL) {
257 if (list_empty(li_list)) {
258 /* no loss event have occured yet */
259 LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
260 "comparing to initial seqno\n",
261 __FUNCTION__);
262 goto out;
263 } else {
264 LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
265 __FUNCTION__);
266 goto out;
267 }
268 }
269
270 /* Locate a lost data packet */
271 entry = packet = b_loss;
272 list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
273 u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
274 packet->dccphrx_seqno);
275
276 if (delta != 0) {
277 if (dccp_rx_hist_entry_data_packet(packet))
278 --delta;
279 /*
280 * FIXME: check this, probably this % usage is because
281 * in earlier drafts the ndp count was just 8 bits
282 * long, but now it cam be up to 24 bits long.
283 */
284#if 0
285 if (delta % DCCP_NDP_LIMIT !=
286 (packet->dccphrx_ndp -
287 entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
288#endif
289 if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
290 seq_loss = entry->dccphrx_seqno;
291 dccp_inc_seqno(&seq_loss);
292 }
293 }
294 packet = entry;
295 if (packet == a_loss)
296 break;
297 }
298out:
299 if (seq_loss != DCCP_MAX_SEQNO + 1)
300 *win_loss = a_loss->dccphrx_ccval;
301 else
302 *win_loss = 0; /* Paranoia */
303
304 return seq_loss;
305}
306
307EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
308
309struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
310{
311 struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
312 static const char dccp_tx_hist_mask[] = "tx_hist_%s";
313 char *slab_name;
314
315 if (hist == NULL)
316 goto out;
317
318 slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
319 GFP_ATOMIC);
320 if (slab_name == NULL)
321 goto out_free_hist;
322
323 sprintf(slab_name, dccp_tx_hist_mask, name);
324 hist->dccptxh_slab = kmem_cache_create(slab_name,
325 sizeof(struct dccp_tx_hist_entry),
326 0, SLAB_HWCACHE_ALIGN,
327 NULL, NULL);
328 if (hist->dccptxh_slab == NULL)
329 goto out_free_slab_name;
330out:
331 return hist;
332out_free_slab_name:
333 kfree(slab_name);
334out_free_hist:
335 kfree(hist);
336 hist = NULL;
337 goto out;
338}
339
340EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
341
342void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
343{
344 const char* name = kmem_cache_name(hist->dccptxh_slab);
345
346 kmem_cache_destroy(hist->dccptxh_slab);
347 kfree(name);
348 kfree(hist);
349}
350
351EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
352
353struct dccp_tx_hist_entry *
354 dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
355{
356 struct dccp_tx_hist_entry *packet = NULL, *entry;
357
358 list_for_each_entry(entry, list, dccphtx_node)
359 if (entry->dccphtx_seqno == seq) {
360 packet = entry;
361 break;
362 }
363
364 return packet;
365}
366
367EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
368
369void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
370 struct list_head *list,
371 struct dccp_tx_hist_entry *packet)
372{
373 struct dccp_tx_hist_entry *next;
374
375 list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
376 list_del_init(&packet->dccphtx_node);
377 dccp_tx_hist_entry_delete(hist, packet);
378 }
379}
380
381EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
382
383void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
384{
385 struct dccp_tx_hist_entry *entry, *next;
386
387 list_for_each_entry_safe(entry, next, list, dccphtx_node) {
388 list_del_init(&entry->dccphtx_node);
389 dccp_tx_hist_entry_delete(hist, entry);
390 }
391}
392
393EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
394
395MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
396 "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
397MODULE_DESCRIPTION("DCCP TFRC library");
398MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 000000000000..fb90a91aa93d
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,199 @@
1/*
2 * net/dccp/packet_history.h
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 *
6 * An implementation of the DCCP protocol
7 *
8 * This code has been developed by the University of Waikato WAND
9 * research group. For further information please see http://www.wand.net.nz/
10 * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
11 *
12 * This code also uses code from Lulea University, rereleased as GPL by its
13 * authors:
14 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
15 *
16 * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
17 * and to make it work as a loadable module in the DCCP stack written by
18 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
19 *
20 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
21 *
22 * This program is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation; either version 2 of the License, or
25 * (at your option) any later version.
26 *
27 * This program is distributed in the hope that it will be useful,
28 * but WITHOUT ANY WARRANTY; without even the implied warranty of
29 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 * GNU General Public License for more details.
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */
36
37#ifndef _DCCP_PKT_HIST_
38#define _DCCP_PKT_HIST_
39
40#include <linux/config.h>
41#include <linux/list.h>
42#include <linux/slab.h>
43#include <linux/time.h>
44
45#include "../../dccp.h"
46
47/* Number of later packets received before one is considered lost */
48#define TFRC_RECV_NUM_LATE_LOSS 3
49
50#define TFRC_WIN_COUNT_PER_RTT 4
51#define TFRC_WIN_COUNT_LIMIT 16
52
53struct dccp_tx_hist_entry {
54 struct list_head dccphtx_node;
55 u64 dccphtx_seqno:48,
56 dccphtx_ccval:4,
57 dccphtx_sent:1;
58 u32 dccphtx_rtt;
59 struct timeval dccphtx_tstamp;
60};
61
62struct dccp_rx_hist_entry {
63 struct list_head dccphrx_node;
64 u64 dccphrx_seqno:48,
65 dccphrx_ccval:4,
66 dccphrx_type:4;
67 u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
68 struct timeval dccphrx_tstamp;
69};
70
71struct dccp_tx_hist {
72 kmem_cache_t *dccptxh_slab;
73};
74
75extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
76extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
77
78struct dccp_rx_hist {
79 kmem_cache_t *dccprxh_slab;
80};
81
82extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
83extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
84extern struct dccp_rx_hist_entry *
85 dccp_rx_hist_find_data_packet(const struct list_head *list);
86
87static inline struct dccp_tx_hist_entry *
88 dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
89 const unsigned int __nocast prio)
90{
91 struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
92 prio);
93
94 if (entry != NULL)
95 entry->dccphtx_sent = 0;
96
97 return entry;
98}
99
100static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
101 struct dccp_tx_hist_entry *entry)
102{
103 if (entry != NULL)
104 kmem_cache_free(hist->dccptxh_slab, entry);
105}
106
107extern struct dccp_tx_hist_entry *
108 dccp_tx_hist_find_entry(const struct list_head *list,
109 const u64 seq);
110
111static inline void dccp_tx_hist_add_entry(struct list_head *list,
112 struct dccp_tx_hist_entry *entry)
113{
114 list_add(&entry->dccphtx_node, list);
115}
116
117extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
118 struct list_head *list,
119 struct dccp_tx_hist_entry *next);
120
121extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
122 struct list_head *list);
123
124static inline struct dccp_tx_hist_entry *
125 dccp_tx_hist_head(struct list_head *list)
126{
127 struct dccp_tx_hist_entry *head = NULL;
128
129 if (!list_empty(list))
130 head = list_entry(list->next, struct dccp_tx_hist_entry,
131 dccphtx_node);
132 return head;
133}
134
135static inline struct dccp_rx_hist_entry *
136 dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
137 const u32 ndp,
138 const struct sk_buff *skb,
139 const unsigned int __nocast prio)
140{
141 struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
142 prio);
143
144 if (entry != NULL) {
145 const struct dccp_hdr *dh = dccp_hdr(skb);
146
147 entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
148 entry->dccphrx_ccval = dh->dccph_ccval;
149 entry->dccphrx_type = dh->dccph_type;
150 entry->dccphrx_ndp = ndp;
151 do_gettimeofday(&(entry->dccphrx_tstamp));
152 }
153
154 return entry;
155}
156
157static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
158 struct dccp_rx_hist_entry *entry)
159{
160 if (entry != NULL)
161 kmem_cache_free(hist->dccprxh_slab, entry);
162}
163
164extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
165 struct list_head *list);
166
167static inline void dccp_rx_hist_add_entry(struct list_head *list,
168 struct dccp_rx_hist_entry *entry)
169{
170 list_add(&entry->dccphrx_node, list);
171}
172
173static inline struct dccp_rx_hist_entry *
174 dccp_rx_hist_head(struct list_head *list)
175{
176 struct dccp_rx_hist_entry *head = NULL;
177
178 if (!list_empty(list))
179 head = list_entry(list->next, struct dccp_rx_hist_entry,
180 dccphrx_node);
181 return head;
182}
183
184static inline int
185 dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
186{
187 return entry->dccphrx_type == DCCP_PKT_DATA ||
188 entry->dccphrx_type == DCCP_PKT_DATAACK;
189}
190
191extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
192 struct list_head *rx_list,
193 struct list_head *li_list,
194 struct dccp_rx_hist_entry *packet);
195
196extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
197 struct list_head *li_list, u8 *win_loss);
198
199#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 000000000000..130c4c40cfe3
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,22 @@
1#ifndef _TFRC_H_
2#define _TFRC_H_
3/*
4 * net/dccp/ccids/lib/tfrc.h
5 *
6 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
7 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
9 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 */
16
17#include <linux/types.h>
18
19extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
20extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
21
22#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 000000000000..d2b5933b4510
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,644 @@
1/*
2 * net/dccp/ccids/lib/tfrc_equation.c
3 *
4 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
7 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17
18#include <asm/bug.h>
19#include <asm/div64.h>
20
21#include "tfrc.h"
22
23#define TFRC_CALC_X_ARRSIZE 500
24
25#define TFRC_CALC_X_SPLIT 50000
26/* equivalent to 0.05 */
27
28static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
29 { 37172, 8172 },
30 { 53499, 11567 },
31 { 66664, 14180 },
32 { 78298, 16388 },
33 { 89021, 18339 },
34 { 99147, 20108 },
35 { 108858, 21738 },
36 { 118273, 23260 },
37 { 127474, 24693 },
38 { 136520, 26052 },
39 { 145456, 27348 },
40 { 154316, 28589 },
41 { 163130, 29783 },
42 { 171919, 30935 },
43 { 180704, 32049 },
44 { 189502, 33130 },
45 { 198328, 34180 },
46 { 207194, 35202 },
47 { 216114, 36198 },
48 { 225097, 37172 },
49 { 234153, 38123 },
50 { 243294, 39055 },
51 { 252527, 39968 },
52 { 261861, 40864 },
53 { 271305, 41743 },
54 { 280866, 42607 },
55 { 290553, 43457 },
56 { 300372, 44293 },
57 { 310333, 45117 },
58 { 320441, 45929 },
59 { 330705, 46729 },
60 { 341131, 47518 },
61 { 351728, 48297 },
62 { 362501, 49066 },
63 { 373460, 49826 },
64 { 384609, 50577 },
65 { 395958, 51320 },
66 { 407513, 52054 },
67 { 419281, 52780 },
68 { 431270, 53499 },
69 { 443487, 54211 },
70 { 455940, 54916 },
71 { 468635, 55614 },
72 { 481581, 56306 },
73 { 494785, 56991 },
74 { 508254, 57671 },
75 { 521996, 58345 },
76 { 536019, 59014 },
77 { 550331, 59677 },
78 { 564939, 60335 },
79 { 579851, 60988 },
80 { 595075, 61636 },
81 { 610619, 62279 },
82 { 626491, 62918 },
83 { 642700, 63553 },
84 { 659253, 64183 },
85 { 676158, 64809 },
86 { 693424, 65431 },
87 { 711060, 66050 },
88 { 729073, 66664 },
89 { 747472, 67275 },
90 { 766266, 67882 },
91 { 785464, 68486 },
92 { 805073, 69087 },
93 { 825103, 69684 },
94 { 845562, 70278 },
95 { 866460, 70868 },
96 { 887805, 71456 },
97 { 909606, 72041 },
98 { 931873, 72623 },
99 { 954614, 73202 },
100 { 977839, 73778 },
101 { 1001557, 74352 },
102 { 1025777, 74923 },
103 { 1050508, 75492 },
104 { 1075761, 76058 },
105 { 1101544, 76621 },
106 { 1127867, 77183 },
107 { 1154739, 77741 },
108 { 1182172, 78298 },
109 { 1210173, 78852 },
110 { 1238753, 79405 },
111 { 1267922, 79955 },
112 { 1297689, 80503 },
113 { 1328066, 81049 },
114 { 1359060, 81593 },
115 { 1390684, 82135 },
116 { 1422947, 82675 },
117 { 1455859, 83213 },
118 { 1489430, 83750 },
119 { 1523671, 84284 },
120 { 1558593, 84817 },
121 { 1594205, 85348 },
122 { 1630518, 85878 },
123 { 1667543, 86406 },
124 { 1705290, 86932 },
125 { 1743770, 87457 },
126 { 1782994, 87980 },
127 { 1822973, 88501 },
128 { 1863717, 89021 },
129 { 1905237, 89540 },
130 { 1947545, 90057 },
131 { 1990650, 90573 },
132 { 2034566, 91087 },
133 { 2079301, 91600 },
134 { 2124869, 92111 },
135 { 2171279, 92622 },
136 { 2218543, 93131 },
137 { 2266673, 93639 },
138 { 2315680, 94145 },
139 { 2365575, 94650 },
140 { 2416371, 95154 },
141 { 2468077, 95657 },
142 { 2520707, 96159 },
143 { 2574271, 96660 },
144 { 2628782, 97159 },
145 { 2684250, 97658 },
146 { 2740689, 98155 },
147 { 2798110, 98651 },
148 { 2856524, 99147 },
149 { 2915944, 99641 },
150 { 2976382, 100134 },
151 { 3037850, 100626 },
152 { 3100360, 101117 },
153 { 3163924, 101608 },
154 { 3228554, 102097 },
155 { 3294263, 102586 },
156 { 3361063, 103073 },
157 { 3428966, 103560 },
158 { 3497984, 104045 },
159 { 3568131, 104530 },
160 { 3639419, 105014 },
161 { 3711860, 105498 },
162 { 3785467, 105980 },
163 { 3860253, 106462 },
164 { 3936229, 106942 },
165 { 4013410, 107422 },
166 { 4091808, 107902 },
167 { 4171435, 108380 },
168 { 4252306, 108858 },
169 { 4334431, 109335 },
170 { 4417825, 109811 },
171 { 4502501, 110287 },
172 { 4588472, 110762 },
173 { 4675750, 111236 },
174 { 4764349, 111709 },
175 { 4854283, 112182 },
176 { 4945564, 112654 },
177 { 5038206, 113126 },
178 { 5132223, 113597 },
179 { 5227627, 114067 },
180 { 5324432, 114537 },
181 { 5422652, 115006 },
182 { 5522299, 115474 },
183 { 5623389, 115942 },
184 { 5725934, 116409 },
185 { 5829948, 116876 },
186 { 5935446, 117342 },
187 { 6042439, 117808 },
188 { 6150943, 118273 },
189 { 6260972, 118738 },
190 { 6372538, 119202 },
191 { 6485657, 119665 },
192 { 6600342, 120128 },
193 { 6716607, 120591 },
194 { 6834467, 121053 },
195 { 6953935, 121514 },
196 { 7075025, 121976 },
197 { 7197752, 122436 },
198 { 7322131, 122896 },
199 { 7448175, 123356 },
200 { 7575898, 123815 },
201 { 7705316, 124274 },
202 { 7836442, 124733 },
203 { 7969291, 125191 },
204 { 8103877, 125648 },
205 { 8240216, 126105 },
206 { 8378321, 126562 },
207 { 8518208, 127018 },
208 { 8659890, 127474 },
209 { 8803384, 127930 },
210 { 8948702, 128385 },
211 { 9095861, 128840 },
212 { 9244875, 129294 },
213 { 9395760, 129748 },
214 { 9548529, 130202 },
215 { 9703198, 130655 },
216 { 9859782, 131108 },
217 { 10018296, 131561 },
218 { 10178755, 132014 },
219 { 10341174, 132466 },
220 { 10505569, 132917 },
221 { 10671954, 133369 },
222 { 10840345, 133820 },
223 { 11010757, 134271 },
224 { 11183206, 134721 },
225 { 11357706, 135171 },
226 { 11534274, 135621 },
227 { 11712924, 136071 },
228 { 11893673, 136520 },
229 { 12076536, 136969 },
230 { 12261527, 137418 },
231 { 12448664, 137867 },
232 { 12637961, 138315 },
233 { 12829435, 138763 },
234 { 13023101, 139211 },
235 { 13218974, 139658 },
236 { 13417071, 140106 },
237 { 13617407, 140553 },
238 { 13819999, 140999 },
239 { 14024862, 141446 },
240 { 14232012, 141892 },
241 { 14441465, 142339 },
242 { 14653238, 142785 },
243 { 14867346, 143230 },
244 { 15083805, 143676 },
245 { 15302632, 144121 },
246 { 15523842, 144566 },
247 { 15747453, 145011 },
248 { 15973479, 145456 },
249 { 16201939, 145900 },
250 { 16432847, 146345 },
251 { 16666221, 146789 },
252 { 16902076, 147233 },
253 { 17140429, 147677 },
254 { 17381297, 148121 },
255 { 17624696, 148564 },
256 { 17870643, 149007 },
257 { 18119154, 149451 },
258 { 18370247, 149894 },
259 { 18623936, 150336 },
260 { 18880241, 150779 },
261 { 19139176, 151222 },
262 { 19400759, 151664 },
263 { 19665007, 152107 },
264 { 19931936, 152549 },
265 { 20201564, 152991 },
266 { 20473907, 153433 },
267 { 20748982, 153875 },
268 { 21026807, 154316 },
269 { 21307399, 154758 },
270 { 21590773, 155199 },
271 { 21876949, 155641 },
272 { 22165941, 156082 },
273 { 22457769, 156523 },
274 { 22752449, 156964 },
275 { 23049999, 157405 },
276 { 23350435, 157846 },
277 { 23653774, 158287 },
278 { 23960036, 158727 },
279 { 24269236, 159168 },
280 { 24581392, 159608 },
281 { 24896521, 160049 },
282 { 25214642, 160489 },
283 { 25535772, 160929 },
284 { 25859927, 161370 },
285 { 26187127, 161810 },
286 { 26517388, 162250 },
287 { 26850728, 162690 },
288 { 27187165, 163130 },
289 { 27526716, 163569 },
290 { 27869400, 164009 },
291 { 28215234, 164449 },
292 { 28564236, 164889 },
293 { 28916423, 165328 },
294 { 29271815, 165768 },
295 { 29630428, 166208 },
296 { 29992281, 166647 },
297 { 30357392, 167087 },
298 { 30725779, 167526 },
299 { 31097459, 167965 },
300 { 31472452, 168405 },
301 { 31850774, 168844 },
302 { 32232445, 169283 },
303 { 32617482, 169723 },
304 { 33005904, 170162 },
305 { 33397730, 170601 },
306 { 33792976, 171041 },
307 { 34191663, 171480 },
308 { 34593807, 171919 },
309 { 34999428, 172358 },
310 { 35408544, 172797 },
311 { 35821174, 173237 },
312 { 36237335, 173676 },
313 { 36657047, 174115 },
314 { 37080329, 174554 },
315 { 37507197, 174993 },
316 { 37937673, 175433 },
317 { 38371773, 175872 },
318 { 38809517, 176311 },
319 { 39250924, 176750 },
320 { 39696012, 177190 },
321 { 40144800, 177629 },
322 { 40597308, 178068 },
323 { 41053553, 178507 },
324 { 41513554, 178947 },
325 { 41977332, 179386 },
326 { 42444904, 179825 },
327 { 42916290, 180265 },
328 { 43391509, 180704 },
329 { 43870579, 181144 },
330 { 44353520, 181583 },
331 { 44840352, 182023 },
332 { 45331092, 182462 },
333 { 45825761, 182902 },
334 { 46324378, 183342 },
335 { 46826961, 183781 },
336 { 47333531, 184221 },
337 { 47844106, 184661 },
338 { 48358706, 185101 },
339 { 48877350, 185541 },
340 { 49400058, 185981 },
341 { 49926849, 186421 },
342 { 50457743, 186861 },
343 { 50992759, 187301 },
344 { 51531916, 187741 },
345 { 52075235, 188181 },
346 { 52622735, 188622 },
347 { 53174435, 189062 },
348 { 53730355, 189502 },
349 { 54290515, 189943 },
350 { 54854935, 190383 },
351 { 55423634, 190824 },
352 { 55996633, 191265 },
353 { 56573950, 191706 },
354 { 57155606, 192146 },
355 { 57741621, 192587 },
356 { 58332014, 193028 },
357 { 58926806, 193470 },
358 { 59526017, 193911 },
359 { 60129666, 194352 },
360 { 60737774, 194793 },
361 { 61350361, 195235 },
362 { 61967446, 195677 },
363 { 62589050, 196118 },
364 { 63215194, 196560 },
365 { 63845897, 197002 },
366 { 64481179, 197444 },
367 { 65121061, 197886 },
368 { 65765563, 198328 },
369 { 66414705, 198770 },
370 { 67068508, 199213 },
371 { 67726992, 199655 },
372 { 68390177, 200098 },
373 { 69058085, 200540 },
374 { 69730735, 200983 },
375 { 70408147, 201426 },
376 { 71090343, 201869 },
377 { 71777343, 202312 },
378 { 72469168, 202755 },
379 { 73165837, 203199 },
380 { 73867373, 203642 },
381 { 74573795, 204086 },
382 { 75285124, 204529 },
383 { 76001380, 204973 },
384 { 76722586, 205417 },
385 { 77448761, 205861 },
386 { 78179926, 206306 },
387 { 78916102, 206750 },
388 { 79657310, 207194 },
389 { 80403571, 207639 },
390 { 81154906, 208084 },
391 { 81911335, 208529 },
392 { 82672880, 208974 },
393 { 83439562, 209419 },
394 { 84211402, 209864 },
395 { 84988421, 210309 },
396 { 85770640, 210755 },
397 { 86558080, 211201 },
398 { 87350762, 211647 },
399 { 88148708, 212093 },
400 { 88951938, 212539 },
401 { 89760475, 212985 },
402 { 90574339, 213432 },
403 { 91393551, 213878 },
404 { 92218133, 214325 },
405 { 93048107, 214772 },
406 { 93883493, 215219 },
407 { 94724314, 215666 },
408 { 95570590, 216114 },
409 { 96422343, 216561 },
410 { 97279594, 217009 },
411 { 98142366, 217457 },
412 { 99010679, 217905 },
413 { 99884556, 218353 },
414 { 100764018, 218801 },
415 { 101649086, 219250 },
416 { 102539782, 219698 },
417 { 103436128, 220147 },
418 { 104338146, 220596 },
419 { 105245857, 221046 },
420 { 106159284, 221495 },
421 { 107078448, 221945 },
422 { 108003370, 222394 },
423 { 108934074, 222844 },
424 { 109870580, 223294 },
425 { 110812910, 223745 },
426 { 111761087, 224195 },
427 { 112715133, 224646 },
428 { 113675069, 225097 },
429 { 114640918, 225548 },
430 { 115612702, 225999 },
431 { 116590442, 226450 },
432 { 117574162, 226902 },
433 { 118563882, 227353 },
434 { 119559626, 227805 },
435 { 120561415, 228258 },
436 { 121569272, 228710 },
437 { 122583219, 229162 },
438 { 123603278, 229615 },
439 { 124629471, 230068 },
440 { 125661822, 230521 },
441 { 126700352, 230974 },
442 { 127745083, 231428 },
443 { 128796039, 231882 },
444 { 129853241, 232336 },
445 { 130916713, 232790 },
446 { 131986475, 233244 },
447 { 133062553, 233699 },
448 { 134144966, 234153 },
449 { 135233739, 234608 },
450 { 136328894, 235064 },
451 { 137430453, 235519 },
452 { 138538440, 235975 },
453 { 139652876, 236430 },
454 { 140773786, 236886 },
455 { 141901190, 237343 },
456 { 143035113, 237799 },
457 { 144175576, 238256 },
458 { 145322604, 238713 },
459 { 146476218, 239170 },
460 { 147636442, 239627 },
461 { 148803298, 240085 },
462 { 149976809, 240542 },
463 { 151156999, 241000 },
464 { 152343890, 241459 },
465 { 153537506, 241917 },
466 { 154737869, 242376 },
467 { 155945002, 242835 },
468 { 157158929, 243294 },
469 { 158379673, 243753 },
470 { 159607257, 244213 },
471 { 160841704, 244673 },
472 { 162083037, 245133 },
473 { 163331279, 245593 },
474 { 164586455, 246054 },
475 { 165848586, 246514 },
476 { 167117696, 246975 },
477 { 168393810, 247437 },
478 { 169676949, 247898 },
479 { 170967138, 248360 },
480 { 172264399, 248822 },
481 { 173568757, 249284 },
482 { 174880235, 249747 },
483 { 176198856, 250209 },
484 { 177524643, 250672 },
485 { 178857621, 251136 },
486 { 180197813, 251599 },
487 { 181545242, 252063 },
488 { 182899933, 252527 },
489 { 184261908, 252991 },
490 { 185631191, 253456 },
491 { 187007807, 253920 },
492 { 188391778, 254385 },
493 { 189783129, 254851 },
494 { 191181884, 255316 },
495 { 192588065, 255782 },
496 { 194001698, 256248 },
497 { 195422805, 256714 },
498 { 196851411, 257181 },
499 { 198287540, 257648 },
500 { 199731215, 258115 },
501 { 201182461, 258582 },
502 { 202641302, 259050 },
503 { 204107760, 259518 },
504 { 205581862, 259986 },
505 { 207063630, 260454 },
506 { 208553088, 260923 },
507 { 210050262, 261392 },
508 { 211555174, 261861 },
509 { 213067849, 262331 },
510 { 214588312, 262800 },
511 { 216116586, 263270 },
512 { 217652696, 263741 },
513 { 219196666, 264211 },
514 { 220748520, 264682 },
515 { 222308282, 265153 },
516 { 223875978, 265625 },
517 { 225451630, 266097 },
518 { 227035265, 266569 },
519 { 228626905, 267041 },
520 { 230226576, 267514 },
521 { 231834302, 267986 },
522 { 233450107, 268460 },
523 { 235074016, 268933 },
524 { 236706054, 269407 },
525 { 238346244, 269881 },
526 { 239994613, 270355 },
527 { 241651183, 270830 },
528 { 243315981, 271305 }
529};
530
531/* Calculate the send rate as per section 3.1 of RFC3448
532
533Returns send rate in bytes per second
534
535Integer maths and lookups are used as not allowed floating point in kernel
536
537The function for Xcalc as per section 3.1 of RFC3448 is:
538
539X = s
540 -------------------------------------------------------------
541 R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
542
543where
544X is the trasmit rate in bytes/second
545s is the packet size in bytes
546R is the round trip time in seconds
547p is the loss event rate, between 0 and 1.0, of the number of loss events
548 as a fraction of the number of packets transmitted
549t_RTO is the TCP retransmission timeout value in seconds
550b is the number of packets acknowledged by a single TCP acknowledgement
551
552we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
553
554X = s
555 -----------------------------------------------------------------------
556 R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
557
558
559which we can break down into:
560
561X = s
562 --------
563 R * f(p)
564
565where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
566
567Function parameters:
568s - bytes
569R - RTT in usecs
570p - loss rate (decimal fraction multiplied by 1,000,000)
571
572Returns Xcalc in bytes per second
573
574DON'T alter this code unless you run test cases against it as the code
575has been manipulated to stop underflow/overlow.
576
577*/
578u32 tfrc_calc_x(u16 s, u32 R, u32 p)
579{
580 int index;
581 u32 f;
582 u64 tmp1, tmp2;
583
584 if (p < TFRC_CALC_X_SPLIT)
585 index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
586 else
587 index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
588
589 if (index < 0)
590 /* p should be 0 unless there is a bug in my code */
591 index = 0;
592
593 if (R == 0)
594 R = 1; /* RTT can't be zero or else divide by zero */
595
596 BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
597
598 if (p >= TFRC_CALC_X_SPLIT)
599 f = tfrc_calc_x_lookup[index][0];
600 else
601 f = tfrc_calc_x_lookup[index][1];
602
603 tmp1 = ((u64)s * 100000000);
604 tmp2 = ((u64)R * (u64)f);
605 do_div(tmp2, 10000);
606 do_div(tmp1, tmp2);
607 /* Don't alter above math unless you test due to overflow on 32 bit */
608
609 return (u32)tmp1;
610}
611
612EXPORT_SYMBOL_GPL(tfrc_calc_x);
613
614/*
615 * args: fvalue - function value to match
616 * returns: p closest to that value
617 *
618 * both fvalue and p are multiplied by 1,000,000 to use ints
619 */
620u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
621{
622 int ctr = 0;
623 int small;
624
625 if (fvalue < tfrc_calc_x_lookup[0][1])
626 return 0;
627
628 if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
629 small = 1;
630 else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
631 return 1000000;
632 else
633 small = 0;
634
635 while (fvalue > tfrc_calc_x_lookup[ctr][small])
636 ctr++;
637
638 if (small)
639 return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
640 else
641 return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
642}
643
644EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 000000000000..33456c0d5937
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,493 @@
1#ifndef _DCCP_H
2#define _DCCP_H
3/*
4 * net/dccp/dccp.h
5 *
6 * An implementation of the DCCP protocol
7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
8 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15#include <linux/config.h>
16#include <linux/dccp.h>
17#include <net/snmp.h>
18#include <net/sock.h>
19#include <net/tcp.h>
20
21#ifdef CONFIG_IP_DCCP_DEBUG
22extern int dccp_debug;
23
24#define dccp_pr_debug(format, a...) \
25 do { if (dccp_debug) \
26 printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
27 } while (0)
28#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
29 printk(format, ##a); } while (0)
30#else
31#define dccp_pr_debug(format, a...)
32#define dccp_pr_debug_cat(format, a...)
33#endif
34
35extern struct inet_hashinfo dccp_hashinfo;
36
37extern atomic_t dccp_orphan_count;
38extern int dccp_tw_count;
39extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
40
41extern void dccp_time_wait(struct sock *sk, int state, int timeo);
42
43/* FIXME: Right size this */
44#define DCCP_MAX_OPT_LEN 128
45
46#define DCCP_MAX_PACKET_HDR 32
47
48#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
49
50#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
51 * state, about 60 seconds */
52
53/* draft-ietf-dccp-spec-11.txt initial RTO value */
54#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
55
56/* Maximal interval between probes for local resources. */
57#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
58
59#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
60
61extern struct proto dccp_v4_prot;
62
63/* is seq1 < seq2 ? */
64static inline int before48(const u64 seq1, const u64 seq2)
65{
66 return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
67}
68
69/* is seq1 > seq2 ? */
70static inline int after48(const u64 seq1, const u64 seq2)
71{
72 return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
73}
74
75/* is seq2 <= seq1 <= seq3 ? */
76static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
77{
78 return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
79}
80
81static inline u64 max48(const u64 seq1, const u64 seq2)
82{
83 return after48(seq1, seq2) ? seq1 : seq2;
84}
85
86enum {
87 DCCP_MIB_NUM = 0,
88 DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
89 DCCP_MIB_ESTABRESETS, /* EstabResets */
90 DCCP_MIB_CURRESTAB, /* CurrEstab */
91 DCCP_MIB_OUTSEGS, /* OutSegs */
92 DCCP_MIB_OUTRSTS,
93 DCCP_MIB_ABORTONTIMEOUT,
94 DCCP_MIB_TIMEOUTS,
95 DCCP_MIB_ABORTFAILED,
96 DCCP_MIB_PASSIVEOPENS,
97 DCCP_MIB_ATTEMPTFAILS,
98 DCCP_MIB_OUTDATAGRAMS,
99 DCCP_MIB_INERRS,
100 DCCP_MIB_OPTMANDATORYERROR,
101 DCCP_MIB_INVALIDOPT,
102 __DCCP_MIB_MAX
103};
104
105#define DCCP_MIB_MAX __DCCP_MIB_MAX
106struct dccp_mib {
107 unsigned long mibs[DCCP_MIB_MAX];
108} __SNMP_MIB_ALIGN__;
109
110DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
111#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
112#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
113#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
114#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
115#define DCCP_ADD_STATS_BH(field, val) \
116 SNMP_ADD_STATS_BH(dccp_statistics, field, val)
117#define DCCP_ADD_STATS_USER(field, val) \
118 SNMP_ADD_STATS_USER(dccp_statistics, field, val)
119
120extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb);
121extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
122
123extern int dccp_send_response(struct sock *sk);
124extern void dccp_send_ack(struct sock *sk);
125extern void dccp_send_delayed_ack(struct sock *sk);
126extern void dccp_send_sync(struct sock *sk, const u64 seq,
127 const enum dccp_pkt_type pkt_type);
128
129extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
130extern void dccp_write_space(struct sock *sk);
131
132extern void dccp_init_xmit_timers(struct sock *sk);
133static inline void dccp_clear_xmit_timers(struct sock *sk)
134{
135 inet_csk_clear_xmit_timers(sk);
136}
137
138extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
139
140extern const char *dccp_packet_name(const int type);
141extern const char *dccp_state_name(const int state);
142
143static inline void dccp_set_state(struct sock *sk, const int state)
144{
145 const int oldstate = sk->sk_state;
146
147 dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
148 dccp_role(sk), sk,
149 dccp_state_name(oldstate), dccp_state_name(state));
150 WARN_ON(state == oldstate);
151
152 switch (state) {
153 case DCCP_OPEN:
154 if (oldstate != DCCP_OPEN)
155 DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
156 break;
157
158 case DCCP_CLOSED:
159 if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
160 DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
161
162 sk->sk_prot->unhash(sk);
163 if (inet_csk(sk)->icsk_bind_hash != NULL &&
164 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
165 inet_put_port(&dccp_hashinfo, sk);
166 /* fall through */
167 default:
168 if (oldstate == DCCP_OPEN)
169 DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
170 }
171
172 /* Change state AFTER socket is unhashed to avoid closed
173 * socket sitting in hash tables.
174 */
175 sk->sk_state = state;
176}
177
178static inline void dccp_done(struct sock *sk)
179{
180 dccp_set_state(sk, DCCP_CLOSED);
181 dccp_clear_xmit_timers(sk);
182
183 sk->sk_shutdown = SHUTDOWN_MASK;
184
185 if (!sock_flag(sk, SOCK_DEAD))
186 sk->sk_state_change(sk);
187 else
188 inet_csk_destroy_sock(sk);
189}
190
191static inline void dccp_openreq_init(struct request_sock *req,
192 struct dccp_sock *dp,
193 struct sk_buff *skb)
194{
195 /*
196 * FIXME: fill in the other req fields from the DCCP options
197 * received
198 */
199 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
200 inet_rsk(req)->acked = 0;
201 req->rcv_wnd = 0;
202}
203
204extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
205
206extern struct sock *dccp_create_openreq_child(struct sock *sk,
207 const struct request_sock *req,
208 const struct sk_buff *skb);
209
210extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
211
212extern void dccp_v4_err(struct sk_buff *skb, u32);
213
214extern int dccp_v4_rcv(struct sk_buff *skb);
215
216extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
217 struct sk_buff *skb,
218 struct request_sock *req,
219 struct dst_entry *dst);
220extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
221 struct request_sock *req,
222 struct request_sock **prev);
223
224extern int dccp_child_process(struct sock *parent, struct sock *child,
225 struct sk_buff *skb);
226extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
227 struct dccp_hdr *dh, unsigned len);
228extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
229 const struct dccp_hdr *dh, const unsigned len);
230
231extern void dccp_close(struct sock *sk, long timeout);
232extern struct sk_buff *dccp_make_response(struct sock *sk,
233 struct dst_entry *dst,
234 struct request_sock *req);
235extern struct sk_buff *dccp_make_reset(struct sock *sk,
236 struct dst_entry *dst,
237 enum dccp_reset_codes code);
238
239extern int dccp_connect(struct sock *sk);
240extern int dccp_disconnect(struct sock *sk, int flags);
241extern int dccp_getsockopt(struct sock *sk, int level, int optname,
242 char __user *optval, int __user *optlen);
243extern int dccp_setsockopt(struct sock *sk, int level, int optname,
244 char __user *optval, int optlen);
245extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
246extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
247 struct msghdr *msg, size_t size);
248extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
249 struct msghdr *msg, size_t len, int nonblock,
250 int flags, int *addr_len);
251extern void dccp_shutdown(struct sock *sk, int how);
252
253extern int dccp_v4_checksum(const struct sk_buff *skb,
254 const u32 saddr, const u32 daddr);
255
256extern int dccp_v4_send_reset(struct sock *sk,
257 enum dccp_reset_codes code);
258extern void dccp_send_close(struct sock *sk, const int active);
259
260struct dccp_skb_cb {
261 __u8 dccpd_type;
262 __u8 dccpd_reset_code;
263 __u8 dccpd_service;
264 __u8 dccpd_ccval;
265 __u64 dccpd_seq;
266 __u64 dccpd_ack_seq;
267 int dccpd_opt_len;
268};
269
270#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
271
272static inline int dccp_non_data_packet(const struct sk_buff *skb)
273{
274 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
275
276 return type == DCCP_PKT_ACK ||
277 type == DCCP_PKT_CLOSE ||
278 type == DCCP_PKT_CLOSEREQ ||
279 type == DCCP_PKT_RESET ||
280 type == DCCP_PKT_SYNC ||
281 type == DCCP_PKT_SYNCACK;
282}
283
284static inline int dccp_packet_without_ack(const struct sk_buff *skb)
285{
286 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
287
288 return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
289}
290
291#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
292#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
293
294static inline void dccp_set_seqno(u64 *seqno, u64 value)
295{
296 if (value > DCCP_MAX_SEQNO)
297 value -= DCCP_MAX_SEQNO + 1;
298 *seqno = value;
299}
300
301static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
302{
303 return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
304}
305
306static inline void dccp_inc_seqno(u64 *seqno)
307{
308 if (++*seqno > DCCP_MAX_SEQNO)
309 *seqno = 0;
310}
311
312static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
313{
314 struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
315 sizeof(*dh));
316
317#if defined(__LITTLE_ENDIAN_BITFIELD)
318 dh->dccph_seq = htonl((gss >> 32)) >> 8;
319#elif defined(__BIG_ENDIAN_BITFIELD)
320 dh->dccph_seq = htonl((gss >> 32));
321#else
322#error "Adjust your <asm/byteorder.h> defines"
323#endif
324 dhx->dccph_seq_low = htonl(gss & 0xffffffff);
325}
326
327static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
328 const u64 gsr)
329{
330#if defined(__LITTLE_ENDIAN_BITFIELD)
331 dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
332#elif defined(__BIG_ENDIAN_BITFIELD)
333 dhack->dccph_ack_nr_high = htonl((gsr >> 32));
334#else
335#error "Adjust your <asm/byteorder.h> defines"
336#endif
337 dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
338}
339
340static inline void dccp_update_gsr(struct sock *sk, u64 seq)
341{
342 struct dccp_sock *dp = dccp_sk(sk);
343
344 dp->dccps_gsr = seq;
345 dccp_set_seqno(&dp->dccps_swl,
346 (dp->dccps_gsr + 1 -
347 (dp->dccps_options.dccpo_sequence_window / 4)));
348 dccp_set_seqno(&dp->dccps_swh,
349 (dp->dccps_gsr +
350 (3 * dp->dccps_options.dccpo_sequence_window) / 4));
351}
352
353static inline void dccp_update_gss(struct sock *sk, u64 seq)
354{
355 struct dccp_sock *dp = dccp_sk(sk);
356
357 dp->dccps_awh = dp->dccps_gss = seq;
358 dccp_set_seqno(&dp->dccps_awl,
359 (dp->dccps_gss -
360 dp->dccps_options.dccpo_sequence_window + 1));
361}
362
363extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
364extern void dccp_insert_option_elapsed_time(struct sock *sk,
365 struct sk_buff *skb,
366 u32 elapsed_time);
367extern void dccp_insert_option_timestamp(struct sock *sk,
368 struct sk_buff *skb);
369extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
370 unsigned char option,
371 const void *value, unsigned char len);
372
373extern struct socket *dccp_ctl_socket;
374
375#define DCCP_ACKPKTS_STATE_RECEIVED 0
376#define DCCP_ACKPKTS_STATE_ECN_MARKED (1 << 6)
377#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6)
378
379#define DCCP_ACKPKTS_STATE_MASK 0xC0 /* 11000000 */
380#define DCCP_ACKPKTS_LEN_MASK 0x3F /* 00111111 */
381
382/** struct dccp_ackpkts - acknowledgeable packets
383 *
384 * This data structure is the one defined in the DCCP draft
385 * Appendix A.
386 *
387 * @dccpap_buf_head - circular buffer head
388 * @dccpap_buf_tail - circular buffer tail
389 * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the
390 * buffer (i.e. %dccpap_buf_head)
391 * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
392 * by the buffer with State 0
393 *
394 * Additionally, the HC-Receiver must keep some information about the
395 * Ack Vectors it has recently sent. For each packet sent carrying an
396 * Ack Vector, it remembers four variables:
397 *
398 * @dccpap_ack_seqno - the Sequence Number used for the packet
399 * (HC-Receiver seqno)
400 * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement.
401 * @dccpap_ack_ackno - the Acknowledgement Number used for the packet
402 * (HC-Sender seqno)
403 * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
404 *
405 * @dccpap_buf_len - circular buffer length
406 * @dccpap_time - the time in usecs
407 * @dccpap_buf - circular buffer of acknowledgeable packets
408 */
409struct dccp_ackpkts {
410 unsigned int dccpap_buf_head;
411 unsigned int dccpap_buf_tail;
412 u64 dccpap_buf_ackno;
413 u64 dccpap_ack_seqno;
414 u64 dccpap_ack_ackno;
415 unsigned int dccpap_ack_ptr;
416 unsigned int dccpap_buf_vector_len;
417 unsigned int dccpap_ack_vector_len;
418 unsigned int dccpap_buf_len;
419 struct timeval dccpap_time;
420 u8 dccpap_buf_nonce;
421 u8 dccpap_ack_nonce;
422 u8 dccpap_buf[0];
423};
424
425extern struct dccp_ackpkts *
426 dccp_ackpkts_alloc(unsigned int len,
427 const unsigned int __nocast priority);
428extern void dccp_ackpkts_free(struct dccp_ackpkts *ap);
429extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state);
430extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap,
431 struct sock *sk, u64 ackno);
432
433static inline suseconds_t timeval_usecs(const struct timeval *tv)
434{
435 return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
436}
437
438static inline suseconds_t timeval_delta(const struct timeval *large,
439 const struct timeval *small)
440{
441 time_t secs = large->tv_sec - small->tv_sec;
442 suseconds_t usecs = large->tv_usec - small->tv_usec;
443
444 if (usecs < 0) {
445 secs--;
446 usecs += USEC_PER_SEC;
447 }
448 return secs * USEC_PER_SEC + usecs;
449}
450
451static inline void timeval_add_usecs(struct timeval *tv,
452 const suseconds_t usecs)
453{
454 tv->tv_usec += usecs;
455 while (tv->tv_usec >= USEC_PER_SEC) {
456 tv->tv_sec++;
457 tv->tv_usec -= USEC_PER_SEC;
458 }
459}
460
461static inline void timeval_sub_usecs(struct timeval *tv,
462 const suseconds_t usecs)
463{
464 tv->tv_usec -= usecs;
465 while (tv->tv_usec < 0) {
466 tv->tv_sec--;
467 tv->tv_usec += USEC_PER_SEC;
468 }
469}
470
471/*
472 * Returns the difference in usecs between timeval
473 * passed in and current time
474 */
475static inline suseconds_t timeval_now_delta(const struct timeval *tv)
476{
477 struct timeval now;
478 do_gettimeofday(&now);
479 return timeval_delta(&now, tv);
480}
481
482#ifdef CONFIG_IP_DCCP_DEBUG
483extern void dccp_ackvector_print(const u64 ackno,
484 const unsigned char *vector, int len);
485extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap);
486#else
487static inline void dccp_ackvector_print(const u64 ackno,
488 const unsigned char *vector,
489 int len) { }
490static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { }
491#endif
492
493#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 000000000000..f675d8e642d3
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,71 @@
1/*
2 * net/dccp/diag.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@mandriva.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/config.h>
13
14#include <linux/module.h>
15#include <linux/inet_diag.h>
16
17#include "ccid.h"
18#include "dccp.h"
19
20static void dccp_get_info(struct sock *sk, struct tcp_info *info)
21{
22 struct dccp_sock *dp = dccp_sk(sk);
23 const struct inet_connection_sock *icsk = inet_csk(sk);
24
25 memset(info, 0, sizeof(*info));
26
27 info->tcpi_state = sk->sk_state;
28 info->tcpi_retransmits = icsk->icsk_retransmits;
29 info->tcpi_probes = icsk->icsk_probes_out;
30 info->tcpi_backoff = icsk->icsk_backoff;
31 info->tcpi_pmtu = dp->dccps_pmtu_cookie;
32
33 if (dp->dccps_options.dccpo_send_ack_vector)
34 info->tcpi_options |= TCPI_OPT_SACK;
35
36 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
37 ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
38}
39
40static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
41 void *_info)
42{
43 r->idiag_rqueue = r->idiag_wqueue = 0;
44
45 if (_info != NULL)
46 dccp_get_info(sk, _info);
47}
48
49static struct inet_diag_handler dccp_diag_handler = {
50 .idiag_hashinfo = &dccp_hashinfo,
51 .idiag_get_info = dccp_diag_get_info,
52 .idiag_type = DCCPDIAG_GETSOCK,
53 .idiag_info_size = sizeof(struct tcp_info),
54};
55
56static int __init dccp_diag_init(void)
57{
58 return inet_diag_register(&dccp_diag_handler);
59}
60
61static void __exit dccp_diag_fini(void)
62{
63 inet_diag_unregister(&dccp_diag_handler);
64}
65
66module_init(dccp_diag_init);
67module_exit(dccp_diag_fini);
68
69MODULE_LICENSE("GPL");
70MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
71MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 000000000000..ef29cef1dafe
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,600 @@
1/*
2 * net/dccp/input.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/config.h>
14#include <linux/dccp.h>
15#include <linux/skbuff.h>
16
17#include <net/sock.h>
18
19#include "ccid.h"
20#include "dccp.h"
21
22static void dccp_fin(struct sock *sk, struct sk_buff *skb)
23{
24 sk->sk_shutdown |= RCV_SHUTDOWN;
25 sock_set_flag(sk, SOCK_DONE);
26 __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
27 __skb_queue_tail(&sk->sk_receive_queue, skb);
28 skb_set_owner_r(skb, sk);
29 sk->sk_data_ready(sk, 0);
30}
31
32static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
33{
34 dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
35 dccp_fin(sk, skb);
36 dccp_set_state(sk, DCCP_CLOSED);
37 sk_wake_async(sk, 1, POLL_HUP);
38}
39
40static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
41{
42 /*
43 * Step 7: Check for unexpected packet types
44 * If (S.is_server and P.type == CloseReq)
45 * Send Sync packet acknowledging P.seqno
46 * Drop packet and return
47 */
48 if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
49 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
50 return;
51 }
52
53 dccp_set_state(sk, DCCP_CLOSING);
54 dccp_send_close(sk, 0);
55}
56
57static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
58{
59 struct dccp_sock *dp = dccp_sk(sk);
60
61 if (dp->dccps_options.dccpo_send_ack_vector)
62 dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk,
63 DCCP_SKB_CB(skb)->dccpd_ack_seq);
64}
65
66static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
67{
68 const struct dccp_hdr *dh = dccp_hdr(skb);
69 struct dccp_sock *dp = dccp_sk(sk);
70 u64 lswl, lawl;
71
72 /*
73 * Step 5: Prepare sequence numbers for Sync
74 * If P.type == Sync or P.type == SyncAck,
75 * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
76 * / * P is valid, so update sequence number variables
77 * accordingly. After this update, P will pass the tests
78 * in Step 6. A SyncAck is generated if necessary in
79 * Step 15 * /
80 * Update S.GSR, S.SWL, S.SWH
81 * Otherwise,
82 * Drop packet and return
83 */
84 if (dh->dccph_type == DCCP_PKT_SYNC ||
85 dh->dccph_type == DCCP_PKT_SYNCACK) {
86 if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
87 dp->dccps_awl, dp->dccps_awh) &&
88 !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
89 dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
90 else
91 return -1;
92 }
93
94 /*
95 * Step 6: Check sequence numbers
96 * Let LSWL = S.SWL and LAWL = S.AWL
97 * If P.type == CloseReq or P.type == Close or P.type == Reset,
98 * LSWL := S.GSR + 1, LAWL := S.GAR
99 * If LSWL <= P.seqno <= S.SWH
100 * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
101 * Update S.GSR, S.SWL, S.SWH
102 * If P.type != Sync,
103 * Update S.GAR
104 * Otherwise,
105 * Send Sync packet acknowledging P.seqno
106 * Drop packet and return
107 */
108 lswl = dp->dccps_swl;
109 lawl = dp->dccps_awl;
110
111 if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
112 dh->dccph_type == DCCP_PKT_CLOSE ||
113 dh->dccph_type == DCCP_PKT_RESET) {
114 lswl = dp->dccps_gsr;
115 dccp_inc_seqno(&lswl);
116 lawl = dp->dccps_gar;
117 }
118
119 if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
120 (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
121 between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
122 lawl, dp->dccps_awh))) {
123 dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
124
125 if (dh->dccph_type != DCCP_PKT_SYNC &&
126 (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
127 DCCP_PKT_WITHOUT_ACK_SEQ))
128 dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
129 } else {
130 LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
131 "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
132 "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
133 "sending SYNC...\n",
134 dccp_packet_name(dh->dccph_type),
135 (unsigned long long) lswl,
136 (unsigned long long)
137 DCCP_SKB_CB(skb)->dccpd_seq,
138 (unsigned long long) dp->dccps_swh,
139 (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
140 DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
141 (unsigned long long) lawl,
142 (unsigned long long)
143 DCCP_SKB_CB(skb)->dccpd_ack_seq,
144 (unsigned long long) dp->dccps_awh);
145 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
146 return -1;
147 }
148
149 return 0;
150}
151
152int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
153 const struct dccp_hdr *dh, const unsigned len)
154{
155 struct dccp_sock *dp = dccp_sk(sk);
156
157 if (dccp_check_seqno(sk, skb))
158 goto discard;
159
160 if (dccp_parse_options(sk, skb))
161 goto discard;
162
163 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
164 dccp_event_ack_recv(sk, skb);
165
166 /*
167 * FIXME: check ECN to see if we should use
168 * DCCP_ACKPKTS_STATE_ECN_MARKED
169 */
170 if (dp->dccps_options.dccpo_send_ack_vector) {
171 struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
172
173 if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
174 DCCP_SKB_CB(skb)->dccpd_seq,
175 DCCP_ACKPKTS_STATE_RECEIVED)) {
176 LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable "
177 "packets buffer full!\n");
178 ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
179 inet_csk_schedule_ack(sk);
180 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
181 TCP_DELACK_MIN,
182 DCCP_RTO_MAX);
183 goto discard;
184 }
185
186 /*
187 * FIXME: this activation is probably wrong, have to study more
188 * TCP delack machinery and how it fits into DCCP draft, but
189 * for now it kinda "works" 8)
190 */
191 if (!inet_csk_ack_scheduled(sk)) {
192 inet_csk_schedule_ack(sk);
193 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ,
194 DCCP_RTO_MAX);
195 }
196 }
197
198 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
199 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
200
201 switch (dccp_hdr(skb)->dccph_type) {
202 case DCCP_PKT_DATAACK:
203 case DCCP_PKT_DATA:
204 /*
205 * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
206 * option if it is.
207 */
208 __skb_pull(skb, dh->dccph_doff * 4);
209 __skb_queue_tail(&sk->sk_receive_queue, skb);
210 skb_set_owner_r(skb, sk);
211 sk->sk_data_ready(sk, 0);
212 return 0;
213 case DCCP_PKT_ACK:
214 goto discard;
215 case DCCP_PKT_RESET:
216 /*
217 * Step 9: Process Reset
218 * If P.type == Reset,
219 * Tear down connection
220 * S.state := TIMEWAIT
221 * Set TIMEWAIT timer
222 * Drop packet and return
223 */
224 dccp_fin(sk, skb);
225 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
226 return 0;
227 case DCCP_PKT_CLOSEREQ:
228 dccp_rcv_closereq(sk, skb);
229 goto discard;
230 case DCCP_PKT_CLOSE:
231 dccp_rcv_close(sk, skb);
232 return 0;
233 case DCCP_PKT_REQUEST:
234 /* Step 7
235 * or (S.is_server and P.type == Response)
236 * or (S.is_client and P.type == Request)
237 * or (S.state >= OPEN and P.type == Request
238 * and P.seqno >= S.OSR)
239 * or (S.state >= OPEN and P.type == Response
240 * and P.seqno >= S.OSR)
241 * or (S.state == RESPOND and P.type == Data),
242 * Send Sync packet acknowledging P.seqno
243 * Drop packet and return
244 */
245 if (dp->dccps_role != DCCP_ROLE_LISTEN)
246 goto send_sync;
247 goto check_seq;
248 case DCCP_PKT_RESPONSE:
249 if (dp->dccps_role != DCCP_ROLE_CLIENT)
250 goto send_sync;
251check_seq:
252 if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
253send_sync:
254 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
255 DCCP_PKT_SYNC);
256 }
257 break;
258 case DCCP_PKT_SYNC:
259 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
260 DCCP_PKT_SYNCACK);
261 /*
262 * From the draft:
263 *
264 * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
265 * MAY have non-zero-length application data areas, whose
266 * contents * receivers MUST ignore.
267 */
268 goto discard;
269 }
270
271 DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
272discard:
273 __kfree_skb(skb);
274 return 0;
275}
276
277static int dccp_rcv_request_sent_state_process(struct sock *sk,
278 struct sk_buff *skb,
279 const struct dccp_hdr *dh,
280 const unsigned len)
281{
282 /*
283 * Step 4: Prepare sequence numbers in REQUEST
284 * If S.state == REQUEST,
285 * If (P.type == Response or P.type == Reset)
286 * and S.AWL <= P.ackno <= S.AWH,
287 * / * Set sequence number variables corresponding to the
288 * other endpoint, so P will pass the tests in Step 6 * /
289 * Set S.GSR, S.ISR, S.SWL, S.SWH
290 * / * Response processing continues in Step 10; Reset
291 * processing continues in Step 9 * /
292 */
293 if (dh->dccph_type == DCCP_PKT_RESPONSE) {
294 const struct inet_connection_sock *icsk = inet_csk(sk);
295 struct dccp_sock *dp = dccp_sk(sk);
296
297 /* Stop the REQUEST timer */
298 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
299 BUG_TRAP(sk->sk_send_head != NULL);
300 __kfree_skb(sk->sk_send_head);
301 sk->sk_send_head = NULL;
302
303 if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
304 dp->dccps_awl, dp->dccps_awh)) {
305 dccp_pr_debug("invalid ackno: S.AWL=%llu, "
306 "P.ackno=%llu, S.AWH=%llu \n",
307 (unsigned long long)dp->dccps_awl,
308 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
309 (unsigned long long)dp->dccps_awh);
310 goto out_invalid_packet;
311 }
312
313 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
314 dccp_update_gsr(sk, dp->dccps_isr);
315 /*
316 * SWL and AWL are initially adjusted so that they are not less than
317 * the initial Sequence Numbers received and sent, respectively:
318 * SWL := max(GSR + 1 - floor(W/4), ISR),
319 * AWL := max(GSS - W' + 1, ISS).
320 * These adjustments MUST be applied only at the beginning of the
321 * connection.
322 *
323 * AWL was adjusted in dccp_v4_connect -acme
324 */
325 dccp_set_seqno(&dp->dccps_swl,
326 max48(dp->dccps_swl, dp->dccps_isr));
327
328 if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
329 ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
330 ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
331 ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
332 /* FIXME: send appropriate RESET code */
333 goto out_invalid_packet;
334 }
335
336 dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
337
338 /*
339 * Step 10: Process REQUEST state (second part)
340 * If S.state == REQUEST,
341 * / * If we get here, P is a valid Response from the
342 * server (see Step 4), and we should move to
343 * PARTOPEN state. PARTOPEN means send an Ack,
344 * don't send Data packets, retransmit Acks
345 * periodically, and always include any Init Cookie
346 * from the Response * /
347 * S.state := PARTOPEN
348 * Set PARTOPEN timer
349 * Continue with S.state == PARTOPEN
350 * / * Step 12 will send the Ack completing the
351 * three-way handshake * /
352 */
353 dccp_set_state(sk, DCCP_PARTOPEN);
354
355 /* Make sure socket is routed, for correct metrics. */
356 inet_sk_rebuild_header(sk);
357
358 if (!sock_flag(sk, SOCK_DEAD)) {
359 sk->sk_state_change(sk);
360 sk_wake_async(sk, 0, POLL_OUT);
361 }
362
363 if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
364 icsk->icsk_accept_queue.rskq_defer_accept) {
365 /* Save one ACK. Data will be ready after
366 * several ticks, if write_pending is set.
367 *
368 * It may be deleted, but with this feature tcpdumps
369 * look so _wonderfully_ clever, that I was not able
370 * to stand against the temptation 8) --ANK
371 */
372 /*
373 * OK, in DCCP we can as well do a similar trick, its
374 * even in the draft, but there is no need for us to
375 * schedule an ack here, as dccp_sendmsg does this for
376 * us, also stated in the draft. -acme
377 */
378 __kfree_skb(skb);
379 return 0;
380 }
381 dccp_send_ack(sk);
382 return -1;
383 }
384
385out_invalid_packet:
386 return 1; /* dccp_v4_do_rcv will send a reset, but...
387 FIXME: the reset code should be
388 DCCP_RESET_CODE_PACKET_ERROR */
389}
390
391static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
392 struct sk_buff *skb,
393 const struct dccp_hdr *dh,
394 const unsigned len)
395{
396 int queued = 0;
397
398 switch (dh->dccph_type) {
399 case DCCP_PKT_RESET:
400 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
401 break;
402 case DCCP_PKT_DATAACK:
403 case DCCP_PKT_ACK:
404 /*
405 * FIXME: we should be reseting the PARTOPEN (DELACK) timer
406 * here but only if we haven't used the DELACK timer for
407 * something else, like sending a delayed ack for a TIMESTAMP
408 * echo, etc, for now were not clearing it, sending an extra
409 * ACK when there is nothing else to do in DELACK is not a big
410 * deal after all.
411 */
412
413 /* Stop the PARTOPEN timer */
414 if (sk->sk_state == DCCP_PARTOPEN)
415 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
416
417 dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
418 dccp_set_state(sk, DCCP_OPEN);
419
420 if (dh->dccph_type == DCCP_PKT_DATAACK) {
421 dccp_rcv_established(sk, skb, dh, len);
422 queued = 1; /* packet was queued
423 (by dccp_rcv_established) */
424 }
425 break;
426 }
427
428 return queued;
429}
430
431int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
432 struct dccp_hdr *dh, unsigned len)
433{
434 struct dccp_sock *dp = dccp_sk(sk);
435 const int old_state = sk->sk_state;
436 int queued = 0;
437
438 /*
439 * Step 3: Process LISTEN state
440 * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
441 *
442 * If S.state == LISTEN,
443 * If P.type == Request or P contains a valid Init Cookie
444 * option,
445 * * Must scan the packet's options to check for an Init
446 * Cookie. Only the Init Cookie is processed here,
447 * however; other options are processed in Step 8. This
448 * scan need only be performed if the endpoint uses Init
449 * Cookies *
450 * * Generate a new socket and switch to that socket *
451 * Set S := new socket for this port pair
452 * S.state = RESPOND
453 * Choose S.ISS (initial seqno) or set from Init Cookie
454 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
455 * Continue with S.state == RESPOND
456 * * A Response packet will be generated in Step 11 *
457 * Otherwise,
458 * Generate Reset(No Connection) unless P.type == Reset
459 * Drop packet and return
460 *
461 * NOTE: the check for the packet types is done in
462 * dccp_rcv_state_process
463 */
464 if (sk->sk_state == DCCP_LISTEN) {
465 if (dh->dccph_type == DCCP_PKT_REQUEST) {
466 if (dccp_v4_conn_request(sk, skb) < 0)
467 return 1;
468
469 /* FIXME: do congestion control initialization */
470 goto discard;
471 }
472 if (dh->dccph_type == DCCP_PKT_RESET)
473 goto discard;
474
475 /* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/
476 return 1;
477 }
478
479 if (sk->sk_state != DCCP_REQUESTING) {
480 if (dccp_check_seqno(sk, skb))
481 goto discard;
482
483 /*
484 * Step 8: Process options and mark acknowledgeable
485 */
486 if (dccp_parse_options(sk, skb))
487 goto discard;
488
489 if (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
490 DCCP_PKT_WITHOUT_ACK_SEQ)
491 dccp_event_ack_recv(sk, skb);
492
493 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
494 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
495
496 /*
497 * FIXME: check ECN to see if we should use
498 * DCCP_ACKPKTS_STATE_ECN_MARKED
499 */
500 if (dp->dccps_options.dccpo_send_ack_vector) {
501 if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
502 DCCP_SKB_CB(skb)->dccpd_seq,
503 DCCP_ACKPKTS_STATE_RECEIVED))
504 goto discard;
505 /*
506 * FIXME: this activation is probably wrong, have to
507 * study more TCP delack machinery and how it fits into
508 * DCCP draft, but for now it kinda "works" 8)
509 */
510 if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno ==
511 DCCP_MAX_SEQNO + 1) &&
512 !inet_csk_ack_scheduled(sk)) {
513 inet_csk_schedule_ack(sk);
514 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
515 TCP_DELACK_MIN,
516 DCCP_RTO_MAX);
517 }
518 }
519 }
520
521 /*
522 * Step 9: Process Reset
523 * If P.type == Reset,
524 * Tear down connection
525 * S.state := TIMEWAIT
526 * Set TIMEWAIT timer
527 * Drop packet and return
528 */
529 if (dh->dccph_type == DCCP_PKT_RESET) {
530 /*
531 * Queue the equivalent of TCP fin so that dccp_recvmsg
532 * exits the loop
533 */
534 dccp_fin(sk, skb);
535 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
536 return 0;
537 /*
538 * Step 7: Check for unexpected packet types
539 * If (S.is_server and P.type == CloseReq)
540 * or (S.is_server and P.type == Response)
541 * or (S.is_client and P.type == Request)
542 * or (S.state == RESPOND and P.type == Data),
543 * Send Sync packet acknowledging P.seqno
544 * Drop packet and return
545 */
546 } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
547 (dh->dccph_type == DCCP_PKT_RESPONSE ||
548 dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
549 (dp->dccps_role == DCCP_ROLE_CLIENT &&
550 dh->dccph_type == DCCP_PKT_REQUEST) ||
551 (sk->sk_state == DCCP_RESPOND &&
552 dh->dccph_type == DCCP_PKT_DATA)) {
553 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
554 DCCP_PKT_SYNC);
555 goto discard;
556 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
557 dccp_rcv_closereq(sk, skb);
558 goto discard;
559 } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
560 dccp_rcv_close(sk, skb);
561 return 0;
562 }
563
564 switch (sk->sk_state) {
565 case DCCP_CLOSED:
566 return 1;
567
568 case DCCP_REQUESTING:
569 /* FIXME: do congestion control initialization */
570
571 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
572 if (queued >= 0)
573 return queued;
574
575 __kfree_skb(skb);
576 return 0;
577
578 case DCCP_RESPOND:
579 case DCCP_PARTOPEN:
580 queued = dccp_rcv_respond_partopen_state_process(sk, skb,
581 dh, len);
582 break;
583 }
584
585 if (dh->dccph_type == DCCP_PKT_ACK ||
586 dh->dccph_type == DCCP_PKT_DATAACK) {
587 switch (old_state) {
588 case DCCP_PARTOPEN:
589 sk->sk_state_change(sk);
590 sk_wake_async(sk, 0, POLL_OUT);
591 break;
592 }
593 }
594
595 if (!queued) {
596discard:
597 __kfree_skb(skb);
598 }
599 return 0;
600}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 000000000000..3fc75dbee4b8
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1356 @@
1/*
2 * net/dccp/ipv4.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/config.h>
14#include <linux/dccp.h>
15#include <linux/icmp.h>
16#include <linux/module.h>
17#include <linux/skbuff.h>
18#include <linux/random.h>
19
20#include <net/icmp.h>
21#include <net/inet_hashtables.h>
22#include <net/sock.h>
23#include <net/tcp_states.h>
24#include <net/xfrm.h>
25
26#include "ccid.h"
27#include "dccp.h"
28
29struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
30 .lhash_lock = RW_LOCK_UNLOCKED,
31 .lhash_users = ATOMIC_INIT(0),
32 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
33 .portalloc_lock = SPIN_LOCK_UNLOCKED,
34 .port_rover = 1024 - 1,
35};
36
37EXPORT_SYMBOL_GPL(dccp_hashinfo);
38
39static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
40{
41 return inet_csk_get_port(&dccp_hashinfo, sk, snum);
42}
43
44static void dccp_v4_hash(struct sock *sk)
45{
46 inet_hash(&dccp_hashinfo, sk);
47}
48
49static void dccp_v4_unhash(struct sock *sk)
50{
51 inet_unhash(&dccp_hashinfo, sk);
52}
53
54/* called with local bh disabled */
55static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
56 struct inet_timewait_sock **twp)
57{
58 struct inet_sock *inet = inet_sk(sk);
59 const u32 daddr = inet->rcv_saddr;
60 const u32 saddr = inet->daddr;
61 const int dif = sk->sk_bound_dev_if;
62 INET_ADDR_COOKIE(acookie, saddr, daddr)
63 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
64 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport,
65 dccp_hashinfo.ehash_size);
66 struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash];
67 const struct sock *sk2;
68 const struct hlist_node *node;
69 struct inet_timewait_sock *tw;
70
71 write_lock(&head->lock);
72
73 /* Check TIME-WAIT sockets first. */
74 sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
75 tw = inet_twsk(sk2);
76
77 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif))
78 goto not_unique;
79 }
80 tw = NULL;
81
82 /* And established part... */
83 sk_for_each(sk2, node, &head->chain) {
84 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
85 goto not_unique;
86 }
87
88 /* Must record num and sport now. Otherwise we will see
89 * in hash table socket with a funny identity. */
90 inet->num = lport;
91 inet->sport = htons(lport);
92 sk->sk_hashent = hash;
93 BUG_TRAP(sk_unhashed(sk));
94 __sk_add_node(sk, &head->chain);
95 sock_prot_inc_use(sk->sk_prot);
96 write_unlock(&head->lock);
97
98 if (twp != NULL) {
99 *twp = tw;
100 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
101 } else if (tw != NULL) {
102 /* Silly. Should hash-dance instead... */
103 inet_twsk_deschedule(tw, &dccp_death_row);
104 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
105
106 inet_twsk_put(tw);
107 }
108
109 return 0;
110
111not_unique:
112 write_unlock(&head->lock);
113 return -EADDRNOTAVAIL;
114}
115
116/*
117 * Bind a port for a connect operation and hash it.
118 */
119static int dccp_v4_hash_connect(struct sock *sk)
120{
121 const unsigned short snum = inet_sk(sk)->num;
122 struct inet_bind_hashbucket *head;
123 struct inet_bind_bucket *tb;
124 int ret;
125
126 if (snum == 0) {
127 int rover;
128 int low = sysctl_local_port_range[0];
129 int high = sysctl_local_port_range[1];
130 int remaining = (high - low) + 1;
131 struct hlist_node *node;
132 struct inet_timewait_sock *tw = NULL;
133
134 local_bh_disable();
135
136 /* TODO. Actually it is not so bad idea to remove
137 * dccp_hashinfo.portalloc_lock before next submission to
138 * Linus.
139 * As soon as we touch this place at all it is time to think.
140 *
141 * Now it protects single _advisory_ variable
142 * dccp_hashinfo.port_rover, hence it is mostly useless.
143 * Code will work nicely if we just delete it, but
144 * I am afraid in contented case it will work not better or
145 * even worse: another cpu just will hit the same bucket
146 * and spin there.
147 * So some cpu salt could remove both contention and
148 * memory pingpong. Any ideas how to do this in a nice way?
149 */
150 spin_lock(&dccp_hashinfo.portalloc_lock);
151 rover = dccp_hashinfo.port_rover;
152
153 do {
154 rover++;
155 if ((rover < low) || (rover > high))
156 rover = low;
157 head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
158 dccp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160
161 /* Does not bother with rcv_saddr checks,
162 * because the established check is already
163 * unique enough.
164 */
165 inet_bind_bucket_for_each(tb, node, &head->chain) {
166 if (tb->port == rover) {
167 BUG_TRAP(!hlist_empty(&tb->owners));
168 if (tb->fastreuse >= 0)
169 goto next_port;
170 if (!__dccp_v4_check_established(sk,
171 rover,
172 &tw))
173 goto ok;
174 goto next_port;
175 }
176 }
177
178 tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
179 head, rover);
180 if (tb == NULL) {
181 spin_unlock(&head->lock);
182 break;
183 }
184 tb->fastreuse = -1;
185 goto ok;
186
187 next_port:
188 spin_unlock(&head->lock);
189 } while (--remaining > 0);
190 dccp_hashinfo.port_rover = rover;
191 spin_unlock(&dccp_hashinfo.portalloc_lock);
192
193 local_bh_enable();
194
195 return -EADDRNOTAVAIL;
196
197ok:
198 /* All locks still held and bhs disabled */
199 dccp_hashinfo.port_rover = rover;
200 spin_unlock(&dccp_hashinfo.portalloc_lock);
201
202 inet_bind_hash(sk, tb, rover);
203 if (sk_unhashed(sk)) {
204 inet_sk(sk)->sport = htons(rover);
205 __inet_hash(&dccp_hashinfo, sk, 0);
206 }
207 spin_unlock(&head->lock);
208
209 if (tw != NULL) {
210 inet_twsk_deschedule(tw, &dccp_death_row);
211 inet_twsk_put(tw);
212 }
213
214 ret = 0;
215 goto out;
216 }
217
218 head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
219 dccp_hashinfo.bhash_size)];
220 tb = inet_csk(sk)->icsk_bind_hash;
221 spin_lock_bh(&head->lock);
222 if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
223 __inet_hash(&dccp_hashinfo, sk, 0);
224 spin_unlock_bh(&head->lock);
225 return 0;
226 } else {
227 spin_unlock(&head->lock);
228 /* No definite answer... Walk to established hash table */
229 ret = __dccp_v4_check_established(sk, snum, NULL);
230out:
231 local_bh_enable();
232 return ret;
233 }
234}
235
236static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
237 int addr_len)
238{
239 struct inet_sock *inet = inet_sk(sk);
240 struct dccp_sock *dp = dccp_sk(sk);
241 const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
242 struct rtable *rt;
243 u32 daddr, nexthop;
244 int tmp;
245 int err;
246
247 dp->dccps_role = DCCP_ROLE_CLIENT;
248
249 if (addr_len < sizeof(struct sockaddr_in))
250 return -EINVAL;
251
252 if (usin->sin_family != AF_INET)
253 return -EAFNOSUPPORT;
254
255 nexthop = daddr = usin->sin_addr.s_addr;
256 if (inet->opt != NULL && inet->opt->srr) {
257 if (daddr == 0)
258 return -EINVAL;
259 nexthop = inet->opt->faddr;
260 }
261
262 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
263 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
264 IPPROTO_DCCP,
265 inet->sport, usin->sin_port, sk);
266 if (tmp < 0)
267 return tmp;
268
269 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
270 ip_rt_put(rt);
271 return -ENETUNREACH;
272 }
273
274 if (inet->opt == NULL || !inet->opt->srr)
275 daddr = rt->rt_dst;
276
277 if (inet->saddr == 0)
278 inet->saddr = rt->rt_src;
279 inet->rcv_saddr = inet->saddr;
280
281 inet->dport = usin->sin_port;
282 inet->daddr = daddr;
283
284 dp->dccps_ext_header_len = 0;
285 if (inet->opt != NULL)
286 dp->dccps_ext_header_len = inet->opt->optlen;
287 /*
288 * Socket identity is still unknown (sport may be zero).
289 * However we set state to DCCP_REQUESTING and not releasing socket
290 * lock select source port, enter ourselves into the hash tables and
291 * complete initialization after this.
292 */
293 dccp_set_state(sk, DCCP_REQUESTING);
294 err = dccp_v4_hash_connect(sk);
295 if (err != 0)
296 goto failure;
297
298 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
299 if (err != 0)
300 goto failure;
301
302 /* OK, now commit destination to socket. */
303 sk_setup_caps(sk, &rt->u.dst);
304
305 dp->dccps_gar =
306 dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
307 inet->daddr,
308 inet->sport,
309 usin->sin_port);
310 dccp_update_gss(sk, dp->dccps_iss);
311
312 /*
313 * SWL and AWL are initially adjusted so that they are not less than
314 * the initial Sequence Numbers received and sent, respectively:
315 * SWL := max(GSR + 1 - floor(W/4), ISR),
316 * AWL := max(GSS - W' + 1, ISS).
317 * These adjustments MUST be applied only at the beginning of the
318 * connection.
319 */
320 dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
321
322 inet->id = dp->dccps_iss ^ jiffies;
323
324 err = dccp_connect(sk);
325 rt = NULL;
326 if (err != 0)
327 goto failure;
328out:
329 return err;
330failure:
331 /*
332 * This unhashes the socket and releases the local port, if necessary.
333 */
334 dccp_set_state(sk, DCCP_CLOSED);
335 ip_rt_put(rt);
336 sk->sk_route_caps = 0;
337 inet->dport = 0;
338 goto out;
339}
340
341/*
342 * This routine does path mtu discovery as defined in RFC1191.
343 */
344static inline void dccp_do_pmtu_discovery(struct sock *sk,
345 const struct iphdr *iph,
346 u32 mtu)
347{
348 struct dst_entry *dst;
349 const struct inet_sock *inet = inet_sk(sk);
350 const struct dccp_sock *dp = dccp_sk(sk);
351
352 /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
353 * send out by Linux are always < 576bytes so they should go through
354 * unfragmented).
355 */
356 if (sk->sk_state == DCCP_LISTEN)
357 return;
358
359 /* We don't check in the destentry if pmtu discovery is forbidden
360 * on this route. We just assume that no packet_to_big packets
361 * are send back when pmtu discovery is not active.
362 * There is a small race when the user changes this flag in the
363 * route, but I think that's acceptable.
364 */
365 if ((dst = __sk_dst_check(sk, 0)) == NULL)
366 return;
367
368 dst->ops->update_pmtu(dst, mtu);
369
370 /* Something is about to be wrong... Remember soft error
371 * for the case, if this connection will not able to recover.
372 */
373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 sk->sk_err_soft = EMSGSIZE;
375
376 mtu = dst_mtu(dst);
377
378 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379 dp->dccps_pmtu_cookie > mtu) {
380 dccp_sync_mss(sk, mtu);
381
382 /*
383 * From: draft-ietf-dccp-spec-11.txt
384 *
385 * DCCP-Sync packets are the best choice for upward
386 * probing, since DCCP-Sync probes do not risk application
387 * data loss.
388 */
389 dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
390 } /* else let the usual retransmit timer handle it */
391}
392
393static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
394{
395 int err;
396 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
397 const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
398 sizeof(struct dccp_hdr_ext) +
399 sizeof(struct dccp_hdr_ack_bits);
400 struct sk_buff *skb;
401
402 if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
403 return;
404
405 skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
406 if (skb == NULL)
407 return;
408
409 /* Reserve space for headers. */
410 skb_reserve(skb, MAX_DCCP_HEADER);
411
412 skb->dst = dst_clone(rxskb->dst);
413
414 skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
415 dh = dccp_hdr(skb);
416 memset(dh, 0, dccp_hdr_ack_len);
417
418 /* Build DCCP header and checksum it. */
419 dh->dccph_type = DCCP_PKT_ACK;
420 dh->dccph_sport = rxdh->dccph_dport;
421 dh->dccph_dport = rxdh->dccph_sport;
422 dh->dccph_doff = dccp_hdr_ack_len / 4;
423 dh->dccph_x = 1;
424
425 dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
426 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
427 DCCP_SKB_CB(rxskb)->dccpd_seq);
428
429 bh_lock_sock(dccp_ctl_socket->sk);
430 err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
431 rxskb->nh.iph->daddr,
432 rxskb->nh.iph->saddr, NULL);
433 bh_unlock_sock(dccp_ctl_socket->sk);
434
435 if (err == NET_XMIT_CN || err == 0) {
436 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
437 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
438 }
439}
440
441static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
442 struct request_sock *req)
443{
444 dccp_v4_ctl_send_ack(skb);
445}
446
447static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
448 struct dst_entry *dst)
449{
450 int err = -1;
451 struct sk_buff *skb;
452
453 /* First, grab a route. */
454
455 if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
456 goto out;
457
458 skb = dccp_make_response(sk, dst, req);
459 if (skb != NULL) {
460 const struct inet_request_sock *ireq = inet_rsk(req);
461
462 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
463 ireq->rmt_addr,
464 ireq->opt);
465 if (err == NET_XMIT_CN)
466 err = 0;
467 }
468
469out:
470 dst_release(dst);
471 return err;
472}
473
474/*
475 * This routine is called by the ICMP module when it gets some sort of error
476 * condition. If err < 0 then the socket should be closed and the error
477 * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
478 * After adjustment header points to the first 8 bytes of the tcp header. We
479 * need to find the appropriate port.
480 *
481 * The locking strategy used here is very "optimistic". When someone else
482 * accesses the socket the ICMP is just dropped and for some paths there is no
483 * check at all. A more general error queue to queue errors for later handling
484 * is probably better.
485 */
486void dccp_v4_err(struct sk_buff *skb, u32 info)
487{
488 const struct iphdr *iph = (struct iphdr *)skb->data;
489 const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
490 (iph->ihl << 2));
491 struct dccp_sock *dp;
492 struct inet_sock *inet;
493 const int type = skb->h.icmph->type;
494 const int code = skb->h.icmph->code;
495 struct sock *sk;
496 __u64 seq;
497 int err;
498
499 if (skb->len < (iph->ihl << 2) + 8) {
500 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
501 return;
502 }
503
504 sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
505 iph->saddr, dh->dccph_sport, inet_iif(skb));
506 if (sk == NULL) {
507 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
508 return;
509 }
510
511 if (sk->sk_state == DCCP_TIME_WAIT) {
512 inet_twsk_put((struct inet_timewait_sock *)sk);
513 return;
514 }
515
516 bh_lock_sock(sk);
517 /* If too many ICMPs get dropped on busy
518 * servers this needs to be solved differently.
519 */
520 if (sock_owned_by_user(sk))
521 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
522
523 if (sk->sk_state == DCCP_CLOSED)
524 goto out;
525
526 dp = dccp_sk(sk);
527 seq = dccp_hdr_seq(skb);
528 if (sk->sk_state != DCCP_LISTEN &&
529 !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
530 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
531 goto out;
532 }
533
534 switch (type) {
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 if (!sock_owned_by_user(sk))
547 dccp_do_pmtu_discovery(sk, iph, info);
548 goto out;
549 }
550
551 err = icmp_err_convert[code].errno;
552 break;
553 case ICMP_TIME_EXCEEDED:
554 err = EHOSTUNREACH;
555 break;
556 default:
557 goto out;
558 }
559
560 switch (sk->sk_state) {
561 struct request_sock *req , **prev;
562 case DCCP_LISTEN:
563 if (sock_owned_by_user(sk))
564 goto out;
565 req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
566 iph->daddr, iph->saddr);
567 if (!req)
568 goto out;
569
570 /*
571 * ICMPs are not backlogged, hence we cannot get an established
572 * socket here.
573 */
574 BUG_TRAP(!req->sk);
575
576 if (seq != dccp_rsk(req)->dreq_iss) {
577 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
578 goto out;
579 }
580 /*
581 * Still in RESPOND, just remove it silently.
582 * There is no good way to pass the error to the newly
583 * created socket, and POSIX does not want network
584 * errors returned from accept().
585 */
586 inet_csk_reqsk_queue_drop(sk, req, prev);
587 goto out;
588
589 case DCCP_REQUESTING:
590 case DCCP_RESPOND:
591 if (!sock_owned_by_user(sk)) {
592 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
593 sk->sk_err = err;
594
595 sk->sk_error_report(sk);
596
597 dccp_done(sk);
598 } else
599 sk->sk_err_soft = err;
600 goto out;
601 }
602
603 /* If we've already connected we will keep trying
604 * until we time out, or the user gives up.
605 *
606 * rfc1122 4.2.3.9 allows to consider as hard errors
607 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
608 * but it is obsoleted by pmtu discovery).
609 *
610 * Note, that in modern internet, where routing is unreliable
611 * and in each dark corner broken firewalls sit, sending random
612 * errors ordered by their masters even this two messages finally lose
613 * their original sense (even Linux sends invalid PORT_UNREACHs)
614 *
615 * Now we are in compliance with RFCs.
616 * --ANK (980905)
617 */
618
619 inet = inet_sk(sk);
620 if (!sock_owned_by_user(sk) && inet->recverr) {
621 sk->sk_err = err;
622 sk->sk_error_report(sk);
623 } else /* Only an error on timeout */
624 sk->sk_err_soft = err;
625out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628}
629
630int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
631{
632 struct sk_buff *skb;
633 /*
634 * FIXME: what if rebuild_header fails?
635 * Should we be doing a rebuild_header here?
636 */
637 int err = inet_sk_rebuild_header(sk);
638
639 if (err != 0)
640 return err;
641
642 skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
643 if (skb != NULL) {
644 const struct dccp_sock *dp = dccp_sk(sk);
645 const struct inet_sock *inet = inet_sk(sk);
646
647 err = ip_build_and_send_pkt(skb, sk,
648 inet->saddr, inet->daddr, NULL);
649 if (err == NET_XMIT_CN)
650 err = 0;
651
652 ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
653 ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
654 }
655
656 return err;
657}
658
659static inline u64 dccp_v4_init_sequence(const struct sock *sk,
660 const struct sk_buff *skb)
661{
662 return secure_dccp_sequence_number(skb->nh.iph->daddr,
663 skb->nh.iph->saddr,
664 dccp_hdr(skb)->dccph_dport,
665 dccp_hdr(skb)->dccph_sport);
666}
667
668int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
669{
670 struct inet_request_sock *ireq;
671 struct dccp_sock dp;
672 struct request_sock *req;
673 struct dccp_request_sock *dreq;
674 const __u32 saddr = skb->nh.iph->saddr;
675 const __u32 daddr = skb->nh.iph->daddr;
676 struct dst_entry *dst = NULL;
677
678 /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
679 if (((struct rtable *)skb->dst)->rt_flags &
680 (RTCF_BROADCAST | RTCF_MULTICAST))
681 goto drop;
682
683 /*
684 * TW buckets are converted to open requests without
685 * limitations, they conserve resources and peer is
686 * evidently real one.
687 */
688 if (inet_csk_reqsk_queue_is_full(sk))
689 goto drop;
690
691 /*
692 * Accept backlog is full. If we have already queued enough
693 * of warm entries in syn queue, drop request. It is better than
694 * clogging syn queue with openreqs with exponentially increasing
695 * timeout.
696 */
697 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
698 goto drop;
699
700 req = reqsk_alloc(sk->sk_prot->rsk_prot);
701 if (req == NULL)
702 goto drop;
703
704 /* FIXME: process options */
705
706 dccp_openreq_init(req, &dp, skb);
707
708 ireq = inet_rsk(req);
709 ireq->loc_addr = daddr;
710 ireq->rmt_addr = saddr;
711 /* FIXME: Merge Aristeu's option parsing code when ready */
712 req->rcv_wnd = 100; /* Fake, option parsing will get the
713 right value */
714 ireq->opt = NULL;
715
716 /*
717 * Step 3: Process LISTEN state
718 *
719 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
720 *
721 * In fact we defer setting S.GSR, S.SWL, S.SWH to
722 * dccp_create_openreq_child.
723 */
724 dreq = dccp_rsk(req);
725 dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
726 dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
727 dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service;
728
729 if (dccp_v4_send_response(sk, req, dst))
730 goto drop_and_free;
731
732 inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
733 return 0;
734
735drop_and_free:
736 /*
737 * FIXME: should be reqsk_free after implementing req->rsk_ops
738 */
739 __reqsk_free(req);
740drop:
741 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
742 return -1;
743}
744
745/*
746 * The three way handshake has completed - we got a valid ACK or DATAACK -
747 * now create the new socket.
748 *
749 * This is the equivalent of TCP's tcp_v4_syn_recv_sock
750 */
751struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
752 struct request_sock *req,
753 struct dst_entry *dst)
754{
755 struct inet_request_sock *ireq;
756 struct inet_sock *newinet;
757 struct dccp_sock *newdp;
758 struct sock *newsk;
759
760 if (sk_acceptq_is_full(sk))
761 goto exit_overflow;
762
763 if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
764 goto exit;
765
766 newsk = dccp_create_openreq_child(sk, req, skb);
767 if (newsk == NULL)
768 goto exit;
769
770 sk_setup_caps(newsk, dst);
771
772 newdp = dccp_sk(newsk);
773 newinet = inet_sk(newsk);
774 ireq = inet_rsk(req);
775 newinet->daddr = ireq->rmt_addr;
776 newinet->rcv_saddr = ireq->loc_addr;
777 newinet->saddr = ireq->loc_addr;
778 newinet->opt = ireq->opt;
779 ireq->opt = NULL;
780 newinet->mc_index = inet_iif(skb);
781 newinet->mc_ttl = skb->nh.iph->ttl;
782 newinet->id = jiffies;
783
784 dccp_sync_mss(newsk, dst_mtu(dst));
785
786 __inet_hash(&dccp_hashinfo, newsk, 0);
787 __inet_inherit_port(&dccp_hashinfo, sk, newsk);
788
789 return newsk;
790
791exit_overflow:
792 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
793exit:
794 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
795 dst_release(dst);
796 return NULL;
797}
798
799static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
800{
801 const struct dccp_hdr *dh = dccp_hdr(skb);
802 const struct iphdr *iph = skb->nh.iph;
803 struct sock *nsk;
804 struct request_sock **prev;
805 /* Find possible connection requests. */
806 struct request_sock *req = inet_csk_search_req(sk, &prev,
807 dh->dccph_sport,
808 iph->saddr, iph->daddr);
809 if (req != NULL)
810 return dccp_check_req(sk, skb, req, prev);
811
812 nsk = __inet_lookup_established(&dccp_hashinfo,
813 iph->saddr, dh->dccph_sport,
814 iph->daddr, ntohs(dh->dccph_dport),
815 inet_iif(skb));
816 if (nsk != NULL) {
817 if (nsk->sk_state != DCCP_TIME_WAIT) {
818 bh_lock_sock(nsk);
819 return nsk;
820 }
821 inet_twsk_put((struct inet_timewait_sock *)nsk);
822 return NULL;
823 }
824
825 return sk;
826}
827
828int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
829 const u32 daddr)
830{
831 const struct dccp_hdr* dh = dccp_hdr(skb);
832 int checksum_len;
833 u32 tmp;
834
835 if (dh->dccph_cscov == 0)
836 checksum_len = skb->len;
837 else {
838 checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
839 checksum_len = checksum_len < skb->len ? checksum_len :
840 skb->len;
841 }
842
843 tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
844 return csum_tcpudp_magic(saddr, daddr, checksum_len,
845 IPPROTO_DCCP, tmp);
846}
847
848static int dccp_v4_verify_checksum(struct sk_buff *skb,
849 const u32 saddr, const u32 daddr)
850{
851 struct dccp_hdr *dh = dccp_hdr(skb);
852 int checksum_len;
853 u32 tmp;
854
855 if (dh->dccph_cscov == 0)
856 checksum_len = skb->len;
857 else {
858 checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
859 checksum_len = checksum_len < skb->len ? checksum_len :
860 skb->len;
861 }
862 tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
863 return csum_tcpudp_magic(saddr, daddr, checksum_len,
864 IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
865}
866
867static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
868 struct sk_buff *skb)
869{
870 struct rtable *rt;
871 struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
872 .nl_u = { .ip4_u =
873 { .daddr = skb->nh.iph->saddr,
874 .saddr = skb->nh.iph->daddr,
875 .tos = RT_CONN_FLAGS(sk) } },
876 .proto = sk->sk_protocol,
877 .uli_u = { .ports =
878 { .sport = dccp_hdr(skb)->dccph_dport,
879 .dport = dccp_hdr(skb)->dccph_sport }
880 }
881 };
882
883 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
884 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
885 return NULL;
886 }
887
888 return &rt->u.dst;
889}
890
891static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
892{
893 int err;
894 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
895 const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
896 sizeof(struct dccp_hdr_ext) +
897 sizeof(struct dccp_hdr_reset);
898 struct sk_buff *skb;
899 struct dst_entry *dst;
900 u64 seqno;
901
902 /* Never send a reset in response to a reset. */
903 if (rxdh->dccph_type == DCCP_PKT_RESET)
904 return;
905
906 if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
907 return;
908
909 dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
910 if (dst == NULL)
911 return;
912
913 skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
914 if (skb == NULL)
915 goto out;
916
917 /* Reserve space for headers. */
918 skb_reserve(skb, MAX_DCCP_HEADER);
919 skb->dst = dst_clone(dst);
920
921 skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
922 dh = dccp_hdr(skb);
923 memset(dh, 0, dccp_hdr_reset_len);
924
925 /* Build DCCP header and checksum it. */
926 dh->dccph_type = DCCP_PKT_RESET;
927 dh->dccph_sport = rxdh->dccph_dport;
928 dh->dccph_dport = rxdh->dccph_sport;
929 dh->dccph_doff = dccp_hdr_reset_len / 4;
930 dh->dccph_x = 1;
931 dccp_hdr_reset(skb)->dccph_reset_code =
932 DCCP_SKB_CB(rxskb)->dccpd_reset_code;
933
934 /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
935 seqno = 0;
936 if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
937 dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
938
939 dccp_hdr_set_seq(dh, seqno);
940 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
941 DCCP_SKB_CB(rxskb)->dccpd_seq);
942
943 dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
944 rxskb->nh.iph->daddr);
945
946 bh_lock_sock(dccp_ctl_socket->sk);
947 err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
948 rxskb->nh.iph->daddr,
949 rxskb->nh.iph->saddr, NULL);
950 bh_unlock_sock(dccp_ctl_socket->sk);
951
952 if (err == NET_XMIT_CN || err == 0) {
953 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
954 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
955 }
956out:
957 dst_release(dst);
958}
959
960int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
961{
962 struct dccp_hdr *dh = dccp_hdr(skb);
963
964 if (sk->sk_state == DCCP_OPEN) { /* Fast path */
965 if (dccp_rcv_established(sk, skb, dh, skb->len))
966 goto reset;
967 return 0;
968 }
969
970 /*
971 * Step 3: Process LISTEN state
972 * If S.state == LISTEN,
973 * If P.type == Request or P contains a valid Init Cookie
974 * option,
975 * * Must scan the packet's options to check for an Init
976 * Cookie. Only the Init Cookie is processed here,
977 * however; other options are processed in Step 8. This
978 * scan need only be performed if the endpoint uses Init
979 * Cookies *
980 * * Generate a new socket and switch to that socket *
981 * Set S := new socket for this port pair
982 * S.state = RESPOND
983 * Choose S.ISS (initial seqno) or set from Init Cookie
984 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
985 * Continue with S.state == RESPOND
986 * * A Response packet will be generated in Step 11 *
987 * Otherwise,
988 * Generate Reset(No Connection) unless P.type == Reset
989 * Drop packet and return
990 *
991 * NOTE: the check for the packet types is done in
992 * dccp_rcv_state_process
993 */
994 if (sk->sk_state == DCCP_LISTEN) {
995 struct sock *nsk = dccp_v4_hnd_req(sk, skb);
996
997 if (nsk == NULL)
998 goto discard;
999
1000 if (nsk != sk) {
1001 if (dccp_child_process(sk, nsk, skb))
1002 goto reset;
1003 return 0;
1004 }
1005 }
1006
1007 if (dccp_rcv_state_process(sk, skb, dh, skb->len))
1008 goto reset;
1009 return 0;
1010
1011reset:
1012 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
1013 dccp_v4_ctl_send_reset(skb);
1014discard:
1015 kfree_skb(skb);
1016 return 0;
1017}
1018
1019static inline int dccp_invalid_packet(struct sk_buff *skb)
1020{
1021 const struct dccp_hdr *dh;
1022
1023 if (skb->pkt_type != PACKET_HOST)
1024 return 1;
1025
1026 if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
1027 LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
1028 return 1;
1029 }
1030
1031 dh = dccp_hdr(skb);
1032
1033 /* If the packet type is not understood, drop packet and return */
1034 if (dh->dccph_type >= DCCP_PKT_INVALID) {
1035 LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
1036 return 1;
1037 }
1038
1039 /*
1040 * If P.Data Offset is too small for packet type, or too large for
1041 * packet, drop packet and return
1042 */
1043 if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
1044 LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
1045 "too small 1\n",
1046 dh->dccph_doff);
1047 return 1;
1048 }
1049
1050 if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
1051 LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
1052 "too small 2\n",
1053 dh->dccph_doff);
1054 return 1;
1055 }
1056
1057 dh = dccp_hdr(skb);
1058
1059 /*
1060 * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
1061 * has short sequence numbers), drop packet and return
1062 */
1063 if (dh->dccph_x == 0 &&
1064 dh->dccph_type != DCCP_PKT_DATA &&
1065 dh->dccph_type != DCCP_PKT_ACK &&
1066 dh->dccph_type != DCCP_PKT_DATAACK) {
1067 LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
1068 "nor DataAck and P.X == 0\n",
1069 dccp_packet_name(dh->dccph_type));
1070 return 1;
1071 }
1072
1073 /* If the header checksum is incorrect, drop packet and return */
1074 if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
1075 skb->nh.iph->daddr) < 0) {
1076 LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
1077 "incorrect\n");
1078 return 1;
1079 }
1080
1081 return 0;
1082}
1083
1084/* this is called when real data arrives */
1085int dccp_v4_rcv(struct sk_buff *skb)
1086{
1087 const struct dccp_hdr *dh;
1088 struct sock *sk;
1089 int rc;
1090
1091 /* Step 1: Check header basics: */
1092
1093 if (dccp_invalid_packet(skb))
1094 goto discard_it;
1095
1096 dh = dccp_hdr(skb);
1097#if 0
1098 /*
1099 * Use something like this to simulate some DATA/DATAACK loss to test
1100 * dccp_ackpkts_add, you'll get something like this on a session that
1101 * sends 10 DATA/DATAACK packets:
1102 *
1103 * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1|
1104 *
1105 * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet
1106 * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets
1107 * with the same state
1108 * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet
1109 *
1110 * So...
1111 *
1112 * 281473596467422 was received
1113 * 281473596467421 was not received
1114 * 281473596467420 was received
1115 * 281473596467419 was not received
1116 * 281473596467418 was received
1117 * 281473596467417 was not received
1118 * 281473596467416 was received
1119 * 281473596467415 was not received
1120 * 281473596467414 was received
1121 * 281473596467413 was received (this one was the 3way handshake
1122 * RESPONSE)
1123 *
1124 */
1125 if (dh->dccph_type == DCCP_PKT_DATA ||
1126 dh->dccph_type == DCCP_PKT_DATAACK) {
1127 static int discard = 0;
1128
1129 if (discard) {
1130 discard = 0;
1131 goto discard_it;
1132 }
1133 discard = 1;
1134 }
1135#endif
1136 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
1137 DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
1138
1139 dccp_pr_debug("%8.8s "
1140 "src=%u.%u.%u.%u@%-5d "
1141 "dst=%u.%u.%u.%u@%-5d seq=%llu",
1142 dccp_packet_name(dh->dccph_type),
1143 NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
1144 NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
1145 (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
1146
1147 if (dccp_packet_without_ack(skb)) {
1148 DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
1149 dccp_pr_debug_cat("\n");
1150 } else {
1151 DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
1152 dccp_pr_debug_cat(", ack=%llu\n",
1153 (unsigned long long)
1154 DCCP_SKB_CB(skb)->dccpd_ack_seq);
1155 }
1156
1157 /* Step 2:
1158 * Look up flow ID in table and get corresponding socket */
1159 sk = __inet_lookup(&dccp_hashinfo,
1160 skb->nh.iph->saddr, dh->dccph_sport,
1161 skb->nh.iph->daddr, ntohs(dh->dccph_dport),
1162 inet_iif(skb));
1163
1164 /*
1165 * Step 2:
1166 * If no socket ...
1167 * Generate Reset(No Connection) unless P.type == Reset
1168 * Drop packet and return
1169 */
1170 if (sk == NULL) {
1171 dccp_pr_debug("failed to look up flow ID in table and "
1172 "get corresponding socket\n");
1173 goto no_dccp_socket;
1174 }
1175
1176 /*
1177 * Step 2:
1178 * ... or S.state == TIMEWAIT,
1179 * Generate Reset(No Connection) unless P.type == Reset
1180 * Drop packet and return
1181 */
1182
1183 if (sk->sk_state == DCCP_TIME_WAIT) {
1184 dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
1185 "do_time_wait\n");
1186 goto do_time_wait;
1187 }
1188
1189 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
1190 dccp_pr_debug("xfrm4_policy_check failed\n");
1191 goto discard_and_relse;
1192 }
1193
1194 if (sk_filter(sk, skb, 0)) {
1195 dccp_pr_debug("sk_filter failed\n");
1196 goto discard_and_relse;
1197 }
1198
1199 skb->dev = NULL;
1200
1201 bh_lock_sock(sk);
1202 rc = 0;
1203 if (!sock_owned_by_user(sk))
1204 rc = dccp_v4_do_rcv(sk, skb);
1205 else
1206 sk_add_backlog(sk, skb);
1207 bh_unlock_sock(sk);
1208
1209 sock_put(sk);
1210 return rc;
1211
1212no_dccp_socket:
1213 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1214 goto discard_it;
1215 /*
1216 * Step 2:
1217 * Generate Reset(No Connection) unless P.type == Reset
1218 * Drop packet and return
1219 */
1220 if (dh->dccph_type != DCCP_PKT_RESET) {
1221 DCCP_SKB_CB(skb)->dccpd_reset_code =
1222 DCCP_RESET_CODE_NO_CONNECTION;
1223 dccp_v4_ctl_send_reset(skb);
1224 }
1225
1226discard_it:
1227 /* Discard frame. */
1228 kfree_skb(skb);
1229 return 0;
1230
1231discard_and_relse:
1232 sock_put(sk);
1233 goto discard_it;
1234
1235do_time_wait:
1236 inet_twsk_put((struct inet_timewait_sock *)sk);
1237 goto no_dccp_socket;
1238}
1239
1240static int dccp_v4_init_sock(struct sock *sk)
1241{
1242 struct dccp_sock *dp = dccp_sk(sk);
1243 static int dccp_ctl_socket_init = 1;
1244
1245 dccp_options_init(&dp->dccps_options);
1246
1247 if (dp->dccps_options.dccpo_send_ack_vector) {
1248 dp->dccps_hc_rx_ackpkts =
1249 dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
1250 GFP_KERNEL);
1251
1252 if (dp->dccps_hc_rx_ackpkts == NULL)
1253 return -ENOMEM;
1254 }
1255
1256 /*
1257 * FIXME: We're hardcoding the CCID, and doing this at this point makes
1258 * the listening (master) sock get CCID control blocks, which is not
1259 * necessary, but for now, to not mess with the test userspace apps,
1260 * lets leave it here, later the real solution is to do this in a
1261 * setsockopt(CCIDs-I-want/accept). -acme
1262 */
1263 if (likely(!dccp_ctl_socket_init)) {
1264 dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
1265 sk);
1266 dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
1267 sk);
1268 if (dp->dccps_hc_rx_ccid == NULL ||
1269 dp->dccps_hc_tx_ccid == NULL) {
1270 ccid_exit(dp->dccps_hc_rx_ccid, sk);
1271 ccid_exit(dp->dccps_hc_tx_ccid, sk);
1272 dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
1273 dp->dccps_hc_rx_ackpkts = NULL;
1274 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
1275 return -ENOMEM;
1276 }
1277 } else
1278 dccp_ctl_socket_init = 0;
1279
1280 dccp_init_xmit_timers(sk);
1281 inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
1282 sk->sk_state = DCCP_CLOSED;
1283 sk->sk_write_space = dccp_write_space;
1284 dp->dccps_mss_cache = 536;
1285 dp->dccps_role = DCCP_ROLE_UNDEFINED;
1286
1287 return 0;
1288}
1289
1290static int dccp_v4_destroy_sock(struct sock *sk)
1291{
1292 struct dccp_sock *dp = dccp_sk(sk);
1293
1294 /*
1295 * DCCP doesn't use sk_qrite_queue, just sk_send_head
1296 * for retransmissions
1297 */
1298 if (sk->sk_send_head != NULL) {
1299 kfree_skb(sk->sk_send_head);
1300 sk->sk_send_head = NULL;
1301 }
1302
1303 /* Clean up a referenced DCCP bind bucket. */
1304 if (inet_csk(sk)->icsk_bind_hash != NULL)
1305 inet_put_port(&dccp_hashinfo, sk);
1306
1307 ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
1308 ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
1309 dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
1310 dp->dccps_hc_rx_ackpkts = NULL;
1311 ccid_exit(dp->dccps_hc_rx_ccid, sk);
1312 ccid_exit(dp->dccps_hc_tx_ccid, sk);
1313 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
1314
1315 return 0;
1316}
1317
1318static void dccp_v4_reqsk_destructor(struct request_sock *req)
1319{
1320 kfree(inet_rsk(req)->opt);
1321}
1322
1323static struct request_sock_ops dccp_request_sock_ops = {
1324 .family = PF_INET,
1325 .obj_size = sizeof(struct dccp_request_sock),
1326 .rtx_syn_ack = dccp_v4_send_response,
1327 .send_ack = dccp_v4_reqsk_send_ack,
1328 .destructor = dccp_v4_reqsk_destructor,
1329 .send_reset = dccp_v4_ctl_send_reset,
1330};
1331
1332struct proto dccp_v4_prot = {
1333 .name = "DCCP",
1334 .owner = THIS_MODULE,
1335 .close = dccp_close,
1336 .connect = dccp_v4_connect,
1337 .disconnect = dccp_disconnect,
1338 .ioctl = dccp_ioctl,
1339 .init = dccp_v4_init_sock,
1340 .setsockopt = dccp_setsockopt,
1341 .getsockopt = dccp_getsockopt,
1342 .sendmsg = dccp_sendmsg,
1343 .recvmsg = dccp_recvmsg,
1344 .backlog_rcv = dccp_v4_do_rcv,
1345 .hash = dccp_v4_hash,
1346 .unhash = dccp_v4_unhash,
1347 .accept = inet_csk_accept,
1348 .get_port = dccp_v4_get_port,
1349 .shutdown = dccp_shutdown,
1350 .destroy = dccp_v4_destroy_sock,
1351 .orphan_count = &dccp_orphan_count,
1352 .max_header = MAX_DCCP_HEADER,
1353 .obj_size = sizeof(struct dccp_sock),
1354 .rsk_prot = &dccp_request_sock_ops,
1355 .twsk_obj_size = sizeof(struct inet_timewait_sock),
1356};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 000000000000..ce5dff4ac22e
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,264 @@
1/*
2 * net/dccp/minisocks.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/config.h>
14#include <linux/dccp.h>
15#include <linux/skbuff.h>
16#include <linux/timer.h>
17
18#include <net/sock.h>
19#include <net/xfrm.h>
20#include <net/inet_timewait_sock.h>
21
22#include "ccid.h"
23#include "dccp.h"
24
25struct inet_timewait_death_row dccp_death_row = {
26 .sysctl_max_tw_buckets = NR_FILE * 2,
27 .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
28 .death_lock = SPIN_LOCK_UNLOCKED,
29 .hashinfo = &dccp_hashinfo,
30 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
31 (unsigned long)&dccp_death_row),
32 .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
33 inet_twdr_twkill_work,
34 &dccp_death_row),
35/* Short-time timewait calendar */
36
37 .twcal_hand = -1,
38 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
39 (unsigned long)&dccp_death_row),
40};
41
42void dccp_time_wait(struct sock *sk, int state, int timeo)
43{
44 struct inet_timewait_sock *tw = NULL;
45
46 if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
47 tw = inet_twsk_alloc(sk, state);
48
49 if (tw != NULL) {
50 const struct inet_connection_sock *icsk = inet_csk(sk);
51 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
52
53 /* Linkage updates. */
54 __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
55
56 /* Get the TIME_WAIT timeout firing. */
57 if (timeo < rto)
58 timeo = rto;
59
60 tw->tw_timeout = DCCP_TIMEWAIT_LEN;
61 if (state == DCCP_TIME_WAIT)
62 timeo = DCCP_TIMEWAIT_LEN;
63
64 inet_twsk_schedule(tw, &dccp_death_row, timeo,
65 DCCP_TIMEWAIT_LEN);
66 inet_twsk_put(tw);
67 } else {
68 /* Sorry, if we're out of memory, just CLOSE this
69 * socket up. We've got bigger problems than
70 * non-graceful socket closings.
71 */
72 LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
73 "table overflow\n");
74 }
75
76 dccp_done(sk);
77}
78
79struct sock *dccp_create_openreq_child(struct sock *sk,
80 const struct request_sock *req,
81 const struct sk_buff *skb)
82{
83 /*
84 * Step 3: Process LISTEN state
85 *
86 * // Generate a new socket and switch to that socket
87 * Set S := new socket for this port pair
88 */
89 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
90
91 if (newsk != NULL) {
92 const struct dccp_request_sock *dreq = dccp_rsk(req);
93 struct inet_connection_sock *newicsk = inet_csk(sk);
94 struct dccp_sock *newdp = dccp_sk(newsk);
95
96 newdp->dccps_hc_rx_ackpkts = NULL;
97 newdp->dccps_role = DCCP_ROLE_SERVER;
98 newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
99
100 if (newdp->dccps_options.dccpo_send_ack_vector) {
101 newdp->dccps_hc_rx_ackpkts =
102 dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
103 GFP_ATOMIC);
104 /*
105 * XXX: We're using the same CCIDs set on the parent,
106 * i.e. sk_clone copied the master sock and left the
107 * CCID pointers for this child, that is why we do the
108 * __ccid_get calls.
109 */
110 if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL))
111 goto out_free;
112 }
113
114 if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
115 newsk) != 0 ||
116 ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
117 newsk) != 0)) {
118 dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts);
119 ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
120 ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
121out_free:
122 /* It is still raw copy of parent, so invalidate
123 * destructor and make plain sk_free() */
124 newsk->sk_destruct = NULL;
125 sk_free(newsk);
126 return NULL;
127 }
128
129 __ccid_get(newdp->dccps_hc_rx_ccid);
130 __ccid_get(newdp->dccps_hc_tx_ccid);
131
132 /*
133 * Step 3: Process LISTEN state
134 *
135 * Choose S.ISS (initial seqno) or set from Init Cookie
136 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
137 * Cookie
138 */
139
140 /* See dccp_v4_conn_request */
141 newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
142
143 newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
144 dccp_update_gsr(newsk, dreq->dreq_isr);
145
146 newdp->dccps_iss = dreq->dreq_iss;
147 dccp_update_gss(newsk, dreq->dreq_iss);
148
149 /*
150 * SWL and AWL are initially adjusted so that they are not less than
151 * the initial Sequence Numbers received and sent, respectively:
152 * SWL := max(GSR + 1 - floor(W/4), ISR),
153 * AWL := max(GSS - W' + 1, ISS).
154 * These adjustments MUST be applied only at the beginning of the
155 * connection.
156 */
157 dccp_set_seqno(&newdp->dccps_swl,
158 max48(newdp->dccps_swl, newdp->dccps_isr));
159 dccp_set_seqno(&newdp->dccps_awl,
160 max48(newdp->dccps_awl, newdp->dccps_iss));
161
162 dccp_init_xmit_timers(newsk);
163
164 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
165 }
166 return newsk;
167}
168
169/*
170 * Process an incoming packet for RESPOND sockets represented
171 * as an request_sock.
172 */
173struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
174 struct request_sock *req,
175 struct request_sock **prev)
176{
177 struct sock *child = NULL;
178
179 /* Check for retransmitted REQUEST */
180 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
181 if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
182 dccp_rsk(req)->dreq_isr)) {
183 struct dccp_request_sock *dreq = dccp_rsk(req);
184
185 dccp_pr_debug("Retransmitted REQUEST\n");
186 /* Send another RESPONSE packet */
187 dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
188 dccp_set_seqno(&dreq->dreq_isr,
189 DCCP_SKB_CB(skb)->dccpd_seq);
190 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
191 }
192 /* Network Duplicate, discard packet */
193 return NULL;
194 }
195
196 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
197
198 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
199 dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
200 goto drop;
201
202 /* Invalid ACK */
203 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
204 dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
205 "dreq_iss=%llu\n",
206 (unsigned long long)
207 DCCP_SKB_CB(skb)->dccpd_ack_seq,
208 (unsigned long long)
209 dccp_rsk(req)->dreq_iss);
210 goto drop;
211 }
212
213 child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
214 if (child == NULL)
215 goto listen_overflow;
216
217 /* FIXME: deal with options */
218
219 inet_csk_reqsk_queue_unlink(sk, req, prev);
220 inet_csk_reqsk_queue_removed(sk, req);
221 inet_csk_reqsk_queue_add(sk, req, child);
222out:
223 return child;
224listen_overflow:
225 dccp_pr_debug("listen_overflow!\n");
226 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
227drop:
228 if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
229 req->rsk_ops->send_reset(skb);
230
231 inet_csk_reqsk_queue_drop(sk, req, prev);
232 goto out;
233}
234
235/*
236 * Queue segment on the new socket if the new socket is active,
237 * otherwise we just shortcircuit this and continue with
238 * the new socket.
239 */
240int dccp_child_process(struct sock *parent, struct sock *child,
241 struct sk_buff *skb)
242{
243 int ret = 0;
244 const int state = child->sk_state;
245
246 if (!sock_owned_by_user(child)) {
247 ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
248 skb->len);
249
250 /* Wakeup parent, send SIGIO */
251 if (state == DCCP_RESPOND && child->sk_state != state)
252 parent->sk_data_ready(parent, 0);
253 } else {
254 /* Alas, it is possible again, because we do lookup
255 * in main socket hash table and lock on listening
256 * socket does not protect us more.
257 */
258 sk_add_backlog(child, skb);
259 }
260
261 bh_unlock_sock(child);
262 sock_put(child);
263 return ret;
264}
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 000000000000..382c5894acb2
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,855 @@
1/*
2 * net/dccp/options.c
3 *
4 * An implementation of the DCCP protocol
5 * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
7 * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14#include <linux/config.h>
15#include <linux/dccp.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/skbuff.h>
20
21#include "ccid.h"
22#include "dccp.h"
23
24static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
25 struct sock *sk,
26 const u64 ackno,
27 const unsigned char len,
28 const unsigned char *vector);
29
30/* stores the default values for new connection. may be changed with sysctl */
31static const struct dccp_options dccpo_default_values = {
32 .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW,
33 .dccpo_ccid = DCCPF_INITIAL_CCID,
34 .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR,
35 .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT,
36};
37
38void dccp_options_init(struct dccp_options *dccpo)
39{
40 memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
41}
42
43static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
44{
45 u32 value = 0;
46
47 if (len > 3)
48 value += *bf++ << 24;
49 if (len > 2)
50 value += *bf++ << 16;
51 if (len > 1)
52 value += *bf++ << 8;
53 if (len > 0)
54 value += *bf;
55
56 return value;
57}
58
59int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
60{
61 struct dccp_sock *dp = dccp_sk(sk);
62#ifdef CONFIG_IP_DCCP_DEBUG
63 const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
64 "CLIENT rx opt: " : "server rx opt: ";
65#endif
66 const struct dccp_hdr *dh = dccp_hdr(skb);
67 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
68 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
69 unsigned char *opt_ptr = options;
70 const unsigned char *opt_end = (unsigned char *)dh +
71 (dh->dccph_doff * 4);
72 struct dccp_options_received *opt_recv = &dp->dccps_options_received;
73 unsigned char opt, len;
74 unsigned char *value;
75
76 memset(opt_recv, 0, sizeof(*opt_recv));
77
78 while (opt_ptr != opt_end) {
79 opt = *opt_ptr++;
80 len = 0;
81 value = NULL;
82
83 /* Check if this isn't a single byte option */
84 if (opt > DCCPO_MAX_RESERVED) {
85 if (opt_ptr == opt_end)
86 goto out_invalid_option;
87
88 len = *opt_ptr++;
89 if (len < 3)
90 goto out_invalid_option;
91 /*
92 * Remove the type and len fields, leaving
93 * just the value size
94 */
95 len -= 2;
96 value = opt_ptr;
97 opt_ptr += len;
98
99 if (opt_ptr > opt_end)
100 goto out_invalid_option;
101 }
102
103 switch (opt) {
104 case DCCPO_PADDING:
105 break;
106 case DCCPO_NDP_COUNT:
107 if (len > 3)
108 goto out_invalid_option;
109
110 opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
111 dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
112 opt_recv->dccpor_ndp);
113 break;
114 case DCCPO_ACK_VECTOR_0:
115 if (len > DCCP_MAX_ACK_VECTOR_LEN)
116 goto out_invalid_option;
117
118 if (pkt_type == DCCP_PKT_DATA)
119 continue;
120
121 opt_recv->dccpor_ack_vector_len = len;
122 opt_recv->dccpor_ack_vector_idx = value - options;
123
124 dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n",
125 debug_prefix, len,
126 (unsigned long long)
127 DCCP_SKB_CB(skb)->dccpd_ack_seq);
128 dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq,
129 value, len);
130 dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts,
131 sk,
132 DCCP_SKB_CB(skb)->dccpd_ack_seq,
133 len, value);
134 break;
135 case DCCPO_TIMESTAMP:
136 if (len != 4)
137 goto out_invalid_option;
138
139 opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
140
141 dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
142 do_gettimeofday(&dp->dccps_timestamp_time);
143
144 dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
145 debug_prefix, opt_recv->dccpor_timestamp,
146 (unsigned long long)
147 DCCP_SKB_CB(skb)->dccpd_ack_seq);
148 break;
149 case DCCPO_TIMESTAMP_ECHO:
150 if (len != 4 && len != 6 && len != 8)
151 goto out_invalid_option;
152
153 opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
154
155 dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
156 debug_prefix,
157 opt_recv->dccpor_timestamp_echo,
158 len + 2,
159 (unsigned long long)
160 DCCP_SKB_CB(skb)->dccpd_ack_seq);
161
162 if (len > 4) {
163 if (len == 6)
164 opt_recv->dccpor_elapsed_time =
165 ntohs(*(u16 *)(value + 4));
166 else
167 opt_recv->dccpor_elapsed_time =
168 ntohl(*(u32 *)(value + 4));
169
170 dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n",
171 debug_prefix,
172 opt_recv->dccpor_elapsed_time);
173 }
174 break;
175 case DCCPO_ELAPSED_TIME:
176 if (len != 2 && len != 4)
177 goto out_invalid_option;
178
179 if (pkt_type == DCCP_PKT_DATA)
180 continue;
181
182 if (len == 2)
183 opt_recv->dccpor_elapsed_time =
184 ntohs(*(u16 *)value);
185 else
186 opt_recv->dccpor_elapsed_time =
187 ntohl(*(u32 *)value);
188
189 dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
190 opt_recv->dccpor_elapsed_time);
191 break;
192 /*
193 * From draft-ietf-dccp-spec-11.txt:
194 *
195 * Option numbers 128 through 191 are for
196 * options sent from the HC-Sender to the
197 * HC-Receiver; option numbers 192 through 255
198 * are for options sent from the HC-Receiver to
199 * the HC-Sender.
200 */
201 case 128 ... 191: {
202 const u16 idx = value - options;
203
204 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
205 opt, len, idx,
206 value) != 0)
207 goto out_invalid_option;
208 }
209 break;
210 case 192 ... 255: {
211 const u16 idx = value - options;
212
213 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
214 opt, len, idx,
215 value) != 0)
216 goto out_invalid_option;
217 }
218 break;
219 default:
220 pr_info("DCCP(%p): option %d(len=%d) not "
221 "implemented, ignoring\n",
222 sk, opt, len);
223 break;
224 }
225 }
226
227 return 0;
228
229out_invalid_option:
230 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
231 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
232 pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
233 return -1;
234}
235
236static void dccp_encode_value_var(const u32 value, unsigned char *to,
237 const unsigned int len)
238{
239 if (len > 3)
240 *to++ = (value & 0xFF000000) >> 24;
241 if (len > 2)
242 *to++ = (value & 0xFF0000) >> 16;
243 if (len > 1)
244 *to++ = (value & 0xFF00) >> 8;
245 if (len > 0)
246 *to++ = (value & 0xFF);
247}
248
249static inline int dccp_ndp_len(const int ndp)
250{
251 return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
252}
253
254void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
255 const unsigned char option,
256 const void *value, const unsigned char len)
257{
258 unsigned char *to;
259
260 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
261 LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
262 "%d option!\n", option);
263 return;
264 }
265
266 DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
267
268 to = skb_push(skb, len + 2);
269 *to++ = option;
270 *to++ = len + 2;
271
272 memcpy(to, value, len);
273}
274
275EXPORT_SYMBOL_GPL(dccp_insert_option);
276
277static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
278{
279 struct dccp_sock *dp = dccp_sk(sk);
280 int ndp = dp->dccps_ndp_count;
281
282 if (dccp_non_data_packet(skb))
283 ++dp->dccps_ndp_count;
284 else
285 dp->dccps_ndp_count = 0;
286
287 if (ndp > 0) {
288 unsigned char *ptr;
289 const int ndp_len = dccp_ndp_len(ndp);
290 const int len = ndp_len + 2;
291
292 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
293 return;
294
295 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
296
297 ptr = skb_push(skb, len);
298 *ptr++ = DCCPO_NDP_COUNT;
299 *ptr++ = len;
300 dccp_encode_value_var(ndp, ptr, ndp_len);
301 }
302}
303
304static inline int dccp_elapsed_time_len(const u32 elapsed_time)
305{
306 return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
307}
308
309void dccp_insert_option_elapsed_time(struct sock *sk,
310 struct sk_buff *skb,
311 u32 elapsed_time)
312{
313#ifdef CONFIG_IP_DCCP_DEBUG
314 struct dccp_sock *dp = dccp_sk(sk);
315 const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
316 "CLIENT TX opt: " : "server TX opt: ";
317#endif
318 const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
319 const int len = 2 + elapsed_time_len;
320 unsigned char *to;
321
322 if (elapsed_time_len == 0)
323 return;
324
325 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
326 LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
327 "insert elapsed time!\n");
328 return;
329 }
330
331 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
332
333 to = skb_push(skb, len);
334 *to++ = DCCPO_ELAPSED_TIME;
335 *to++ = len;
336
337 if (elapsed_time_len == 2) {
338 const u16 var16 = htons((u16)elapsed_time);
339 memcpy(to, &var16, 2);
340 } else {
341 const u32 var32 = htonl(elapsed_time);
342 memcpy(to, &var32, 4);
343 }
344
345 dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
346 debug_prefix, elapsed_time,
347 len,
348 (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
349}
350
351EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
352
353static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
354{
355 struct dccp_sock *dp = dccp_sk(sk);
356#ifdef CONFIG_IP_DCCP_DEBUG
357 const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
358 "CLIENT TX opt: " : "server TX opt: ";
359#endif
360 struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
361 int len = ap->dccpap_buf_vector_len + 2;
362 const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10;
363 unsigned char *to, *from;
364
365 if (elapsed_time != 0)
366 dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
367
368 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
369 LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
370 "insert ACK Vector!\n");
371 return;
372 }
373
374 /*
375 * XXX: now we have just one ack vector sent record, so
376 * we have to wait for it to be cleared.
377 *
378 * Of course this is not acceptable, but this is just for
379 * basic testing now.
380 */
381 if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1)
382 return;
383
384 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
385
386 to = skb_push(skb, len);
387 *to++ = DCCPO_ACK_VECTOR_0;
388 *to++ = len;
389
390 len = ap->dccpap_buf_vector_len;
391 from = ap->dccpap_buf + ap->dccpap_buf_head;
392
393 /* Check if buf_head wraps */
394 if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) {
395 const unsigned int tailsize = (ap->dccpap_buf_len -
396 ap->dccpap_buf_head);
397
398 memcpy(to, from, tailsize);
399 to += tailsize;
400 len -= tailsize;
401 from = ap->dccpap_buf;
402 }
403
404 memcpy(to, from, len);
405 /*
406 * From draft-ietf-dccp-spec-11.txt:
407 *
408 * For each acknowledgement it sends, the HC-Receiver will add an
409 * acknowledgement record. ack_seqno will equal the HC-Receiver
410 * sequence number it used for the ack packet; ack_ptr will equal
411 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
412 * equal buf_nonce.
413 *
414 * This implemention uses just one ack record for now.
415 */
416 ap->dccpap_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
417 ap->dccpap_ack_ptr = ap->dccpap_buf_head;
418 ap->dccpap_ack_ackno = ap->dccpap_buf_ackno;
419 ap->dccpap_ack_nonce = ap->dccpap_buf_nonce;
420 ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len;
421
422 dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
423 "ack_ackno=%llu\n",
424 debug_prefix, ap->dccpap_ack_vector_len,
425 (unsigned long long) ap->dccpap_ack_seqno,
426 (unsigned long long) ap->dccpap_ack_ackno);
427}
428
429void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
430{
431 struct timeval tv;
432 u32 now;
433
434 do_gettimeofday(&tv);
435 now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10;
436 /* yes this will overflow but that is the point as we want a
437 * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
438
439 now = htonl(now);
440 dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
441}
442
443EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
444
445static void dccp_insert_option_timestamp_echo(struct sock *sk,
446 struct sk_buff *skb)
447{
448 struct dccp_sock *dp = dccp_sk(sk);
449#ifdef CONFIG_IP_DCCP_DEBUG
450 const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
451 "CLIENT TX opt: " : "server TX opt: ";
452#endif
453 u32 tstamp_echo;
454 const u32 elapsed_time =
455 timeval_now_delta(&dp->dccps_timestamp_time) / 10;
456 const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
457 const int len = 6 + elapsed_time_len;
458 unsigned char *to;
459
460 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
461 LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
462 "timestamp echo!\n");
463 return;
464 }
465
466 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
467
468 to = skb_push(skb, len);
469 *to++ = DCCPO_TIMESTAMP_ECHO;
470 *to++ = len;
471
472 tstamp_echo = htonl(dp->dccps_timestamp_echo);
473 memcpy(to, &tstamp_echo, 4);
474 to += 4;
475
476 if (elapsed_time_len == 2) {
477 const u16 var16 = htons((u16)elapsed_time);
478 memcpy(to, &var16, 2);
479 } else if (elapsed_time_len == 4) {
480 const u32 var32 = htonl(elapsed_time);
481 memcpy(to, &var32, 4);
482 }
483
484 dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
485 debug_prefix, dp->dccps_timestamp_echo,
486 len,
487 (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
488
489 dp->dccps_timestamp_echo = 0;
490 dp->dccps_timestamp_time.tv_sec = 0;
491 dp->dccps_timestamp_time.tv_usec = 0;
492}
493
494void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
495{
496 struct dccp_sock *dp = dccp_sk(sk);
497
498 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
499
500 if (dp->dccps_options.dccpo_send_ndp_count)
501 dccp_insert_option_ndp(sk, skb);
502
503 if (!dccp_packet_without_ack(skb)) {
504 if (dp->dccps_options.dccpo_send_ack_vector &&
505 (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno !=
506 DCCP_MAX_SEQNO + 1))
507 dccp_insert_option_ack_vector(sk, skb);
508
509 if (dp->dccps_timestamp_echo != 0)
510 dccp_insert_option_timestamp_echo(sk, skb);
511 }
512
513 ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
514 ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
515
516 /* XXX: insert other options when appropriate */
517
518 if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
519 /* The length of all options has to be a multiple of 4 */
520 int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
521
522 if (padding != 0) {
523 padding = 4 - padding;
524 memset(skb_push(skb, padding), 0, padding);
525 DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
526 }
527 }
528}
529
530struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len,
531 const unsigned int __nocast priority)
532{
533 struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority);
534
535 if (ap != NULL) {
536#ifdef CONFIG_IP_DCCP_DEBUG
537 memset(ap->dccpap_buf, 0xFF, len);
538#endif
539 ap->dccpap_buf_len = len;
540 ap->dccpap_buf_head =
541 ap->dccpap_buf_tail =
542 ap->dccpap_buf_len - 1;
543 ap->dccpap_buf_ackno =
544 ap->dccpap_ack_ackno =
545 ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
546 ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0;
547 ap->dccpap_ack_ptr = 0;
548 ap->dccpap_time.tv_sec = 0;
549 ap->dccpap_time.tv_usec = 0;
550 ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0;
551 }
552
553 return ap;
554}
555
556void dccp_ackpkts_free(struct dccp_ackpkts *ap)
557{
558 if (ap != NULL) {
559#ifdef CONFIG_IP_DCCP_DEBUG
560 memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len);
561#endif
562 kfree(ap);
563 }
564}
565
566static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap,
567 const unsigned int index)
568{
569 return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK;
570}
571
572static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap,
573 const unsigned int index)
574{
575 return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK;
576}
577
578/*
579 * If several packets are missing, the HC-Receiver may prefer to enter multiple
580 * bytes with run length 0, rather than a single byte with a larger run length;
581 * this simplifies table updates if one of the missing packets arrives.
582 */
583static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap,
584 const unsigned int packets,
585 const unsigned char state)
586{
587 unsigned int gap;
588 signed long new_head;
589
590 if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len)
591 return -ENOBUFS;
592
593 gap = packets - 1;
594 new_head = ap->dccpap_buf_head - packets;
595
596 if (new_head < 0) {
597 if (gap > 0) {
598 memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED,
599 gap + new_head + 1);
600 gap = -new_head;
601 }
602 new_head += ap->dccpap_buf_len;
603 }
604
605 ap->dccpap_buf_head = new_head;
606
607 if (gap > 0)
608 memset(ap->dccpap_buf + ap->dccpap_buf_head + 1,
609 DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap);
610
611 ap->dccpap_buf[ap->dccpap_buf_head] = state;
612 ap->dccpap_buf_vector_len += packets;
613 return 0;
614}
615
616/*
617 * Implements the draft-ietf-dccp-spec-11.txt Appendix A
618 */
619int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
620{
621 /*
622 * Check at the right places if the buffer is full, if it is, tell the
623 * caller to start dropping packets till the HC-Sender acks our ACK
624 * vectors, when we will free up space in dccpap_buf.
625 *
626 * We may well decide to do buffer compression, etc, but for now lets
627 * just drop.
628 *
629 * From Appendix A:
630 *
631 * Of course, the circular buffer may overflow, either when the
632 * HC-Sender is sending data at a very high rate, when the
633 * HC-Receiver's acknowledgements are not reaching the HC-Sender,
634 * or when the HC-Sender is forgetting to acknowledge those acks
635 * (so the HC-Receiver is unable to clean up old state). In this
636 * case, the HC-Receiver should either compress the buffer (by
637 * increasing run lengths when possible), transfer its state to
638 * a larger buffer, or, as a last resort, drop all received
639 * packets, without processing them whatsoever, until its buffer
640 * shrinks again.
641 */
642
643 /* See if this is the first ackno being inserted */
644 if (ap->dccpap_buf_vector_len == 0) {
645 ap->dccpap_buf[ap->dccpap_buf_head] = state;
646 ap->dccpap_buf_vector_len = 1;
647 } else if (after48(ackno, ap->dccpap_buf_ackno)) {
648 const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno,
649 ackno);
650
651 /*
652 * Look if the state of this packet is the same as the
653 * previous ackno and if so if we can bump the head len.
654 */
655 if (delta == 1 &&
656 dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state &&
657 (dccp_ackpkts_len(ap, ap->dccpap_buf_head) <
658 DCCP_ACKPKTS_LEN_MASK))
659 ap->dccpap_buf[ap->dccpap_buf_head]++;
660 else if (dccp_ackpkts_set_buf_head_state(ap, delta, state))
661 return -ENOBUFS;
662 } else {
663 /*
664 * A.1.2. Old Packets
665 *
666 * When a packet with Sequence Number S arrives, and
667 * S <= buf_ackno, the HC-Receiver will scan the table
668 * for the byte corresponding to S. (Indexing structures
669 * could reduce the complexity of this scan.)
670 */
671 u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno);
672 unsigned int index = ap->dccpap_buf_head;
673
674 while (1) {
675 const u8 len = dccp_ackpkts_len(ap, index);
676 const u8 state = dccp_ackpkts_state(ap, index);
677 /*
678 * valid packets not yet in dccpap_buf have a reserved
679 * entry, with a len equal to 0.
680 */
681 if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED &&
682 len == 0 && delta == 0) { /* Found our
683 reserved seat! */
684 dccp_pr_debug("Found %llu reserved seat!\n",
685 (unsigned long long) ackno);
686 ap->dccpap_buf[index] = state;
687 goto out;
688 }
689 /* len == 0 means one packet */
690 if (delta < len + 1)
691 goto out_duplicate;
692
693 delta -= len + 1;
694 if (++index == ap->dccpap_buf_len)
695 index = 0;
696 }
697 }
698
699 ap->dccpap_buf_ackno = ackno;
700 do_gettimeofday(&ap->dccpap_time);
701out:
702 dccp_pr_debug("");
703 dccp_ackpkts_print(ap);
704 return 0;
705
706out_duplicate:
707 /* Duplicate packet */
708 dccp_pr_debug("Received a dup or already considered lost "
709 "packet: %llu\n", (unsigned long long) ackno);
710 return -EILSEQ;
711}
712
713#ifdef CONFIG_IP_DCCP_DEBUG
714void dccp_ackvector_print(const u64 ackno, const unsigned char *vector,
715 int len)
716{
717 if (!dccp_debug)
718 return;
719
720 printk("ACK vector len=%d, ackno=%llu |", len,
721 (unsigned long long) ackno);
722
723 while (len--) {
724 const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6;
725 const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
726
727 printk("%d,%d|", state, rl);
728 ++vector;
729 }
730
731 printk("\n");
732}
733
734void dccp_ackpkts_print(const struct dccp_ackpkts *ap)
735{
736 dccp_ackvector_print(ap->dccpap_buf_ackno,
737 ap->dccpap_buf + ap->dccpap_buf_head,
738 ap->dccpap_buf_vector_len);
739}
740#endif
741
742static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap)
743{
744 /*
745 * As we're keeping track of the ack vector size
746 * (dccpap_buf_vector_len) and the sent ack vector size
747 * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but
748 * keep this code here as in the future we'll implement a vector of
749 * ack records, as suggested in draft-ietf-dccp-spec-11.txt
750 * Appendix A. -acme
751 */
752#if 0
753 ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1;
754 if (ap->dccpap_buf_tail >= ap->dccpap_buf_len)
755 ap->dccpap_buf_tail -= ap->dccpap_buf_len;
756#endif
757 ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len;
758}
759
760void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk,
761 u64 ackno)
762{
763 /* Check if we actually sent an ACK vector */
764 if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
765 return;
766
767 if (ackno == ap->dccpap_ack_seqno) {
768#ifdef CONFIG_IP_DCCP_DEBUG
769 struct dccp_sock *dp = dccp_sk(sk);
770 const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
771 "CLIENT rx ack: " : "server rx ack: ";
772#endif
773 dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
774 "ack_ackno=%llu, ACKED!\n",
775 debug_prefix, 1,
776 (unsigned long long) ap->dccpap_ack_seqno,
777 (unsigned long long) ap->dccpap_ack_ackno);
778 dccp_ackpkts_trow_away_ack_record(ap);
779 ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
780 }
781}
782
783static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
784 struct sock *sk, u64 ackno,
785 const unsigned char len,
786 const unsigned char *vector)
787{
788 unsigned char i;
789
790 /* Check if we actually sent an ACK vector */
791 if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
792 return;
793 /*
794 * We're in the receiver half connection, so if the received an ACK
795 * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're
796 * not interested.
797 *
798 * Extra explanation with example:
799 *
800 * if we received an ACK vector with ackno 50, it can only be acking
801 * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
802 */
803 /* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */
804 if (before48(ackno, ap->dccpap_ack_seqno)) {
805 /* dccp_pr_debug_cat("yes\n"); */
806 return;
807 }
808 /* dccp_pr_debug_cat("no\n"); */
809
810 i = len;
811 while (i--) {
812 const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
813 u64 ackno_end_rl;
814
815 dccp_set_seqno(&ackno_end_rl, ackno - rl);
816
817 /*
818 * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
819 * ap->dccpap_ack_seqno, ackno);
820 */
821 if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) {
822 const u8 state = (*vector &
823 DCCP_ACKPKTS_STATE_MASK) >> 6;
824 /* dccp_pr_debug_cat("yes\n"); */
825
826 if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) {
827#ifdef CONFIG_IP_DCCP_DEBUG
828 struct dccp_sock *dp = dccp_sk(sk);
829 const char *debug_prefix =
830 dp->dccps_role == DCCP_ROLE_CLIENT ?
831 "CLIENT rx ack: " : "server rx ack: ";
832#endif
833 dccp_pr_debug("%sACK vector 0, len=%d, "
834 "ack_seqno=%llu, ack_ackno=%llu, "
835 "ACKED!\n",
836 debug_prefix, len,
837 (unsigned long long)
838 ap->dccpap_ack_seqno,
839 (unsigned long long)
840 ap->dccpap_ack_ackno);
841 dccp_ackpkts_trow_away_ack_record(ap);
842 }
843 /*
844 * If dccpap_ack_seqno was not received, no problem
845 * we'll send another ACK vector.
846 */
847 ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
848 break;
849 }
850 /* dccp_pr_debug_cat("no\n"); */
851
852 dccp_set_seqno(&ackno, ackno_end_rl - 1);
853 ++vector;
854 }
855}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 000000000000..28de157a4326
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,528 @@
1/*
2 * net/dccp/output.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/config.h>
14#include <linux/dccp.h>
15#include <linux/skbuff.h>
16
17#include <net/sock.h>
18
19#include "ccid.h"
20#include "dccp.h"
21
22static inline void dccp_event_ack_sent(struct sock *sk)
23{
24 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
25}
26
27/*
28 * All SKB's seen here are completely headerless. It is our
29 * job to build the DCCP header, and pass the packet down to
30 * IP so it can do the same plus pass the packet off to the
31 * device.
32 */
33int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
34{
35 if (likely(skb != NULL)) {
36 const struct inet_sock *inet = inet_sk(sk);
37 struct dccp_sock *dp = dccp_sk(sk);
38 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
39 struct dccp_hdr *dh;
40 /* XXX For now we're using only 48 bits sequence numbers */
41 const int dccp_header_size = sizeof(*dh) +
42 sizeof(struct dccp_hdr_ext) +
43 dccp_packet_hdr_len(dcb->dccpd_type);
44 int err, set_ack = 1;
45 u64 ackno = dp->dccps_gsr;
46
47 dccp_inc_seqno(&dp->dccps_gss);
48
49 switch (dcb->dccpd_type) {
50 case DCCP_PKT_DATA:
51 set_ack = 0;
52 break;
53 case DCCP_PKT_SYNC:
54 case DCCP_PKT_SYNCACK:
55 ackno = dcb->dccpd_seq;
56 break;
57 }
58
59 dcb->dccpd_seq = dp->dccps_gss;
60 dccp_insert_options(sk, skb);
61
62 skb->h.raw = skb_push(skb, dccp_header_size);
63 dh = dccp_hdr(skb);
64 /*
65 * Data packets are not cloned as they are never retransmitted
66 */
67 if (skb_cloned(skb))
68 skb_set_owner_w(skb, sk);
69
70 /* Build DCCP header and checksum it. */
71 memset(dh, 0, dccp_header_size);
72 dh->dccph_type = dcb->dccpd_type;
73 dh->dccph_sport = inet->sport;
74 dh->dccph_dport = inet->dport;
75 dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
76 dh->dccph_ccval = dcb->dccpd_ccval;
77 /* XXX For now we're using only 48 bits sequence numbers */
78 dh->dccph_x = 1;
79
80 dp->dccps_awh = dp->dccps_gss;
81 dccp_hdr_set_seq(dh, dp->dccps_gss);
82 if (set_ack)
83 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
84
85 switch (dcb->dccpd_type) {
86 case DCCP_PKT_REQUEST:
87 dccp_hdr_request(skb)->dccph_req_service =
88 dcb->dccpd_service;
89 break;
90 case DCCP_PKT_RESET:
91 dccp_hdr_reset(skb)->dccph_reset_code =
92 dcb->dccpd_reset_code;
93 break;
94 }
95
96 dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
97 inet->daddr);
98
99 if (set_ack)
100 dccp_event_ack_sent(sk);
101
102 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
103
104 err = ip_queue_xmit(skb, 0);
105 if (err <= 0)
106 return err;
107
108 /* NET_XMIT_CN is special. It does not guarantee,
109 * that this packet is lost. It tells that device
110 * is about to start to drop packets or already
111 * drops some packets of the same priority and
112 * invokes us to send less aggressively.
113 */
114 return err == NET_XMIT_CN ? 0 : err;
115 }
116 return -ENOBUFS;
117}
118
119unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
120{
121 struct dccp_sock *dp = dccp_sk(sk);
122 int mss_now;
123
124 /*
125 * FIXME: we really should be using the af_specific thing to support
126 * IPv6.
127 * mss_now = pmtu - tp->af_specific->net_header_len -
128 * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
129 */
130 mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
131 sizeof(struct dccp_hdr_ext);
132
133 /* Now subtract optional transport overhead */
134 mss_now -= dp->dccps_ext_header_len;
135
136 /*
137 * FIXME: this should come from the CCID infrastructure, where, say,
138 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
139 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
140 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
141 * make it a multiple of 4
142 */
143
144 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
145
146 /* And store cached results */
147 dp->dccps_pmtu_cookie = pmtu;
148 dp->dccps_mss_cache = mss_now;
149
150 return mss_now;
151}
152
153void dccp_write_space(struct sock *sk)
154{
155 read_lock(&sk->sk_callback_lock);
156
157 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
158 wake_up_interruptible(sk->sk_sleep);
159 /* Should agree with poll, otherwise some programs break */
160 if (sock_writeable(sk))
161 sk_wake_async(sk, 2, POLL_OUT);
162
163 read_unlock(&sk->sk_callback_lock);
164}
165
166/**
167 * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
168 * @sk: socket to wait for
169 * @timeo: for how long
170 */
171static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
172 long *timeo)
173{
174 struct dccp_sock *dp = dccp_sk(sk);
175 DEFINE_WAIT(wait);
176 long delay;
177 int rc;
178
179 while (1) {
180 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
181
182 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
183 goto do_error;
184 if (!*timeo)
185 goto do_nonblock;
186 if (signal_pending(current))
187 goto do_interrupted;
188
189 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
190 skb->len);
191 if (rc <= 0)
192 break;
193 delay = msecs_to_jiffies(rc);
194 if (delay > *timeo || delay < 0)
195 goto do_nonblock;
196
197 sk->sk_write_pending++;
198 release_sock(sk);
199 *timeo -= schedule_timeout(delay);
200 lock_sock(sk);
201 sk->sk_write_pending--;
202 }
203out:
204 finish_wait(sk->sk_sleep, &wait);
205 return rc;
206
207do_error:
208 rc = -EPIPE;
209 goto out;
210do_nonblock:
211 rc = -EAGAIN;
212 goto out;
213do_interrupted:
214 rc = sock_intr_errno(*timeo);
215 goto out;
216}
217
218int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
219{
220 const struct dccp_sock *dp = dccp_sk(sk);
221 int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
222 skb->len);
223
224 if (err > 0)
225 err = dccp_wait_for_ccid(sk, skb, timeo);
226
227 if (err == 0) {
228 const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
229 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
230 const int len = skb->len;
231
232 if (sk->sk_state == DCCP_PARTOPEN) {
233 /* See 8.1.5. Handshake Completion */
234 inet_csk_schedule_ack(sk);
235 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
236 inet_csk(sk)->icsk_rto,
237 DCCP_RTO_MAX);
238 dcb->dccpd_type = DCCP_PKT_DATAACK;
239 /*
240 * FIXME: we really should have a
241 * dccps_ack_pending or use icsk.
242 */
243 } else if (inet_csk_ack_scheduled(sk) ||
244 dp->dccps_timestamp_echo != 0 ||
245 (dp->dccps_options.dccpo_send_ack_vector &&
246 ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 &&
247 ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1))
248 dcb->dccpd_type = DCCP_PKT_DATAACK;
249 else
250 dcb->dccpd_type = DCCP_PKT_DATA;
251
252 err = dccp_transmit_skb(sk, skb);
253 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
254 }
255
256 return err;
257}
258
259int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
260{
261 if (inet_sk_rebuild_header(sk) != 0)
262 return -EHOSTUNREACH; /* Routing failure or similar. */
263
264 return dccp_transmit_skb(sk, (skb_cloned(skb) ?
265 pskb_copy(skb, GFP_ATOMIC):
266 skb_clone(skb, GFP_ATOMIC)));
267}
268
269struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
270 struct request_sock *req)
271{
272 struct dccp_hdr *dh;
273 const int dccp_header_size = sizeof(struct dccp_hdr) +
274 sizeof(struct dccp_hdr_ext) +
275 sizeof(struct dccp_hdr_response);
276 struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
277 dccp_header_size, 1,
278 GFP_ATOMIC);
279 if (skb == NULL)
280 return NULL;
281
282 /* Reserve space for headers. */
283 skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
284
285 skb->dst = dst_clone(dst);
286 skb->csum = 0;
287
288 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
289 DCCP_SKB_CB(skb)->dccpd_seq = dccp_rsk(req)->dreq_iss;
290 dccp_insert_options(sk, skb);
291
292 skb->h.raw = skb_push(skb, dccp_header_size);
293
294 dh = dccp_hdr(skb);
295 memset(dh, 0, dccp_header_size);
296
297 dh->dccph_sport = inet_sk(sk)->sport;
298 dh->dccph_dport = inet_rsk(req)->rmt_port;
299 dh->dccph_doff = (dccp_header_size +
300 DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
301 dh->dccph_type = DCCP_PKT_RESPONSE;
302 dh->dccph_x = 1;
303 dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss);
304 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr);
305
306 dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
307 inet_rsk(req)->rmt_addr);
308
309 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
310 return skb;
311}
312
313struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
314 const enum dccp_reset_codes code)
315
316{
317 struct dccp_hdr *dh;
318 struct dccp_sock *dp = dccp_sk(sk);
319 const int dccp_header_size = sizeof(struct dccp_hdr) +
320 sizeof(struct dccp_hdr_ext) +
321 sizeof(struct dccp_hdr_reset);
322 struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
323 dccp_header_size, 1,
324 GFP_ATOMIC);
325 if (skb == NULL)
326 return NULL;
327
328 /* Reserve space for headers. */
329 skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
330
331 skb->dst = dst_clone(dst);
332 skb->csum = 0;
333
334 dccp_inc_seqno(&dp->dccps_gss);
335
336 DCCP_SKB_CB(skb)->dccpd_reset_code = code;
337 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
338 DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss;
339 dccp_insert_options(sk, skb);
340
341 skb->h.raw = skb_push(skb, dccp_header_size);
342
343 dh = dccp_hdr(skb);
344 memset(dh, 0, dccp_header_size);
345
346 dh->dccph_sport = inet_sk(sk)->sport;
347 dh->dccph_dport = inet_sk(sk)->dport;
348 dh->dccph_doff = (dccp_header_size +
349 DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
350 dh->dccph_type = DCCP_PKT_RESET;
351 dh->dccph_x = 1;
352 dccp_hdr_set_seq(dh, dp->dccps_gss);
353 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
354
355 dccp_hdr_reset(skb)->dccph_reset_code = code;
356
357 dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
358 inet_sk(sk)->daddr);
359
360 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
361 return skb;
362}
363
364/*
365 * Do all connect socket setups that can be done AF independent.
366 */
367static inline void dccp_connect_init(struct sock *sk)
368{
369 struct dst_entry *dst = __sk_dst_get(sk);
370 struct inet_connection_sock *icsk = inet_csk(sk);
371
372 sk->sk_err = 0;
373 sock_reset_flag(sk, SOCK_DONE);
374
375 dccp_sync_mss(sk, dst_mtu(dst));
376
377 /*
378 * FIXME: set dp->{dccps_swh,dccps_swl}, with
379 * something like dccp_inc_seq
380 */
381
382 icsk->icsk_retransmits = 0;
383}
384
385int dccp_connect(struct sock *sk)
386{
387 struct sk_buff *skb;
388 struct inet_connection_sock *icsk = inet_csk(sk);
389
390 dccp_connect_init(sk);
391
392 skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
393 if (unlikely(skb == NULL))
394 return -ENOBUFS;
395
396 /* Reserve space for headers. */
397 skb_reserve(skb, MAX_DCCP_HEADER);
398
399 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
400 /* FIXME: set service to something meaningful, coming
401 * from userspace*/
402 DCCP_SKB_CB(skb)->dccpd_service = 0;
403 skb->csum = 0;
404 skb_set_owner_w(skb, sk);
405
406 BUG_TRAP(sk->sk_send_head == NULL);
407 sk->sk_send_head = skb;
408 dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
409 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
410
411 /* Timer for repeating the REQUEST until an answer. */
412 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
413 icsk->icsk_rto, DCCP_RTO_MAX);
414 return 0;
415}
416
417void dccp_send_ack(struct sock *sk)
418{
419 /* If we have been reset, we may not send again. */
420 if (sk->sk_state != DCCP_CLOSED) {
421 struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
422
423 if (skb == NULL) {
424 inet_csk_schedule_ack(sk);
425 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
426 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
427 TCP_DELACK_MAX,
428 DCCP_RTO_MAX);
429 return;
430 }
431
432 /* Reserve space for headers */
433 skb_reserve(skb, MAX_DCCP_HEADER);
434 skb->csum = 0;
435 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
436 skb_set_owner_w(skb, sk);
437 dccp_transmit_skb(sk, skb);
438 }
439}
440
441EXPORT_SYMBOL_GPL(dccp_send_ack);
442
443void dccp_send_delayed_ack(struct sock *sk)
444{
445 struct inet_connection_sock *icsk = inet_csk(sk);
446 /*
447 * FIXME: tune this timer. elapsed time fixes the skew, so no problem
448 * with using 2s, and active senders also piggyback the ACK into a
449 * DATAACK packet, so this is really for quiescent senders.
450 */
451 unsigned long timeout = jiffies + 2 * HZ;
452
453 /* Use new timeout only if there wasn't a older one earlier. */
454 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
455 /* If delack timer was blocked or is about to expire,
456 * send ACK now.
457 *
458 * FIXME: check the "about to expire" part
459 */
460 if (icsk->icsk_ack.blocked) {
461 dccp_send_ack(sk);
462 return;
463 }
464
465 if (!time_before(timeout, icsk->icsk_ack.timeout))
466 timeout = icsk->icsk_ack.timeout;
467 }
468 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
469 icsk->icsk_ack.timeout = timeout;
470 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
471}
472
473void dccp_send_sync(struct sock *sk, const u64 seq,
474 const enum dccp_pkt_type pkt_type)
475{
476 /*
477 * We are not putting this on the write queue, so
478 * dccp_transmit_skb() will set the ownership to this
479 * sock.
480 */
481 struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
482
483 if (skb == NULL)
484 /* FIXME: how to make sure the sync is sent? */
485 return;
486
487 /* Reserve space for headers and prepare control bits. */
488 skb_reserve(skb, MAX_DCCP_HEADER);
489 skb->csum = 0;
490 DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
491 DCCP_SKB_CB(skb)->dccpd_seq = seq;
492
493 skb_set_owner_w(skb, sk);
494 dccp_transmit_skb(sk, skb);
495}
496
497/*
498 * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
499 * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
500 * any circumstances.
501 */
502void dccp_send_close(struct sock *sk, const int active)
503{
504 struct dccp_sock *dp = dccp_sk(sk);
505 struct sk_buff *skb;
506 const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC;
507
508 skb = alloc_skb(sk->sk_prot->max_header, prio);
509 if (skb == NULL)
510 return;
511
512 /* Reserve space for headers and prepare control bits. */
513 skb_reserve(skb, sk->sk_prot->max_header);
514 skb->csum = 0;
515 DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
516 DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
517
518 skb_set_owner_w(skb, sk);
519 if (active) {
520 BUG_TRAP(sk->sk_send_head == NULL);
521 sk->sk_send_head = skb;
522 dccp_transmit_skb(sk, skb_clone(skb, prio));
523 } else
524 dccp_transmit_skb(sk, skb);
525
526 ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
527 ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
528}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 000000000000..18a0e69c9dc7
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,826 @@
1/*
2 * net/dccp/proto.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/config.h>
13#include <linux/dccp.h>
14#include <linux/module.h>
15#include <linux/types.h>
16#include <linux/sched.h>
17#include <linux/kernel.h>
18#include <linux/skbuff.h>
19#include <linux/netdevice.h>
20#include <linux/in.h>
21#include <linux/if_arp.h>
22#include <linux/init.h>
23#include <linux/random.h>
24#include <net/checksum.h>
25
26#include <net/inet_common.h>
27#include <net/ip.h>
28#include <net/protocol.h>
29#include <net/sock.h>
30#include <net/xfrm.h>
31
32#include <asm/semaphore.h>
33#include <linux/spinlock.h>
34#include <linux/timer.h>
35#include <linux/delay.h>
36#include <linux/poll.h>
37#include <linux/dccp.h>
38
39#include "ccid.h"
40#include "dccp.h"
41
42DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
43
44atomic_t dccp_orphan_count = ATOMIC_INIT(0);
45
46static struct net_protocol dccp_protocol = {
47 .handler = dccp_v4_rcv,
48 .err_handler = dccp_v4_err,
49};
50
51const char *dccp_packet_name(const int type)
52{
53 static const char *dccp_packet_names[] = {
54 [DCCP_PKT_REQUEST] = "REQUEST",
55 [DCCP_PKT_RESPONSE] = "RESPONSE",
56 [DCCP_PKT_DATA] = "DATA",
57 [DCCP_PKT_ACK] = "ACK",
58 [DCCP_PKT_DATAACK] = "DATAACK",
59 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
60 [DCCP_PKT_CLOSE] = "CLOSE",
61 [DCCP_PKT_RESET] = "RESET",
62 [DCCP_PKT_SYNC] = "SYNC",
63 [DCCP_PKT_SYNCACK] = "SYNCACK",
64 };
65
66 if (type >= DCCP_NR_PKT_TYPES)
67 return "INVALID";
68 else
69 return dccp_packet_names[type];
70}
71
72EXPORT_SYMBOL_GPL(dccp_packet_name);
73
74const char *dccp_state_name(const int state)
75{
76 static char *dccp_state_names[] = {
77 [DCCP_OPEN] = "OPEN",
78 [DCCP_REQUESTING] = "REQUESTING",
79 [DCCP_PARTOPEN] = "PARTOPEN",
80 [DCCP_LISTEN] = "LISTEN",
81 [DCCP_RESPOND] = "RESPOND",
82 [DCCP_CLOSING] = "CLOSING",
83 [DCCP_TIME_WAIT] = "TIME_WAIT",
84 [DCCP_CLOSED] = "CLOSED",
85 };
86
87 if (state >= DCCP_MAX_STATES)
88 return "INVALID STATE!";
89 else
90 return dccp_state_names[state];
91}
92
93EXPORT_SYMBOL_GPL(dccp_state_name);
94
95static inline int dccp_listen_start(struct sock *sk)
96{
97 dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
98 return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
99}
100
101int dccp_disconnect(struct sock *sk, int flags)
102{
103 struct inet_connection_sock *icsk = inet_csk(sk);
104 struct inet_sock *inet = inet_sk(sk);
105 int err = 0;
106 const int old_state = sk->sk_state;
107
108 if (old_state != DCCP_CLOSED)
109 dccp_set_state(sk, DCCP_CLOSED);
110
111 /* ABORT function of RFC793 */
112 if (old_state == DCCP_LISTEN) {
113 inet_csk_listen_stop(sk);
114 /* FIXME: do the active reset thing */
115 } else if (old_state == DCCP_REQUESTING)
116 sk->sk_err = ECONNRESET;
117
118 dccp_clear_xmit_timers(sk);
119 __skb_queue_purge(&sk->sk_receive_queue);
120 if (sk->sk_send_head != NULL) {
121 __kfree_skb(sk->sk_send_head);
122 sk->sk_send_head = NULL;
123 }
124
125 inet->dport = 0;
126
127 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
128 inet_reset_saddr(sk);
129
130 sk->sk_shutdown = 0;
131 sock_reset_flag(sk, SOCK_DONE);
132
133 icsk->icsk_backoff = 0;
134 inet_csk_delack_init(sk);
135 __sk_dst_reset(sk);
136
137 BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
138
139 sk->sk_error_report(sk);
140 return err;
141}
142
143/*
144 * Wait for a DCCP event.
145 *
146 * Note that we don't need to lock the socket, as the upper poll layers
147 * take care of normal races (between the test and the event) and we don't
148 * go look at any of the socket buffers directly.
149 */
150static unsigned int dccp_poll(struct file *file, struct socket *sock,
151 poll_table *wait)
152{
153 unsigned int mask;
154 struct sock *sk = sock->sk;
155
156 poll_wait(file, sk->sk_sleep, wait);
157 if (sk->sk_state == DCCP_LISTEN)
158 return inet_csk_listen_poll(sk);
159
160 /* Socket is not locked. We are protected from async events
161 by poll logic and correct handling of state changes
162 made by another threads is impossible in any case.
163 */
164
165 mask = 0;
166 if (sk->sk_err)
167 mask = POLLERR;
168
169 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
170 mask |= POLLHUP;
171 if (sk->sk_shutdown & RCV_SHUTDOWN)
172 mask |= POLLIN | POLLRDNORM;
173
174 /* Connected? */
175 if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
176 if (atomic_read(&sk->sk_rmem_alloc) > 0)
177 mask |= POLLIN | POLLRDNORM;
178
179 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
180 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
181 mask |= POLLOUT | POLLWRNORM;
182 } else { /* send SIGIO later */
183 set_bit(SOCK_ASYNC_NOSPACE,
184 &sk->sk_socket->flags);
185 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
186
187 /* Race breaker. If space is freed after
188 * wspace test but before the flags are set,
189 * IO signal will be lost.
190 */
191 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
192 mask |= POLLOUT | POLLWRNORM;
193 }
194 }
195 }
196 return mask;
197}
198
199int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
200{
201 dccp_pr_debug("entry\n");
202 return -ENOIOCTLCMD;
203}
204
205int dccp_setsockopt(struct sock *sk, int level, int optname,
206 char __user *optval, int optlen)
207{
208 struct dccp_sock *dp;
209 int err;
210 int val;
211
212 if (level != SOL_DCCP)
213 return ip_setsockopt(sk, level, optname, optval, optlen);
214
215 if (optlen < sizeof(int))
216 return -EINVAL;
217
218 if (get_user(val, (int __user *)optval))
219 return -EFAULT;
220
221 lock_sock(sk);
222
223 dp = dccp_sk(sk);
224 err = 0;
225
226 switch (optname) {
227 case DCCP_SOCKOPT_PACKET_SIZE:
228 dp->dccps_packet_size = val;
229 break;
230 default:
231 err = -ENOPROTOOPT;
232 break;
233 }
234
235 release_sock(sk);
236 return err;
237}
238
239int dccp_getsockopt(struct sock *sk, int level, int optname,
240 char __user *optval, int __user *optlen)
241{
242 struct dccp_sock *dp;
243 int val, len;
244
245 if (level != SOL_DCCP)
246 return ip_getsockopt(sk, level, optname, optval, optlen);
247
248 if (get_user(len, optlen))
249 return -EFAULT;
250
251 len = min_t(unsigned int, len, sizeof(int));
252 if (len < 0)
253 return -EINVAL;
254
255 dp = dccp_sk(sk);
256
257 switch (optname) {
258 case DCCP_SOCKOPT_PACKET_SIZE:
259 val = dp->dccps_packet_size;
260 break;
261 default:
262 return -ENOPROTOOPT;
263 }
264
265 if (put_user(len, optlen) || copy_to_user(optval, &val, len))
266 return -EFAULT;
267
268 return 0;
269}
270
271int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
272 size_t len)
273{
274 const struct dccp_sock *dp = dccp_sk(sk);
275 const int flags = msg->msg_flags;
276 const int noblock = flags & MSG_DONTWAIT;
277 struct sk_buff *skb;
278 int rc, size;
279 long timeo;
280
281 if (len > dp->dccps_mss_cache)
282 return -EMSGSIZE;
283
284 lock_sock(sk);
285 timeo = sock_sndtimeo(sk, noblock);
286
287 /*
288 * We have to use sk_stream_wait_connect here to set sk_write_pending,
289 * so that the trick in dccp_rcv_request_sent_state_process.
290 */
291 /* Wait for a connection to finish. */
292 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
293 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
294 goto out_release;
295
296 size = sk->sk_prot->max_header + len;
297 release_sock(sk);
298 skb = sock_alloc_send_skb(sk, size, noblock, &rc);
299 lock_sock(sk);
300 if (skb == NULL)
301 goto out_release;
302
303 skb_reserve(skb, sk->sk_prot->max_header);
304 rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
305 if (rc != 0)
306 goto out_discard;
307
308 rc = dccp_write_xmit(sk, skb, &timeo);
309 /*
310 * XXX we don't use sk_write_queue, so just discard the packet.
311 * Current plan however is to _use_ sk_write_queue with
312 * an algorith similar to tcp_sendmsg, where the main difference
313 * is that in DCCP we have to respect packet boundaries, so
314 * no coalescing of skbs.
315 *
316 * This bug was _quickly_ found & fixed by just looking at an OSTRA
317 * generated callgraph 8) -acme
318 */
319 if (rc != 0)
320 goto out_discard;
321out_release:
322 release_sock(sk);
323 return rc ? : len;
324out_discard:
325 kfree_skb(skb);
326 goto out_release;
327}
328
329int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
330 size_t len, int nonblock, int flags, int *addr_len)
331{
332 const struct dccp_hdr *dh;
333 long timeo;
334
335 lock_sock(sk);
336
337 if (sk->sk_state == DCCP_LISTEN) {
338 len = -ENOTCONN;
339 goto out;
340 }
341
342 timeo = sock_rcvtimeo(sk, nonblock);
343
344 do {
345 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
346
347 if (skb == NULL)
348 goto verify_sock_status;
349
350 dh = dccp_hdr(skb);
351
352 if (dh->dccph_type == DCCP_PKT_DATA ||
353 dh->dccph_type == DCCP_PKT_DATAACK)
354 goto found_ok_skb;
355
356 if (dh->dccph_type == DCCP_PKT_RESET ||
357 dh->dccph_type == DCCP_PKT_CLOSE) {
358 dccp_pr_debug("found fin ok!\n");
359 len = 0;
360 goto found_fin_ok;
361 }
362 dccp_pr_debug("packet_type=%s\n",
363 dccp_packet_name(dh->dccph_type));
364 sk_eat_skb(sk, skb);
365verify_sock_status:
366 if (sock_flag(sk, SOCK_DONE)) {
367 len = 0;
368 break;
369 }
370
371 if (sk->sk_err) {
372 len = sock_error(sk);
373 break;
374 }
375
376 if (sk->sk_shutdown & RCV_SHUTDOWN) {
377 len = 0;
378 break;
379 }
380
381 if (sk->sk_state == DCCP_CLOSED) {
382 if (!sock_flag(sk, SOCK_DONE)) {
383 /* This occurs when user tries to read
384 * from never connected socket.
385 */
386 len = -ENOTCONN;
387 break;
388 }
389 len = 0;
390 break;
391 }
392
393 if (!timeo) {
394 len = -EAGAIN;
395 break;
396 }
397
398 if (signal_pending(current)) {
399 len = sock_intr_errno(timeo);
400 break;
401 }
402
403 sk_wait_data(sk, &timeo);
404 continue;
405 found_ok_skb:
406 if (len > skb->len)
407 len = skb->len;
408 else if (len < skb->len)
409 msg->msg_flags |= MSG_TRUNC;
410
411 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
412 /* Exception. Bailout! */
413 len = -EFAULT;
414 break;
415 }
416 found_fin_ok:
417 if (!(flags & MSG_PEEK))
418 sk_eat_skb(sk, skb);
419 break;
420 } while (1);
421out:
422 release_sock(sk);
423 return len;
424}
425
426static int inet_dccp_listen(struct socket *sock, int backlog)
427{
428 struct sock *sk = sock->sk;
429 unsigned char old_state;
430 int err;
431
432 lock_sock(sk);
433
434 err = -EINVAL;
435 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
436 goto out;
437
438 old_state = sk->sk_state;
439 if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
440 goto out;
441
442 /* Really, if the socket is already in listen state
443 * we can only allow the backlog to be adjusted.
444 */
445 if (old_state != DCCP_LISTEN) {
446 /*
447 * FIXME: here it probably should be sk->sk_prot->listen_start
448 * see tcp_listen_start
449 */
450 err = dccp_listen_start(sk);
451 if (err)
452 goto out;
453 }
454 sk->sk_max_ack_backlog = backlog;
455 err = 0;
456
457out:
458 release_sock(sk);
459 return err;
460}
461
462static const unsigned char dccp_new_state[] = {
463 /* current state: new state: action: */
464 [0] = DCCP_CLOSED,
465 [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
466 [DCCP_REQUESTING] = DCCP_CLOSED,
467 [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
468 [DCCP_LISTEN] = DCCP_CLOSED,
469 [DCCP_RESPOND] = DCCP_CLOSED,
470 [DCCP_CLOSING] = DCCP_CLOSED,
471 [DCCP_TIME_WAIT] = DCCP_CLOSED,
472 [DCCP_CLOSED] = DCCP_CLOSED,
473};
474
475static int dccp_close_state(struct sock *sk)
476{
477 const int next = dccp_new_state[sk->sk_state];
478 const int ns = next & DCCP_STATE_MASK;
479
480 if (ns != sk->sk_state)
481 dccp_set_state(sk, ns);
482
483 return next & DCCP_ACTION_FIN;
484}
485
486void dccp_close(struct sock *sk, long timeout)
487{
488 struct sk_buff *skb;
489
490 lock_sock(sk);
491
492 sk->sk_shutdown = SHUTDOWN_MASK;
493
494 if (sk->sk_state == DCCP_LISTEN) {
495 dccp_set_state(sk, DCCP_CLOSED);
496
497 /* Special case. */
498 inet_csk_listen_stop(sk);
499
500 goto adjudge_to_death;
501 }
502
503 /*
504 * We need to flush the recv. buffs. We do this only on the
505 * descriptor close, not protocol-sourced closes, because the
506 *reader process may not have drained the data yet!
507 */
508 /* FIXME: check for unread data */
509 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
510 __kfree_skb(skb);
511 }
512
513 if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
514 /* Check zero linger _after_ checking for unread data. */
515 sk->sk_prot->disconnect(sk, 0);
516 } else if (dccp_close_state(sk)) {
517 dccp_send_close(sk, 1);
518 }
519
520 sk_stream_wait_close(sk, timeout);
521
522adjudge_to_death:
523 /*
524 * It is the last release_sock in its life. It will remove backlog.
525 */
526 release_sock(sk);
527 /*
528 * Now socket is owned by kernel and we acquire BH lock
529 * to finish close. No need to check for user refs.
530 */
531 local_bh_disable();
532 bh_lock_sock(sk);
533 BUG_TRAP(!sock_owned_by_user(sk));
534
535 sock_hold(sk);
536 sock_orphan(sk);
537
538 /*
539 * The last release_sock may have processed the CLOSE or RESET
540 * packet moving sock to CLOSED state, if not we have to fire
541 * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
542 * in draft-ietf-dccp-spec-11. -acme
543 */
544 if (sk->sk_state == DCCP_CLOSING) {
545 /* FIXME: should start at 2 * RTT */
546 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
547 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
548 inet_csk(sk)->icsk_rto,
549 DCCP_RTO_MAX);
550#if 0
551 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
552 dccp_set_state(sk, DCCP_CLOSED);
553#endif
554 }
555
556 atomic_inc(sk->sk_prot->orphan_count);
557 if (sk->sk_state == DCCP_CLOSED)
558 inet_csk_destroy_sock(sk);
559
560 /* Otherwise, socket is reprieved until protocol close. */
561
562 bh_unlock_sock(sk);
563 local_bh_enable();
564 sock_put(sk);
565}
566
567void dccp_shutdown(struct sock *sk, int how)
568{
569 dccp_pr_debug("entry\n");
570}
571
572static struct proto_ops inet_dccp_ops = {
573 .family = PF_INET,
574 .owner = THIS_MODULE,
575 .release = inet_release,
576 .bind = inet_bind,
577 .connect = inet_stream_connect,
578 .socketpair = sock_no_socketpair,
579 .accept = inet_accept,
580 .getname = inet_getname,
581 /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
582 .poll = dccp_poll,
583 .ioctl = inet_ioctl,
584 /* FIXME: work on inet_listen to rename it to sock_common_listen */
585 .listen = inet_dccp_listen,
586 .shutdown = inet_shutdown,
587 .setsockopt = sock_common_setsockopt,
588 .getsockopt = sock_common_getsockopt,
589 .sendmsg = inet_sendmsg,
590 .recvmsg = sock_common_recvmsg,
591 .mmap = sock_no_mmap,
592 .sendpage = sock_no_sendpage,
593};
594
595extern struct net_proto_family inet_family_ops;
596
597static struct inet_protosw dccp_v4_protosw = {
598 .type = SOCK_DCCP,
599 .protocol = IPPROTO_DCCP,
600 .prot = &dccp_v4_prot,
601 .ops = &inet_dccp_ops,
602 .capability = -1,
603 .no_check = 0,
604 .flags = 0,
605};
606
607/*
608 * This is the global socket data structure used for responding to
609 * the Out-of-the-blue (OOTB) packets. A control sock will be created
610 * for this socket at the initialization time.
611 */
612struct socket *dccp_ctl_socket;
613
614static char dccp_ctl_socket_err_msg[] __initdata =
615 KERN_ERR "DCCP: Failed to create the control socket.\n";
616
617static int __init dccp_ctl_sock_init(void)
618{
619 int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
620 &dccp_ctl_socket);
621 if (rc < 0)
622 printk(dccp_ctl_socket_err_msg);
623 else {
624 dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
625 inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
626
627 /* Unhash it so that IP input processing does not even
628 * see it, we do not wish this socket to see incoming
629 * packets.
630 */
631 dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
632 }
633
634 return rc;
635}
636
637#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
638void dccp_ctl_sock_exit(void)
639{
640 if (dccp_ctl_socket != NULL) {
641 sock_release(dccp_ctl_socket);
642 dccp_ctl_socket = NULL;
643 }
644}
645
646EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
647#endif
648
649static int __init init_dccp_v4_mibs(void)
650{
651 int rc = -ENOMEM;
652
653 dccp_statistics[0] = alloc_percpu(struct dccp_mib);
654 if (dccp_statistics[0] == NULL)
655 goto out;
656
657 dccp_statistics[1] = alloc_percpu(struct dccp_mib);
658 if (dccp_statistics[1] == NULL)
659 goto out_free_one;
660
661 rc = 0;
662out:
663 return rc;
664out_free_one:
665 free_percpu(dccp_statistics[0]);
666 dccp_statistics[0] = NULL;
667 goto out;
668
669}
670
671static int thash_entries;
672module_param(thash_entries, int, 0444);
673MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
674
675#ifdef CONFIG_IP_DCCP_DEBUG
676int dccp_debug;
677module_param(dccp_debug, int, 0444);
678MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
679#endif
680
681static int __init dccp_init(void)
682{
683 unsigned long goal;
684 int ehash_order, bhash_order, i;
685 int rc = proto_register(&dccp_v4_prot, 1);
686
687 if (rc)
688 goto out;
689
690 dccp_hashinfo.bind_bucket_cachep =
691 kmem_cache_create("dccp_bind_bucket",
692 sizeof(struct inet_bind_bucket), 0,
693 SLAB_HWCACHE_ALIGN, NULL, NULL);
694 if (!dccp_hashinfo.bind_bucket_cachep)
695 goto out_proto_unregister;
696
697 /*
698 * Size and allocate the main established and bind bucket
699 * hash tables.
700 *
701 * The methodology is similar to that of the buffer cache.
702 */
703 if (num_physpages >= (128 * 1024))
704 goal = num_physpages >> (21 - PAGE_SHIFT);
705 else
706 goal = num_physpages >> (23 - PAGE_SHIFT);
707
708 if (thash_entries)
709 goal = (thash_entries *
710 sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
711 for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
712 ;
713 do {
714 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
715 sizeof(struct inet_ehash_bucket);
716 dccp_hashinfo.ehash_size >>= 1;
717 while (dccp_hashinfo.ehash_size &
718 (dccp_hashinfo.ehash_size - 1))
719 dccp_hashinfo.ehash_size--;
720 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
721 __get_free_pages(GFP_ATOMIC, ehash_order);
722 } while (!dccp_hashinfo.ehash && --ehash_order > 0);
723
724 if (!dccp_hashinfo.ehash) {
725 printk(KERN_CRIT "Failed to allocate DCCP "
726 "established hash table\n");
727 goto out_free_bind_bucket_cachep;
728 }
729
730 for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
731 rwlock_init(&dccp_hashinfo.ehash[i].lock);
732 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
733 }
734
735 bhash_order = ehash_order;
736
737 do {
738 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
739 sizeof(struct inet_bind_hashbucket);
740 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
741 bhash_order > 0)
742 continue;
743 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
744 __get_free_pages(GFP_ATOMIC, bhash_order);
745 } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
746
747 if (!dccp_hashinfo.bhash) {
748 printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
749 goto out_free_dccp_ehash;
750 }
751
752 for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
753 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
754 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
755 }
756
757 if (init_dccp_v4_mibs())
758 goto out_free_dccp_bhash;
759
760 rc = -EAGAIN;
761 if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
762 goto out_free_dccp_v4_mibs;
763
764 inet_register_protosw(&dccp_v4_protosw);
765
766 rc = dccp_ctl_sock_init();
767 if (rc)
768 goto out_unregister_protosw;
769out:
770 return rc;
771out_unregister_protosw:
772 inet_unregister_protosw(&dccp_v4_protosw);
773 inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
774out_free_dccp_v4_mibs:
775 free_percpu(dccp_statistics[0]);
776 free_percpu(dccp_statistics[1]);
777 dccp_statistics[0] = dccp_statistics[1] = NULL;
778out_free_dccp_bhash:
779 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
780 dccp_hashinfo.bhash = NULL;
781out_free_dccp_ehash:
782 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
783 dccp_hashinfo.ehash = NULL;
784out_free_bind_bucket_cachep:
785 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
786 dccp_hashinfo.bind_bucket_cachep = NULL;
787out_proto_unregister:
788 proto_unregister(&dccp_v4_prot);
789 goto out;
790}
791
792static const char dccp_del_proto_err_msg[] __exitdata =
793 KERN_ERR "can't remove dccp net_protocol\n";
794
795static void __exit dccp_fini(void)
796{
797 inet_unregister_protosw(&dccp_v4_protosw);
798
799 if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
800 printk(dccp_del_proto_err_msg);
801
802 free_percpu(dccp_statistics[0]);
803 free_percpu(dccp_statistics[1]);
804 free_pages((unsigned long)dccp_hashinfo.bhash,
805 get_order(dccp_hashinfo.bhash_size *
806 sizeof(struct inet_bind_hashbucket)));
807 free_pages((unsigned long)dccp_hashinfo.ehash,
808 get_order(dccp_hashinfo.ehash_size *
809 sizeof(struct inet_ehash_bucket)));
810 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
811 proto_unregister(&dccp_v4_prot);
812}
813
814module_init(dccp_init);
815module_exit(dccp_fini);
816
817/*
818 * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
819 * values directly, Also cover the case where the protocol is not specified,
820 * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
821 */
822MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
823MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
824MODULE_LICENSE("GPL");
825MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
826MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 000000000000..aa34b576e228
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,255 @@
1/*
2 * net/dccp/timer.c
3 *
4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/config.h>
14#include <linux/dccp.h>
15#include <linux/skbuff.h>
16
17#include "dccp.h"
18
19static void dccp_write_timer(unsigned long data);
20static void dccp_keepalive_timer(unsigned long data);
21static void dccp_delack_timer(unsigned long data);
22
23void dccp_init_xmit_timers(struct sock *sk)
24{
25 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
26 &dccp_keepalive_timer);
27}
28
29static void dccp_write_err(struct sock *sk)
30{
31 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
32 sk->sk_error_report(sk);
33
34 dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
35 dccp_done(sk);
36 DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
37}
38
39/* A write timeout has occurred. Process the after effects. */
40static int dccp_write_timeout(struct sock *sk)
41{
42 const struct inet_connection_sock *icsk = inet_csk(sk);
43 int retry_until;
44
45 if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
46 if (icsk->icsk_retransmits != 0)
47 dst_negative_advice(&sk->sk_dst_cache);
48 retry_until = icsk->icsk_syn_retries ? :
49 /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
50 } else {
51 if (icsk->icsk_retransmits >=
52 /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
53 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
54 black hole detection. :-(
55
56 It is place to make it. It is not made. I do not want
57 to make it. It is disguisting. It does not work in any
58 case. Let me to cite the same draft, which requires for
59 us to implement this:
60
61 "The one security concern raised by this memo is that ICMP black holes
62 are often caused by over-zealous security administrators who block
63 all ICMP messages. It is vitally important that those who design and
64 deploy security systems understand the impact of strict filtering on
65 upper-layer protocols. The safest web site in the world is worthless
66 if most TCP implementations cannot transfer data from it. It would
67 be far nicer to have all of the black holes fixed rather than fixing
68 all of the TCP implementations."
69
70 Golden words :-).
71 */
72
73 dst_negative_advice(&sk->sk_dst_cache);
74 }
75
76 retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
77 /*
78 * FIXME: see tcp_write_timout and tcp_out_of_resources
79 */
80 }
81
82 if (icsk->icsk_retransmits >= retry_until) {
83 /* Has it gone just too far? */
84 dccp_write_err(sk);
85 return 1;
86 }
87 return 0;
88}
89
90/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
91static void dccp_delack_timer(unsigned long data)
92{
93 struct sock *sk = (struct sock *)data;
94 struct inet_connection_sock *icsk = inet_csk(sk);
95
96 bh_lock_sock(sk);
97 if (sock_owned_by_user(sk)) {
98 /* Try again later. */
99 icsk->icsk_ack.blocked = 1;
100 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
101 sk_reset_timer(sk, &icsk->icsk_delack_timer,
102 jiffies + TCP_DELACK_MIN);
103 goto out;
104 }
105
106 if (sk->sk_state == DCCP_CLOSED ||
107 !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
108 goto out;
109 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
110 sk_reset_timer(sk, &icsk->icsk_delack_timer,
111 icsk->icsk_ack.timeout);
112 goto out;
113 }
114
115 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
116
117 if (inet_csk_ack_scheduled(sk)) {
118 if (!icsk->icsk_ack.pingpong) {
119 /* Delayed ACK missed: inflate ATO. */
120 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
121 icsk->icsk_rto);
122 } else {
123 /* Delayed ACK missed: leave pingpong mode and
124 * deflate ATO.
125 */
126 icsk->icsk_ack.pingpong = 0;
127 icsk->icsk_ack.ato = TCP_ATO_MIN;
128 }
129 dccp_send_ack(sk);
130 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
131 }
132out:
133 bh_unlock_sock(sk);
134 sock_put(sk);
135}
136
137/*
138 * The DCCP retransmit timer.
139 */
140static void dccp_retransmit_timer(struct sock *sk)
141{
142 struct inet_connection_sock *icsk = inet_csk(sk);
143
144 /*
145 * sk->sk_send_head has to have one skb with
146 * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
147 * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
148 * (PARTOPEN timer), etc).
149 */
150 BUG_TRAP(sk->sk_send_head != NULL);
151
152 /*
153 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
154 * sent, no need to retransmit, this sock is dead.
155 */
156 if (dccp_write_timeout(sk))
157 goto out;
158
159 /*
160 * We want to know the number of packets retransmitted, not the
161 * total number of retransmissions of clones of original packets.
162 */
163 if (icsk->icsk_retransmits == 0)
164 DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
165
166 if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
167 /*
168 * Retransmission failed because of local congestion,
169 * do not backoff.
170 */
171 if (icsk->icsk_retransmits == 0)
172 icsk->icsk_retransmits = 1;
173 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
174 min(icsk->icsk_rto,
175 TCP_RESOURCE_PROBE_INTERVAL),
176 DCCP_RTO_MAX);
177 goto out;
178 }
179
180 icsk->icsk_backoff++;
181 icsk->icsk_retransmits++;
182
183 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
184 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
185 DCCP_RTO_MAX);
186 if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
187 __sk_dst_reset(sk);
188out:;
189}
190
191static void dccp_write_timer(unsigned long data)
192{
193 struct sock *sk = (struct sock *)data;
194 struct inet_connection_sock *icsk = inet_csk(sk);
195 int event = 0;
196
197 bh_lock_sock(sk);
198 if (sock_owned_by_user(sk)) {
199 /* Try again later */
200 sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
201 jiffies + (HZ / 20));
202 goto out;
203 }
204
205 if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
206 goto out;
207
208 if (time_after(icsk->icsk_timeout, jiffies)) {
209 sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
210 icsk->icsk_timeout);
211 goto out;
212 }
213
214 event = icsk->icsk_pending;
215 icsk->icsk_pending = 0;
216
217 switch (event) {
218 case ICSK_TIME_RETRANS:
219 dccp_retransmit_timer(sk);
220 break;
221 }
222out:
223 bh_unlock_sock(sk);
224 sock_put(sk);
225}
226
227/*
228 * Timer for listening sockets
229 */
230static void dccp_response_timer(struct sock *sk)
231{
232 inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
233 DCCP_RTO_MAX);
234}
235
236static void dccp_keepalive_timer(unsigned long data)
237{
238 struct sock *sk = (struct sock *)data;
239
240 /* Only process if socket is not in use. */
241 bh_lock_sock(sk);
242 if (sock_owned_by_user(sk)) {
243 /* Try again later. */
244 inet_csk_reset_keepalive_timer(sk, HZ / 20);
245 goto out;
246 }
247
248 if (sk->sk_state == DCCP_LISTEN) {
249 dccp_response_timer(sk);
250 goto out;
251 }
252out:
253 bh_unlock_sock(sk);
254 sock_put(sk);
255}
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542ba8..92f2ec46fd22 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
1# 1#
2# DECnet configuration 2# DECnet configuration
3# 3#
4config DECNET
5 tristate "DECnet Support"
6 ---help---
7 The DECnet networking protocol was used in many products made by
8 Digital (now Compaq). It provides reliable stream and sequenced
9 packet communications over which run a variety of services similar
10 to those which run over TCP/IP.
11
12 To find some tools to use with the kernel layer support, please
13 look at Patrick Caulfield's web site:
14 <http://linux-decnet.sourceforge.net/>.
15
16 More detailed documentation is available in
17 <file:Documentation/networking/decnet.txt>.
18
19 Be sure to say Y to "/proc file system support" and "Sysctl support"
20 below when using DECnet, since you will need sysctl support to aid
21 in configuration at run time.
22
23 The DECnet code is also available as a module ( = code which can be
24 inserted in and removed from the running kernel whenever you want).
25 The module is called decnet.
26
4config DECNET_ROUTER 27config DECNET_ROUTER
5 bool "DECnet: router support (EXPERIMENTAL)" 28 bool "DECnet: router support (EXPERIMENTAL)"
6 depends on DECNET && EXPERIMENTAL 29 depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 96a02800cd28..348f36b529f7 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
118#include <linux/netfilter.h> 118#include <linux/netfilter.h>
119#include <linux/seq_file.h> 119#include <linux/seq_file.h>
120#include <net/sock.h> 120#include <net/sock.h>
121#include <net/tcp.h> 121#include <net/tcp_states.h>
122#include <net/flow.h> 122#include <net/flow.h>
123#include <asm/system.h> 123#include <asm/system.h>
124#include <asm/ioctls.h> 124#include <asm/ioctls.h>
@@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
1763 nskb = skb->next; 1763 nskb = skb->next;
1764 1764
1765 if (skb->len == 0) { 1765 if (skb->len == 0) {
1766 skb_unlink(skb); 1766 skb_unlink(skb, queue);
1767 kfree_skb(skb); 1767 kfree_skb(skb);
1768 /* 1768 /*
1769 * N.B. Don't refer to skb or cb after this point 1769 * N.B. Don't refer to skb or cb after this point
@@ -1876,17 +1876,27 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
1876 return mss_now; 1876 return mss_now;
1877} 1877}
1878 1878
1879static int dn_error(struct sock *sk, int flags, int err) 1879/*
1880 * N.B. We get the timeout wrong here, but then we always did get it
1881 * wrong before and this is another step along the road to correcting
1882 * it. It ought to get updated each time we pass through the routine,
1883 * but in practise it probably doesn't matter too much for now.
1884 */
1885static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
1886 unsigned long datalen, int noblock,
1887 int *errcode)
1880{ 1888{
1881 if (err == -EPIPE) 1889 struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
1882 err = sock_error(sk) ? : -EPIPE; 1890 noblock, errcode);
1883 if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) 1891 if (skb) {
1884 send_sig(SIGPIPE, current, 0); 1892 skb->protocol = __constant_htons(ETH_P_DNA_RT);
1885 return err; 1893 skb->pkt_type = PACKET_OUTGOING;
1894 }
1895 return skb;
1886} 1896}
1887 1897
1888static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, 1898static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1889 struct msghdr *msg, size_t size) 1899 struct msghdr *msg, size_t size)
1890{ 1900{
1891 struct sock *sk = sock->sk; 1901 struct sock *sk = sock->sk;
1892 struct dn_scp *scp = DN_SK(sk); 1902 struct dn_scp *scp = DN_SK(sk);
@@ -1901,7 +1911,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1901 struct dn_skb_cb *cb; 1911 struct dn_skb_cb *cb;
1902 size_t len; 1912 size_t len;
1903 unsigned char fctype; 1913 unsigned char fctype;
1904 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1914 long timeo;
1905 1915
1906 if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT)) 1916 if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
1907 return -EOPNOTSUPP; 1917 return -EOPNOTSUPP;
@@ -1909,18 +1919,21 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1909 if (addr_len && (addr_len != sizeof(struct sockaddr_dn))) 1919 if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
1910 return -EINVAL; 1920 return -EINVAL;
1911 1921
1922 lock_sock(sk);
1923 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1912 /* 1924 /*
1913 * The only difference between stream sockets and sequenced packet 1925 * The only difference between stream sockets and sequenced packet
1914 * sockets is that the stream sockets always behave as if MSG_EOR 1926 * sockets is that the stream sockets always behave as if MSG_EOR
1915 * has been set. 1927 * has been set.
1916 */ 1928 */
1917 if (sock->type == SOCK_STREAM) { 1929 if (sock->type == SOCK_STREAM) {
1918 if (flags & MSG_EOR) 1930 if (flags & MSG_EOR) {
1919 return -EINVAL; 1931 err = -EINVAL;
1932 goto out;
1933 }
1920 flags |= MSG_EOR; 1934 flags |= MSG_EOR;
1921 } 1935 }
1922 1936
1923 lock_sock(sk);
1924 1937
1925 err = dn_check_state(sk, addr, addr_len, &timeo, flags); 1938 err = dn_check_state(sk, addr, addr_len, &timeo, flags);
1926 if (err) 1939 if (err)
@@ -1989,8 +2002,12 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
1989 2002
1990 /* 2003 /*
1991 * Get a suitably sized skb. 2004 * Get a suitably sized skb.
2005 * 64 is a bit of a hack really, but its larger than any
2006 * link-layer headers and has served us well as a good
2007 * guess as to their real length.
1992 */ 2008 */
1993 skb = dn_alloc_send_skb(sk, &len, flags & MSG_DONTWAIT, timeo, &err); 2009 skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
2010 flags & MSG_DONTWAIT, &err);
1994 2011
1995 if (err) 2012 if (err)
1996 break; 2013 break;
@@ -2000,7 +2017,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
2000 2017
2001 cb = DN_SKB_CB(skb); 2018 cb = DN_SKB_CB(skb);
2002 2019
2003 skb_reserve(skb, DN_MAX_NSP_DATA_HEADER); 2020 skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
2004 2021
2005 if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { 2022 if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
2006 err = -EFAULT; 2023 err = -EFAULT;
@@ -2045,7 +2062,7 @@ out:
2045 return sent ? sent : err; 2062 return sent ? sent : err;
2046 2063
2047out_err: 2064out_err:
2048 err = dn_error(sk, flags, err); 2065 err = sk_stream_error(sk, flags, err);
2049 release_sock(sk); 2066 release_sock(sk);
2050 return err; 2067 return err;
2051} 2068}
@@ -2073,7 +2090,7 @@ static struct notifier_block dn_dev_notifier = {
2073 .notifier_call = dn_device_event, 2090 .notifier_call = dn_device_event,
2074}; 2091};
2075 2092
2076extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *); 2093extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
2077 2094
2078static struct packet_type dn_dix_packet_type = { 2095static struct packet_type dn_dix_packet_type = {
2079 .type = __constant_htons(ETH_P_DNA_RT), 2096 .type = __constant_htons(ETH_P_DNA_RT),
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 00233ecbc9cb..5610bb16dbf9 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
752 752
753 skb = alloc_skb(size, GFP_KERNEL); 753 skb = alloc_skb(size, GFP_KERNEL);
754 if (!skb) { 754 if (!skb) {
755 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); 755 netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS);
756 return; 756 return;
757 } 757 }
758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) { 758 if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
759 kfree_skb(skb); 759 kfree_skb(skb);
760 netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); 760 netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL);
761 return; 761 return;
762 } 762 }
763 NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR; 763 NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR;
764 netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL); 764 netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL);
765} 765}
766 766
767static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 767static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f32dba9e26fe..8d0cc3cf3e49 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -148,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
148 148
149 __neigh_parms_put(neigh->parms); 149 __neigh_parms_put(neigh->parms);
150 neigh->parms = neigh_parms_clone(parms); 150 neigh->parms = neigh_parms_clone(parms);
151 rcu_read_unlock();
152 151
153 if (dn_db->use_long) 152 if (dn_db->use_long)
154 neigh->ops = &dn_long_ops; 153 neigh->ops = &dn_long_ops;
155 else 154 else
156 neigh->ops = &dn_short_ops; 155 neigh->ops = &dn_short_ops;
156 rcu_read_unlock();
157 157
158 if (dn->flags & DN_NDFLAG_P3) 158 if (dn->flags & DN_NDFLAG_P3)
159 neigh->ops = &dn_phase3_ops; 159 neigh->ops = &dn_phase3_ops;
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 202dbde9850d..369f25b60f3f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -60,7 +60,7 @@
60#include <linux/inet.h> 60#include <linux/inet.h>
61#include <linux/route.h> 61#include <linux/route.h>
62#include <net/sock.h> 62#include <net/sock.h>
63#include <net/tcp.h> 63#include <net/tcp_states.h>
64#include <asm/system.h> 64#include <asm/system.h>
65#include <linux/fcntl.h> 65#include <linux/fcntl.h>
66#include <linux/mm.h> 66#include <linux/mm.h>
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 8cce1fdbda90..53633d352868 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -137,69 +137,6 @@ struct sk_buff *dn_alloc_skb(struct sock *sk, int size, int pri)
137} 137}
138 138
139/* 139/*
140 * Wrapper for the above, for allocs of data skbs. We try and get the
141 * whole size thats been asked for (plus 11 bytes of header). If this
142 * fails, then we try for any size over 16 bytes for SOCK_STREAMS.
143 */
144struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, long timeo, int *err)
145{
146 int space;
147 int len;
148 struct sk_buff *skb = NULL;
149
150 *err = 0;
151
152 while(skb == NULL) {
153 if (signal_pending(current)) {
154 *err = sock_intr_errno(timeo);
155 break;
156 }
157
158 if (sk->sk_shutdown & SEND_SHUTDOWN) {
159 *err = EINVAL;
160 break;
161 }
162
163 if (sk->sk_err)
164 break;
165
166 len = *size + 11;
167 space = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
168
169 if (space < len) {
170 if ((sk->sk_socket->type == SOCK_STREAM) &&
171 (space >= (16 + 11)))
172 len = space;
173 }
174
175 if (space < len) {
176 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
177 if (noblock) {
178 *err = EWOULDBLOCK;
179 break;
180 }
181
182 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
183 SOCK_SLEEP_PRE(sk)
184
185 if ((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) <
186 len)
187 schedule();
188
189 SOCK_SLEEP_POST(sk)
190 continue;
191 }
192
193 if ((skb = dn_alloc_skb(sk, len, sk->sk_allocation)) == NULL)
194 continue;
195
196 *size = len - 11;
197 }
198
199 return skb;
200}
201
202/*
203 * Calculate persist timer based upon the smoothed round 140 * Calculate persist timer based upon the smoothed round
204 * trip time and the variance. Backoff according to the 141 * trip time and the variance. Backoff according to the
205 * nsp_backoff[] array. 142 * nsp_backoff[] array.
@@ -479,7 +416,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff
479 xmit_count = cb2->xmit_count; 416 xmit_count = cb2->xmit_count;
480 segnum = cb2->segnum; 417 segnum = cb2->segnum;
481 /* Remove and drop ack'ed packet */ 418 /* Remove and drop ack'ed packet */
482 skb_unlink(ack); 419 skb_unlink(ack, q);
483 kfree_skb(ack); 420 kfree_skb(ack);
484 ack = NULL; 421 ack = NULL;
485 422
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2399fa8a3f86..2c915f305be3 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb)
572 return NET_RX_SUCCESS; 572 return NET_RX_SUCCESS;
573} 573}
574 574
575int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 575int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
576{ 576{
577 struct dn_skb_cb *cb; 577 struct dn_skb_cb *cb;
578 unsigned char flags = 0; 578 unsigned char flags = 0;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 28ba5777a25a..eeba56f99323 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
79static DEFINE_RWLOCK(dn_fib_tables_lock); 79static DEFINE_RWLOCK(dn_fib_tables_lock);
80struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1]; 80struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
81 81
82static kmem_cache_t *dn_hash_kmem; 82static kmem_cache_t *dn_hash_kmem __read_mostly;
83static int dn_fib_hash_zombies; 83static int dn_fib_hash_zombies;
84 84
85static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) 85static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
@@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
349 kfree_skb(skb); 349 kfree_skb(skb);
350 return; 350 return;
351 } 351 }
352 NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE; 352 NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE;
353 if (nlh->nlmsg_flags & NLM_F_ECHO) 353 if (nlh->nlmsg_flags & NLM_F_ECHO)
354 atomic_inc(&skb->users); 354 atomic_inc(&skb->users);
355 netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL); 355 netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL);
356 if (nlh->nlmsg_flags & NLM_F_ECHO) 356 if (nlh->nlmsg_flags & NLM_F_ECHO)
357 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); 357 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
358} 358}
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 284a9998e53d..1ab94c6e22ed 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,6 +19,7 @@
19#include <linux/netfilter.h> 19#include <linux/netfilter.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21#include <linux/netlink.h> 21#include <linux/netlink.h>
22#include <linux/netfilter_decnet.h>
22 23
23#include <net/sock.h> 24#include <net/sock.h>
24#include <net/flow.h> 25#include <net/flow.h>
@@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb)
71 72
72 switch(flags & DN_RT_CNTL_MSK) { 73 switch(flags & DN_RT_CNTL_MSK) {
73 case DN_RT_PKT_L1RT: 74 case DN_RT_PKT_L1RT:
74 group = DNRMG_L1_GROUP; 75 group = DNRNG_NLGRP_L1;
75 break; 76 break;
76 case DN_RT_PKT_L2RT: 77 case DN_RT_PKT_L2RT:
77 group = DNRMG_L2_GROUP; 78 group = DNRNG_NLGRP_L2;
78 break; 79 break;
79 default: 80 default:
80 return; 81 return;
@@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
83 skb2 = dnrmg_build_message(skb, &status); 84 skb2 = dnrmg_build_message(skb, &status);
84 if (skb2 == NULL) 85 if (skb2 == NULL)
85 return; 86 return;
86 NETLINK_CB(skb2).dst_groups = group; 87 NETLINK_CB(skb2).dst_group = group;
87 netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); 88 netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
88} 89}
89 90
@@ -138,7 +139,8 @@ static int __init init(void)
138{ 139{
139 int rv = 0; 140 int rv = 0;
140 141
141 dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk); 142 dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
143 dnrmg_receive_user_sk, THIS_MODULE);
142 if (dnrmg == NULL) { 144 if (dnrmg == NULL) {
143 printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); 145 printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
144 return -ENOMEM; 146 return -ENOMEM;
@@ -162,6 +164,7 @@ static void __exit fini(void)
162MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); 164MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
163MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>"); 165MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
164MODULE_LICENSE("GPL"); 166MODULE_LICENSE("GPL");
167MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
165 168
166module_init(init); 169module_init(init);
167module_exit(fini); 170module_exit(fini);
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 000000000000..39a2d2975e0e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
1#
2# Acorn Econet/AUN protocols
3#
4
5config ECONET
6 tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
7 depends on EXPERIMENTAL && INET
8 ---help---
9 Econet is a fairly old and slow networking protocol mainly used by
10 Acorn computers to access file and print servers. It uses native
11 Econet network cards. AUN is an implementation of the higher level
12 parts of Econet that runs over ordinary Ethernet connections, on
13 top of the UDP packet protocol, which in turn runs on top of the
14 Internet protocol IP.
15
16 If you say Y here, you can choose with the next two options whether
17 to send Econet/AUN traffic over a UDP Ethernet connection or over
18 a native Econet network card.
19
20 To compile this driver as a module, choose M here: the module
21 will be called econet.
22
23config ECONET_AUNUDP
24 bool "AUN over UDP"
25 depends on ECONET
26 help
27 Say Y here if you want to send Econet/AUN traffic over a UDP
28 connection (UDP is a packet based protocol that runs on top of the
29 Internet protocol IP) using an ordinary Ethernet network card.
30
31config ECONET_NATIVE
32 bool "Native Econet"
33 depends on ECONET
34 help
35 Say Y here if you have a native Econet network card installed in
36 your computer.
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index de691e119e17..4a62093eb343 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
159 err = memcpy_toiovec(msg->msg_iov, skb->data, copied); 159 err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
160 if (err) 160 if (err)
161 goto out_free; 161 goto out_free;
162 sk->sk_stamp = skb->stamp; 162 skb_get_timestamp(skb, &sk->sk_stamp);
163 163
164 if (msg->msg_name) 164 if (msg->msg_name)
165 memcpy(msg->msg_name, skb->cb, msg->msg_namelen); 165 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result)
869 869
870foundit: 870foundit:
871 tx_result(skb->sk, eb->cookie, result); 871 tx_result(skb->sk, eb->cookie, result);
872 skb_unlink(skb); 872 skb_unlink(skb, &aun_queue);
873 spin_unlock_irqrestore(&aun_queue_lock, flags); 873 spin_unlock_irqrestore(&aun_queue_lock, flags);
874 kfree_skb(skb); 874 kfree_skb(skb);
875} 875}
@@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h)
947 { 947 {
948 tx_result(skb->sk, eb->cookie, 948 tx_result(skb->sk, eb->cookie,
949 ECTYPE_TRANSMIT_NOT_PRESENT); 949 ECTYPE_TRANSMIT_NOT_PRESENT);
950 skb_unlink(skb); 950 skb_unlink(skb, &aun_queue);
951 kfree_skb(skb); 951 kfree_skb(skb);
952 } 952 }
953 skb = newskb; 953 skb = newskb;
@@ -1009,7 +1009,7 @@ release:
1009 * Receive an Econet frame from a device. 1009 * Receive an Econet frame from a device.
1010 */ 1010 */
1011 1011
1012static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 1012static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
1013{ 1013{
1014 struct ec_framehdr *hdr; 1014 struct ec_framehdr *hdr;
1015 struct sock *sk; 1015 struct sock *sk;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ab60ea63688e..87a052a9a84f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
62#include <asm/system.h> 62#include <asm/system.h>
63#include <asm/checksum.h> 63#include <asm/checksum.h>
64 64
65extern int __init netdev_boot_setup(char *str);
66
67__setup("ether=", netdev_boot_setup); 65__setup("ether=", netdev_boot_setup);
68 66
69/* 67/*
@@ -155,7 +153,7 @@ int eth_rebuild_header(struct sk_buff *skb)
155 * This is normal practice and works for any 'now in use' protocol. 153 * This is normal practice and works for any 'now in use' protocol.
156 */ 154 */
157 155
158unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) 156__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
159{ 157{
160 struct ethhdr *eth; 158 struct ethhdr *eth;
161 unsigned char *rawp; 159 unsigned char *rawp;
@@ -163,7 +161,6 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
163 skb->mac.raw=skb->data; 161 skb->mac.raw=skb->data;
164 skb_pull(skb,ETH_HLEN); 162 skb_pull(skb,ETH_HLEN);
165 eth = eth_hdr(skb); 163 eth = eth_hdr(skb);
166 skb->input_dev = dev;
167 164
168 if(*eth->h_dest&1) 165 if(*eth->h_dest&1)
169 { 166 {
diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c
index b81a6d532342..66b39fc342d2 100644
--- a/net/ethernet/sysctl_net_ether.c
+++ b/net/ethernet/sysctl_net_ether.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/if_ether.h>
10 11
11ctl_table ether_table[] = { 12ctl_table ether_table[] = {
12 {0} 13 {0}
diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig
new file mode 100644
index 000000000000..58ed4319e693
--- /dev/null
+++ b/net/ieee80211/Kconfig
@@ -0,0 +1,69 @@
1config IEEE80211
2 tristate "Generic IEEE 802.11 Networking Stack"
3 select NET_RADIO
4 ---help---
5 This option enables the hardware independent IEEE 802.11
6 networking stack.
7
8config IEEE80211_DEBUG
9 bool "Enable full debugging output"
10 depends on IEEE80211
11 ---help---
12 This option will enable debug tracing output for the
13 ieee80211 network stack.
14
15 This will result in the kernel module being ~70k larger. You
16 can control which debug output is sent to the kernel log by
17 setting the value in
18
19 /proc/net/ieee80211/debug_level
20
21 For example:
22
23 % echo 0x00000FFO > /proc/net/ieee80211/debug_level
24
25 For a list of values you can assign to debug_level, you
26 can look at the bit mask values in <net/ieee80211.h>
27
28 If you are not trying to debug or develop the ieee80211
29 subsystem, you most likely want to say N here.
30
31config IEEE80211_CRYPT_WEP
32 tristate "IEEE 802.11 WEP encryption (802.1x)"
33 depends on IEEE80211
34 select CRYPTO
35 select CRYPTO_ARC4
36 select CRC32
37 ---help---
38 Include software based cipher suites in support of IEEE
39 802.11's WEP. This is needed for WEP as well as 802.1x.
40
41 This can be compiled as a modules and it will be called
42 "ieee80211_crypt_wep".
43
44config IEEE80211_CRYPT_CCMP
45 tristate "IEEE 802.11i CCMP support"
46 depends on IEEE80211
47 select CRYPTO
48 select CRYPTO_AES
49 ---help---
50 Include software based cipher suites in support of IEEE 802.11i
51 (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
52 networks.
53
54 This can be compiled as a modules and it will be called
55 "ieee80211_crypt_ccmp".
56
57config IEEE80211_CRYPT_TKIP
58 tristate "IEEE 802.11i TKIP encryption"
59 depends on IEEE80211
60 select CRYPTO
61 select CRYPTO_MICHAEL_MIC
62 ---help---
63 Include software based cipher suites in support of IEEE 802.11i
64 (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
65 networks.
66
67 This can be compiled as a modules and it will be called
68 "ieee80211_crypt_tkip".
69
diff --git a/net/ieee80211/Makefile b/net/ieee80211/Makefile
new file mode 100644
index 000000000000..a6ccac5baea8
--- /dev/null
+++ b/net/ieee80211/Makefile
@@ -0,0 +1,11 @@
1obj-$(CONFIG_IEEE80211) += ieee80211.o
2obj-$(CONFIG_IEEE80211) += ieee80211_crypt.o
3obj-$(CONFIG_IEEE80211_CRYPT_WEP) += ieee80211_crypt_wep.o
4obj-$(CONFIG_IEEE80211_CRYPT_CCMP) += ieee80211_crypt_ccmp.o
5obj-$(CONFIG_IEEE80211_CRYPT_TKIP) += ieee80211_crypt_tkip.o
6ieee80211-objs := \
7 ieee80211_module.o \
8 ieee80211_tx.o \
9 ieee80211_rx.o \
10 ieee80211_wx.o
11
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
new file mode 100644
index 000000000000..05a6f2f298db
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -0,0 +1,259 @@
1/*
2 * Host AP crypto routines
3 *
4 * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
5 * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. See README and COPYING for
10 * more details.
11 *
12 */
13
14#include <linux/config.h>
15#include <linux/version.h>
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/slab.h>
19#include <asm/string.h>
20#include <asm/errno.h>
21
22#include <net/ieee80211.h>
23
24MODULE_AUTHOR("Jouni Malinen");
25MODULE_DESCRIPTION("HostAP crypto");
26MODULE_LICENSE("GPL");
27
28struct ieee80211_crypto_alg {
29 struct list_head list;
30 struct ieee80211_crypto_ops *ops;
31};
32
33
34struct ieee80211_crypto {
35 struct list_head algs;
36 spinlock_t lock;
37};
38
39static struct ieee80211_crypto *hcrypt;
40
41void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee,
42 int force)
43{
44 struct list_head *ptr, *n;
45 struct ieee80211_crypt_data *entry;
46
47 for (ptr = ieee->crypt_deinit_list.next, n = ptr->next;
48 ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) {
49 entry = list_entry(ptr, struct ieee80211_crypt_data, list);
50
51 if (atomic_read(&entry->refcnt) != 0 && !force)
52 continue;
53
54 list_del(ptr);
55
56 if (entry->ops) {
57 entry->ops->deinit(entry->priv);
58 module_put(entry->ops->owner);
59 }
60 kfree(entry);
61 }
62}
63
64void ieee80211_crypt_deinit_handler(unsigned long data)
65{
66 struct ieee80211_device *ieee = (struct ieee80211_device *)data;
67 unsigned long flags;
68
69 spin_lock_irqsave(&ieee->lock, flags);
70 ieee80211_crypt_deinit_entries(ieee, 0);
71 if (!list_empty(&ieee->crypt_deinit_list)) {
72 printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
73 "deletion list\n", ieee->dev->name);
74 ieee->crypt_deinit_timer.expires = jiffies + HZ;
75 add_timer(&ieee->crypt_deinit_timer);
76 }
77 spin_unlock_irqrestore(&ieee->lock, flags);
78
79}
80
81void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
82 struct ieee80211_crypt_data **crypt)
83{
84 struct ieee80211_crypt_data *tmp;
85 unsigned long flags;
86
87 if (*crypt == NULL)
88 return;
89
90 tmp = *crypt;
91 *crypt = NULL;
92
93 /* must not run ops->deinit() while there may be pending encrypt or
94 * decrypt operations. Use a list of delayed deinits to avoid needing
95 * locking. */
96
97 spin_lock_irqsave(&ieee->lock, flags);
98 list_add(&tmp->list, &ieee->crypt_deinit_list);
99 if (!timer_pending(&ieee->crypt_deinit_timer)) {
100 ieee->crypt_deinit_timer.expires = jiffies + HZ;
101 add_timer(&ieee->crypt_deinit_timer);
102 }
103 spin_unlock_irqrestore(&ieee->lock, flags);
104}
105
106int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
107{
108 unsigned long flags;
109 struct ieee80211_crypto_alg *alg;
110
111 if (hcrypt == NULL)
112 return -1;
113
114 alg = kmalloc(sizeof(*alg), GFP_KERNEL);
115 if (alg == NULL)
116 return -ENOMEM;
117
118 memset(alg, 0, sizeof(*alg));
119 alg->ops = ops;
120
121 spin_lock_irqsave(&hcrypt->lock, flags);
122 list_add(&alg->list, &hcrypt->algs);
123 spin_unlock_irqrestore(&hcrypt->lock, flags);
124
125 printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
126 ops->name);
127
128 return 0;
129}
130
131int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
132{
133 unsigned long flags;
134 struct list_head *ptr;
135 struct ieee80211_crypto_alg *del_alg = NULL;
136
137 if (hcrypt == NULL)
138 return -1;
139
140 spin_lock_irqsave(&hcrypt->lock, flags);
141 for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
142 struct ieee80211_crypto_alg *alg =
143 (struct ieee80211_crypto_alg *) ptr;
144 if (alg->ops == ops) {
145 list_del(&alg->list);
146 del_alg = alg;
147 break;
148 }
149 }
150 spin_unlock_irqrestore(&hcrypt->lock, flags);
151
152 if (del_alg) {
153 printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
154 "'%s'\n", ops->name);
155 kfree(del_alg);
156 }
157
158 return del_alg ? 0 : -1;
159}
160
161
162struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name)
163{
164 unsigned long flags;
165 struct list_head *ptr;
166 struct ieee80211_crypto_alg *found_alg = NULL;
167
168 if (hcrypt == NULL)
169 return NULL;
170
171 spin_lock_irqsave(&hcrypt->lock, flags);
172 for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
173 struct ieee80211_crypto_alg *alg =
174 (struct ieee80211_crypto_alg *) ptr;
175 if (strcmp(alg->ops->name, name) == 0) {
176 found_alg = alg;
177 break;
178 }
179 }
180 spin_unlock_irqrestore(&hcrypt->lock, flags);
181
182 if (found_alg)
183 return found_alg->ops;
184 else
185 return NULL;
186}
187
188
189static void * ieee80211_crypt_null_init(int keyidx) { return (void *) 1; }
190static void ieee80211_crypt_null_deinit(void *priv) {}
191
192static struct ieee80211_crypto_ops ieee80211_crypt_null = {
193 .name = "NULL",
194 .init = ieee80211_crypt_null_init,
195 .deinit = ieee80211_crypt_null_deinit,
196 .encrypt_mpdu = NULL,
197 .decrypt_mpdu = NULL,
198 .encrypt_msdu = NULL,
199 .decrypt_msdu = NULL,
200 .set_key = NULL,
201 .get_key = NULL,
202 .extra_prefix_len = 0,
203 .extra_postfix_len = 0,
204 .owner = THIS_MODULE,
205};
206
207
208static int __init ieee80211_crypto_init(void)
209{
210 int ret = -ENOMEM;
211
212 hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL);
213 if (!hcrypt)
214 goto out;
215
216 memset(hcrypt, 0, sizeof(*hcrypt));
217 INIT_LIST_HEAD(&hcrypt->algs);
218 spin_lock_init(&hcrypt->lock);
219
220 ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null);
221 if (ret < 0) {
222 kfree(hcrypt);
223 hcrypt = NULL;
224 }
225out:
226 return ret;
227}
228
229
230static void __exit ieee80211_crypto_deinit(void)
231{
232 struct list_head *ptr, *n;
233
234 if (hcrypt == NULL)
235 return;
236
237 for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs;
238 ptr = n, n = ptr->next) {
239 struct ieee80211_crypto_alg *alg =
240 (struct ieee80211_crypto_alg *) ptr;
241 list_del(ptr);
242 printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
243 "'%s' (deinit)\n", alg->ops->name);
244 kfree(alg);
245 }
246
247 kfree(hcrypt);
248}
249
250EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
251EXPORT_SYMBOL(ieee80211_crypt_deinit_handler);
252EXPORT_SYMBOL(ieee80211_crypt_delayed_deinit);
253
254EXPORT_SYMBOL(ieee80211_register_crypto_ops);
255EXPORT_SYMBOL(ieee80211_unregister_crypto_ops);
256EXPORT_SYMBOL(ieee80211_get_crypto_ops);
257
258module_init(ieee80211_crypto_init);
259module_exit(ieee80211_crypto_deinit);
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
new file mode 100644
index 000000000000..11d15573b26a
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -0,0 +1,470 @@
1/*
2 * Host AP crypt: host-based CCMP encryption implementation for Host AP driver
3 *
4 * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. See README and COPYING for
9 * more details.
10 */
11
12#include <linux/config.h>
13#include <linux/version.h>
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/slab.h>
17#include <linux/random.h>
18#include <linux/skbuff.h>
19#include <linux/netdevice.h>
20#include <linux/if_ether.h>
21#include <linux/if_arp.h>
22#include <asm/string.h>
23#include <linux/wireless.h>
24
25#include <net/ieee80211.h>
26
27
28#include <linux/crypto.h>
29#include <asm/scatterlist.h>
30
31MODULE_AUTHOR("Jouni Malinen");
32MODULE_DESCRIPTION("Host AP crypt: CCMP");
33MODULE_LICENSE("GPL");
34
35#define AES_BLOCK_LEN 16
36#define CCMP_HDR_LEN 8
37#define CCMP_MIC_LEN 8
38#define CCMP_TK_LEN 16
39#define CCMP_PN_LEN 6
40
41struct ieee80211_ccmp_data {
42 u8 key[CCMP_TK_LEN];
43 int key_set;
44
45 u8 tx_pn[CCMP_PN_LEN];
46 u8 rx_pn[CCMP_PN_LEN];
47
48 u32 dot11RSNAStatsCCMPFormatErrors;
49 u32 dot11RSNAStatsCCMPReplays;
50 u32 dot11RSNAStatsCCMPDecryptErrors;
51
52 int key_idx;
53
54 struct crypto_tfm *tfm;
55
56 /* scratch buffers for virt_to_page() (crypto API) */
57 u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
58 tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
59 u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
60};
61
62static void ieee80211_ccmp_aes_encrypt(struct crypto_tfm *tfm,
63 const u8 pt[16], u8 ct[16])
64{
65 struct scatterlist src, dst;
66
67 src.page = virt_to_page(pt);
68 src.offset = offset_in_page(pt);
69 src.length = AES_BLOCK_LEN;
70
71 dst.page = virt_to_page(ct);
72 dst.offset = offset_in_page(ct);
73 dst.length = AES_BLOCK_LEN;
74
75 crypto_cipher_encrypt(tfm, &dst, &src, AES_BLOCK_LEN);
76}
77
78static void * ieee80211_ccmp_init(int key_idx)
79{
80 struct ieee80211_ccmp_data *priv;
81
82 priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
83 if (priv == NULL)
84 goto fail;
85 memset(priv, 0, sizeof(*priv));
86 priv->key_idx = key_idx;
87
88 priv->tfm = crypto_alloc_tfm("aes", 0);
89 if (priv->tfm == NULL) {
90 printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
91 "crypto API aes\n");
92 goto fail;
93 }
94
95 return priv;
96
97fail:
98 if (priv) {
99 if (priv->tfm)
100 crypto_free_tfm(priv->tfm);
101 kfree(priv);
102 }
103
104 return NULL;
105}
106
107
108static void ieee80211_ccmp_deinit(void *priv)
109{
110 struct ieee80211_ccmp_data *_priv = priv;
111 if (_priv && _priv->tfm)
112 crypto_free_tfm(_priv->tfm);
113 kfree(priv);
114}
115
116
117static inline void xor_block(u8 *b, u8 *a, size_t len)
118{
119 int i;
120 for (i = 0; i < len; i++)
121 b[i] ^= a[i];
122}
123
124
125static void ccmp_init_blocks(struct crypto_tfm *tfm,
126 struct ieee80211_hdr *hdr,
127 u8 *pn, size_t dlen, u8 *b0, u8 *auth,
128 u8 *s0)
129{
130 u8 *pos, qc = 0;
131 size_t aad_len;
132 u16 fc;
133 int a4_included, qc_included;
134 u8 aad[2 * AES_BLOCK_LEN];
135
136 fc = le16_to_cpu(hdr->frame_ctl);
137 a4_included = ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
138 (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS));
139 qc_included = ((WLAN_FC_GET_TYPE(fc) == IEEE80211_FTYPE_DATA) &&
140 (WLAN_FC_GET_STYPE(fc) & 0x08));
141 aad_len = 22;
142 if (a4_included)
143 aad_len += 6;
144 if (qc_included) {
145 pos = (u8 *) &hdr->addr4;
146 if (a4_included)
147 pos += 6;
148 qc = *pos & 0x0f;
149 aad_len += 2;
150 }
151
152 /* CCM Initial Block:
153 * Flag (Include authentication header, M=3 (8-octet MIC),
154 * L=1 (2-octet Dlen))
155 * Nonce: 0x00 | A2 | PN
156 * Dlen */
157 b0[0] = 0x59;
158 b0[1] = qc;
159 memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
160 memcpy(b0 + 8, pn, CCMP_PN_LEN);
161 b0[14] = (dlen >> 8) & 0xff;
162 b0[15] = dlen & 0xff;
163
164 /* AAD:
165 * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
166 * A1 | A2 | A3
167 * SC with bits 4..15 (seq#) masked to zero
168 * A4 (if present)
169 * QC (if present)
170 */
171 pos = (u8 *) hdr;
172 aad[0] = 0; /* aad_len >> 8 */
173 aad[1] = aad_len & 0xff;
174 aad[2] = pos[0] & 0x8f;
175 aad[3] = pos[1] & 0xc7;
176 memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
177 pos = (u8 *) &hdr->seq_ctl;
178 aad[22] = pos[0] & 0x0f;
179 aad[23] = 0; /* all bits masked */
180 memset(aad + 24, 0, 8);
181 if (a4_included)
182 memcpy(aad + 24, hdr->addr4, ETH_ALEN);
183 if (qc_included) {
184 aad[a4_included ? 30 : 24] = qc;
185 /* rest of QC masked */
186 }
187
188 /* Start with the first block and AAD */
189 ieee80211_ccmp_aes_encrypt(tfm, b0, auth);
190 xor_block(auth, aad, AES_BLOCK_LEN);
191 ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
192 xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
193 ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
194 b0[0] &= 0x07;
195 b0[14] = b0[15] = 0;
196 ieee80211_ccmp_aes_encrypt(tfm, b0, s0);
197}
198
199
200static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
201{
202 struct ieee80211_ccmp_data *key = priv;
203 int data_len, i, blocks, last, len;
204 u8 *pos, *mic;
205 struct ieee80211_hdr *hdr;
206 u8 *b0 = key->tx_b0;
207 u8 *b = key->tx_b;
208 u8 *e = key->tx_e;
209 u8 *s0 = key->tx_s0;
210
211 if (skb_headroom(skb) < CCMP_HDR_LEN ||
212 skb_tailroom(skb) < CCMP_MIC_LEN ||
213 skb->len < hdr_len)
214 return -1;
215
216 data_len = skb->len - hdr_len;
217 pos = skb_push(skb, CCMP_HDR_LEN);
218 memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
219 pos += hdr_len;
220 mic = skb_put(skb, CCMP_MIC_LEN);
221
222 i = CCMP_PN_LEN - 1;
223 while (i >= 0) {
224 key->tx_pn[i]++;
225 if (key->tx_pn[i] != 0)
226 break;
227 i--;
228 }
229
230 *pos++ = key->tx_pn[5];
231 *pos++ = key->tx_pn[4];
232 *pos++ = 0;
233 *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */;
234 *pos++ = key->tx_pn[3];
235 *pos++ = key->tx_pn[2];
236 *pos++ = key->tx_pn[1];
237 *pos++ = key->tx_pn[0];
238
239 hdr = (struct ieee80211_hdr *) skb->data;
240 ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
241
242 blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
243 last = data_len % AES_BLOCK_LEN;
244
245 for (i = 1; i <= blocks; i++) {
246 len = (i == blocks && last) ? last : AES_BLOCK_LEN;
247 /* Authentication */
248 xor_block(b, pos, len);
249 ieee80211_ccmp_aes_encrypt(key->tfm, b, b);
250 /* Encryption, with counter */
251 b0[14] = (i >> 8) & 0xff;
252 b0[15] = i & 0xff;
253 ieee80211_ccmp_aes_encrypt(key->tfm, b0, e);
254 xor_block(pos, e, len);
255 pos += len;
256 }
257
258 for (i = 0; i < CCMP_MIC_LEN; i++)
259 mic[i] = b[i] ^ s0[i];
260
261 return 0;
262}
263
264
265static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
266{
267 struct ieee80211_ccmp_data *key = priv;
268 u8 keyidx, *pos;
269 struct ieee80211_hdr *hdr;
270 u8 *b0 = key->rx_b0;
271 u8 *b = key->rx_b;
272 u8 *a = key->rx_a;
273 u8 pn[6];
274 int i, blocks, last, len;
275 size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
276 u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
277
278 if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
279 key->dot11RSNAStatsCCMPFormatErrors++;
280 return -1;
281 }
282
283 hdr = (struct ieee80211_hdr *) skb->data;
284 pos = skb->data + hdr_len;
285 keyidx = pos[3];
286 if (!(keyidx & (1 << 5))) {
287 if (net_ratelimit()) {
288 printk(KERN_DEBUG "CCMP: received packet without ExtIV"
289 " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
290 }
291 key->dot11RSNAStatsCCMPFormatErrors++;
292 return -2;
293 }
294 keyidx >>= 6;
295 if (key->key_idx != keyidx) {
296 printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
297 "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
298 return -6;
299 }
300 if (!key->key_set) {
301 if (net_ratelimit()) {
302 printk(KERN_DEBUG "CCMP: received packet from " MAC_FMT
303 " with keyid=%d that does not have a configured"
304 " key\n", MAC_ARG(hdr->addr2), keyidx);
305 }
306 return -3;
307 }
308
309 pn[0] = pos[7];
310 pn[1] = pos[6];
311 pn[2] = pos[5];
312 pn[3] = pos[4];
313 pn[4] = pos[1];
314 pn[5] = pos[0];
315 pos += 8;
316
317 if (memcmp(pn, key->rx_pn, CCMP_PN_LEN) <= 0) {
318 if (net_ratelimit()) {
319 printk(KERN_DEBUG "CCMP: replay detected: STA=" MAC_FMT
320 " previous PN %02x%02x%02x%02x%02x%02x "
321 "received PN %02x%02x%02x%02x%02x%02x\n",
322 MAC_ARG(hdr->addr2), MAC_ARG(key->rx_pn),
323 MAC_ARG(pn));
324 }
325 key->dot11RSNAStatsCCMPReplays++;
326 return -4;
327 }
328
329 ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
330 xor_block(mic, b, CCMP_MIC_LEN);
331
332 blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
333 last = data_len % AES_BLOCK_LEN;
334
335 for (i = 1; i <= blocks; i++) {
336 len = (i == blocks && last) ? last : AES_BLOCK_LEN;
337 /* Decrypt, with counter */
338 b0[14] = (i >> 8) & 0xff;
339 b0[15] = i & 0xff;
340 ieee80211_ccmp_aes_encrypt(key->tfm, b0, b);
341 xor_block(pos, b, len);
342 /* Authentication */
343 xor_block(a, pos, len);
344 ieee80211_ccmp_aes_encrypt(key->tfm, a, a);
345 pos += len;
346 }
347
348 if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
349 if (net_ratelimit()) {
350 printk(KERN_DEBUG "CCMP: decrypt failed: STA="
351 MAC_FMT "\n", MAC_ARG(hdr->addr2));
352 }
353 key->dot11RSNAStatsCCMPDecryptErrors++;
354 return -5;
355 }
356
357 memcpy(key->rx_pn, pn, CCMP_PN_LEN);
358
359 /* Remove hdr and MIC */
360 memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
361 skb_pull(skb, CCMP_HDR_LEN);
362 skb_trim(skb, skb->len - CCMP_MIC_LEN);
363
364 return keyidx;
365}
366
367
368static int ieee80211_ccmp_set_key(void *key, int len, u8 *seq, void *priv)
369{
370 struct ieee80211_ccmp_data *data = priv;
371 int keyidx;
372 struct crypto_tfm *tfm = data->tfm;
373
374 keyidx = data->key_idx;
375 memset(data, 0, sizeof(*data));
376 data->key_idx = keyidx;
377 data->tfm = tfm;
378 if (len == CCMP_TK_LEN) {
379 memcpy(data->key, key, CCMP_TK_LEN);
380 data->key_set = 1;
381 if (seq) {
382 data->rx_pn[0] = seq[5];
383 data->rx_pn[1] = seq[4];
384 data->rx_pn[2] = seq[3];
385 data->rx_pn[3] = seq[2];
386 data->rx_pn[4] = seq[1];
387 data->rx_pn[5] = seq[0];
388 }
389 crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
390 } else if (len == 0)
391 data->key_set = 0;
392 else
393 return -1;
394
395 return 0;
396}
397
398
399static int ieee80211_ccmp_get_key(void *key, int len, u8 *seq, void *priv)
400{
401 struct ieee80211_ccmp_data *data = priv;
402
403 if (len < CCMP_TK_LEN)
404 return -1;
405
406 if (!data->key_set)
407 return 0;
408 memcpy(key, data->key, CCMP_TK_LEN);
409
410 if (seq) {
411 seq[0] = data->tx_pn[5];
412 seq[1] = data->tx_pn[4];
413 seq[2] = data->tx_pn[3];
414 seq[3] = data->tx_pn[2];
415 seq[4] = data->tx_pn[1];
416 seq[5] = data->tx_pn[0];
417 }
418
419 return CCMP_TK_LEN;
420}
421
422
423static char * ieee80211_ccmp_print_stats(char *p, void *priv)
424{
425 struct ieee80211_ccmp_data *ccmp = priv;
426 p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
427 "tx_pn=%02x%02x%02x%02x%02x%02x "
428 "rx_pn=%02x%02x%02x%02x%02x%02x "
429 "format_errors=%d replays=%d decrypt_errors=%d\n",
430 ccmp->key_idx, ccmp->key_set,
431 MAC_ARG(ccmp->tx_pn), MAC_ARG(ccmp->rx_pn),
432 ccmp->dot11RSNAStatsCCMPFormatErrors,
433 ccmp->dot11RSNAStatsCCMPReplays,
434 ccmp->dot11RSNAStatsCCMPDecryptErrors);
435
436 return p;
437}
438
439
440static struct ieee80211_crypto_ops ieee80211_crypt_ccmp = {
441 .name = "CCMP",
442 .init = ieee80211_ccmp_init,
443 .deinit = ieee80211_ccmp_deinit,
444 .encrypt_mpdu = ieee80211_ccmp_encrypt,
445 .decrypt_mpdu = ieee80211_ccmp_decrypt,
446 .encrypt_msdu = NULL,
447 .decrypt_msdu = NULL,
448 .set_key = ieee80211_ccmp_set_key,
449 .get_key = ieee80211_ccmp_get_key,
450 .print_stats = ieee80211_ccmp_print_stats,
451 .extra_prefix_len = CCMP_HDR_LEN,
452 .extra_postfix_len = CCMP_MIC_LEN,
453 .owner = THIS_MODULE,
454};
455
456
457static int __init ieee80211_crypto_ccmp_init(void)
458{
459 return ieee80211_register_crypto_ops(&ieee80211_crypt_ccmp);
460}
461
462
463static void __exit ieee80211_crypto_ccmp_exit(void)
464{
465 ieee80211_unregister_crypto_ops(&ieee80211_crypt_ccmp);
466}
467
468
469module_init(ieee80211_crypto_ccmp_init);
470module_exit(ieee80211_crypto_ccmp_exit);
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
new file mode 100644
index 000000000000..f91d92c6df25
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -0,0 +1,708 @@
1/*
2 * Host AP crypt: host-based TKIP encryption implementation for Host AP driver
3 *
4 * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. See README and COPYING for
9 * more details.
10 */
11
12#include <linux/config.h>
13#include <linux/version.h>
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/slab.h>
17#include <linux/random.h>
18#include <linux/skbuff.h>
19#include <linux/netdevice.h>
20#include <linux/if_ether.h>
21#include <linux/if_arp.h>
22#include <asm/string.h>
23
24#include <net/ieee80211.h>
25
26
27#include <linux/crypto.h>
28#include <asm/scatterlist.h>
29#include <linux/crc32.h>
30
31MODULE_AUTHOR("Jouni Malinen");
32MODULE_DESCRIPTION("Host AP crypt: TKIP");
33MODULE_LICENSE("GPL");
34
35struct ieee80211_tkip_data {
36#define TKIP_KEY_LEN 32
37 u8 key[TKIP_KEY_LEN];
38 int key_set;
39
40 u32 tx_iv32;
41 u16 tx_iv16;
42 u16 tx_ttak[5];
43 int tx_phase1_done;
44
45 u32 rx_iv32;
46 u16 rx_iv16;
47 u16 rx_ttak[5];
48 int rx_phase1_done;
49 u32 rx_iv32_new;
50 u16 rx_iv16_new;
51
52 u32 dot11RSNAStatsTKIPReplays;
53 u32 dot11RSNAStatsTKIPICVErrors;
54 u32 dot11RSNAStatsTKIPLocalMICFailures;
55
56 int key_idx;
57
58 struct crypto_tfm *tfm_arc4;
59 struct crypto_tfm *tfm_michael;
60
61 /* scratch buffers for virt_to_page() (crypto API) */
62 u8 rx_hdr[16], tx_hdr[16];
63};
64
65static void * ieee80211_tkip_init(int key_idx)
66{
67 struct ieee80211_tkip_data *priv;
68
69 priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
70 if (priv == NULL)
71 goto fail;
72 memset(priv, 0, sizeof(*priv));
73 priv->key_idx = key_idx;
74
75 priv->tfm_arc4 = crypto_alloc_tfm("arc4", 0);
76 if (priv->tfm_arc4 == NULL) {
77 printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
78 "crypto API arc4\n");
79 goto fail;
80 }
81
82 priv->tfm_michael = crypto_alloc_tfm("michael_mic", 0);
83 if (priv->tfm_michael == NULL) {
84 printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
85 "crypto API michael_mic\n");
86 goto fail;
87 }
88
89 return priv;
90
91fail:
92 if (priv) {
93 if (priv->tfm_michael)
94 crypto_free_tfm(priv->tfm_michael);
95 if (priv->tfm_arc4)
96 crypto_free_tfm(priv->tfm_arc4);
97 kfree(priv);
98 }
99
100 return NULL;
101}
102
103
104static void ieee80211_tkip_deinit(void *priv)
105{
106 struct ieee80211_tkip_data *_priv = priv;
107 if (_priv && _priv->tfm_michael)
108 crypto_free_tfm(_priv->tfm_michael);
109 if (_priv && _priv->tfm_arc4)
110 crypto_free_tfm(_priv->tfm_arc4);
111 kfree(priv);
112}
113
114
115static inline u16 RotR1(u16 val)
116{
117 return (val >> 1) | (val << 15);
118}
119
120
121static inline u8 Lo8(u16 val)
122{
123 return val & 0xff;
124}
125
126
127static inline u8 Hi8(u16 val)
128{
129 return val >> 8;
130}
131
132
133static inline u16 Lo16(u32 val)
134{
135 return val & 0xffff;
136}
137
138
139static inline u16 Hi16(u32 val)
140{
141 return val >> 16;
142}
143
144
145static inline u16 Mk16(u8 hi, u8 lo)
146{
147 return lo | (((u16) hi) << 8);
148}
149
150
151static inline u16 Mk16_le(u16 *v)
152{
153 return le16_to_cpu(*v);
154}
155
156
157static const u16 Sbox[256] =
158{
159 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
160 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
161 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
162 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
163 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
164 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
165 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
166 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
167 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
168 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
169 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
170 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
171 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
172 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
173 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
174 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
175 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
176 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
177 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
178 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
179 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
180 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
181 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
182 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
183 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
184 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
185 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
186 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
187 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
188 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
189 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
190 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
191};
192
193
194static inline u16 _S_(u16 v)
195{
196 u16 t = Sbox[Hi8(v)];
197 return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
198}
199
200
201#define PHASE1_LOOP_COUNT 8
202
203static void tkip_mixing_phase1(u16 *TTAK, const u8 *TK, const u8 *TA, u32 IV32)
204{
205 int i, j;
206
207 /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
208 TTAK[0] = Lo16(IV32);
209 TTAK[1] = Hi16(IV32);
210 TTAK[2] = Mk16(TA[1], TA[0]);
211 TTAK[3] = Mk16(TA[3], TA[2]);
212 TTAK[4] = Mk16(TA[5], TA[4]);
213
214 for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
215 j = 2 * (i & 1);
216 TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
217 TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
218 TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
219 TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
220 TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
221 }
222}
223
224
225static void tkip_mixing_phase2(u8 *WEPSeed, const u8 *TK, const u16 *TTAK,
226 u16 IV16)
227{
228 /* Make temporary area overlap WEP seed so that the final copy can be
229 * avoided on little endian hosts. */
230 u16 *PPK = (u16 *) &WEPSeed[4];
231
232 /* Step 1 - make copy of TTAK and bring in TSC */
233 PPK[0] = TTAK[0];
234 PPK[1] = TTAK[1];
235 PPK[2] = TTAK[2];
236 PPK[3] = TTAK[3];
237 PPK[4] = TTAK[4];
238 PPK[5] = TTAK[4] + IV16;
239
240 /* Step 2 - 96-bit bijective mixing using S-box */
241 PPK[0] += _S_(PPK[5] ^ Mk16_le((u16 *) &TK[0]));
242 PPK[1] += _S_(PPK[0] ^ Mk16_le((u16 *) &TK[2]));
243 PPK[2] += _S_(PPK[1] ^ Mk16_le((u16 *) &TK[4]));
244 PPK[3] += _S_(PPK[2] ^ Mk16_le((u16 *) &TK[6]));
245 PPK[4] += _S_(PPK[3] ^ Mk16_le((u16 *) &TK[8]));
246 PPK[5] += _S_(PPK[4] ^ Mk16_le((u16 *) &TK[10]));
247
248 PPK[0] += RotR1(PPK[5] ^ Mk16_le((u16 *) &TK[12]));
249 PPK[1] += RotR1(PPK[0] ^ Mk16_le((u16 *) &TK[14]));
250 PPK[2] += RotR1(PPK[1]);
251 PPK[3] += RotR1(PPK[2]);
252 PPK[4] += RotR1(PPK[3]);
253 PPK[5] += RotR1(PPK[4]);
254
255 /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
256 * WEPSeed[0..2] is transmitted as WEP IV */
257 WEPSeed[0] = Hi8(IV16);
258 WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
259 WEPSeed[2] = Lo8(IV16);
260 WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((u16 *) &TK[0])) >> 1);
261
262#ifdef __BIG_ENDIAN
263 {
264 int i;
265 for (i = 0; i < 6; i++)
266 PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
267 }
268#endif
269}
270
271static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
272{
273 struct ieee80211_tkip_data *tkey = priv;
274 int len;
275 u8 rc4key[16], *pos, *icv;
276 struct ieee80211_hdr *hdr;
277 u32 crc;
278 struct scatterlist sg;
279
280 if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 4 ||
281 skb->len < hdr_len)
282 return -1;
283
284 hdr = (struct ieee80211_hdr *) skb->data;
285 if (!tkey->tx_phase1_done) {
286 tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
287 tkey->tx_iv32);
288 tkey->tx_phase1_done = 1;
289 }
290 tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
291
292 len = skb->len - hdr_len;
293 pos = skb_push(skb, 8);
294 memmove(pos, pos + 8, hdr_len);
295 pos += hdr_len;
296 icv = skb_put(skb, 4);
297
298 *pos++ = rc4key[0];
299 *pos++ = rc4key[1];
300 *pos++ = rc4key[2];
301 *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */;
302 *pos++ = tkey->tx_iv32 & 0xff;
303 *pos++ = (tkey->tx_iv32 >> 8) & 0xff;
304 *pos++ = (tkey->tx_iv32 >> 16) & 0xff;
305 *pos++ = (tkey->tx_iv32 >> 24) & 0xff;
306
307 crc = ~crc32_le(~0, pos, len);
308 icv[0] = crc;
309 icv[1] = crc >> 8;
310 icv[2] = crc >> 16;
311 icv[3] = crc >> 24;
312
313 crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
314 sg.page = virt_to_page(pos);
315 sg.offset = offset_in_page(pos);
316 sg.length = len + 4;
317 crypto_cipher_encrypt(tkey->tfm_arc4, &sg, &sg, len + 4);
318
319 tkey->tx_iv16++;
320 if (tkey->tx_iv16 == 0) {
321 tkey->tx_phase1_done = 0;
322 tkey->tx_iv32++;
323 }
324
325 return 0;
326}
327
328static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
329{
330 struct ieee80211_tkip_data *tkey = priv;
331 u8 rc4key[16];
332 u8 keyidx, *pos;
333 u32 iv32;
334 u16 iv16;
335 struct ieee80211_hdr *hdr;
336 u8 icv[4];
337 u32 crc;
338 struct scatterlist sg;
339 int plen;
340
341 if (skb->len < hdr_len + 8 + 4)
342 return -1;
343
344 hdr = (struct ieee80211_hdr *) skb->data;
345 pos = skb->data + hdr_len;
346 keyidx = pos[3];
347 if (!(keyidx & (1 << 5))) {
348 if (net_ratelimit()) {
349 printk(KERN_DEBUG "TKIP: received packet without ExtIV"
350 " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
351 }
352 return -2;
353 }
354 keyidx >>= 6;
355 if (tkey->key_idx != keyidx) {
356 printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
357 "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
358 return -6;
359 }
360 if (!tkey->key_set) {
361 if (net_ratelimit()) {
362 printk(KERN_DEBUG "TKIP: received packet from " MAC_FMT
363 " with keyid=%d that does not have a configured"
364 " key\n", MAC_ARG(hdr->addr2), keyidx);
365 }
366 return -3;
367 }
368 iv16 = (pos[0] << 8) | pos[2];
369 iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
370 pos += 8;
371
372 if (iv32 < tkey->rx_iv32 ||
373 (iv32 == tkey->rx_iv32 && iv16 <= tkey->rx_iv16)) {
374 if (net_ratelimit()) {
375 printk(KERN_DEBUG "TKIP: replay detected: STA=" MAC_FMT
376 " previous TSC %08x%04x received TSC "
377 "%08x%04x\n", MAC_ARG(hdr->addr2),
378 tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
379 }
380 tkey->dot11RSNAStatsTKIPReplays++;
381 return -4;
382 }
383
384 if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
385 tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
386 tkey->rx_phase1_done = 1;
387 }
388 tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
389
390 plen = skb->len - hdr_len - 12;
391
392 crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
393 sg.page = virt_to_page(pos);
394 sg.offset = offset_in_page(pos);
395 sg.length = plen + 4;
396 crypto_cipher_decrypt(tkey->tfm_arc4, &sg, &sg, plen + 4);
397
398 crc = ~crc32_le(~0, pos, plen);
399 icv[0] = crc;
400 icv[1] = crc >> 8;
401 icv[2] = crc >> 16;
402 icv[3] = crc >> 24;
403 if (memcmp(icv, pos + plen, 4) != 0) {
404 if (iv32 != tkey->rx_iv32) {
405 /* Previously cached Phase1 result was already lost, so
406 * it needs to be recalculated for the next packet. */
407 tkey->rx_phase1_done = 0;
408 }
409 if (net_ratelimit()) {
410 printk(KERN_DEBUG "TKIP: ICV error detected: STA="
411 MAC_FMT "\n", MAC_ARG(hdr->addr2));
412 }
413 tkey->dot11RSNAStatsTKIPICVErrors++;
414 return -5;
415 }
416
417 /* Update real counters only after Michael MIC verification has
418 * completed */
419 tkey->rx_iv32_new = iv32;
420 tkey->rx_iv16_new = iv16;
421
422 /* Remove IV and ICV */
423 memmove(skb->data + 8, skb->data, hdr_len);
424 skb_pull(skb, 8);
425 skb_trim(skb, skb->len - 4);
426
427 return keyidx;
428}
429
430
431static int michael_mic(struct ieee80211_tkip_data *tkey, u8 *key, u8 *hdr,
432 u8 *data, size_t data_len, u8 *mic)
433{
434 struct scatterlist sg[2];
435
436 if (tkey->tfm_michael == NULL) {
437 printk(KERN_WARNING "michael_mic: tfm_michael == NULL\n");
438 return -1;
439 }
440 sg[0].page = virt_to_page(hdr);
441 sg[0].offset = offset_in_page(hdr);
442 sg[0].length = 16;
443
444 sg[1].page = virt_to_page(data);
445 sg[1].offset = offset_in_page(data);
446 sg[1].length = data_len;
447
448 crypto_digest_init(tkey->tfm_michael);
449 crypto_digest_setkey(tkey->tfm_michael, key, 8);
450 crypto_digest_update(tkey->tfm_michael, sg, 2);
451 crypto_digest_final(tkey->tfm_michael, mic);
452
453 return 0;
454}
455
456static void michael_mic_hdr(struct sk_buff *skb, u8 *hdr)
457{
458 struct ieee80211_hdr *hdr11;
459
460 hdr11 = (struct ieee80211_hdr *) skb->data;
461 switch (le16_to_cpu(hdr11->frame_ctl) &
462 (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
463 case IEEE80211_FCTL_TODS:
464 memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
465 memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
466 break;
467 case IEEE80211_FCTL_FROMDS:
468 memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
469 memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */
470 break;
471 case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
472 memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
473 memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
474 break;
475 case 0:
476 memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
477 memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
478 break;
479 }
480
481 hdr[12] = 0; /* priority */
482 hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */
483}
484
485
486static int ieee80211_michael_mic_add(struct sk_buff *skb, int hdr_len, void *priv)
487{
488 struct ieee80211_tkip_data *tkey = priv;
489 u8 *pos;
490
491 if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
492 printk(KERN_DEBUG "Invalid packet for Michael MIC add "
493 "(tailroom=%d hdr_len=%d skb->len=%d)\n",
494 skb_tailroom(skb), hdr_len, skb->len);
495 return -1;
496 }
497
498 michael_mic_hdr(skb, tkey->tx_hdr);
499 pos = skb_put(skb, 8);
500 if (michael_mic(tkey, &tkey->key[16], tkey->tx_hdr,
501 skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
502 return -1;
503
504 return 0;
505}
506
507
508#if WIRELESS_EXT >= 18
509static void ieee80211_michael_mic_failure(struct net_device *dev,
510 struct ieee80211_hdr *hdr,
511 int keyidx)
512{
513 union iwreq_data wrqu;
514 struct iw_michaelmicfailure ev;
515
516 /* TODO: needed parameters: count, keyid, key type, TSC */
517 memset(&ev, 0, sizeof(ev));
518 ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
519 if (hdr->addr1[0] & 0x01)
520 ev.flags |= IW_MICFAILURE_GROUP;
521 else
522 ev.flags |= IW_MICFAILURE_PAIRWISE;
523 ev.src_addr.sa_family = ARPHRD_ETHER;
524 memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
525 memset(&wrqu, 0, sizeof(wrqu));
526 wrqu.data.length = sizeof(ev);
527 wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *) &ev);
528}
529#elif WIRELESS_EXT >= 15
530static void ieee80211_michael_mic_failure(struct net_device *dev,
531 struct ieee80211_hdr *hdr,
532 int keyidx)
533{
534 union iwreq_data wrqu;
535 char buf[128];
536
537 /* TODO: needed parameters: count, keyid, key type, TSC */
538 sprintf(buf, "MLME-MICHAELMICFAILURE.indication(keyid=%d %scast addr="
539 MAC_FMT ")", keyidx, hdr->addr1[0] & 0x01 ? "broad" : "uni",
540 MAC_ARG(hdr->addr2));
541 memset(&wrqu, 0, sizeof(wrqu));
542 wrqu.data.length = strlen(buf);
543 wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
544}
545#else /* WIRELESS_EXT >= 15 */
546static inline void ieee80211_michael_mic_failure(struct net_device *dev,
547 struct ieee80211_hdr *hdr,
548 int keyidx)
549{
550}
551#endif /* WIRELESS_EXT >= 15 */
552
553
554static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
555 int hdr_len, void *priv)
556{
557 struct ieee80211_tkip_data *tkey = priv;
558 u8 mic[8];
559
560 if (!tkey->key_set)
561 return -1;
562
563 michael_mic_hdr(skb, tkey->rx_hdr);
564 if (michael_mic(tkey, &tkey->key[24], tkey->rx_hdr,
565 skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
566 return -1;
567 if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
568 struct ieee80211_hdr *hdr;
569 hdr = (struct ieee80211_hdr *) skb->data;
570 printk(KERN_DEBUG "%s: Michael MIC verification failed for "
571 "MSDU from " MAC_FMT " keyidx=%d\n",
572 skb->dev ? skb->dev->name : "N/A", MAC_ARG(hdr->addr2),
573 keyidx);
574 if (skb->dev)
575 ieee80211_michael_mic_failure(skb->dev, hdr, keyidx);
576 tkey->dot11RSNAStatsTKIPLocalMICFailures++;
577 return -1;
578 }
579
580 /* Update TSC counters for RX now that the packet verification has
581 * completed. */
582 tkey->rx_iv32 = tkey->rx_iv32_new;
583 tkey->rx_iv16 = tkey->rx_iv16_new;
584
585 skb_trim(skb, skb->len - 8);
586
587 return 0;
588}
589
590
591static int ieee80211_tkip_set_key(void *key, int len, u8 *seq, void *priv)
592{
593 struct ieee80211_tkip_data *tkey = priv;
594 int keyidx;
595 struct crypto_tfm *tfm = tkey->tfm_michael;
596 struct crypto_tfm *tfm2 = tkey->tfm_arc4;
597
598 keyidx = tkey->key_idx;
599 memset(tkey, 0, sizeof(*tkey));
600 tkey->key_idx = keyidx;
601 tkey->tfm_michael = tfm;
602 tkey->tfm_arc4 = tfm2;
603 if (len == TKIP_KEY_LEN) {
604 memcpy(tkey->key, key, TKIP_KEY_LEN);
605 tkey->key_set = 1;
606 tkey->tx_iv16 = 1; /* TSC is initialized to 1 */
607 if (seq) {
608 tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
609 (seq[3] << 8) | seq[2];
610 tkey->rx_iv16 = (seq[1] << 8) | seq[0];
611 }
612 } else if (len == 0)
613 tkey->key_set = 0;
614 else
615 return -1;
616
617 return 0;
618}
619
620
621static int ieee80211_tkip_get_key(void *key, int len, u8 *seq, void *priv)
622{
623 struct ieee80211_tkip_data *tkey = priv;
624
625 if (len < TKIP_KEY_LEN)
626 return -1;
627
628 if (!tkey->key_set)
629 return 0;
630 memcpy(key, tkey->key, TKIP_KEY_LEN);
631
632 if (seq) {
633 /* Return the sequence number of the last transmitted frame. */
634 u16 iv16 = tkey->tx_iv16;
635 u32 iv32 = tkey->tx_iv32;
636 if (iv16 == 0)
637 iv32--;
638 iv16--;
639 seq[0] = tkey->tx_iv16;
640 seq[1] = tkey->tx_iv16 >> 8;
641 seq[2] = tkey->tx_iv32;
642 seq[3] = tkey->tx_iv32 >> 8;
643 seq[4] = tkey->tx_iv32 >> 16;
644 seq[5] = tkey->tx_iv32 >> 24;
645 }
646
647 return TKIP_KEY_LEN;
648}
649
650
651static char * ieee80211_tkip_print_stats(char *p, void *priv)
652{
653 struct ieee80211_tkip_data *tkip = priv;
654 p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
655 "tx_pn=%02x%02x%02x%02x%02x%02x "
656 "rx_pn=%02x%02x%02x%02x%02x%02x "
657 "replays=%d icv_errors=%d local_mic_failures=%d\n",
658 tkip->key_idx, tkip->key_set,
659 (tkip->tx_iv32 >> 24) & 0xff,
660 (tkip->tx_iv32 >> 16) & 0xff,
661 (tkip->tx_iv32 >> 8) & 0xff,
662 tkip->tx_iv32 & 0xff,
663 (tkip->tx_iv16 >> 8) & 0xff,
664 tkip->tx_iv16 & 0xff,
665 (tkip->rx_iv32 >> 24) & 0xff,
666 (tkip->rx_iv32 >> 16) & 0xff,
667 (tkip->rx_iv32 >> 8) & 0xff,
668 tkip->rx_iv32 & 0xff,
669 (tkip->rx_iv16 >> 8) & 0xff,
670 tkip->rx_iv16 & 0xff,
671 tkip->dot11RSNAStatsTKIPReplays,
672 tkip->dot11RSNAStatsTKIPICVErrors,
673 tkip->dot11RSNAStatsTKIPLocalMICFailures);
674 return p;
675}
676
677
678static struct ieee80211_crypto_ops ieee80211_crypt_tkip = {
679 .name = "TKIP",
680 .init = ieee80211_tkip_init,
681 .deinit = ieee80211_tkip_deinit,
682 .encrypt_mpdu = ieee80211_tkip_encrypt,
683 .decrypt_mpdu = ieee80211_tkip_decrypt,
684 .encrypt_msdu = ieee80211_michael_mic_add,
685 .decrypt_msdu = ieee80211_michael_mic_verify,
686 .set_key = ieee80211_tkip_set_key,
687 .get_key = ieee80211_tkip_get_key,
688 .print_stats = ieee80211_tkip_print_stats,
689 .extra_prefix_len = 4 + 4, /* IV + ExtIV */
690 .extra_postfix_len = 8 + 4, /* MIC + ICV */
691 .owner = THIS_MODULE,
692};
693
694
695static int __init ieee80211_crypto_tkip_init(void)
696{
697 return ieee80211_register_crypto_ops(&ieee80211_crypt_tkip);
698}
699
700
701static void __exit ieee80211_crypto_tkip_exit(void)
702{
703 ieee80211_unregister_crypto_ops(&ieee80211_crypt_tkip);
704}
705
706
707module_init(ieee80211_crypto_tkip_init);
708module_exit(ieee80211_crypto_tkip_exit);
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
new file mode 100644
index 000000000000..bec1d3470d39
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -0,0 +1,272 @@
1/*
2 * Host AP crypt: host-based WEP encryption implementation for Host AP driver
3 *
4 * Copyright (c) 2002-2004, Jouni Malinen <jkmaline@cc.hut.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. See README and COPYING for
9 * more details.
10 */
11
12#include <linux/config.h>
13#include <linux/version.h>
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/slab.h>
17#include <linux/random.h>
18#include <linux/skbuff.h>
19#include <asm/string.h>
20
21#include <net/ieee80211.h>
22
23
24#include <linux/crypto.h>
25#include <asm/scatterlist.h>
26#include <linux/crc32.h>
27
28MODULE_AUTHOR("Jouni Malinen");
29MODULE_DESCRIPTION("Host AP crypt: WEP");
30MODULE_LICENSE("GPL");
31
32
33struct prism2_wep_data {
34 u32 iv;
35#define WEP_KEY_LEN 13
36 u8 key[WEP_KEY_LEN + 1];
37 u8 key_len;
38 u8 key_idx;
39 struct crypto_tfm *tfm;
40};
41
42
43static void * prism2_wep_init(int keyidx)
44{
45 struct prism2_wep_data *priv;
46
47 priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
48 if (priv == NULL)
49 goto fail;
50 memset(priv, 0, sizeof(*priv));
51 priv->key_idx = keyidx;
52
53 priv->tfm = crypto_alloc_tfm("arc4", 0);
54 if (priv->tfm == NULL) {
55 printk(KERN_DEBUG "ieee80211_crypt_wep: could not allocate "
56 "crypto API arc4\n");
57 goto fail;
58 }
59
60 /* start WEP IV from a random value */
61 get_random_bytes(&priv->iv, 4);
62
63 return priv;
64
65fail:
66 if (priv) {
67 if (priv->tfm)
68 crypto_free_tfm(priv->tfm);
69 kfree(priv);
70 }
71 return NULL;
72}
73
74
75static void prism2_wep_deinit(void *priv)
76{
77 struct prism2_wep_data *_priv = priv;
78 if (_priv && _priv->tfm)
79 crypto_free_tfm(_priv->tfm);
80 kfree(priv);
81}
82
83
84/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
85 * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
86 * so the payload length increases with 8 bytes.
87 *
88 * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
89 */
90static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
91{
92 struct prism2_wep_data *wep = priv;
93 u32 crc, klen, len;
94 u8 key[WEP_KEY_LEN + 3];
95 u8 *pos, *icv;
96 struct scatterlist sg;
97
98 if (skb_headroom(skb) < 4 || skb_tailroom(skb) < 4 ||
99 skb->len < hdr_len)
100 return -1;
101
102 len = skb->len - hdr_len;
103 pos = skb_push(skb, 4);
104 memmove(pos, pos + 4, hdr_len);
105 pos += hdr_len;
106
107 klen = 3 + wep->key_len;
108
109 wep->iv++;
110
111 /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
112 * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
113 * can be used to speedup attacks, so avoid using them. */
114 if ((wep->iv & 0xff00) == 0xff00) {
115 u8 B = (wep->iv >> 16) & 0xff;
116 if (B >= 3 && B < klen)
117 wep->iv += 0x0100;
118 }
119
120 /* Prepend 24-bit IV to RC4 key and TX frame */
121 *pos++ = key[0] = (wep->iv >> 16) & 0xff;
122 *pos++ = key[1] = (wep->iv >> 8) & 0xff;
123 *pos++ = key[2] = wep->iv & 0xff;
124 *pos++ = wep->key_idx << 6;
125
126 /* Copy rest of the WEP key (the secret part) */
127 memcpy(key + 3, wep->key, wep->key_len);
128
129 /* Append little-endian CRC32 and encrypt it to produce ICV */
130 crc = ~crc32_le(~0, pos, len);
131 icv = skb_put(skb, 4);
132 icv[0] = crc;
133 icv[1] = crc >> 8;
134 icv[2] = crc >> 16;
135 icv[3] = crc >> 24;
136
137 crypto_cipher_setkey(wep->tfm, key, klen);
138 sg.page = virt_to_page(pos);
139 sg.offset = offset_in_page(pos);
140 sg.length = len + 4;
141 crypto_cipher_encrypt(wep->tfm, &sg, &sg, len + 4);
142
143 return 0;
144}
145
146
147/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
148 * the frame: IV (4 bytes), encrypted payload (including SNAP header),
149 * ICV (4 bytes). len includes both IV and ICV.
150 *
151 * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
152 * failure. If frame is OK, IV and ICV will be removed.
153 */
154static int prism2_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
155{
156 struct prism2_wep_data *wep = priv;
157 u32 crc, klen, plen;
158 u8 key[WEP_KEY_LEN + 3];
159 u8 keyidx, *pos, icv[4];
160 struct scatterlist sg;
161
162 if (skb->len < hdr_len + 8)
163 return -1;
164
165 pos = skb->data + hdr_len;
166 key[0] = *pos++;
167 key[1] = *pos++;
168 key[2] = *pos++;
169 keyidx = *pos++ >> 6;
170 if (keyidx != wep->key_idx)
171 return -1;
172
173 klen = 3 + wep->key_len;
174
175 /* Copy rest of the WEP key (the secret part) */
176 memcpy(key + 3, wep->key, wep->key_len);
177
178 /* Apply RC4 to data and compute CRC32 over decrypted data */
179 plen = skb->len - hdr_len - 8;
180
181 crypto_cipher_setkey(wep->tfm, key, klen);
182 sg.page = virt_to_page(pos);
183 sg.offset = offset_in_page(pos);
184 sg.length = plen + 4;
185 crypto_cipher_decrypt(wep->tfm, &sg, &sg, plen + 4);
186
187 crc = ~crc32_le(~0, pos, plen);
188 icv[0] = crc;
189 icv[1] = crc >> 8;
190 icv[2] = crc >> 16;
191 icv[3] = crc >> 24;
192 if (memcmp(icv, pos + plen, 4) != 0) {
193 /* ICV mismatch - drop frame */
194 return -2;
195 }
196
197 /* Remove IV and ICV */
198 memmove(skb->data + 4, skb->data, hdr_len);
199 skb_pull(skb, 4);
200 skb_trim(skb, skb->len - 4);
201
202 return 0;
203}
204
205
206static int prism2_wep_set_key(void *key, int len, u8 *seq, void *priv)
207{
208 struct prism2_wep_data *wep = priv;
209
210 if (len < 0 || len > WEP_KEY_LEN)
211 return -1;
212
213 memcpy(wep->key, key, len);
214 wep->key_len = len;
215
216 return 0;
217}
218
219
220static int prism2_wep_get_key(void *key, int len, u8 *seq, void *priv)
221{
222 struct prism2_wep_data *wep = priv;
223
224 if (len < wep->key_len)
225 return -1;
226
227 memcpy(key, wep->key, wep->key_len);
228
229 return wep->key_len;
230}
231
232
233static char * prism2_wep_print_stats(char *p, void *priv)
234{
235 struct prism2_wep_data *wep = priv;
236 p += sprintf(p, "key[%d] alg=WEP len=%d\n",
237 wep->key_idx, wep->key_len);
238 return p;
239}
240
241
242static struct ieee80211_crypto_ops ieee80211_crypt_wep = {
243 .name = "WEP",
244 .init = prism2_wep_init,
245 .deinit = prism2_wep_deinit,
246 .encrypt_mpdu = prism2_wep_encrypt,
247 .decrypt_mpdu = prism2_wep_decrypt,
248 .encrypt_msdu = NULL,
249 .decrypt_msdu = NULL,
250 .set_key = prism2_wep_set_key,
251 .get_key = prism2_wep_get_key,
252 .print_stats = prism2_wep_print_stats,
253 .extra_prefix_len = 4, /* IV */
254 .extra_postfix_len = 4, /* ICV */
255 .owner = THIS_MODULE,
256};
257
258
259static int __init ieee80211_crypto_wep_init(void)
260{
261 return ieee80211_register_crypto_ops(&ieee80211_crypt_wep);
262}
263
264
265static void __exit ieee80211_crypto_wep_exit(void)
266{
267 ieee80211_unregister_crypto_ops(&ieee80211_crypt_wep);
268}
269
270
271module_init(ieee80211_crypto_wep_init);
272module_exit(ieee80211_crypto_wep_exit);
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
new file mode 100644
index 000000000000..553acb2e93d5
--- /dev/null
+++ b/net/ieee80211/ieee80211_module.c
@@ -0,0 +1,299 @@
1/*******************************************************************************
2
3 Copyright(c) 2004 Intel Corporation. All rights reserved.
4
5 Portions of this file are based on the WEP enablement code provided by the
6 Host AP project hostap-drivers v0.1.3
7 Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
8 <jkmaline@cc.hut.fi>
9 Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
10
11 This program is free software; you can redistribute it and/or modify it
12 under the terms of version 2 of the GNU General Public License as
13 published by the Free Software Foundation.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 59
22 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23
24 The full GNU General Public License is included in this distribution in the
25 file called LICENSE.
26
27 Contact Information:
28 James P. Ketrenos <ipw2100-admin@linux.intel.com>
29 Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
30
31*******************************************************************************/
32
33#include <linux/compiler.h>
34#include <linux/config.h>
35#include <linux/errno.h>
36#include <linux/if_arp.h>
37#include <linux/in6.h>
38#include <linux/in.h>
39#include <linux/ip.h>
40#include <linux/kernel.h>
41#include <linux/module.h>
42#include <linux/netdevice.h>
43#include <linux/proc_fs.h>
44#include <linux/skbuff.h>
45#include <linux/slab.h>
46#include <linux/tcp.h>
47#include <linux/types.h>
48#include <linux/version.h>
49#include <linux/wireless.h>
50#include <linux/etherdevice.h>
51#include <asm/uaccess.h>
52#include <net/arp.h>
53
54#include <net/ieee80211.h>
55
56MODULE_DESCRIPTION("802.11 data/management/control stack");
57MODULE_AUTHOR("Copyright (C) 2004 Intel Corporation <jketreno@linux.intel.com>");
58MODULE_LICENSE("GPL");
59
60#define DRV_NAME "ieee80211"
61
62static inline int ieee80211_networks_allocate(struct ieee80211_device *ieee)
63{
64 if (ieee->networks)
65 return 0;
66
67 ieee->networks = kmalloc(
68 MAX_NETWORK_COUNT * sizeof(struct ieee80211_network),
69 GFP_KERNEL);
70 if (!ieee->networks) {
71 printk(KERN_WARNING "%s: Out of memory allocating beacons\n",
72 ieee->dev->name);
73 return -ENOMEM;
74 }
75
76 memset(ieee->networks, 0,
77 MAX_NETWORK_COUNT * sizeof(struct ieee80211_network));
78
79 return 0;
80}
81
82static inline void ieee80211_networks_free(struct ieee80211_device *ieee)
83{
84 if (!ieee->networks)
85 return;
86 kfree(ieee->networks);
87 ieee->networks = NULL;
88}
89
90static inline void ieee80211_networks_initialize(struct ieee80211_device *ieee)
91{
92 int i;
93
94 INIT_LIST_HEAD(&ieee->network_free_list);
95 INIT_LIST_HEAD(&ieee->network_list);
96 for (i = 0; i < MAX_NETWORK_COUNT; i++)
97 list_add_tail(&ieee->networks[i].list, &ieee->network_free_list);
98}
99
100
101struct net_device *alloc_ieee80211(int sizeof_priv)
102{
103 struct ieee80211_device *ieee;
104 struct net_device *dev;
105 int err;
106
107 IEEE80211_DEBUG_INFO("Initializing...\n");
108
109 dev = alloc_etherdev(sizeof(struct ieee80211_device) + sizeof_priv);
110 if (!dev) {
111 IEEE80211_ERROR("Unable to network device.\n");
112 goto failed;
113 }
114 ieee = netdev_priv(dev);
115 dev->hard_start_xmit = ieee80211_xmit;
116
117 ieee->dev = dev;
118
119 err = ieee80211_networks_allocate(ieee);
120 if (err) {
121 IEEE80211_ERROR("Unable to allocate beacon storage: %d\n",
122 err);
123 goto failed;
124 }
125 ieee80211_networks_initialize(ieee);
126
127 /* Default fragmentation threshold is maximum payload size */
128 ieee->fts = DEFAULT_FTS;
129 ieee->scan_age = DEFAULT_MAX_SCAN_AGE;
130 ieee->open_wep = 1;
131
132 /* Default to enabling full open WEP with host based encrypt/decrypt */
133 ieee->host_encrypt = 1;
134 ieee->host_decrypt = 1;
135 ieee->ieee802_1x = 1; /* Default to supporting 802.1x */
136
137 INIT_LIST_HEAD(&ieee->crypt_deinit_list);
138 init_timer(&ieee->crypt_deinit_timer);
139 ieee->crypt_deinit_timer.data = (unsigned long)ieee;
140 ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler;
141
142 spin_lock_init(&ieee->lock);
143
144 ieee->wpa_enabled = 0;
145 ieee->tkip_countermeasures = 0;
146 ieee->drop_unencrypted = 0;
147 ieee->privacy_invoked = 0;
148 ieee->ieee802_1x = 1;
149
150 return dev;
151
152 failed:
153 if (dev)
154 free_netdev(dev);
155 return NULL;
156}
157
158
159void free_ieee80211(struct net_device *dev)
160{
161 struct ieee80211_device *ieee = netdev_priv(dev);
162
163 int i;
164
165 del_timer_sync(&ieee->crypt_deinit_timer);
166 ieee80211_crypt_deinit_entries(ieee, 1);
167
168 for (i = 0; i < WEP_KEYS; i++) {
169 struct ieee80211_crypt_data *crypt = ieee->crypt[i];
170 if (crypt) {
171 if (crypt->ops) {
172 crypt->ops->deinit(crypt->priv);
173 module_put(crypt->ops->owner);
174 }
175 kfree(crypt);
176 ieee->crypt[i] = NULL;
177 }
178 }
179
180 ieee80211_networks_free(ieee);
181 free_netdev(dev);
182}
183
184#ifdef CONFIG_IEEE80211_DEBUG
185
186static int debug = 0;
187u32 ieee80211_debug_level = 0;
188struct proc_dir_entry *ieee80211_proc = NULL;
189
190static int show_debug_level(char *page, char **start, off_t offset,
191 int count, int *eof, void *data)
192{
193 return snprintf(page, count, "0x%08X\n", ieee80211_debug_level);
194}
195
196static int store_debug_level(struct file *file, const char __user *buffer,
197 unsigned long count, void *data)
198{
199 char buf[] = "0x00000000";
200 char *p = (char *)buf;
201 unsigned long val;
202
203 if (count > sizeof(buf) - 1)
204 count = sizeof(buf) - 1;
205
206 if (copy_from_user(buf, buffer, count))
207 return count;
208 buf[count] = 0;
209 /*
210 * what a FPOS... What, sscanf(buf, "%i", &val) would be too
211 * scary?
212 */
213 if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
214 p++;
215 if (p[0] == 'x' || p[0] == 'X')
216 p++;
217 val = simple_strtoul(p, &p, 16);
218 } else
219 val = simple_strtoul(p, &p, 10);
220 if (p == buf)
221 printk(KERN_INFO DRV_NAME
222 ": %s is not in hex or decimal form.\n", buf);
223 else
224 ieee80211_debug_level = val;
225
226 return strlen(buf);
227}
228
229static int __init ieee80211_init(void)
230{
231 struct proc_dir_entry *e;
232
233 ieee80211_debug_level = debug;
234 ieee80211_proc = create_proc_entry(DRV_NAME, S_IFDIR, proc_net);
235 if (ieee80211_proc == NULL) {
236 IEEE80211_ERROR("Unable to create " DRV_NAME
237 " proc directory\n");
238 return -EIO;
239 }
240 e = create_proc_entry("debug_level", S_IFREG | S_IRUGO | S_IWUSR,
241 ieee80211_proc);
242 if (!e) {
243 remove_proc_entry(DRV_NAME, proc_net);
244 ieee80211_proc = NULL;
245 return -EIO;
246 }
247 e->read_proc = show_debug_level;
248 e->write_proc = store_debug_level;
249 e->data = NULL;
250
251 return 0;
252}
253
254static void __exit ieee80211_exit(void)
255{
256 if (ieee80211_proc) {
257 remove_proc_entry("debug_level", ieee80211_proc);
258 remove_proc_entry(DRV_NAME, proc_net);
259 ieee80211_proc = NULL;
260 }
261}
262
263#include <linux/moduleparam.h>
264module_param(debug, int, 0444);
265MODULE_PARM_DESC(debug, "debug output mask");
266
267
268module_exit(ieee80211_exit);
269module_init(ieee80211_init);
270#endif
271
272
273const char *escape_essid(const char *essid, u8 essid_len) {
274 static char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
275 const char *s = essid;
276 char *d = escaped;
277
278 if (ieee80211_is_empty_essid(essid, essid_len)) {
279 memcpy(escaped, "<hidden>", sizeof("<hidden>"));
280 return escaped;
281 }
282
283 essid_len = min(essid_len, (u8)IW_ESSID_MAX_SIZE);
284 while (essid_len--) {
285 if (*s == '\0') {
286 *d++ = '\\';
287 *d++ = '0';
288 s++;
289 } else {
290 *d++ = *s++;
291 }
292 }
293 *d = '\0';
294 return escaped;
295}
296
297EXPORT_SYMBOL(alloc_ieee80211);
298EXPORT_SYMBOL(free_ieee80211);
299EXPORT_SYMBOL(escape_essid);
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
new file mode 100644
index 000000000000..a5905f53aed7
--- /dev/null
+++ b/net/ieee80211/ieee80211_rx.c
@@ -0,0 +1,1189 @@
1/*
2 * Original code based Host AP (software wireless LAN access point) driver
3 * for Intersil Prism2/2.5/3 - hostap.o module, common routines
4 *
5 * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
6 * <jkmaline@cc.hut.fi>
7 * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
8 * Copyright (c) 2004, Intel Corporation
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation. See README and COPYING for
13 * more details.
14 */
15
16#include <linux/compiler.h>
17#include <linux/config.h>
18#include <linux/errno.h>
19#include <linux/if_arp.h>
20#include <linux/in6.h>
21#include <linux/in.h>
22#include <linux/ip.h>
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/netdevice.h>
26#include <linux/proc_fs.h>
27#include <linux/skbuff.h>
28#include <linux/slab.h>
29#include <linux/tcp.h>
30#include <linux/types.h>
31#include <linux/version.h>
32#include <linux/wireless.h>
33#include <linux/etherdevice.h>
34#include <asm/uaccess.h>
35#include <linux/ctype.h>
36
37#include <net/ieee80211.h>
38
39static inline void ieee80211_monitor_rx(struct ieee80211_device *ieee,
40 struct sk_buff *skb,
41 struct ieee80211_rx_stats *rx_stats)
42{
43 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
44 u16 fc = le16_to_cpu(hdr->frame_ctl);
45
46 skb->dev = ieee->dev;
47 skb->mac.raw = skb->data;
48 skb_pull(skb, ieee80211_get_hdrlen(fc));
49 skb->pkt_type = PACKET_OTHERHOST;
50 skb->protocol = __constant_htons(ETH_P_80211_RAW);
51 memset(skb->cb, 0, sizeof(skb->cb));
52 netif_rx(skb);
53}
54
55
56/* Called only as a tasklet (software IRQ) */
57static struct ieee80211_frag_entry *
58ieee80211_frag_cache_find(struct ieee80211_device *ieee, unsigned int seq,
59 unsigned int frag, u8 *src, u8 *dst)
60{
61 struct ieee80211_frag_entry *entry;
62 int i;
63
64 for (i = 0; i < IEEE80211_FRAG_CACHE_LEN; i++) {
65 entry = &ieee->frag_cache[i];
66 if (entry->skb != NULL &&
67 time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
68 IEEE80211_DEBUG_FRAG(
69 "expiring fragment cache entry "
70 "seq=%u last_frag=%u\n",
71 entry->seq, entry->last_frag);
72 dev_kfree_skb_any(entry->skb);
73 entry->skb = NULL;
74 }
75
76 if (entry->skb != NULL && entry->seq == seq &&
77 (entry->last_frag + 1 == frag || frag == -1) &&
78 memcmp(entry->src_addr, src, ETH_ALEN) == 0 &&
79 memcmp(entry->dst_addr, dst, ETH_ALEN) == 0)
80 return entry;
81 }
82
83 return NULL;
84}
85
86/* Called only as a tasklet (software IRQ) */
87static struct sk_buff *
88ieee80211_frag_cache_get(struct ieee80211_device *ieee,
89 struct ieee80211_hdr *hdr)
90{
91 struct sk_buff *skb = NULL;
92 u16 sc;
93 unsigned int frag, seq;
94 struct ieee80211_frag_entry *entry;
95
96 sc = le16_to_cpu(hdr->seq_ctl);
97 frag = WLAN_GET_SEQ_FRAG(sc);
98 seq = WLAN_GET_SEQ_SEQ(sc);
99
100 if (frag == 0) {
101 /* Reserve enough space to fit maximum frame length */
102 skb = dev_alloc_skb(ieee->dev->mtu +
103 sizeof(struct ieee80211_hdr) +
104 8 /* LLC */ +
105 2 /* alignment */ +
106 8 /* WEP */ + ETH_ALEN /* WDS */);
107 if (skb == NULL)
108 return NULL;
109
110 entry = &ieee->frag_cache[ieee->frag_next_idx];
111 ieee->frag_next_idx++;
112 if (ieee->frag_next_idx >= IEEE80211_FRAG_CACHE_LEN)
113 ieee->frag_next_idx = 0;
114
115 if (entry->skb != NULL)
116 dev_kfree_skb_any(entry->skb);
117
118 entry->first_frag_time = jiffies;
119 entry->seq = seq;
120 entry->last_frag = frag;
121 entry->skb = skb;
122 memcpy(entry->src_addr, hdr->addr2, ETH_ALEN);
123 memcpy(entry->dst_addr, hdr->addr1, ETH_ALEN);
124 } else {
125 /* received a fragment of a frame for which the head fragment
126 * should have already been received */
127 entry = ieee80211_frag_cache_find(ieee, seq, frag, hdr->addr2,
128 hdr->addr1);
129 if (entry != NULL) {
130 entry->last_frag = frag;
131 skb = entry->skb;
132 }
133 }
134
135 return skb;
136}
137
138
139/* Called only as a tasklet (software IRQ) */
140static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee,
141 struct ieee80211_hdr *hdr)
142{
143 u16 sc;
144 unsigned int seq;
145 struct ieee80211_frag_entry *entry;
146
147 sc = le16_to_cpu(hdr->seq_ctl);
148 seq = WLAN_GET_SEQ_SEQ(sc);
149
150 entry = ieee80211_frag_cache_find(ieee, seq, -1, hdr->addr2,
151 hdr->addr1);
152
153 if (entry == NULL) {
154 IEEE80211_DEBUG_FRAG(
155 "could not invalidate fragment cache "
156 "entry (seq=%u)\n", seq);
157 return -1;
158 }
159
160 entry->skb = NULL;
161 return 0;
162}
163
164
165#ifdef NOT_YET
166/* ieee80211_rx_frame_mgtmt
167 *
168 * Responsible for handling management control frames
169 *
170 * Called by ieee80211_rx */
171static inline int
172ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb,
173 struct ieee80211_rx_stats *rx_stats, u16 type,
174 u16 stype)
175{
176 if (ieee->iw_mode == IW_MODE_MASTER) {
177 printk(KERN_DEBUG "%s: Master mode not yet suppported.\n",
178 ieee->dev->name);
179 return 0;
180/*
181 hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr *)
182 skb->data);*/
183 }
184
185 if (ieee->hostapd && type == WLAN_FC_TYPE_MGMT) {
186 if (stype == WLAN_FC_STYPE_BEACON &&
187 ieee->iw_mode == IW_MODE_MASTER) {
188 struct sk_buff *skb2;
189 /* Process beacon frames also in kernel driver to
190 * update STA(AP) table statistics */
191 skb2 = skb_clone(skb, GFP_ATOMIC);
192 if (skb2)
193 hostap_rx(skb2->dev, skb2, rx_stats);
194 }
195
196 /* send management frames to the user space daemon for
197 * processing */
198 ieee->apdevstats.rx_packets++;
199 ieee->apdevstats.rx_bytes += skb->len;
200 prism2_rx_80211(ieee->apdev, skb, rx_stats, PRISM2_RX_MGMT);
201 return 0;
202 }
203
204 if (ieee->iw_mode == IW_MODE_MASTER) {
205 if (type != WLAN_FC_TYPE_MGMT && type != WLAN_FC_TYPE_CTRL) {
206 printk(KERN_DEBUG "%s: unknown management frame "
207 "(type=0x%02x, stype=0x%02x) dropped\n",
208 skb->dev->name, type, stype);
209 return -1;
210 }
211
212 hostap_rx(skb->dev, skb, rx_stats);
213 return 0;
214 }
215
216 printk(KERN_DEBUG "%s: hostap_rx_frame_mgmt: management frame "
217 "received in non-Host AP mode\n", skb->dev->name);
218 return -1;
219}
220#endif
221
222
223/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
224/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
225static unsigned char rfc1042_header[] =
226{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
227/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
228static unsigned char bridge_tunnel_header[] =
229{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
230/* No encapsulation header if EtherType < 0x600 (=length) */
231
232/* Called by ieee80211_rx_frame_decrypt */
233static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee,
234 struct sk_buff *skb)
235{
236 struct net_device *dev = ieee->dev;
237 u16 fc, ethertype;
238 struct ieee80211_hdr *hdr;
239 u8 *pos;
240
241 if (skb->len < 24)
242 return 0;
243
244 hdr = (struct ieee80211_hdr *) skb->data;
245 fc = le16_to_cpu(hdr->frame_ctl);
246
247 /* check that the frame is unicast frame to us */
248 if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
249 IEEE80211_FCTL_TODS &&
250 memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0 &&
251 memcmp(hdr->addr3, dev->dev_addr, ETH_ALEN) == 0) {
252 /* ToDS frame with own addr BSSID and DA */
253 } else if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
254 IEEE80211_FCTL_FROMDS &&
255 memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0) {
256 /* FromDS frame with own addr as DA */
257 } else
258 return 0;
259
260 if (skb->len < 24 + 8)
261 return 0;
262
263 /* check for port access entity Ethernet type */
264 pos = skb->data + 24;
265 ethertype = (pos[6] << 8) | pos[7];
266 if (ethertype == ETH_P_PAE)
267 return 1;
268
269 return 0;
270}
271
272/* Called only as a tasklet (software IRQ), by ieee80211_rx */
273static inline int
274ieee80211_rx_frame_decrypt(struct ieee80211_device* ieee, struct sk_buff *skb,
275 struct ieee80211_crypt_data *crypt)
276{
277 struct ieee80211_hdr *hdr;
278 int res, hdrlen;
279
280 if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL)
281 return 0;
282
283 hdr = (struct ieee80211_hdr *) skb->data;
284 hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
285
286#ifdef CONFIG_IEEE80211_CRYPT_TKIP
287 if (ieee->tkip_countermeasures &&
288 strcmp(crypt->ops->name, "TKIP") == 0) {
289 if (net_ratelimit()) {
290 printk(KERN_DEBUG "%s: TKIP countermeasures: dropped "
291 "received packet from " MAC_FMT "\n",
292 ieee->dev->name, MAC_ARG(hdr->addr2));
293 }
294 return -1;
295 }
296#endif
297
298 atomic_inc(&crypt->refcnt);
299 res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv);
300 atomic_dec(&crypt->refcnt);
301 if (res < 0) {
302 IEEE80211_DEBUG_DROP(
303 "decryption failed (SA=" MAC_FMT
304 ") res=%d\n", MAC_ARG(hdr->addr2), res);
305 if (res == -2)
306 IEEE80211_DEBUG_DROP("Decryption failed ICV "
307 "mismatch (key %d)\n",
308 skb->data[hdrlen + 3] >> 6);
309 ieee->ieee_stats.rx_discards_undecryptable++;
310 return -1;
311 }
312
313 return res;
314}
315
316
317/* Called only as a tasklet (software IRQ), by ieee80211_rx */
318static inline int
319ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device* ieee, struct sk_buff *skb,
320 int keyidx, struct ieee80211_crypt_data *crypt)
321{
322 struct ieee80211_hdr *hdr;
323 int res, hdrlen;
324
325 if (crypt == NULL || crypt->ops->decrypt_msdu == NULL)
326 return 0;
327
328 hdr = (struct ieee80211_hdr *) skb->data;
329 hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
330
331 atomic_inc(&crypt->refcnt);
332 res = crypt->ops->decrypt_msdu(skb, keyidx, hdrlen, crypt->priv);
333 atomic_dec(&crypt->refcnt);
334 if (res < 0) {
335 printk(KERN_DEBUG "%s: MSDU decryption/MIC verification failed"
336 " (SA=" MAC_FMT " keyidx=%d)\n",
337 ieee->dev->name, MAC_ARG(hdr->addr2), keyidx);
338 return -1;
339 }
340
341 return 0;
342}
343
344
345/* All received frames are sent to this function. @skb contains the frame in
346 * IEEE 802.11 format, i.e., in the format it was sent over air.
347 * This function is called only as a tasklet (software IRQ). */
348int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
349 struct ieee80211_rx_stats *rx_stats)
350{
351 struct net_device *dev = ieee->dev;
352 struct ieee80211_hdr *hdr;
353 size_t hdrlen;
354 u16 fc, type, stype, sc;
355 struct net_device_stats *stats;
356 unsigned int frag;
357 u8 *payload;
358 u16 ethertype;
359#ifdef NOT_YET
360 struct net_device *wds = NULL;
361 struct sk_buff *skb2 = NULL;
362 struct net_device *wds = NULL;
363 int frame_authorized = 0;
364 int from_assoc_ap = 0;
365 void *sta = NULL;
366#endif
367 u8 dst[ETH_ALEN];
368 u8 src[ETH_ALEN];
369 struct ieee80211_crypt_data *crypt = NULL;
370 int keyidx = 0;
371
372 hdr = (struct ieee80211_hdr *)skb->data;
373 stats = &ieee->stats;
374
375 if (skb->len < 10) {
376 printk(KERN_INFO "%s: SKB length < 10\n",
377 dev->name);
378 goto rx_dropped;
379 }
380
381 fc = le16_to_cpu(hdr->frame_ctl);
382 type = WLAN_FC_GET_TYPE(fc);
383 stype = WLAN_FC_GET_STYPE(fc);
384 sc = le16_to_cpu(hdr->seq_ctl);
385 frag = WLAN_GET_SEQ_FRAG(sc);
386 hdrlen = ieee80211_get_hdrlen(fc);
387
388#ifdef NOT_YET
389#if WIRELESS_EXT > 15
390 /* Put this code here so that we avoid duplicating it in all
391 * Rx paths. - Jean II */
392#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
393 /* If spy monitoring on */
394 if (iface->spy_data.spy_number > 0) {
395 struct iw_quality wstats;
396 wstats.level = rx_stats->signal;
397 wstats.noise = rx_stats->noise;
398 wstats.updated = 6; /* No qual value */
399 /* Update spy records */
400 wireless_spy_update(dev, hdr->addr2, &wstats);
401 }
402#endif /* IW_WIRELESS_SPY */
403#endif /* WIRELESS_EXT > 15 */
404 hostap_update_rx_stats(local->ap, hdr, rx_stats);
405#endif
406
407#if WIRELESS_EXT > 15
408 if (ieee->iw_mode == IW_MODE_MONITOR) {
409 ieee80211_monitor_rx(ieee, skb, rx_stats);
410 stats->rx_packets++;
411 stats->rx_bytes += skb->len;
412 return 1;
413 }
414#endif
415
416 if (ieee->host_decrypt) {
417 int idx = 0;
418 if (skb->len >= hdrlen + 3)
419 idx = skb->data[hdrlen + 3] >> 6;
420 crypt = ieee->crypt[idx];
421#ifdef NOT_YET
422 sta = NULL;
423
424 /* Use station specific key to override default keys if the
425 * receiver address is a unicast address ("individual RA"). If
426 * bcrx_sta_key parameter is set, station specific key is used
427 * even with broad/multicast targets (this is against IEEE
428 * 802.11, but makes it easier to use different keys with
429 * stations that do not support WEP key mapping). */
430
431 if (!(hdr->addr1[0] & 0x01) || local->bcrx_sta_key)
432 (void) hostap_handle_sta_crypto(local, hdr, &crypt,
433 &sta);
434#endif
435
436 /* allow NULL decrypt to indicate an station specific override
437 * for default encryption */
438 if (crypt && (crypt->ops == NULL ||
439 crypt->ops->decrypt_mpdu == NULL))
440 crypt = NULL;
441
442 if (!crypt && (fc & IEEE80211_FCTL_PROTECTED)) {
443 /* This seems to be triggered by some (multicast?)
444 * frames from other than current BSS, so just drop the
445 * frames silently instead of filling system log with
446 * these reports. */
447 IEEE80211_DEBUG_DROP("Decryption failed (not set)"
448 " (SA=" MAC_FMT ")\n",
449 MAC_ARG(hdr->addr2));
450 ieee->ieee_stats.rx_discards_undecryptable++;
451 goto rx_dropped;
452 }
453 }
454
455#ifdef NOT_YET
456 if (type != WLAN_FC_TYPE_DATA) {
457 if (type == WLAN_FC_TYPE_MGMT && stype == WLAN_FC_STYPE_AUTH &&
458 fc & IEEE80211_FCTL_PROTECTED && ieee->host_decrypt &&
459 (keyidx = hostap_rx_frame_decrypt(ieee, skb, crypt)) < 0)
460 {
461 printk(KERN_DEBUG "%s: failed to decrypt mgmt::auth "
462 "from " MAC_FMT "\n", dev->name,
463 MAC_ARG(hdr->addr2));
464 /* TODO: could inform hostapd about this so that it
465 * could send auth failure report */
466 goto rx_dropped;
467 }
468
469 if (ieee80211_rx_frame_mgmt(ieee, skb, rx_stats, type, stype))
470 goto rx_dropped;
471 else
472 goto rx_exit;
473 }
474#endif
475
476 /* Data frame - extract src/dst addresses */
477 if (skb->len < IEEE80211_3ADDR_LEN)
478 goto rx_dropped;
479
480 switch (fc & (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
481 case IEEE80211_FCTL_FROMDS:
482 memcpy(dst, hdr->addr1, ETH_ALEN);
483 memcpy(src, hdr->addr3, ETH_ALEN);
484 break;
485 case IEEE80211_FCTL_TODS:
486 memcpy(dst, hdr->addr3, ETH_ALEN);
487 memcpy(src, hdr->addr2, ETH_ALEN);
488 break;
489 case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
490 if (skb->len < IEEE80211_4ADDR_LEN)
491 goto rx_dropped;
492 memcpy(dst, hdr->addr3, ETH_ALEN);
493 memcpy(src, hdr->addr4, ETH_ALEN);
494 break;
495 case 0:
496 memcpy(dst, hdr->addr1, ETH_ALEN);
497 memcpy(src, hdr->addr2, ETH_ALEN);
498 break;
499 }
500
501#ifdef NOT_YET
502 if (hostap_rx_frame_wds(ieee, hdr, fc, &wds))
503 goto rx_dropped;
504 if (wds) {
505 skb->dev = dev = wds;
506 stats = hostap_get_stats(dev);
507 }
508
509 if (ieee->iw_mode == IW_MODE_MASTER && !wds &&
510 (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) == IEEE80211_FCTL_FROMDS &&
511 ieee->stadev &&
512 memcmp(hdr->addr2, ieee->assoc_ap_addr, ETH_ALEN) == 0) {
513 /* Frame from BSSID of the AP for which we are a client */
514 skb->dev = dev = ieee->stadev;
515 stats = hostap_get_stats(dev);
516 from_assoc_ap = 1;
517 }
518#endif
519
520 dev->last_rx = jiffies;
521
522#ifdef NOT_YET
523 if ((ieee->iw_mode == IW_MODE_MASTER ||
524 ieee->iw_mode == IW_MODE_REPEAT) &&
525 !from_assoc_ap) {
526 switch (hostap_handle_sta_rx(ieee, dev, skb, rx_stats,
527 wds != NULL)) {
528 case AP_RX_CONTINUE_NOT_AUTHORIZED:
529 frame_authorized = 0;
530 break;
531 case AP_RX_CONTINUE:
532 frame_authorized = 1;
533 break;
534 case AP_RX_DROP:
535 goto rx_dropped;
536 case AP_RX_EXIT:
537 goto rx_exit;
538 }
539 }
540#endif
541
542 /* Nullfunc frames may have PS-bit set, so they must be passed to
543 * hostap_handle_sta_rx() before being dropped here. */
544 if (stype != IEEE80211_STYPE_DATA &&
545 stype != IEEE80211_STYPE_DATA_CFACK &&
546 stype != IEEE80211_STYPE_DATA_CFPOLL &&
547 stype != IEEE80211_STYPE_DATA_CFACKPOLL) {
548 if (stype != IEEE80211_STYPE_NULLFUNC)
549 IEEE80211_DEBUG_DROP(
550 "RX: dropped data frame "
551 "with no data (type=0x%02x, "
552 "subtype=0x%02x, len=%d)\n",
553 type, stype, skb->len);
554 goto rx_dropped;
555 }
556
557 /* skb: hdr + (possibly fragmented, possibly encrypted) payload */
558
559 if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
560 (keyidx = ieee80211_rx_frame_decrypt(ieee, skb, crypt)) < 0)
561 goto rx_dropped;
562
563 hdr = (struct ieee80211_hdr *) skb->data;
564
565 /* skb: hdr + (possibly fragmented) plaintext payload */
566 // PR: FIXME: hostap has additional conditions in the "if" below:
567 // ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
568 if ((frag != 0 || (fc & IEEE80211_FCTL_MOREFRAGS))) {
569 int flen;
570 struct sk_buff *frag_skb = ieee80211_frag_cache_get(ieee, hdr);
571 IEEE80211_DEBUG_FRAG("Rx Fragment received (%u)\n", frag);
572
573 if (!frag_skb) {
574 IEEE80211_DEBUG(IEEE80211_DL_RX | IEEE80211_DL_FRAG,
575 "Rx cannot get skb from fragment "
576 "cache (morefrag=%d seq=%u frag=%u)\n",
577 (fc & IEEE80211_FCTL_MOREFRAGS) != 0,
578 WLAN_GET_SEQ_SEQ(sc), frag);
579 goto rx_dropped;
580 }
581
582 flen = skb->len;
583 if (frag != 0)
584 flen -= hdrlen;
585
586 if (frag_skb->tail + flen > frag_skb->end) {
587 printk(KERN_WARNING "%s: host decrypted and "
588 "reassembled frame did not fit skb\n",
589 dev->name);
590 ieee80211_frag_cache_invalidate(ieee, hdr);
591 goto rx_dropped;
592 }
593
594 if (frag == 0) {
595 /* copy first fragment (including full headers) into
596 * beginning of the fragment cache skb */
597 memcpy(skb_put(frag_skb, flen), skb->data, flen);
598 } else {
599 /* append frame payload to the end of the fragment
600 * cache skb */
601 memcpy(skb_put(frag_skb, flen), skb->data + hdrlen,
602 flen);
603 }
604 dev_kfree_skb_any(skb);
605 skb = NULL;
606
607 if (fc & IEEE80211_FCTL_MOREFRAGS) {
608 /* more fragments expected - leave the skb in fragment
609 * cache for now; it will be delivered to upper layers
610 * after all fragments have been received */
611 goto rx_exit;
612 }
613
614 /* this was the last fragment and the frame will be
615 * delivered, so remove skb from fragment cache */
616 skb = frag_skb;
617 hdr = (struct ieee80211_hdr *) skb->data;
618 ieee80211_frag_cache_invalidate(ieee, hdr);
619 }
620
621 /* skb: hdr + (possible reassembled) full MSDU payload; possibly still
622 * encrypted/authenticated */
623 if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
624 ieee80211_rx_frame_decrypt_msdu(ieee, skb, keyidx, crypt))
625 goto rx_dropped;
626
627 hdr = (struct ieee80211_hdr *) skb->data;
628 if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep) {
629 if (/*ieee->ieee802_1x &&*/
630 ieee80211_is_eapol_frame(ieee, skb)) {
631 /* pass unencrypted EAPOL frames even if encryption is
632 * configured */
633 } else {
634 IEEE80211_DEBUG_DROP(
635 "encryption configured, but RX "
636 "frame not encrypted (SA=" MAC_FMT ")\n",
637 MAC_ARG(hdr->addr2));
638 goto rx_dropped;
639 }
640 }
641
642 if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep &&
643 !ieee80211_is_eapol_frame(ieee, skb)) {
644 IEEE80211_DEBUG_DROP(
645 "dropped unencrypted RX data "
646 "frame from " MAC_FMT
647 " (drop_unencrypted=1)\n",
648 MAC_ARG(hdr->addr2));
649 goto rx_dropped;
650 }
651
652 /* skb: hdr + (possible reassembled) full plaintext payload */
653
654 payload = skb->data + hdrlen;
655 ethertype = (payload[6] << 8) | payload[7];
656
657#ifdef NOT_YET
658 /* If IEEE 802.1X is used, check whether the port is authorized to send
659 * the received frame. */
660 if (ieee->ieee802_1x && ieee->iw_mode == IW_MODE_MASTER) {
661 if (ethertype == ETH_P_PAE) {
662 printk(KERN_DEBUG "%s: RX: IEEE 802.1X frame\n",
663 dev->name);
664 if (ieee->hostapd && ieee->apdev) {
665 /* Send IEEE 802.1X frames to the user
666 * space daemon for processing */
667 prism2_rx_80211(ieee->apdev, skb, rx_stats,
668 PRISM2_RX_MGMT);
669 ieee->apdevstats.rx_packets++;
670 ieee->apdevstats.rx_bytes += skb->len;
671 goto rx_exit;
672 }
673 } else if (!frame_authorized) {
674 printk(KERN_DEBUG "%s: dropped frame from "
675 "unauthorized port (IEEE 802.1X): "
676 "ethertype=0x%04x\n",
677 dev->name, ethertype);
678 goto rx_dropped;
679 }
680 }
681#endif
682
683 /* convert hdr + possible LLC headers into Ethernet header */
684 if (skb->len - hdrlen >= 8 &&
685 ((memcmp(payload, rfc1042_header, SNAP_SIZE) == 0 &&
686 ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
687 memcmp(payload, bridge_tunnel_header, SNAP_SIZE) == 0)) {
688 /* remove RFC1042 or Bridge-Tunnel encapsulation and
689 * replace EtherType */
690 skb_pull(skb, hdrlen + SNAP_SIZE);
691 memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
692 memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
693 } else {
694 u16 len;
695 /* Leave Ethernet header part of hdr and full payload */
696 skb_pull(skb, hdrlen);
697 len = htons(skb->len);
698 memcpy(skb_push(skb, 2), &len, 2);
699 memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
700 memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
701 }
702
703#ifdef NOT_YET
704 if (wds && ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
705 IEEE80211_FCTL_TODS) &&
706 skb->len >= ETH_HLEN + ETH_ALEN) {
707 /* Non-standard frame: get addr4 from its bogus location after
708 * the payload */
709 memcpy(skb->data + ETH_ALEN,
710 skb->data + skb->len - ETH_ALEN, ETH_ALEN);
711 skb_trim(skb, skb->len - ETH_ALEN);
712 }
713#endif
714
715 stats->rx_packets++;
716 stats->rx_bytes += skb->len;
717
718#ifdef NOT_YET
719 if (ieee->iw_mode == IW_MODE_MASTER && !wds &&
720 ieee->ap->bridge_packets) {
721 if (dst[0] & 0x01) {
722 /* copy multicast frame both to the higher layers and
723 * to the wireless media */
724 ieee->ap->bridged_multicast++;
725 skb2 = skb_clone(skb, GFP_ATOMIC);
726 if (skb2 == NULL)
727 printk(KERN_DEBUG "%s: skb_clone failed for "
728 "multicast frame\n", dev->name);
729 } else if (hostap_is_sta_assoc(ieee->ap, dst)) {
730 /* send frame directly to the associated STA using
731 * wireless media and not passing to higher layers */
732 ieee->ap->bridged_unicast++;
733 skb2 = skb;
734 skb = NULL;
735 }
736 }
737
738 if (skb2 != NULL) {
739 /* send to wireless media */
740 skb2->protocol = __constant_htons(ETH_P_802_3);
741 skb2->mac.raw = skb2->nh.raw = skb2->data;
742 /* skb2->nh.raw = skb2->data + ETH_HLEN; */
743 skb2->dev = dev;
744 dev_queue_xmit(skb2);
745 }
746
747#endif
748
749 if (skb) {
750 skb->protocol = eth_type_trans(skb, dev);
751 memset(skb->cb, 0, sizeof(skb->cb));
752 skb->dev = dev;
753 skb->ip_summed = CHECKSUM_NONE; /* 802.11 crc not sufficient */
754 netif_rx(skb);
755 }
756
757 rx_exit:
758#ifdef NOT_YET
759 if (sta)
760 hostap_handle_sta_release(sta);
761#endif
762 return 1;
763
764 rx_dropped:
765 stats->rx_dropped++;
766
767 /* Returning 0 indicates to caller that we have not handled the SKB--
768 * so it is still allocated and can be used again by underlying
769 * hardware as a DMA target */
770 return 0;
771}
772
773#define MGMT_FRAME_FIXED_PART_LENGTH 0x24
774
775static inline int ieee80211_is_ofdm_rate(u8 rate)
776{
777 switch (rate & ~IEEE80211_BASIC_RATE_MASK) {
778 case IEEE80211_OFDM_RATE_6MB:
779 case IEEE80211_OFDM_RATE_9MB:
780 case IEEE80211_OFDM_RATE_12MB:
781 case IEEE80211_OFDM_RATE_18MB:
782 case IEEE80211_OFDM_RATE_24MB:
783 case IEEE80211_OFDM_RATE_36MB:
784 case IEEE80211_OFDM_RATE_48MB:
785 case IEEE80211_OFDM_RATE_54MB:
786 return 1;
787 }
788 return 0;
789}
790
791
792static inline int ieee80211_network_init(
793 struct ieee80211_device *ieee,
794 struct ieee80211_probe_response *beacon,
795 struct ieee80211_network *network,
796 struct ieee80211_rx_stats *stats)
797{
798#ifdef CONFIG_IEEE80211_DEBUG
799 char rates_str[64];
800 char *p;
801#endif
802 struct ieee80211_info_element *info_element;
803 u16 left;
804 u8 i;
805
806 /* Pull out fixed field data */
807 memcpy(network->bssid, beacon->header.addr3, ETH_ALEN);
808 network->capability = beacon->capability;
809 network->last_scanned = jiffies;
810 network->time_stamp[0] = beacon->time_stamp[0];
811 network->time_stamp[1] = beacon->time_stamp[1];
812 network->beacon_interval = beacon->beacon_interval;
813 /* Where to pull this? beacon->listen_interval;*/
814 network->listen_interval = 0x0A;
815 network->rates_len = network->rates_ex_len = 0;
816 network->last_associate = 0;
817 network->ssid_len = 0;
818 network->flags = 0;
819 network->atim_window = 0;
820
821 if (stats->freq == IEEE80211_52GHZ_BAND) {
822 /* for A band (No DS info) */
823 network->channel = stats->received_channel;
824 } else
825 network->flags |= NETWORK_HAS_CCK;
826
827 network->wpa_ie_len = 0;
828 network->rsn_ie_len = 0;
829
830 info_element = &beacon->info_element;
831 left = stats->len - ((void *)info_element - (void *)beacon);
832 while (left >= sizeof(struct ieee80211_info_element_hdr)) {
833 if (sizeof(struct ieee80211_info_element_hdr) + info_element->len > left) {
834 IEEE80211_DEBUG_SCAN("SCAN: parse failed: info_element->len + 2 > left : info_element->len+2=%Zd left=%d.\n",
835 info_element->len + sizeof(struct ieee80211_info_element),
836 left);
837 return 1;
838 }
839
840 switch (info_element->id) {
841 case MFIE_TYPE_SSID:
842 if (ieee80211_is_empty_essid(info_element->data,
843 info_element->len)) {
844 network->flags |= NETWORK_EMPTY_ESSID;
845 break;
846 }
847
848 network->ssid_len = min(info_element->len,
849 (u8)IW_ESSID_MAX_SIZE);
850 memcpy(network->ssid, info_element->data, network->ssid_len);
851 if (network->ssid_len < IW_ESSID_MAX_SIZE)
852 memset(network->ssid + network->ssid_len, 0,
853 IW_ESSID_MAX_SIZE - network->ssid_len);
854
855 IEEE80211_DEBUG_SCAN("MFIE_TYPE_SSID: '%s' len=%d.\n",
856 network->ssid, network->ssid_len);
857 break;
858
859 case MFIE_TYPE_RATES:
860#ifdef CONFIG_IEEE80211_DEBUG
861 p = rates_str;
862#endif
863 network->rates_len = min(info_element->len, MAX_RATES_LENGTH);
864 for (i = 0; i < network->rates_len; i++) {
865 network->rates[i] = info_element->data[i];
866#ifdef CONFIG_IEEE80211_DEBUG
867 p += snprintf(p, sizeof(rates_str) - (p - rates_str), "%02X ", network->rates[i]);
868#endif
869 if (ieee80211_is_ofdm_rate(info_element->data[i])) {
870 network->flags |= NETWORK_HAS_OFDM;
871 if (info_element->data[i] &
872 IEEE80211_BASIC_RATE_MASK)
873 network->flags &=
874 ~NETWORK_HAS_CCK;
875 }
876 }
877
878 IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES: '%s' (%d)\n",
879 rates_str, network->rates_len);
880 break;
881
882 case MFIE_TYPE_RATES_EX:
883#ifdef CONFIG_IEEE80211_DEBUG
884 p = rates_str;
885#endif
886 network->rates_ex_len = min(info_element->len, MAX_RATES_EX_LENGTH);
887 for (i = 0; i < network->rates_ex_len; i++) {
888 network->rates_ex[i] = info_element->data[i];
889#ifdef CONFIG_IEEE80211_DEBUG
890 p += snprintf(p, sizeof(rates_str) - (p - rates_str), "%02X ", network->rates[i]);
891#endif
892 if (ieee80211_is_ofdm_rate(info_element->data[i])) {
893 network->flags |= NETWORK_HAS_OFDM;
894 if (info_element->data[i] &
895 IEEE80211_BASIC_RATE_MASK)
896 network->flags &=
897 ~NETWORK_HAS_CCK;
898 }
899 }
900
901 IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES_EX: '%s' (%d)\n",
902 rates_str, network->rates_ex_len);
903 break;
904
905 case MFIE_TYPE_DS_SET:
906 IEEE80211_DEBUG_SCAN("MFIE_TYPE_DS_SET: %d\n",
907 info_element->data[0]);
908 if (stats->freq == IEEE80211_24GHZ_BAND)
909 network->channel = info_element->data[0];
910 break;
911
912 case MFIE_TYPE_FH_SET:
913 IEEE80211_DEBUG_SCAN("MFIE_TYPE_FH_SET: ignored\n");
914 break;
915
916 case MFIE_TYPE_CF_SET:
917 IEEE80211_DEBUG_SCAN("MFIE_TYPE_CF_SET: ignored\n");
918 break;
919
920 case MFIE_TYPE_TIM:
921 IEEE80211_DEBUG_SCAN("MFIE_TYPE_TIM: ignored\n");
922 break;
923
924 case MFIE_TYPE_IBSS_SET:
925 IEEE80211_DEBUG_SCAN("MFIE_TYPE_IBSS_SET: ignored\n");
926 break;
927
928 case MFIE_TYPE_CHALLENGE:
929 IEEE80211_DEBUG_SCAN("MFIE_TYPE_CHALLENGE: ignored\n");
930 break;
931
932 case MFIE_TYPE_GENERIC:
933 IEEE80211_DEBUG_SCAN("MFIE_TYPE_GENERIC: %d bytes\n",
934 info_element->len);
935 if (info_element->len >= 4 &&
936 info_element->data[0] == 0x00 &&
937 info_element->data[1] == 0x50 &&
938 info_element->data[2] == 0xf2 &&
939 info_element->data[3] == 0x01) {
940 network->wpa_ie_len = min(info_element->len + 2,
941 MAX_WPA_IE_LEN);
942 memcpy(network->wpa_ie, info_element,
943 network->wpa_ie_len);
944 }
945 break;
946
947 case MFIE_TYPE_RSN:
948 IEEE80211_DEBUG_SCAN("MFIE_TYPE_RSN: %d bytes\n",
949 info_element->len);
950 network->rsn_ie_len = min(info_element->len + 2,
951 MAX_WPA_IE_LEN);
952 memcpy(network->rsn_ie, info_element,
953 network->rsn_ie_len);
954 break;
955
956 default:
957 IEEE80211_DEBUG_SCAN("unsupported IE %d\n",
958 info_element->id);
959 break;
960 }
961
962 left -= sizeof(struct ieee80211_info_element_hdr) +
963 info_element->len;
964 info_element = (struct ieee80211_info_element *)
965 &info_element->data[info_element->len];
966 }
967
968 network->mode = 0;
969 if (stats->freq == IEEE80211_52GHZ_BAND)
970 network->mode = IEEE_A;
971 else {
972 if (network->flags & NETWORK_HAS_OFDM)
973 network->mode |= IEEE_G;
974 if (network->flags & NETWORK_HAS_CCK)
975 network->mode |= IEEE_B;
976 }
977
978 if (network->mode == 0) {
979 IEEE80211_DEBUG_SCAN("Filtered out '%s (" MAC_FMT ")' "
980 "network.\n",
981 escape_essid(network->ssid,
982 network->ssid_len),
983 MAC_ARG(network->bssid));
984 return 1;
985 }
986
987 if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
988 network->flags |= NETWORK_EMPTY_ESSID;
989
990 memcpy(&network->stats, stats, sizeof(network->stats));
991
992 return 0;
993}
994
995static inline int is_same_network(struct ieee80211_network *src,
996 struct ieee80211_network *dst)
997{
998 /* A network is only a duplicate if the channel, BSSID, and ESSID
999 * all match. We treat all <hidden> with the same BSSID and channel
1000 * as one network */
1001 return ((src->ssid_len == dst->ssid_len) &&
1002 (src->channel == dst->channel) &&
1003 !memcmp(src->bssid, dst->bssid, ETH_ALEN) &&
1004 !memcmp(src->ssid, dst->ssid, src->ssid_len));
1005}
1006
1007static inline void update_network(struct ieee80211_network *dst,
1008 struct ieee80211_network *src)
1009{
1010 memcpy(&dst->stats, &src->stats, sizeof(struct ieee80211_rx_stats));
1011 dst->capability = src->capability;
1012 memcpy(dst->rates, src->rates, src->rates_len);
1013 dst->rates_len = src->rates_len;
1014 memcpy(dst->rates_ex, src->rates_ex, src->rates_ex_len);
1015 dst->rates_ex_len = src->rates_ex_len;
1016
1017 dst->mode = src->mode;
1018 dst->flags = src->flags;
1019 dst->time_stamp[0] = src->time_stamp[0];
1020 dst->time_stamp[1] = src->time_stamp[1];
1021
1022 dst->beacon_interval = src->beacon_interval;
1023 dst->listen_interval = src->listen_interval;
1024 dst->atim_window = src->atim_window;
1025
1026 memcpy(dst->wpa_ie, src->wpa_ie, src->wpa_ie_len);
1027 dst->wpa_ie_len = src->wpa_ie_len;
1028 memcpy(dst->rsn_ie, src->rsn_ie, src->rsn_ie_len);
1029 dst->rsn_ie_len = src->rsn_ie_len;
1030
1031 dst->last_scanned = jiffies;
1032 /* dst->last_associate is not overwritten */
1033}
1034
1035static inline void ieee80211_process_probe_response(
1036 struct ieee80211_device *ieee,
1037 struct ieee80211_probe_response *beacon,
1038 struct ieee80211_rx_stats *stats)
1039{
1040 struct ieee80211_network network;
1041 struct ieee80211_network *target;
1042 struct ieee80211_network *oldest = NULL;
1043#ifdef CONFIG_IEEE80211_DEBUG
1044 struct ieee80211_info_element *info_element = &beacon->info_element;
1045#endif
1046 unsigned long flags;
1047
1048 IEEE80211_DEBUG_SCAN(
1049 "'%s' (" MAC_FMT "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n",
1050 escape_essid(info_element->data, info_element->len),
1051 MAC_ARG(beacon->header.addr3),
1052 (beacon->capability & (1<<0xf)) ? '1' : '0',
1053 (beacon->capability & (1<<0xe)) ? '1' : '0',
1054 (beacon->capability & (1<<0xd)) ? '1' : '0',
1055 (beacon->capability & (1<<0xc)) ? '1' : '0',
1056 (beacon->capability & (1<<0xb)) ? '1' : '0',
1057 (beacon->capability & (1<<0xa)) ? '1' : '0',
1058 (beacon->capability & (1<<0x9)) ? '1' : '0',
1059 (beacon->capability & (1<<0x8)) ? '1' : '0',
1060 (beacon->capability & (1<<0x7)) ? '1' : '0',
1061 (beacon->capability & (1<<0x6)) ? '1' : '0',
1062 (beacon->capability & (1<<0x5)) ? '1' : '0',
1063 (beacon->capability & (1<<0x4)) ? '1' : '0',
1064 (beacon->capability & (1<<0x3)) ? '1' : '0',
1065 (beacon->capability & (1<<0x2)) ? '1' : '0',
1066 (beacon->capability & (1<<0x1)) ? '1' : '0',
1067 (beacon->capability & (1<<0x0)) ? '1' : '0');
1068
1069 if (ieee80211_network_init(ieee, beacon, &network, stats)) {
1070 IEEE80211_DEBUG_SCAN("Dropped '%s' (" MAC_FMT ") via %s.\n",
1071 escape_essid(info_element->data,
1072 info_element->len),
1073 MAC_ARG(beacon->header.addr3),
1074 WLAN_FC_GET_STYPE(beacon->header.frame_ctl) ==
1075 IEEE80211_STYPE_PROBE_RESP ?
1076 "PROBE RESPONSE" : "BEACON");
1077 return;
1078 }
1079
1080 /* The network parsed correctly -- so now we scan our known networks
1081 * to see if we can find it in our list.
1082 *
1083 * NOTE: This search is definitely not optimized. Once its doing
1084 * the "right thing" we'll optimize it for efficiency if
1085 * necessary */
1086
1087 /* Search for this entry in the list and update it if it is
1088 * already there. */
1089
1090 spin_lock_irqsave(&ieee->lock, flags);
1091
1092 list_for_each_entry(target, &ieee->network_list, list) {
1093 if (is_same_network(target, &network))
1094 break;
1095
1096 if ((oldest == NULL) ||
1097 (target->last_scanned < oldest->last_scanned))
1098 oldest = target;
1099 }
1100
1101 /* If we didn't find a match, then get a new network slot to initialize
1102 * with this beacon's information */
1103 if (&target->list == &ieee->network_list) {
1104 if (list_empty(&ieee->network_free_list)) {
1105 /* If there are no more slots, expire the oldest */
1106 list_del(&oldest->list);
1107 target = oldest;
1108 IEEE80211_DEBUG_SCAN("Expired '%s' (" MAC_FMT ") from "
1109 "network list.\n",
1110 escape_essid(target->ssid,
1111 target->ssid_len),
1112 MAC_ARG(target->bssid));
1113 } else {
1114 /* Otherwise just pull from the free list */
1115 target = list_entry(ieee->network_free_list.next,
1116 struct ieee80211_network, list);
1117 list_del(ieee->network_free_list.next);
1118 }
1119
1120
1121#ifdef CONFIG_IEEE80211_DEBUG
1122 IEEE80211_DEBUG_SCAN("Adding '%s' (" MAC_FMT ") via %s.\n",
1123 escape_essid(network.ssid,
1124 network.ssid_len),
1125 MAC_ARG(network.bssid),
1126 WLAN_FC_GET_STYPE(beacon->header.frame_ctl) ==
1127 IEEE80211_STYPE_PROBE_RESP ?
1128 "PROBE RESPONSE" : "BEACON");
1129#endif
1130 memcpy(target, &network, sizeof(*target));
1131 list_add_tail(&target->list, &ieee->network_list);
1132 } else {
1133 IEEE80211_DEBUG_SCAN("Updating '%s' (" MAC_FMT ") via %s.\n",
1134 escape_essid(target->ssid,
1135 target->ssid_len),
1136 MAC_ARG(target->bssid),
1137 WLAN_FC_GET_STYPE(beacon->header.frame_ctl) ==
1138 IEEE80211_STYPE_PROBE_RESP ?
1139 "PROBE RESPONSE" : "BEACON");
1140 update_network(target, &network);
1141 }
1142
1143 spin_unlock_irqrestore(&ieee->lock, flags);
1144}
1145
1146void ieee80211_rx_mgt(struct ieee80211_device *ieee,
1147 struct ieee80211_hdr *header,
1148 struct ieee80211_rx_stats *stats)
1149{
1150 switch (WLAN_FC_GET_STYPE(header->frame_ctl)) {
1151 case IEEE80211_STYPE_ASSOC_RESP:
1152 IEEE80211_DEBUG_MGMT("received ASSOCIATION RESPONSE (%d)\n",
1153 WLAN_FC_GET_STYPE(header->frame_ctl));
1154 break;
1155
1156 case IEEE80211_STYPE_REASSOC_RESP:
1157 IEEE80211_DEBUG_MGMT("received REASSOCIATION RESPONSE (%d)\n",
1158 WLAN_FC_GET_STYPE(header->frame_ctl));
1159 break;
1160
1161 case IEEE80211_STYPE_PROBE_RESP:
1162 IEEE80211_DEBUG_MGMT("received PROBE RESPONSE (%d)\n",
1163 WLAN_FC_GET_STYPE(header->frame_ctl));
1164 IEEE80211_DEBUG_SCAN("Probe response\n");
1165 ieee80211_process_probe_response(
1166 ieee, (struct ieee80211_probe_response *)header, stats);
1167 break;
1168
1169 case IEEE80211_STYPE_BEACON:
1170 IEEE80211_DEBUG_MGMT("received BEACON (%d)\n",
1171 WLAN_FC_GET_STYPE(header->frame_ctl));
1172 IEEE80211_DEBUG_SCAN("Beacon\n");
1173 ieee80211_process_probe_response(
1174 ieee, (struct ieee80211_probe_response *)header, stats);
1175 break;
1176
1177 default:
1178 IEEE80211_DEBUG_MGMT("received UNKNOWN (%d)\n",
1179 WLAN_FC_GET_STYPE(header->frame_ctl));
1180 IEEE80211_WARNING("%s: Unknown management packet: %d\n",
1181 ieee->dev->name,
1182 WLAN_FC_GET_STYPE(header->frame_ctl));
1183 break;
1184 }
1185}
1186
1187
1188EXPORT_SYMBOL(ieee80211_rx_mgt);
1189EXPORT_SYMBOL(ieee80211_rx);
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
new file mode 100644
index 000000000000..b7ea3e25e25d
--- /dev/null
+++ b/net/ieee80211/ieee80211_tx.c
@@ -0,0 +1,438 @@
1/******************************************************************************
2
3 Copyright(c) 2003 - 2004 Intel Corporation. All rights reserved.
4
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of version 2 of the GNU General Public License as
7 published by the Free Software Foundation.
8
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc., 59
16 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17
18 The full GNU General Public License is included in this distribution in the
19 file called LICENSE.
20
21 Contact Information:
22 James P. Ketrenos <ipw2100-admin@linux.intel.com>
23 Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
24
25******************************************************************************/
26#include <linux/compiler.h>
27#include <linux/config.h>
28#include <linux/errno.h>
29#include <linux/if_arp.h>
30#include <linux/in6.h>
31#include <linux/in.h>
32#include <linux/ip.h>
33#include <linux/kernel.h>
34#include <linux/module.h>
35#include <linux/netdevice.h>
36#include <linux/proc_fs.h>
37#include <linux/skbuff.h>
38#include <linux/slab.h>
39#include <linux/tcp.h>
40#include <linux/types.h>
41#include <linux/version.h>
42#include <linux/wireless.h>
43#include <linux/etherdevice.h>
44#include <asm/uaccess.h>
45
46#include <net/ieee80211.h>
47
48
49/*
50
51
52802.11 Data Frame
53
54 ,-------------------------------------------------------------------.
55Bytes | 2 | 2 | 6 | 6 | 6 | 2 | 0..2312 | 4 |
56 |------|------|---------|---------|---------|------|---------|------|
57Desc. | ctrl | dura | DA/RA | TA | SA | Sequ | Frame | fcs |
58 | | tion | (BSSID) | | | ence | data | |
59 `--------------------------------------------------| |------'
60Total: 28 non-data bytes `----.----'
61 |
62 .- 'Frame data' expands to <---------------------------'
63 |
64 V
65 ,---------------------------------------------------.
66Bytes | 1 | 1 | 1 | 3 | 2 | 0-2304 |
67 |------|------|---------|----------|------|---------|
68Desc. | SNAP | SNAP | Control |Eth Tunnel| Type | IP |
69 | DSAP | SSAP | | | | Packet |
70 | 0xAA | 0xAA |0x03 (UI)|0x00-00-F8| | |
71 `-----------------------------------------| |
72Total: 8 non-data bytes `----.----'
73 |
74 .- 'IP Packet' expands, if WEP enabled, to <--'
75 |
76 V
77 ,-----------------------.
78Bytes | 4 | 0-2296 | 4 |
79 |-----|-----------|-----|
80Desc. | IV | Encrypted | ICV |
81 | | IP Packet | |
82 `-----------------------'
83Total: 8 non-data bytes
84
85
86802.3 Ethernet Data Frame
87
88 ,-----------------------------------------.
89Bytes | 6 | 6 | 2 | Variable | 4 |
90 |-------|-------|------|-----------|------|
91Desc. | Dest. | Source| Type | IP Packet | fcs |
92 | MAC | MAC | | | |
93 `-----------------------------------------'
94Total: 18 non-data bytes
95
96In the event that fragmentation is required, the incoming payload is split into
97N parts of size ieee->fts. The first fragment contains the SNAP header and the
98remaining packets are just data.
99
100If encryption is enabled, each fragment payload size is reduced by enough space
101to add the prefix and postfix (IV and ICV totalling 8 bytes in the case of WEP)
102So if you have 1500 bytes of payload with ieee->fts set to 500 without
103encryption it will take 3 frames. With WEP it will take 4 frames as the
104payload of each frame is reduced to 492 bytes.
105
106* SKB visualization
107*
108* ,- skb->data
109* |
110* | ETHERNET HEADER ,-<-- PAYLOAD
111* | | 14 bytes from skb->data
112* | 2 bytes for Type --> ,T. | (sizeof ethhdr)
113* | | | |
114* |,-Dest.--. ,--Src.---. | | |
115* | 6 bytes| | 6 bytes | | | |
116* v | | | | | |
117* 0 | v 1 | v | v 2
118* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
119* ^ | ^ | ^ |
120* | | | | | |
121* | | | | `T' <---- 2 bytes for Type
122* | | | |
123* | | '---SNAP--' <-------- 6 bytes for SNAP
124* | |
125* `-IV--' <-------------------- 4 bytes for IV (WEP)
126*
127* SNAP HEADER
128*
129*/
130
131static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 };
132static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 };
133
134static inline int ieee80211_put_snap(u8 *data, u16 h_proto)
135{
136 struct ieee80211_snap_hdr *snap;
137 u8 *oui;
138
139 snap = (struct ieee80211_snap_hdr *)data;
140 snap->dsap = 0xaa;
141 snap->ssap = 0xaa;
142 snap->ctrl = 0x03;
143
144 if (h_proto == 0x8137 || h_proto == 0x80f3)
145 oui = P802_1H_OUI;
146 else
147 oui = RFC1042_OUI;
148 snap->oui[0] = oui[0];
149 snap->oui[1] = oui[1];
150 snap->oui[2] = oui[2];
151
152 *(u16 *)(data + SNAP_SIZE) = htons(h_proto);
153
154 return SNAP_SIZE + sizeof(u16);
155}
156
157static inline int ieee80211_encrypt_fragment(
158 struct ieee80211_device *ieee,
159 struct sk_buff *frag,
160 int hdr_len)
161{
162 struct ieee80211_crypt_data* crypt = ieee->crypt[ieee->tx_keyidx];
163 int res;
164
165#ifdef CONFIG_IEEE80211_CRYPT_TKIP
166 struct ieee80211_hdr *header;
167
168 if (ieee->tkip_countermeasures &&
169 crypt && crypt->ops && strcmp(crypt->ops->name, "TKIP") == 0) {
170 header = (struct ieee80211_hdr *) frag->data;
171 if (net_ratelimit()) {
172 printk(KERN_DEBUG "%s: TKIP countermeasures: dropped "
173 "TX packet to " MAC_FMT "\n",
174 ieee->dev->name, MAC_ARG(header->addr1));
175 }
176 return -1;
177 }
178#endif
179 /* To encrypt, frame format is:
180 * IV (4 bytes), clear payload (including SNAP), ICV (4 bytes) */
181
182 // PR: FIXME: Copied from hostap. Check fragmentation/MSDU/MPDU encryption.
183 /* Host-based IEEE 802.11 fragmentation for TX is not yet supported, so
184 * call both MSDU and MPDU encryption functions from here. */
185 atomic_inc(&crypt->refcnt);
186 res = 0;
187 if (crypt->ops->encrypt_msdu)
188 res = crypt->ops->encrypt_msdu(frag, hdr_len, crypt->priv);
189 if (res == 0 && crypt->ops->encrypt_mpdu)
190 res = crypt->ops->encrypt_mpdu(frag, hdr_len, crypt->priv);
191
192 atomic_dec(&crypt->refcnt);
193 if (res < 0) {
194 printk(KERN_INFO "%s: Encryption failed: len=%d.\n",
195 ieee->dev->name, frag->len);
196 ieee->ieee_stats.tx_discards++;
197 return -1;
198 }
199
200 return 0;
201}
202
203
204void ieee80211_txb_free(struct ieee80211_txb *txb) {
205 int i;
206 if (unlikely(!txb))
207 return;
208 for (i = 0; i < txb->nr_frags; i++)
209 if (txb->fragments[i])
210 dev_kfree_skb_any(txb->fragments[i]);
211 kfree(txb);
212}
213
214static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
215 int gfp_mask)
216{
217 struct ieee80211_txb *txb;
218 int i;
219 txb = kmalloc(
220 sizeof(struct ieee80211_txb) + (sizeof(u8*) * nr_frags),
221 gfp_mask);
222 if (!txb)
223 return NULL;
224
225 memset(txb, 0, sizeof(struct ieee80211_txb));
226 txb->nr_frags = nr_frags;
227 txb->frag_size = txb_size;
228
229 for (i = 0; i < nr_frags; i++) {
230 txb->fragments[i] = dev_alloc_skb(txb_size);
231 if (unlikely(!txb->fragments[i])) {
232 i--;
233 break;
234 }
235 }
236 if (unlikely(i != nr_frags)) {
237 while (i >= 0)
238 dev_kfree_skb_any(txb->fragments[i--]);
239 kfree(txb);
240 return NULL;
241 }
242 return txb;
243}
244
245/* SKBs are added to the ieee->tx_queue. */
246int ieee80211_xmit(struct sk_buff *skb,
247 struct net_device *dev)
248{
249 struct ieee80211_device *ieee = netdev_priv(dev);
250 struct ieee80211_txb *txb = NULL;
251 struct ieee80211_hdr *frag_hdr;
252 int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size;
253 unsigned long flags;
254 struct net_device_stats *stats = &ieee->stats;
255 int ether_type, encrypt;
256 int bytes, fc, hdr_len;
257 struct sk_buff *skb_frag;
258 struct ieee80211_hdr header = { /* Ensure zero initialized */
259 .duration_id = 0,
260 .seq_ctl = 0
261 };
262 u8 dest[ETH_ALEN], src[ETH_ALEN];
263
264 struct ieee80211_crypt_data* crypt;
265
266 spin_lock_irqsave(&ieee->lock, flags);
267
268 /* If there is no driver handler to take the TXB, dont' bother
269 * creating it... */
270 if (!ieee->hard_start_xmit) {
271 printk(KERN_WARNING "%s: No xmit handler.\n",
272 ieee->dev->name);
273 goto success;
274 }
275
276 if (unlikely(skb->len < SNAP_SIZE + sizeof(u16))) {
277 printk(KERN_WARNING "%s: skb too small (%d).\n",
278 ieee->dev->name, skb->len);
279 goto success;
280 }
281
282 ether_type = ntohs(((struct ethhdr *)skb->data)->h_proto);
283
284 crypt = ieee->crypt[ieee->tx_keyidx];
285
286 encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) &&
287 ieee->host_encrypt && crypt && crypt->ops;
288
289 if (!encrypt && ieee->ieee802_1x &&
290 ieee->drop_unencrypted && ether_type != ETH_P_PAE) {
291 stats->tx_dropped++;
292 goto success;
293 }
294
295 /* Save source and destination addresses */
296 memcpy(&dest, skb->data, ETH_ALEN);
297 memcpy(&src, skb->data+ETH_ALEN, ETH_ALEN);
298
299 /* Advance the SKB to the start of the payload */
300 skb_pull(skb, sizeof(struct ethhdr));
301
302 /* Determine total amount of storage required for TXB packets */
303 bytes = skb->len + SNAP_SIZE + sizeof(u16);
304
305 if (encrypt)
306 fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA |
307 IEEE80211_FCTL_PROTECTED;
308 else
309 fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA;
310
311 if (ieee->iw_mode == IW_MODE_INFRA) {
312 fc |= IEEE80211_FCTL_TODS;
313 /* To DS: Addr1 = BSSID, Addr2 = SA,
314 Addr3 = DA */
315 memcpy(&header.addr1, ieee->bssid, ETH_ALEN);
316 memcpy(&header.addr2, &src, ETH_ALEN);
317 memcpy(&header.addr3, &dest, ETH_ALEN);
318 } else if (ieee->iw_mode == IW_MODE_ADHOC) {
319 /* not From/To DS: Addr1 = DA, Addr2 = SA,
320 Addr3 = BSSID */
321 memcpy(&header.addr1, dest, ETH_ALEN);
322 memcpy(&header.addr2, src, ETH_ALEN);
323 memcpy(&header.addr3, ieee->bssid, ETH_ALEN);
324 }
325 header.frame_ctl = cpu_to_le16(fc);
326 hdr_len = IEEE80211_3ADDR_LEN;
327
328 /* Determine fragmentation size based on destination (multicast
329 * and broadcast are not fragmented) */
330 if (is_multicast_ether_addr(dest) ||
331 is_broadcast_ether_addr(dest))
332 frag_size = MAX_FRAG_THRESHOLD;
333 else
334 frag_size = ieee->fts;
335
336 /* Determine amount of payload per fragment. Regardless of if
337 * this stack is providing the full 802.11 header, one will
338 * eventually be affixed to this fragment -- so we must account for
339 * it when determining the amount of payload space. */
340 bytes_per_frag = frag_size - IEEE80211_3ADDR_LEN;
341 if (ieee->config &
342 (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
343 bytes_per_frag -= IEEE80211_FCS_LEN;
344
345 /* Each fragment may need to have room for encryptiong pre/postfix */
346 if (encrypt)
347 bytes_per_frag -= crypt->ops->extra_prefix_len +
348 crypt->ops->extra_postfix_len;
349
350 /* Number of fragments is the total bytes_per_frag /
351 * payload_per_fragment */
352 nr_frags = bytes / bytes_per_frag;
353 bytes_last_frag = bytes % bytes_per_frag;
354 if (bytes_last_frag)
355 nr_frags++;
356 else
357 bytes_last_frag = bytes_per_frag;
358
359 /* When we allocate the TXB we allocate enough space for the reserve
360 * and full fragment bytes (bytes_per_frag doesn't include prefix,
361 * postfix, header, FCS, etc.) */
362 txb = ieee80211_alloc_txb(nr_frags, frag_size, GFP_ATOMIC);
363 if (unlikely(!txb)) {
364 printk(KERN_WARNING "%s: Could not allocate TXB\n",
365 ieee->dev->name);
366 goto failed;
367 }
368 txb->encrypted = encrypt;
369 txb->payload_size = bytes;
370
371 for (i = 0; i < nr_frags; i++) {
372 skb_frag = txb->fragments[i];
373
374 if (encrypt)
375 skb_reserve(skb_frag, crypt->ops->extra_prefix_len);
376
377 frag_hdr = (struct ieee80211_hdr *)skb_put(skb_frag, hdr_len);
378 memcpy(frag_hdr, &header, hdr_len);
379
380 /* If this is not the last fragment, then add the MOREFRAGS
381 * bit to the frame control */
382 if (i != nr_frags - 1) {
383 frag_hdr->frame_ctl = cpu_to_le16(
384 fc | IEEE80211_FCTL_MOREFRAGS);
385 bytes = bytes_per_frag;
386 } else {
387 /* The last fragment takes the remaining length */
388 bytes = bytes_last_frag;
389 }
390
391 /* Put a SNAP header on the first fragment */
392 if (i == 0) {
393 ieee80211_put_snap(
394 skb_put(skb_frag, SNAP_SIZE + sizeof(u16)),
395 ether_type);
396 bytes -= SNAP_SIZE + sizeof(u16);
397 }
398
399 memcpy(skb_put(skb_frag, bytes), skb->data, bytes);
400
401 /* Advance the SKB... */
402 skb_pull(skb, bytes);
403
404 /* Encryption routine will move the header forward in order
405 * to insert the IV between the header and the payload */
406 if (encrypt)
407 ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len);
408 if (ieee->config &
409 (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
410 skb_put(skb_frag, 4);
411 }
412
413
414 success:
415 spin_unlock_irqrestore(&ieee->lock, flags);
416
417 dev_kfree_skb_any(skb);
418
419 if (txb) {
420 if ((*ieee->hard_start_xmit)(txb, dev) == 0) {
421 stats->tx_packets++;
422 stats->tx_bytes += txb->payload_size;
423 return 0;
424 }
425 ieee80211_txb_free(txb);
426 }
427
428 return 0;
429
430 failed:
431 spin_unlock_irqrestore(&ieee->lock, flags);
432 netif_stop_queue(dev);
433 stats->tx_errors++;
434 return 1;
435
436}
437
438EXPORT_SYMBOL(ieee80211_txb_free);
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
new file mode 100644
index 000000000000..2cd571c525a9
--- /dev/null
+++ b/net/ieee80211/ieee80211_wx.c
@@ -0,0 +1,471 @@
1/******************************************************************************
2
3 Copyright(c) 2004 Intel Corporation. All rights reserved.
4
5 Portions of this file are based on the WEP enablement code provided by the
6 Host AP project hostap-drivers v0.1.3
7 Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
8 <jkmaline@cc.hut.fi>
9 Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
10
11 This program is free software; you can redistribute it and/or modify it
12 under the terms of version 2 of the GNU General Public License as
13 published by the Free Software Foundation.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 59
22 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23
24 The full GNU General Public License is included in this distribution in the
25 file called LICENSE.
26
27 Contact Information:
28 James P. Ketrenos <ipw2100-admin@linux.intel.com>
29 Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
30
31******************************************************************************/
32#include <linux/wireless.h>
33#include <linux/version.h>
34#include <linux/kmod.h>
35#include <linux/module.h>
36
37#include <net/ieee80211.h>
38static const char *ieee80211_modes[] = {
39 "?", "a", "b", "ab", "g", "ag", "bg", "abg"
40};
41
42#define MAX_CUSTOM_LEN 64
43static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee,
44 char *start, char *stop,
45 struct ieee80211_network *network)
46{
47 char custom[MAX_CUSTOM_LEN];
48 char *p;
49 struct iw_event iwe;
50 int i, j;
51 u8 max_rate, rate;
52
53 /* First entry *MUST* be the AP MAC address */
54 iwe.cmd = SIOCGIWAP;
55 iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
56 memcpy(iwe.u.ap_addr.sa_data, network->bssid, ETH_ALEN);
57 start = iwe_stream_add_event(start, stop, &iwe, IW_EV_ADDR_LEN);
58
59 /* Remaining entries will be displayed in the order we provide them */
60
61 /* Add the ESSID */
62 iwe.cmd = SIOCGIWESSID;
63 iwe.u.data.flags = 1;
64 if (network->flags & NETWORK_EMPTY_ESSID) {
65 iwe.u.data.length = sizeof("<hidden>");
66 start = iwe_stream_add_point(start, stop, &iwe, "<hidden>");
67 } else {
68 iwe.u.data.length = min(network->ssid_len, (u8)32);
69 start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
70 }
71
72 /* Add the protocol name */
73 iwe.cmd = SIOCGIWNAME;
74 snprintf(iwe.u.name, IFNAMSIZ, "IEEE 802.11%s", ieee80211_modes[network->mode]);
75 start = iwe_stream_add_event(start, stop, &iwe, IW_EV_CHAR_LEN);
76
77 /* Add mode */
78 iwe.cmd = SIOCGIWMODE;
79 if (network->capability &
80 (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)) {
81 if (network->capability & WLAN_CAPABILITY_ESS)
82 iwe.u.mode = IW_MODE_MASTER;
83 else
84 iwe.u.mode = IW_MODE_ADHOC;
85
86 start = iwe_stream_add_event(start, stop, &iwe,
87 IW_EV_UINT_LEN);
88 }
89
90 /* Add frequency/channel */
91 iwe.cmd = SIOCGIWFREQ;
92/* iwe.u.freq.m = ieee80211_frequency(network->channel, network->mode);
93 iwe.u.freq.e = 3; */
94 iwe.u.freq.m = network->channel;
95 iwe.u.freq.e = 0;
96 iwe.u.freq.i = 0;
97 start = iwe_stream_add_event(start, stop, &iwe, IW_EV_FREQ_LEN);
98
99 /* Add encryption capability */
100 iwe.cmd = SIOCGIWENCODE;
101 if (network->capability & WLAN_CAPABILITY_PRIVACY)
102 iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
103 else
104 iwe.u.data.flags = IW_ENCODE_DISABLED;
105 iwe.u.data.length = 0;
106 start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
107
108 /* Add basic and extended rates */
109 max_rate = 0;
110 p = custom;
111 p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), " Rates (Mb/s): ");
112 for (i = 0, j = 0; i < network->rates_len; ) {
113 if (j < network->rates_ex_len &&
114 ((network->rates_ex[j] & 0x7F) <
115 (network->rates[i] & 0x7F)))
116 rate = network->rates_ex[j++] & 0x7F;
117 else
118 rate = network->rates[i++] & 0x7F;
119 if (rate > max_rate)
120 max_rate = rate;
121 p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
122 "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
123 }
124 for (; j < network->rates_ex_len; j++) {
125 rate = network->rates_ex[j] & 0x7F;
126 p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
127 "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
128 if (rate > max_rate)
129 max_rate = rate;
130 }
131
132 iwe.cmd = SIOCGIWRATE;
133 iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
134 iwe.u.bitrate.value = max_rate * 500000;
135 start = iwe_stream_add_event(start, stop, &iwe,
136 IW_EV_PARAM_LEN);
137
138 iwe.cmd = IWEVCUSTOM;
139 iwe.u.data.length = p - custom;
140 if (iwe.u.data.length)
141 start = iwe_stream_add_point(start, stop, &iwe, custom);
142
143 /* Add quality statistics */
144 /* TODO: Fix these values... */
145 iwe.cmd = IWEVQUAL;
146 iwe.u.qual.qual = network->stats.signal;
147 iwe.u.qual.level = network->stats.rssi;
148 iwe.u.qual.noise = network->stats.noise;
149 iwe.u.qual.updated = network->stats.mask & IEEE80211_STATMASK_WEMASK;
150 if (!(network->stats.mask & IEEE80211_STATMASK_RSSI))
151 iwe.u.qual.updated |= IW_QUAL_LEVEL_INVALID;
152 if (!(network->stats.mask & IEEE80211_STATMASK_NOISE))
153 iwe.u.qual.updated |= IW_QUAL_NOISE_INVALID;
154 if (!(network->stats.mask & IEEE80211_STATMASK_SIGNAL))
155 iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID;
156
157 start = iwe_stream_add_event(start, stop, &iwe, IW_EV_QUAL_LEN);
158
159 iwe.cmd = IWEVCUSTOM;
160 p = custom;
161
162 iwe.u.data.length = p - custom;
163 if (iwe.u.data.length)
164 start = iwe_stream_add_point(start, stop, &iwe, custom);
165
166 if (ieee->wpa_enabled && network->wpa_ie_len){
167 char buf[MAX_WPA_IE_LEN * 2 + 30];
168
169 u8 *p = buf;
170 p += sprintf(p, "wpa_ie=");
171 for (i = 0; i < network->wpa_ie_len; i++) {
172 p += sprintf(p, "%02x", network->wpa_ie[i]);
173 }
174
175 memset(&iwe, 0, sizeof(iwe));
176 iwe.cmd = IWEVCUSTOM;
177 iwe.u.data.length = strlen(buf);
178 start = iwe_stream_add_point(start, stop, &iwe, buf);
179 }
180
181 if (ieee->wpa_enabled && network->rsn_ie_len){
182 char buf[MAX_WPA_IE_LEN * 2 + 30];
183
184 u8 *p = buf;
185 p += sprintf(p, "rsn_ie=");
186 for (i = 0; i < network->rsn_ie_len; i++) {
187 p += sprintf(p, "%02x", network->rsn_ie[i]);
188 }
189
190 memset(&iwe, 0, sizeof(iwe));
191 iwe.cmd = IWEVCUSTOM;
192 iwe.u.data.length = strlen(buf);
193 start = iwe_stream_add_point(start, stop, &iwe, buf);
194 }
195
196 /* Add EXTRA: Age to display seconds since last beacon/probe response
197 * for given network. */
198 iwe.cmd = IWEVCUSTOM;
199 p = custom;
200 p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
201 " Last beacon: %lums ago", (jiffies - network->last_scanned) / (HZ / 100));
202 iwe.u.data.length = p - custom;
203 if (iwe.u.data.length)
204 start = iwe_stream_add_point(start, stop, &iwe, custom);
205
206
207 return start;
208}
209
210int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
211 struct iw_request_info *info,
212 union iwreq_data *wrqu, char *extra)
213{
214 struct ieee80211_network *network;
215 unsigned long flags;
216
217 char *ev = extra;
218 char *stop = ev + IW_SCAN_MAX_DATA;
219 int i = 0;
220
221 IEEE80211_DEBUG_WX("Getting scan\n");
222
223 spin_lock_irqsave(&ieee->lock, flags);
224
225 list_for_each_entry(network, &ieee->network_list, list) {
226 i++;
227 if (ieee->scan_age == 0 ||
228 time_after(network->last_scanned + ieee->scan_age, jiffies))
229 ev = ipw2100_translate_scan(ieee, ev, stop, network);
230 else
231 IEEE80211_DEBUG_SCAN(
232 "Not showing network '%s ("
233 MAC_FMT ")' due to age (%lums).\n",
234 escape_essid(network->ssid,
235 network->ssid_len),
236 MAC_ARG(network->bssid),
237 (jiffies - network->last_scanned) / (HZ / 100));
238 }
239
240 spin_unlock_irqrestore(&ieee->lock, flags);
241
242 wrqu->data.length = ev - extra;
243 wrqu->data.flags = 0;
244
245 IEEE80211_DEBUG_WX("exit: %d networks returned.\n", i);
246
247 return 0;
248}
249
250int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
251 struct iw_request_info *info,
252 union iwreq_data *wrqu, char *keybuf)
253{
254 struct iw_point *erq = &(wrqu->encoding);
255 struct net_device *dev = ieee->dev;
256 struct ieee80211_security sec = {
257 .flags = 0
258 };
259 int i, key, key_provided, len;
260 struct ieee80211_crypt_data **crypt;
261
262 IEEE80211_DEBUG_WX("SET_ENCODE\n");
263
264 key = erq->flags & IW_ENCODE_INDEX;
265 if (key) {
266 if (key > WEP_KEYS)
267 return -EINVAL;
268 key--;
269 key_provided = 1;
270 } else {
271 key_provided = 0;
272 key = ieee->tx_keyidx;
273 }
274
275 IEEE80211_DEBUG_WX("Key: %d [%s]\n", key, key_provided ?
276 "provided" : "default");
277
278 crypt = &ieee->crypt[key];
279
280 if (erq->flags & IW_ENCODE_DISABLED) {
281 if (key_provided && *crypt) {
282 IEEE80211_DEBUG_WX("Disabling encryption on key %d.\n",
283 key);
284 ieee80211_crypt_delayed_deinit(ieee, crypt);
285 } else
286 IEEE80211_DEBUG_WX("Disabling encryption.\n");
287
288 /* Check all the keys to see if any are still configured,
289 * and if no key index was provided, de-init them all */
290 for (i = 0; i < WEP_KEYS; i++) {
291 if (ieee->crypt[i] != NULL) {
292 if (key_provided)
293 break;
294 ieee80211_crypt_delayed_deinit(
295 ieee, &ieee->crypt[i]);
296 }
297 }
298
299 if (i == WEP_KEYS) {
300 sec.enabled = 0;
301 sec.level = SEC_LEVEL_0;
302 sec.flags |= SEC_ENABLED | SEC_LEVEL;
303 }
304
305 goto done;
306 }
307
308
309
310 sec.enabled = 1;
311 sec.flags |= SEC_ENABLED;
312
313 if (*crypt != NULL && (*crypt)->ops != NULL &&
314 strcmp((*crypt)->ops->name, "WEP") != 0) {
315 /* changing to use WEP; deinit previously used algorithm
316 * on this key */
317 ieee80211_crypt_delayed_deinit(ieee, crypt);
318 }
319
320 if (*crypt == NULL) {
321 struct ieee80211_crypt_data *new_crypt;
322
323 /* take WEP into use */
324 new_crypt = kmalloc(sizeof(struct ieee80211_crypt_data),
325 GFP_KERNEL);
326 if (new_crypt == NULL)
327 return -ENOMEM;
328 memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
329 new_crypt->ops = ieee80211_get_crypto_ops("WEP");
330 if (!new_crypt->ops) {
331 request_module("ieee80211_crypt_wep");
332 new_crypt->ops = ieee80211_get_crypto_ops("WEP");
333 }
334
335 if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
336 new_crypt->priv = new_crypt->ops->init(key);
337
338 if (!new_crypt->ops || !new_crypt->priv) {
339 kfree(new_crypt);
340 new_crypt = NULL;
341
342 printk(KERN_WARNING "%s: could not initialize WEP: "
343 "load module ieee80211_crypt_wep\n",
344 dev->name);
345 return -EOPNOTSUPP;
346 }
347 *crypt = new_crypt;
348 }
349
350 /* If a new key was provided, set it up */
351 if (erq->length > 0) {
352 len = erq->length <= 5 ? 5 : 13;
353 memcpy(sec.keys[key], keybuf, erq->length);
354 if (len > erq->length)
355 memset(sec.keys[key] + erq->length, 0,
356 len - erq->length);
357 IEEE80211_DEBUG_WX("Setting key %d to '%s' (%d:%d bytes)\n",
358 key, escape_essid(sec.keys[key], len),
359 erq->length, len);
360 sec.key_sizes[key] = len;
361 (*crypt)->ops->set_key(sec.keys[key], len, NULL,
362 (*crypt)->priv);
363 sec.flags |= (1 << key);
364 /* This ensures a key will be activated if no key is
365 * explicitely set */
366 if (key == sec.active_key)
367 sec.flags |= SEC_ACTIVE_KEY;
368 } else {
369 len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN,
370 NULL, (*crypt)->priv);
371 if (len == 0) {
372 /* Set a default key of all 0 */
373 IEEE80211_DEBUG_WX("Setting key %d to all zero.\n",
374 key);
375 memset(sec.keys[key], 0, 13);
376 (*crypt)->ops->set_key(sec.keys[key], 13, NULL,
377 (*crypt)->priv);
378 sec.key_sizes[key] = 13;
379 sec.flags |= (1 << key);
380 }
381
382 /* No key data - just set the default TX key index */
383 if (key_provided) {
384 IEEE80211_DEBUG_WX(
385 "Setting key %d to default Tx key.\n", key);
386 ieee->tx_keyidx = key;
387 sec.active_key = key;
388 sec.flags |= SEC_ACTIVE_KEY;
389 }
390 }
391
392 done:
393 ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED);
394 sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN : WLAN_AUTH_SHARED_KEY;
395 sec.flags |= SEC_AUTH_MODE;
396 IEEE80211_DEBUG_WX("Auth: %s\n", sec.auth_mode == WLAN_AUTH_OPEN ?
397 "OPEN" : "SHARED KEY");
398
399 /* For now we just support WEP, so only set that security level...
400 * TODO: When WPA is added this is one place that needs to change */
401 sec.flags |= SEC_LEVEL;
402 sec.level = SEC_LEVEL_1; /* 40 and 104 bit WEP */
403
404 if (ieee->set_security)
405 ieee->set_security(dev, &sec);
406
407 /* Do not reset port if card is in Managed mode since resetting will
408 * generate new IEEE 802.11 authentication which may end up in looping
409 * with IEEE 802.1X. If your hardware requires a reset after WEP
410 * configuration (for example... Prism2), implement the reset_port in
411 * the callbacks structures used to initialize the 802.11 stack. */
412 if (ieee->reset_on_keychange &&
413 ieee->iw_mode != IW_MODE_INFRA &&
414 ieee->reset_port && ieee->reset_port(dev)) {
415 printk(KERN_DEBUG "%s: reset_port failed\n", dev->name);
416 return -EINVAL;
417 }
418 return 0;
419}
420
421int ieee80211_wx_get_encode(struct ieee80211_device *ieee,
422 struct iw_request_info *info,
423 union iwreq_data *wrqu, char *keybuf)
424{
425 struct iw_point *erq = &(wrqu->encoding);
426 int len, key;
427 struct ieee80211_crypt_data *crypt;
428
429 IEEE80211_DEBUG_WX("GET_ENCODE\n");
430
431 key = erq->flags & IW_ENCODE_INDEX;
432 if (key) {
433 if (key > WEP_KEYS)
434 return -EINVAL;
435 key--;
436 } else
437 key = ieee->tx_keyidx;
438
439 crypt = ieee->crypt[key];
440 erq->flags = key + 1;
441
442 if (crypt == NULL || crypt->ops == NULL) {
443 erq->length = 0;
444 erq->flags |= IW_ENCODE_DISABLED;
445 return 0;
446 }
447
448 if (strcmp(crypt->ops->name, "WEP") != 0) {
449 /* only WEP is supported with wireless extensions, so just
450 * report that encryption is used */
451 erq->length = 0;
452 erq->flags |= IW_ENCODE_ENABLED;
453 return 0;
454 }
455
456 len = crypt->ops->get_key(keybuf, WEP_KEY_LEN, NULL, crypt->priv);
457 erq->length = (len >= 0 ? len : 0);
458
459 erq->flags |= IW_ENCODE_ENABLED;
460
461 if (ieee->open_wep)
462 erq->flags |= IW_ENCODE_OPEN;
463 else
464 erq->flags |= IW_ENCODE_RESTRICTED;
465
466 return 0;
467}
468
469EXPORT_SYMBOL(ieee80211_wx_get_scan);
470EXPORT_SYMBOL(ieee80211_wx_set_encode);
471EXPORT_SYMBOL(ieee80211_wx_get_encode);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 3e63123f7bbd..e55136ae09f4 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config IP_MULTICAST 4config IP_MULTICAST
5 bool "IP: multicasting" 5 bool "IP: multicasting"
6 depends on INET
7 help 6 help
8 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
9 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
17 16
18config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
19 bool "IP: advanced router" 18 bool "IP: advanced router"
20 depends on INET
21 ---help--- 19 ---help---
22 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
23 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -56,9 +54,9 @@ config IP_ADVANCED_ROUTER
56choice 54choice
57 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" 55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
58 depends on IP_ADVANCED_ROUTER 56 depends on IP_ADVANCED_ROUTER
59 default IP_FIB_HASH 57 default ASK_IP_FIB_HASH
60 58
61config IP_FIB_HASH 59config ASK_IP_FIB_HASH
62 bool "FIB_HASH" 60 bool "FIB_HASH"
63 ---help--- 61 ---help---
64 Current FIB is very proven and good enough for most users. 62 Current FIB is very proven and good enough for most users.
@@ -84,12 +82,8 @@ config IP_FIB_TRIE
84 82
85endchoice 83endchoice
86 84
87# If the user does not enable advanced routing, he gets the safe
88# default of the fib-hash algorithm.
89config IP_FIB_HASH 85config IP_FIB_HASH
90 bool 86 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
91 depends on !IP_ADVANCED_ROUTER
92 default y
93 87
94config IP_MULTIPLE_TABLES 88config IP_MULTIPLE_TABLES
95 bool "IP: policy routing" 89 bool "IP: policy routing"
@@ -130,7 +124,7 @@ config IP_ROUTE_MULTIPATH
130 124
131config IP_ROUTE_MULTIPATH_CACHED 125config IP_ROUTE_MULTIPATH_CACHED
132 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" 126 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
133 depends on: IP_ROUTE_MULTIPATH 127 depends on IP_ROUTE_MULTIPATH
134 help 128 help
135 Normally, equal cost multipath routing is not supported by the 129 Normally, equal cost multipath routing is not supported by the
136 routing cache. If you say Y here, alternative routes are cached 130 routing cache. If you say Y here, alternative routes are cached
@@ -183,7 +177,6 @@ config IP_ROUTE_VERBOSE
183 177
184config IP_PNP 178config IP_PNP
185 bool "IP: kernel level autoconfiguration" 179 bool "IP: kernel level autoconfiguration"
186 depends on INET
187 help 180 help
188 This enables automatic configuration of IP addresses of devices and 181 This enables automatic configuration of IP addresses of devices and
189 of the routing table during kernel boot, based on either information 182 of the routing table during kernel boot, based on either information
@@ -242,8 +235,6 @@ config IP_PNP_RARP
242# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 235# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
243config NET_IPIP 236config NET_IPIP
244 tristate "IP: tunneling" 237 tristate "IP: tunneling"
245 depends on INET
246 select INET_TUNNEL
247 ---help--- 238 ---help---
248 Tunneling means encapsulating data of one protocol type within 239 Tunneling means encapsulating data of one protocol type within
249 another protocol and sending it over a channel that understands the 240 another protocol and sending it over a channel that understands the
@@ -260,8 +251,6 @@ config NET_IPIP
260 251
261config NET_IPGRE 252config NET_IPGRE
262 tristate "IP: GRE tunnels over IP" 253 tristate "IP: GRE tunnels over IP"
263 depends on INET
264 select XFRM
265 help 254 help
266 Tunneling means encapsulating data of one protocol type within 255 Tunneling means encapsulating data of one protocol type within
267 another protocol and sending it over a channel that understands the 256 another protocol and sending it over a channel that understands the
@@ -319,7 +308,7 @@ config IP_PIMSM_V2
319 308
320config ARPD 309config ARPD
321 bool "IP: ARP daemon support (EXPERIMENTAL)" 310 bool "IP: ARP daemon support (EXPERIMENTAL)"
322 depends on INET && EXPERIMENTAL 311 depends on EXPERIMENTAL
323 ---help--- 312 ---help---
324 Normally, the kernel maintains an internal cache which maps IP 313 Normally, the kernel maintains an internal cache which maps IP
325 addresses to hardware addresses on the local network, so that 314 addresses to hardware addresses on the local network, so that
@@ -344,7 +333,6 @@ config ARPD
344 333
345config SYN_COOKIES 334config SYN_COOKIES
346 bool "IP: TCP syncookie support (disabled per default)" 335 bool "IP: TCP syncookie support (disabled per default)"
347 depends on INET
348 ---help--- 336 ---help---
349 Normal TCP/IP networking is open to an attack known as "SYN 337 Normal TCP/IP networking is open to an attack known as "SYN
350 flooding". This denial-of-service attack prevents legitimate remote 338 flooding". This denial-of-service attack prevents legitimate remote
@@ -381,7 +369,6 @@ config SYN_COOKIES
381 369
382config INET_AH 370config INET_AH
383 tristate "IP: AH transformation" 371 tristate "IP: AH transformation"
384 depends on INET
385 select XFRM 372 select XFRM
386 select CRYPTO 373 select CRYPTO
387 select CRYPTO_HMAC 374 select CRYPTO_HMAC
@@ -394,7 +381,6 @@ config INET_AH
394 381
395config INET_ESP 382config INET_ESP
396 tristate "IP: ESP transformation" 383 tristate "IP: ESP transformation"
397 depends on INET
398 select XFRM 384 select XFRM
399 select CRYPTO 385 select CRYPTO
400 select CRYPTO_HMAC 386 select CRYPTO_HMAC
@@ -408,7 +394,6 @@ config INET_ESP
408 394
409config INET_IPCOMP 395config INET_IPCOMP
410 tristate "IP: IPComp transformation" 396 tristate "IP: IPComp transformation"
411 depends on INET
412 select XFRM 397 select XFRM
413 select INET_TUNNEL 398 select INET_TUNNEL
414 select CRYPTO 399 select CRYPTO
@@ -421,7 +406,6 @@ config INET_IPCOMP
421 406
422config INET_TUNNEL 407config INET_TUNNEL
423 tristate "IP: tunnel transformation" 408 tristate "IP: tunnel transformation"
424 depends on INET
425 select XFRM 409 select XFRM
426 ---help--- 410 ---help---
427 Support for generic IP tunnel transformation, which is required by 411 Support for generic IP tunnel transformation, which is required by
@@ -429,25 +413,22 @@ config INET_TUNNEL
429 413
430 If unsure, say Y. 414 If unsure, say Y.
431 415
432config IP_TCPDIAG 416config INET_DIAG
433 tristate "IP: TCP socket monitoring interface" 417 tristate "INET: socket monitoring interface"
434 depends on INET
435 default y 418 default y
436 ---help--- 419 ---help---
437 Support for TCP socket monitoring interface used by native Linux 420 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
438 tools such as ss. ss is included in iproute2, currently downloadable 421 native Linux tools such as ss. ss is included in iproute2, currently
439 at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support 422 downloadable at <http://developer.osdl.org/dev/iproute2>.
440 and have selected IPv6 as a module, you need to build this as a
441 module too.
442 423
443 If unsure, say Y. 424 If unsure, say Y.
444 425
445config IP_TCPDIAG_IPV6 426config INET_TCP_DIAG
446 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 427 depends on INET_DIAG
428 def_tristate INET_DIAG
447 429
448config TCP_CONG_ADVANCED 430config TCP_CONG_ADVANCED
449 bool "TCP: advanced congestion control" 431 bool "TCP: advanced congestion control"
450 depends on INET
451 ---help--- 432 ---help---
452 Support for selection of various TCP congestion control 433 Support for selection of various TCP congestion control
453 modules. 434 modules.
@@ -463,7 +444,6 @@ menu "TCP congestion control"
463 444
464config TCP_CONG_BIC 445config TCP_CONG_BIC
465 tristate "Binary Increase Congestion (BIC) control" 446 tristate "Binary Increase Congestion (BIC) control"
466 depends on INET
467 default y 447 default y
468 ---help--- 448 ---help---
469 BIC-TCP is a sender-side only change that ensures a linear RTT 449 BIC-TCP is a sender-side only change that ensures a linear RTT
@@ -478,7 +458,6 @@ config TCP_CONG_BIC
478 458
479config TCP_CONG_WESTWOOD 459config TCP_CONG_WESTWOOD
480 tristate "TCP Westwood+" 460 tristate "TCP Westwood+"
481 depends on INET
482 default m 461 default m
483 ---help--- 462 ---help---
484 TCP Westwood+ is a sender-side only modification of the TCP Reno 463 TCP Westwood+ is a sender-side only modification of the TCP Reno
@@ -493,7 +472,6 @@ config TCP_CONG_WESTWOOD
493 472
494config TCP_CONG_HTCP 473config TCP_CONG_HTCP
495 tristate "H-TCP" 474 tristate "H-TCP"
496 depends on INET
497 default m 475 default m
498 ---help--- 476 ---help---
499 H-TCP is a send-side only modifications of the TCP Reno 477 H-TCP is a send-side only modifications of the TCP Reno
@@ -505,7 +483,7 @@ config TCP_CONG_HTCP
505 483
506config TCP_CONG_HSTCP 484config TCP_CONG_HSTCP
507 tristate "High Speed TCP" 485 tristate "High Speed TCP"
508 depends on INET && EXPERIMENTAL 486 depends on EXPERIMENTAL
509 default n 487 default n
510 ---help--- 488 ---help---
511 Sally Floyd's High Speed TCP (RFC 3649) congestion control. 489 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -516,7 +494,7 @@ config TCP_CONG_HSTCP
516 494
517config TCP_CONG_HYBLA 495config TCP_CONG_HYBLA
518 tristate "TCP-Hybla congestion control algorithm" 496 tristate "TCP-Hybla congestion control algorithm"
519 depends on INET && EXPERIMENTAL 497 depends on EXPERIMENTAL
520 default n 498 default n
521 ---help--- 499 ---help---
522 TCP-Hybla is a sender-side only change that eliminates penalization of 500 TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -526,7 +504,7 @@ config TCP_CONG_HYBLA
526 504
527config TCP_CONG_VEGAS 505config TCP_CONG_VEGAS
528 tristate "TCP Vegas" 506 tristate "TCP Vegas"
529 depends on INET && EXPERIMENTAL 507 depends on EXPERIMENTAL
530 default n 508 default n
531 ---help--- 509 ---help---
532 TCP Vegas is a sender-side only change to TCP that anticipates 510 TCP Vegas is a sender-side only change to TCP that anticipates
@@ -537,7 +515,7 @@ config TCP_CONG_VEGAS
537 515
538config TCP_CONG_SCALABLE 516config TCP_CONG_SCALABLE
539 tristate "Scalable TCP" 517 tristate "Scalable TCP"
540 depends on INET && EXPERIMENTAL 518 depends on EXPERIMENTAL
541 default n 519 default n
542 ---help--- 520 ---help---
543 Scalable TCP is a sender-side only change to TCP which uses a 521 Scalable TCP is a sender-side only change to TCP which uses a
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5718cdb3a61e..f0435d00db6b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,13 +2,14 @@
2# Makefile for the Linux TCP/IP (INET) layer. 2# Makefile for the Linux TCP/IP (INET) layer.
3# 3#
4 4
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 11 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 12 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
12 13
13obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o 14obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
14obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o 15obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
29obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o 30obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
30obj-$(CONFIG_NETFILTER) += netfilter/ 31obj-$(CONFIG_NETFILTER) += netfilter/
31obj-$(CONFIG_IP_VS) += ipvs/ 32obj-$(CONFIG_IP_VS) += ipvs/
32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 33obj-$(CONFIG_INET_DIAG) += inet_diag.o
33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 37obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 38obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ef7468376ae6..bf147f8db399 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
99#include <net/arp.h> 99#include <net/arp.h>
100#include <net/route.h> 100#include <net/route.h>
101#include <net/ip_fib.h> 101#include <net/ip_fib.h>
102#include <net/inet_connection_sock.h>
102#include <net/tcp.h> 103#include <net/tcp.h>
103#include <net/udp.h> 104#include <net/udp.h>
104#include <linux/skbuff.h> 105#include <linux/skbuff.h>
@@ -112,11 +113,7 @@
112#include <linux/mroute.h> 113#include <linux/mroute.h>
113#endif 114#endif
114 115
115DEFINE_SNMP_STAT(struct linux_mib, net_statistics); 116DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
116
117#ifdef INET_REFCNT_DEBUG
118atomic_t inet_sock_nr;
119#endif
120 117
121extern void ip_mc_drop_socket(struct sock *sk); 118extern void ip_mc_drop_socket(struct sock *sk);
122 119
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
153 if (inet->opt) 150 if (inet->opt)
154 kfree(inet->opt); 151 kfree(inet->opt);
155 dst_release(sk->sk_dst_cache); 152 dst_release(sk->sk_dst_cache);
156#ifdef INET_REFCNT_DEBUG 153 sk_refcnt_debug_dec(sk);
157 atomic_dec(&inet_sock_nr);
158 printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
159 sk, atomic_read(&inet_sock_nr));
160#endif
161} 154}
162 155
163/* 156/*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
210 * we can only allow the backlog to be adjusted. 203 * we can only allow the backlog to be adjusted.
211 */ 204 */
212 if (old_state != TCP_LISTEN) { 205 if (old_state != TCP_LISTEN) {
213 err = tcp_listen_start(sk); 206 err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
214 if (err) 207 if (err)
215 goto out; 208 goto out;
216 } 209 }
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
235 struct proto *answer_prot; 228 struct proto *answer_prot;
236 unsigned char answer_flags; 229 unsigned char answer_flags;
237 char answer_no_check; 230 char answer_no_check;
238 int err; 231 int try_loading_module = 0;
232 int err = -ESOCKTNOSUPPORT;
239 233
240 sock->state = SS_UNCONNECTED; 234 sock->state = SS_UNCONNECTED;
241 235
242 /* Look for the requested type/protocol pair. */ 236 /* Look for the requested type/protocol pair. */
243 answer = NULL; 237 answer = NULL;
238lookup_protocol:
244 rcu_read_lock(); 239 rcu_read_lock();
245 list_for_each_rcu(p, &inetsw[sock->type]) { 240 list_for_each_rcu(p, &inetsw[sock->type]) {
246 answer = list_entry(p, struct inet_protosw, list); 241 answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
261 answer = NULL; 256 answer = NULL;
262 } 257 }
263 258
264 err = -ESOCKTNOSUPPORT; 259 if (unlikely(answer == NULL)) {
265 if (!answer) 260 if (try_loading_module < 2) {
266 goto out_rcu_unlock; 261 rcu_read_unlock();
262 /*
263 * Be more specific, e.g. net-pf-2-proto-132-type-1
264 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
265 */
266 if (++try_loading_module == 1)
267 request_module("net-pf-%d-proto-%d-type-%d",
268 PF_INET, protocol, sock->type);
269 /*
270 * Fall back to generic, e.g. net-pf-2-proto-132
271 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
272 */
273 else
274 request_module("net-pf-%d-proto-%d",
275 PF_INET, protocol);
276 goto lookup_protocol;
277 } else
278 goto out_rcu_unlock;
279 }
280
267 err = -EPERM; 281 err = -EPERM;
268 if (answer->capability > 0 && !capable(answer->capability)) 282 if (answer->capability > 0 && !capable(answer->capability))
269 goto out_rcu_unlock; 283 goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
317 inet->mc_index = 0; 331 inet->mc_index = 0;
318 inet->mc_list = NULL; 332 inet->mc_list = NULL;
319 333
320#ifdef INET_REFCNT_DEBUG 334 sk_refcnt_debug_inc(sk);
321 atomic_inc(&inet_sock_nr);
322#endif
323 335
324 if (inet->num) { 336 if (inet->num) {
325 /* It assumes that any protocol which allows 337 /* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
847 .owner = THIS_MODULE, 859 .owner = THIS_MODULE,
848}; 860};
849 861
850
851extern void tcp_init(void);
852extern void tcp_v4_init(struct net_proto_family *);
853
854/* Upon startup we insert all the elements in inetsw_array[] into 862/* Upon startup we insert all the elements in inetsw_array[] into
855 * the linked list inetsw. 863 * the linked list inetsw.
856 */ 864 */
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
961 } 969 }
962} 970}
963 971
972/*
973 * Shall we try to damage output packets if routing dev changes?
974 */
975
976int sysctl_ip_dynaddr;
977
978static int inet_sk_reselect_saddr(struct sock *sk)
979{
980 struct inet_sock *inet = inet_sk(sk);
981 int err;
982 struct rtable *rt;
983 __u32 old_saddr = inet->saddr;
984 __u32 new_saddr;
985 __u32 daddr = inet->daddr;
986
987 if (inet->opt && inet->opt->srr)
988 daddr = inet->opt->faddr;
989
990 /* Query new route. */
991 err = ip_route_connect(&rt, daddr, 0,
992 RT_CONN_FLAGS(sk),
993 sk->sk_bound_dev_if,
994 sk->sk_protocol,
995 inet->sport, inet->dport, sk);
996 if (err)
997 return err;
998
999 sk_setup_caps(sk, &rt->u.dst);
1000
1001 new_saddr = rt->rt_src;
1002
1003 if (new_saddr == old_saddr)
1004 return 0;
1005
1006 if (sysctl_ip_dynaddr > 1) {
1007 printk(KERN_INFO "%s(): shifting inet->"
1008 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1009 __FUNCTION__,
1010 NIPQUAD(old_saddr),
1011 NIPQUAD(new_saddr));
1012 }
1013
1014 inet->saddr = inet->rcv_saddr = new_saddr;
1015
1016 /*
1017 * XXX The only one ugly spot where we need to
1018 * XXX really change the sockets identity after
1019 * XXX it has entered the hashes. -DaveM
1020 *
1021 * Besides that, it does not check for connection
1022 * uniqueness. Wait for troubles.
1023 */
1024 __sk_prot_rehash(sk);
1025 return 0;
1026}
1027
1028int inet_sk_rebuild_header(struct sock *sk)
1029{
1030 struct inet_sock *inet = inet_sk(sk);
1031 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1032 u32 daddr;
1033 int err;
1034
1035 /* Route is OK, nothing to do. */
1036 if (rt)
1037 return 0;
1038
1039 /* Reroute. */
1040 daddr = inet->daddr;
1041 if (inet->opt && inet->opt->srr)
1042 daddr = inet->opt->faddr;
1043{
1044 struct flowi fl = {
1045 .oif = sk->sk_bound_dev_if,
1046 .nl_u = {
1047 .ip4_u = {
1048 .daddr = daddr,
1049 .saddr = inet->saddr,
1050 .tos = RT_CONN_FLAGS(sk),
1051 },
1052 },
1053 .proto = sk->sk_protocol,
1054 .uli_u = {
1055 .ports = {
1056 .sport = inet->sport,
1057 .dport = inet->dport,
1058 },
1059 },
1060 };
1061
1062 err = ip_route_output_flow(&rt, &fl, sk, 0);
1063}
1064 if (!err)
1065 sk_setup_caps(sk, &rt->u.dst);
1066 else {
1067 /* Routing failed... */
1068 sk->sk_route_caps = 0;
1069 /*
1070 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
1071 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
1072 */
1073 if (!sysctl_ip_dynaddr ||
1074 sk->sk_state != TCP_SYN_SENT ||
1075 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1076 (err = inet_sk_reselect_saddr(sk)) != 0)
1077 sk->sk_err_soft = -err;
1078 }
1079
1080 return err;
1081}
1082
1083EXPORT_SYMBOL(inet_sk_rebuild_header);
1084
964#ifdef CONFIG_IP_MULTICAST 1085#ifdef CONFIG_IP_MULTICAST
965static struct net_protocol igmp_protocol = { 1086static struct net_protocol igmp_protocol = {
966 .handler = igmp_rcv, 1087 .handler = igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
1007} 1128}
1008 1129
1009static int ipv4_proc_init(void); 1130static int ipv4_proc_init(void);
1010extern void ipfrag_init(void);
1011 1131
1012/* 1132/*
1013 * IP protocol layer initialiser 1133 * IP protocol layer initialiser
@@ -1128,19 +1248,10 @@ module_init(inet_init);
1128/* ------------------------------------------------------------------------ */ 1248/* ------------------------------------------------------------------------ */
1129 1249
1130#ifdef CONFIG_PROC_FS 1250#ifdef CONFIG_PROC_FS
1131extern int fib_proc_init(void);
1132extern void fib_proc_exit(void);
1133#ifdef CONFIG_IP_FIB_TRIE 1251#ifdef CONFIG_IP_FIB_TRIE
1134extern int fib_stat_proc_init(void); 1252extern int fib_stat_proc_init(void);
1135extern void fib_stat_proc_exit(void); 1253extern void fib_stat_proc_exit(void);
1136#endif 1254#endif
1137extern int ip_misc_proc_init(void);
1138extern int raw_proc_init(void);
1139extern void raw_proc_exit(void);
1140extern int tcp4_proc_init(void);
1141extern void tcp4_proc_exit(void);
1142extern int udp4_proc_init(void);
1143extern void udp4_proc_exit(void);
1144 1255
1145static int __init ipv4_proc_init(void) 1256static int __init ipv4_proc_init(void)
1146{ 1257{
@@ -1157,7 +1268,7 @@ static int __init ipv4_proc_init(void)
1157#ifdef CONFIG_IP_FIB_TRIE 1268#ifdef CONFIG_IP_FIB_TRIE
1158 if (fib_stat_proc_init()) 1269 if (fib_stat_proc_init())
1159 goto out_fib_stat; 1270 goto out_fib_stat;
1160 #endif 1271#endif
1161 if (ip_misc_proc_init()) 1272 if (ip_misc_proc_init())
1162 goto out_misc; 1273 goto out_misc;
1163out: 1274out:
@@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops);
1205EXPORT_SYMBOL(inet_unregister_protosw); 1316EXPORT_SYMBOL(inet_unregister_protosw);
1206EXPORT_SYMBOL(net_statistics); 1317EXPORT_SYMBOL(net_statistics);
1207EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); 1318EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
1208
1209#ifdef INET_REFCNT_DEBUG
1210EXPORT_SYMBOL(inet_sock_nr);
1211#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 514c85b2631a..035ad2c9e1ba 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -263,10 +263,8 @@ static int ah_init_state(struct xfrm_state *x)
263 263
264error: 264error:
265 if (ahp) { 265 if (ahp) {
266 if (ahp->work_icv) 266 kfree(ahp->work_icv);
267 kfree(ahp->work_icv); 267 crypto_free_tfm(ahp->tfm);
268 if (ahp->tfm)
269 crypto_free_tfm(ahp->tfm);
270 kfree(ahp); 268 kfree(ahp);
271 } 269 }
272 return -EINVAL; 270 return -EINVAL;
@@ -279,14 +277,10 @@ static void ah_destroy(struct xfrm_state *x)
279 if (!ahp) 277 if (!ahp)
280 return; 278 return;
281 279
282 if (ahp->work_icv) { 280 kfree(ahp->work_icv);
283 kfree(ahp->work_icv); 281 ahp->work_icv = NULL;
284 ahp->work_icv = NULL; 282 crypto_free_tfm(ahp->tfm);
285 } 283 ahp->tfm = NULL;
286 if (ahp->tfm) {
287 crypto_free_tfm(ahp->tfm);
288 ahp->tfm = NULL;
289 }
290 kfree(ahp); 284 kfree(ahp);
291} 285}
292 286
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd612853..8bf312bdea13 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
700static void parp_redo(struct sk_buff *skb) 700static void parp_redo(struct sk_buff *skb)
701{ 701{
702 nf_reset(skb); 702 nf_reset(skb);
703 arp_rcv(skb, skb->dev, NULL); 703 arp_rcv(skb, skb->dev, NULL, skb->dev);
704} 704}
705 705
706/* 706/*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
865 if (n) 865 if (n)
866 neigh_release(n); 866 neigh_release(n);
867 867
868 if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 868 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
869 skb->pkt_type == PACKET_HOST || 869 skb->pkt_type == PACKET_HOST ||
870 in_dev->arp_parms->proxy_delay == 0) { 870 in_dev->arp_parms->proxy_delay == 0) {
871 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 871 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
927 * Receive an arp request from the device layer. 927 * Receive an arp request from the device layer.
928 */ 928 */
929 929
930int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 930int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
931{ 931{
932 struct arphdr *arp; 932 struct arphdr *arp;
933 933
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
948 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 948 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
949 goto out_of_mem; 949 goto out_of_mem;
950 950
951 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
952
951 return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); 953 return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
952 954
953freeskb: 955freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f2542..c1b42b5257f8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ip.h> 17#include <linux/ip.h>
18#include <linux/in.h> 18#include <linux/in.h>
19#include <net/ip.h>
19#include <net/sock.h> 20#include <net/sock.h>
20#include <net/tcp.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <net/tcp_states.h>
22 23
23int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 24int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
24{ 25{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77d..ba2895ae8151 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1111 struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); 1111 struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
1112 1112
1113 if (!skb) 1113 if (!skb)
1114 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); 1114 netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { 1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
1116 kfree_skb(skb); 1116 kfree_skb(skb);
1117 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); 1117 netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
1118 } else { 1118 } else {
1119 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; 1119 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
1120 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
1121 } 1120 }
1122} 1121}
1123 1122
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1f..1b5a09d1b90b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
331 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 331 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
332 if (!x) 332 if (!x)
333 return; 333 return;
334 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 334 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
335 ntohl(esph->spi), ntohl(iph->daddr))); 335 ntohl(esph->spi), ntohl(iph->daddr));
336 xfrm_state_put(x); 336 xfrm_state_put(x);
337} 337}
338 338
@@ -343,22 +343,14 @@ static void esp_destroy(struct xfrm_state *x)
343 if (!esp) 343 if (!esp)
344 return; 344 return;
345 345
346 if (esp->conf.tfm) { 346 crypto_free_tfm(esp->conf.tfm);
347 crypto_free_tfm(esp->conf.tfm); 347 esp->conf.tfm = NULL;
348 esp->conf.tfm = NULL; 348 kfree(esp->conf.ivec);
349 } 349 esp->conf.ivec = NULL;
350 if (esp->conf.ivec) { 350 crypto_free_tfm(esp->auth.tfm);
351 kfree(esp->conf.ivec); 351 esp->auth.tfm = NULL;
352 esp->conf.ivec = NULL; 352 kfree(esp->auth.work_icv);
353 } 353 esp->auth.work_icv = NULL;
354 if (esp->auth.tfm) {
355 crypto_free_tfm(esp->auth.tfm);
356 esp->auth.tfm = NULL;
357 }
358 if (esp->auth.work_icv) {
359 kfree(esp->auth.work_icv);
360 esp->auth.work_icv = NULL;
361 }
362 kfree(esp); 354 kfree(esp);
363} 355}
364 356
@@ -395,10 +387,10 @@ static int esp_init_state(struct xfrm_state *x)
395 387
396 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 388 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
397 crypto_tfm_alg_digestsize(esp->auth.tfm)) { 389 crypto_tfm_alg_digestsize(esp->auth.tfm)) {
398 NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", 390 NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
399 x->aalg->alg_name, 391 x->aalg->alg_name,
400 crypto_tfm_alg_digestsize(esp->auth.tfm), 392 crypto_tfm_alg_digestsize(esp->auth.tfm),
401 aalg_desc->uinfo.auth.icv_fullbits/8)); 393 aalg_desc->uinfo.auth.icv_fullbits/8);
402 goto error; 394 goto error;
403 } 395 }
404 396
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab9580..4e1379f71269 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
558 nl_fib_lookup(frn, tb); 558 nl_fib_lookup(frn, tb);
559 559
560 pid = nlh->nlmsg_pid; /*pid of sending process */ 560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */ 561 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid; 562 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */ 563 NETLINK_CB(skb).dst_group = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT); 564 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566} 565}
567 566
568static void nl_fib_lookup_init(void) 567static void nl_fib_lookup_init(void)
569{ 568{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); 569 netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
571} 570}
572 571
573static void fib_disable_ip(struct net_device *dev, int force) 572static void fib_disable_ip(struct net_device *dev, int force)
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
662} 661}
663 662
664EXPORT_SYMBOL(inet_addr_type); 663EXPORT_SYMBOL(inet_addr_type);
665EXPORT_SYMBOL(ip_dev_find);
666EXPORT_SYMBOL(ip_rt_ioctl); 664EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3d..2a8c9afc3695 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
45 45
46#include "fib_lookup.h" 46#include "fib_lookup.h"
47 47
48static kmem_cache_t *fn_hash_kmem; 48static kmem_cache_t *fn_hash_kmem __read_mostly;
49static kmem_cache_t *fn_alias_kmem; 49static kmem_cache_t *fn_alias_kmem __read_mostly;
50 50
51struct fib_node { 51struct fib_node {
52 struct hlist_node fn_hash; 52 struct hlist_node fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa93..ef6609ea0eb7 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
7 7
8struct fib_alias { 8struct fib_alias {
9 struct list_head fa_list; 9 struct list_head fa_list;
10 struct rcu_head rcu;
10 struct fib_info *fa_info; 11 struct fib_info *fa_info;
11 u8 fa_tos; 12 u8 fa_tos;
12 u8 fa_type; 13 u8 fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9f5..d41219e8037c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
290 kfree_skb(skb); 290 kfree_skb(skb);
291 return; 291 return;
292 } 292 }
293 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; 293 NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
294 if (n->nlmsg_flags&NLM_F_ECHO) 294 if (n->nlmsg_flags&NLM_F_ECHO)
295 atomic_inc(&skb->users); 295 atomic_inc(&skb->users);
296 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); 296 netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
297 if (n->nlmsg_flags&NLM_F_ECHO) 297 if (n->nlmsg_flags&NLM_F_ECHO)
298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); 298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
299} 299}
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash, 593 struct hlist_head *new_laddrhash,
594 unsigned int new_size) 594 unsigned int new_size)
595{ 595{
596 struct hlist_head *old_info_hash, *old_laddrhash;
596 unsigned int old_size = fib_hash_size; 597 unsigned int old_size = fib_hash_size;
597 unsigned int i; 598 unsigned int i, bytes;
598 599
599 write_lock(&fib_info_lock); 600 write_lock(&fib_info_lock);
601 old_info_hash = fib_info_hash;
602 old_laddrhash = fib_info_laddrhash;
600 fib_hash_size = new_size; 603 fib_hash_size = new_size;
601 604
602 for (i = 0; i < old_size; i++) { 605 for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
636 fib_info_laddrhash = new_laddrhash; 639 fib_info_laddrhash = new_laddrhash;
637 640
638 write_unlock(&fib_info_lock); 641 write_unlock(&fib_info_lock);
642
643 bytes = old_size * sizeof(struct hlist_head *);
644 fib_hash_free(old_info_hash, bytes);
645 fib_hash_free(old_laddrhash, bytes);
639} 646}
640 647
641struct fib_info * 648struct fib_info *
@@ -847,6 +854,7 @@ failure:
847 return NULL; 854 return NULL;
848} 855}
849 856
857/* Note! fib_semantic_match intentionally uses RCU list functions. */
850int fib_semantic_match(struct list_head *head, const struct flowi *flp, 858int fib_semantic_match(struct list_head *head, const struct flowi *flp,
851 struct fib_result *res, __u32 zone, __u32 mask, 859 struct fib_result *res, __u32 zone, __u32 mask,
852 int prefixlen) 860 int prefixlen)
@@ -854,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
854 struct fib_alias *fa; 862 struct fib_alias *fa;
855 int nh_sel = 0; 863 int nh_sel = 0;
856 864
857 list_for_each_entry(fa, head, fa_list) { 865 list_for_each_entry_rcu(fa, head, fa_list) {
858 int err; 866 int err;
859 867
860 if (fa->fa_tos && 868 if (fa->fa_tos &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4be234c7d8c3..b2dea4e5da77 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.325" 46#define VERSION "0.402"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
62#include <linux/netdevice.h> 62#include <linux/netdevice.h>
63#include <linux/if_arp.h> 63#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 64#include <linux/proc_fs.h>
65#include <linux/rcupdate.h>
65#include <linux/skbuff.h> 66#include <linux/skbuff.h>
66#include <linux/netlink.h> 67#include <linux/netlink.h>
67#include <linux/init.h> 68#include <linux/init.h>
@@ -77,56 +78,55 @@
77#undef CONFIG_IP_FIB_TRIE_STATS 78#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384 79#define MAX_CHILDS 16384
79 80
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key)) 81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) 82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) 83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key; 85typedef unsigned int t_key;
88 86
89#define T_TNODE 0 87#define T_TNODE 0
90#define T_LEAF 1 88#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL 89#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \ 90#define NODE_PARENT(node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) 91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
94#define NODE_SET_PARENT(_node, _ptr) \ 92
95((_node)->_parent = (((unsigned long)(_ptr)) | \ 93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
96 ((_node)->_parent & NODE_TYPE_MASK))) 94
97#define NODE_INIT_PARENT(_node, _type) \ 95#define NODE_SET_PARENT(node, ptr) \
98((_node)->_parent = (_type)) 96 rcu_assign_pointer((node)->parent, \
99#define NODE_TYPE(_node) \ 97 ((unsigned long)(ptr)) | NODE_TYPE(node))
100((_node)->_parent & NODE_TYPE_MASK) 98
101 99#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_TNODE(n) (!(n->_parent & T_LEAF)) 100#define IS_LEAF(n) (n->parent & T_LEAF)
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104 101
105struct node { 102struct node {
106 t_key key; 103 t_key key;
107 unsigned long _parent; 104 unsigned long parent;
108}; 105};
109 106
110struct leaf { 107struct leaf {
111 t_key key; 108 t_key key;
112 unsigned long _parent; 109 unsigned long parent;
113 struct hlist_head list; 110 struct hlist_head list;
111 struct rcu_head rcu;
114}; 112};
115 113
116struct leaf_info { 114struct leaf_info {
117 struct hlist_node hlist; 115 struct hlist_node hlist;
116 struct rcu_head rcu;
118 int plen; 117 int plen;
119 struct list_head falh; 118 struct list_head falh;
120}; 119};
121 120
122struct tnode { 121struct tnode {
123 t_key key; 122 t_key key;
124 unsigned long _parent; 123 unsigned long parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ 124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ 125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */ 126 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */ 127 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0]; 128 struct rcu_head rcu;
129 struct node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -147,116 +147,76 @@ struct trie_stat {
147 unsigned int leaves; 147 unsigned int leaves;
148 unsigned int nullpointers; 148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS]; 149 unsigned int nodesizes[MAX_CHILDS];
150}; 150};
151 151
152struct trie { 152struct trie {
153 struct node *trie; 153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS 154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats; 155 struct trie_use_stats stats;
156#endif 156#endif
157 int size; 157 int size;
158 unsigned int revision; 158 unsigned int revision;
159}; 159};
160 160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 161static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn); 163static struct node *resize(struct trie *t, struct tnode *tn);
168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
170static void tnode_free(struct tnode *tn); 166static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t); 167static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175 168
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, 169static kmem_cache_t *fn_alias_kmem __read_mostly;
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179static kmem_cache_t *fn_alias_kmem;
180static struct trie *trie_local = NULL, *trie_main = NULL; 170static struct trie *trie_local = NULL, *trie_main = NULL;
181 171
182static void trie_bug(char *err)
183{
184 printk("Trie Bug: %s\n", err);
185 BUG();
186}
187 172
188static inline struct node *tnode_get_child(struct tnode *tn, int i) 173/* rcu_read_lock needs to be hold by caller from readside */
174
175static inline struct node *tnode_get_child(struct tnode *tn, int i)
189{ 176{
190 if (i >= 1<<tn->bits) 177 BUG_ON(i >= 1 << tn->bits);
191 trie_bug("tnode_get_child");
192 178
193 return tn->child[i]; 179 return rcu_dereference(tn->child[i]);
194} 180}
195 181
196static inline int tnode_child_length(struct tnode *tn) 182static inline int tnode_child_length(const struct tnode *tn)
197{ 183{
198 return 1<<tn->bits; 184 return 1 << tn->bits;
199} 185}
200 186
201/*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 187static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{ 188{
221 if (offset < KEYLENGTH) 189 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 190 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else 191 else
224 return 0; 192 return 0;
225} 193}
226 194
227static inline int tkey_equals(t_key a, t_key b) 195static inline int tkey_equals(t_key a, t_key b)
228{ 196{
229 return a == b; 197 return a == b;
230} 198}
231 199
232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 200static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{ 201{
234 if (bits == 0 || offset >= KEYLENGTH) 202 if (bits == 0 || offset >= KEYLENGTH)
235 return 1; 203 return 1;
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 204 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 205 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238} 206}
239 207
240static inline int tkey_mismatch(t_key a, int offset, t_key b) 208static inline int tkey_mismatch(t_key a, int offset, t_key b)
241{ 209{
242 t_key diff = a ^ b; 210 t_key diff = a ^ b;
243 int i = offset; 211 int i = offset;
244 212
245 if(!diff) 213 if (!diff)
246 return 0; 214 return 0;
247 while((diff << i) >> (KEYLENGTH-1) == 0) 215 while ((diff << i) >> (KEYLENGTH-1) == 0)
248 i++; 216 i++;
249 return i; 217 return i;
250} 218}
251 219
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/* 220/*
261 To understand this stuff, an understanding of keys and all their bits is 221 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not 222 necessary. Every node in the trie has a key associated with it, but not
@@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa)
295 tp->pos = 7 255 tp->pos = 7
296 tp->bits = 3 256 tp->bits = 3
297 n->pos = 15 257 n->pos = 15
298 n->bits=4 258 n->bits = 4
299 259
300 First, let's just ignore the bits that come before the parent tp, that is 260 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do 261 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
@@ -314,65 +274,71 @@ static void fn_free_alias(struct fib_alias *fa)
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 274 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child. 275 n's child array, and will of course be different for each child.
316 276
277
317 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 278 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
318 at this point. 279 at this point.
319 280
320*/ 281*/
321 282
322static void check_tnode(struct tnode *tn) 283static inline void check_tnode(const struct tnode *tn)
323{ 284{
324 if(tn && tn->pos+tn->bits > 32) { 285 WARN_ON(tn && tn->pos+tn->bits > 32);
325 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
326 }
327} 286}
328 287
329static int halve_threshold = 25; 288static int halve_threshold = 25;
330static int inflate_threshold = 50; 289static int inflate_threshold = 50;
331 290
332static struct leaf *leaf_new(void) 291
292static void __alias_free_mem(struct rcu_head *head)
333{ 293{
334 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 294 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
335 if(l) { 295 kmem_cache_free(fn_alias_kmem, fa);
336 NODE_INIT_PARENT(l, T_LEAF);
337 INIT_HLIST_HEAD(&l->list);
338 }
339 return l;
340} 296}
341 297
342static struct leaf_info *leaf_info_new(int plen) 298static inline void alias_free_mem_rcu(struct fib_alias *fa)
343{ 299{
344 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 300 call_rcu(&fa->rcu, __alias_free_mem);
345 if(li) { 301}
346 li->plen = plen; 302
347 INIT_LIST_HEAD(&li->falh); 303static void __leaf_free_rcu(struct rcu_head *head)
348 } 304{
349 return li; 305 kfree(container_of(head, struct leaf, rcu));
306}
307
308static inline void free_leaf(struct leaf *leaf)
309{
310 call_rcu(&leaf->rcu, __leaf_free_rcu);
350} 311}
351 312
352static inline void free_leaf(struct leaf *l) 313static void __leaf_info_free_rcu(struct rcu_head *head)
353{ 314{
354 kfree(l); 315 kfree(container_of(head, struct leaf_info, rcu));
355} 316}
356 317
357static inline void free_leaf_info(struct leaf_info *li) 318static inline void free_leaf_info(struct leaf_info *leaf)
358{ 319{
359 kfree(li); 320 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
360} 321}
361 322
362static struct tnode *tnode_alloc(unsigned int size) 323static struct tnode *tnode_alloc(unsigned int size)
363{ 324{
364 if (size <= PAGE_SIZE) { 325 struct page *pages;
365 return kmalloc(size, GFP_KERNEL); 326
366 } else { 327 if (size <= PAGE_SIZE)
367 return (struct tnode *) 328 return kcalloc(size, 1, GFP_KERNEL);
368 __get_free_pages(GFP_KERNEL, get_order(size)); 329
369 } 330 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
331 if (!pages)
332 return NULL;
333
334 return page_address(pages);
370} 335}
371 336
372static void __tnode_free(struct tnode *tn) 337static void __tnode_free_rcu(struct rcu_head *head)
373{ 338{
339 struct tnode *tn = container_of(head, struct tnode, rcu);
374 unsigned int size = sizeof(struct tnode) + 340 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *); 341 (1 << tn->bits) * sizeof(struct node *);
376 342
377 if (size <= PAGE_SIZE) 343 if (size <= PAGE_SIZE)
378 kfree(tn); 344 kfree(tn);
@@ -380,45 +346,50 @@ static void __tnode_free(struct tnode *tn)
380 free_pages((unsigned long)tn, get_order(size)); 346 free_pages((unsigned long)tn, get_order(size));
381} 347}
382 348
349static inline void tnode_free(struct tnode *tn)
350{
351 call_rcu(&tn->rcu, __tnode_free_rcu);
352}
353
354static struct leaf *leaf_new(void)
355{
356 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
357 if (l) {
358 l->parent = T_LEAF;
359 INIT_HLIST_HEAD(&l->list);
360 }
361 return l;
362}
363
364static struct leaf_info *leaf_info_new(int plen)
365{
366 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
367 if (li) {
368 li->plen = plen;
369 INIT_LIST_HEAD(&li->falh);
370 }
371 return li;
372}
373
383static struct tnode* tnode_new(t_key key, int pos, int bits) 374static struct tnode* tnode_new(t_key key, int pos, int bits)
384{ 375{
385 int nchildren = 1<<bits; 376 int nchildren = 1<<bits;
386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 377 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
387 struct tnode *tn = tnode_alloc(sz); 378 struct tnode *tn = tnode_alloc(sz);
388 379
389 if(tn) { 380 if (tn) {
390 memset(tn, 0, sz); 381 memset(tn, 0, sz);
391 NODE_INIT_PARENT(tn, T_TNODE); 382 tn->parent = T_TNODE;
392 tn->pos = pos; 383 tn->pos = pos;
393 tn->bits = bits; 384 tn->bits = bits;
394 tn->key = key; 385 tn->key = key;
395 tn->full_children = 0; 386 tn->full_children = 0;
396 tn->empty_children = 1<<bits; 387 tn->empty_children = 1<<bits;
397 } 388 }
398 if(trie_debug > 0)
399 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400 (unsigned int) (sizeof(struct node) * 1<<bits));
401 return tn;
402}
403 389
404static void tnode_free(struct tnode *tn) 390 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
405{ 391 (unsigned int) (sizeof(struct node) * 1<<bits));
406 if(!tn) { 392 return tn;
407 trie_bug("tnode_free\n");
408 }
409 if(IS_LEAF(tn)) {
410 free_leaf((struct leaf *)tn);
411 if(trie_debug > 0 )
412 printk("FL %p \n", tn);
413 }
414 else if(IS_TNODE(tn)) {
415 __tnode_free(tn);
416 if(trie_debug > 0 )
417 printk("FT %p \n", tn);
418 }
419 else {
420 trie_bug("tnode_free\n");
421 }
422} 393}
423 394
424/* 395/*
@@ -426,70 +397,65 @@ static void tnode_free(struct tnode *tn)
426 * and no bits are skipped. See discussion in dyntree paper p. 6 397 * and no bits are skipped. See discussion in dyntree paper p. 6
427 */ 398 */
428 399
429static inline int tnode_full(struct tnode *tn, struct node *n) 400static inline int tnode_full(const struct tnode *tn, const struct node *n)
430{ 401{
431 if(n == NULL || IS_LEAF(n)) 402 if (n == NULL || IS_LEAF(n))
432 return 0; 403 return 0;
433 404
434 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 405 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
435} 406}
436 407
437static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 408static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
438{ 409{
439 tnode_put_child_reorg(tn, i, n, -1); 410 tnode_put_child_reorg(tn, i, n, -1);
440} 411}
441 412
442 /* 413 /*
443 * Add a child at position i overwriting the old value. 414 * Add a child at position i overwriting the old value.
444 * Update the value of full_children and empty_children. 415 * Update the value of full_children and empty_children.
445 */ 416 */
446 417
447static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 418static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
448{ 419{
449 struct node *chi; 420 struct node *chi = tn->child[i];
450 int isfull; 421 int isfull;
451 422
452 if(i >= 1<<tn->bits) { 423 BUG_ON(i >= 1<<tn->bits);
453 printk("bits=%d, i=%d\n", tn->bits, i); 424
454 trie_bug("tnode_put_child_reorg bits");
455 }
456 write_lock_bh(&fib_lock);
457 chi = tn->child[i];
458 425
459 /* update emptyChildren */ 426 /* update emptyChildren */
460 if (n == NULL && chi != NULL) 427 if (n == NULL && chi != NULL)
461 tn->empty_children++; 428 tn->empty_children++;
462 else if (n != NULL && chi == NULL) 429 else if (n != NULL && chi == NULL)
463 tn->empty_children--; 430 tn->empty_children--;
464 431
465 /* update fullChildren */ 432 /* update fullChildren */
466 if (wasfull == -1) 433 if (wasfull == -1)
467 wasfull = tnode_full(tn, chi); 434 wasfull = tnode_full(tn, chi);
468 435
469 isfull = tnode_full(tn, n); 436 isfull = tnode_full(tn, n);
470 if (wasfull && !isfull) 437 if (wasfull && !isfull)
471 tn->full_children--; 438 tn->full_children--;
472 439 else if (!wasfull && isfull)
473 else if (!wasfull && isfull)
474 tn->full_children++; 440 tn->full_children++;
475 if(n)
476 NODE_SET_PARENT(n, tn);
477 441
478 tn->child[i] = n; 442 if (n)
479 write_unlock_bh(&fib_lock); 443 NODE_SET_PARENT(n, tn);
444
445 rcu_assign_pointer(tn->child[i], n);
480} 446}
481 447
482static struct node *resize(struct trie *t, struct tnode *tn) 448static struct node *resize(struct trie *t, struct tnode *tn)
483{ 449{
484 int i; 450 int i;
485 int err = 0; 451 int err = 0;
452 struct tnode *old_tn;
486 453
487 if (!tn) 454 if (!tn)
488 return NULL; 455 return NULL;
489 456
490 if(trie_debug) 457 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
491 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 458 tn, inflate_threshold, halve_threshold);
492 tn, inflate_threshold, halve_threshold);
493 459
494 /* No children */ 460 /* No children */
495 if (tn->empty_children == tnode_child_length(tn)) { 461 if (tn->empty_children == tnode_child_length(tn)) {
@@ -499,95 +465,92 @@ static struct node *resize(struct trie *t, struct tnode *tn)
499 /* One child */ 465 /* One child */
500 if (tn->empty_children == tnode_child_length(tn) - 1) 466 if (tn->empty_children == tnode_child_length(tn) - 1)
501 for (i = 0; i < tnode_child_length(tn); i++) { 467 for (i = 0; i < tnode_child_length(tn); i++) {
468 struct node *n;
502 469
503 write_lock_bh(&fib_lock); 470 n = tn->child[i];
504 if (tn->child[i] != NULL) { 471 if (!n)
505 472 continue;
506 /* compress one level */
507 struct node *n = tn->child[i];
508 if(n)
509 NODE_INIT_PARENT(n, NODE_TYPE(n));
510 473
511 write_unlock_bh(&fib_lock); 474 /* compress one level */
512 tnode_free(tn); 475 NODE_SET_PARENT(n, NULL);
513 return n; 476 tnode_free(tn);
514 } 477 return n;
515 write_unlock_bh(&fib_lock);
516 } 478 }
517 /* 479 /*
518 * Double as long as the resulting node has a number of 480 * Double as long as the resulting node has a number of
519 * nonempty nodes that are above the threshold. 481 * nonempty nodes that are above the threshold.
520 */ 482 */
521 483
522 /* 484 /*
523 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 485 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524 * the Helsinki University of Technology and Matti Tikkanen of Nokia 486 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525 * Telecommunications, page 6: 487 * Telecommunications, page 6:
526 * "A node is doubled if the ratio of non-empty children to all 488 * "A node is doubled if the ratio of non-empty children to all
527 * children in the *doubled* node is at least 'high'." 489 * children in the *doubled* node is at least 'high'."
528 * 490 *
529 * 'high' in this instance is the variable 'inflate_threshold'. It 491 * 'high' in this instance is the variable 'inflate_threshold'. It
530 * is expressed as a percentage, so we multiply it with 492 * is expressed as a percentage, so we multiply it with
531 * tnode_child_length() and instead of multiplying by 2 (since the 493 * tnode_child_length() and instead of multiplying by 2 (since the
532 * child array will be doubled by inflate()) and multiplying 494 * child array will be doubled by inflate()) and multiplying
533 * the left-hand side by 100 (to handle the percentage thing) we 495 * the left-hand side by 100 (to handle the percentage thing) we
534 * multiply the left-hand side by 50. 496 * multiply the left-hand side by 50.
535 * 497 *
536 * The left-hand side may look a bit weird: tnode_child_length(tn) 498 * The left-hand side may look a bit weird: tnode_child_length(tn)
537 * - tn->empty_children is of course the number of non-null children 499 * - tn->empty_children is of course the number of non-null children
538 * in the current node. tn->full_children is the number of "full" 500 * in the current node. tn->full_children is the number of "full"
539 * children, that is non-null tnodes with a skip value of 0. 501 * children, that is non-null tnodes with a skip value of 0.
540 * All of those will be doubled in the resulting inflated tnode, so 502 * All of those will be doubled in the resulting inflated tnode, so
541 * we just count them one extra time here. 503 * we just count them one extra time here.
542 * 504 *
543 * A clearer way to write this would be: 505 * A clearer way to write this would be:
544 * 506 *
545 * to_be_doubled = tn->full_children; 507 * to_be_doubled = tn->full_children;
546 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 508 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
547 * tn->full_children; 509 * tn->full_children;
548 * 510 *
549 * new_child_length = tnode_child_length(tn) * 2; 511 * new_child_length = tnode_child_length(tn) * 2;
550 * 512 *
551 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 513 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
552 * new_child_length; 514 * new_child_length;
553 * if (new_fill_factor >= inflate_threshold) 515 * if (new_fill_factor >= inflate_threshold)
554 * 516 *
555 * ...and so on, tho it would mess up the while() loop. 517 * ...and so on, tho it would mess up the while () loop.
556 * 518 *
557 * anyway, 519 * anyway,
558 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 520 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
559 * inflate_threshold 521 * inflate_threshold
560 * 522 *
561 * avoid a division: 523 * avoid a division:
562 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 524 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563 * inflate_threshold * new_child_length 525 * inflate_threshold * new_child_length
564 * 526 *
565 * expand not_to_be_doubled and to_be_doubled, and shorten: 527 * expand not_to_be_doubled and to_be_doubled, and shorten:
566 * 100 * (tnode_child_length(tn) - tn->empty_children + 528 * 100 * (tnode_child_length(tn) - tn->empty_children +
567 * tn->full_children ) >= inflate_threshold * new_child_length 529 * tn->full_children) >= inflate_threshold * new_child_length
568 * 530 *
569 * expand new_child_length: 531 * expand new_child_length:
570 * 100 * (tnode_child_length(tn) - tn->empty_children + 532 * 100 * (tnode_child_length(tn) - tn->empty_children +
571 * tn->full_children ) >= 533 * tn->full_children) >=
572 * inflate_threshold * tnode_child_length(tn) * 2 534 * inflate_threshold * tnode_child_length(tn) * 2
573 * 535 *
574 * shorten again: 536 * shorten again:
575 * 50 * (tn->full_children + tnode_child_length(tn) - 537 * 50 * (tn->full_children + tnode_child_length(tn) -
576 * tn->empty_children ) >= inflate_threshold * 538 * tn->empty_children) >= inflate_threshold *
577 * tnode_child_length(tn) 539 * tnode_child_length(tn)
578 * 540 *
579 */ 541 */
580 542
581 check_tnode(tn); 543 check_tnode(tn);
582 544
583 err = 0; 545 err = 0;
584 while ((tn->full_children > 0 && 546 while ((tn->full_children > 0 &&
585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 547 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
586 inflate_threshold * tnode_child_length(tn))) { 548 inflate_threshold * tnode_child_length(tn))) {
587 549
588 tn = inflate(t, tn, &err); 550 old_tn = tn;
589 551 tn = inflate(t, tn);
590 if(err) { 552 if (IS_ERR(tn)) {
553 tn = old_tn;
591#ifdef CONFIG_IP_FIB_TRIE_STATS 554#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++; 555 t->stats.resize_node_skipped++;
593#endif 556#endif
@@ -607,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
607 100 * (tnode_child_length(tn) - tn->empty_children) < 570 100 * (tnode_child_length(tn) - tn->empty_children) <
608 halve_threshold * tnode_child_length(tn)) { 571 halve_threshold * tnode_child_length(tn)) {
609 572
610 tn = halve(t, tn, &err); 573 old_tn = tn;
611 574 tn = halve(t, tn);
612 if(err) { 575 if (IS_ERR(tn)) {
576 tn = old_tn;
613#ifdef CONFIG_IP_FIB_TRIE_STATS 577#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++; 578 t->stats.resize_node_skipped++;
615#endif 579#endif
@@ -617,55 +581,48 @@ static struct node *resize(struct trie *t, struct tnode *tn)
617 } 581 }
618 } 582 }
619 583
620
621 /* Only one child remains */
622 584
585 /* Only one child remains */
623 if (tn->empty_children == tnode_child_length(tn) - 1) 586 if (tn->empty_children == tnode_child_length(tn) - 1)
624 for (i = 0; i < tnode_child_length(tn); i++) { 587 for (i = 0; i < tnode_child_length(tn); i++) {
625 588 struct node *n;
626 write_lock_bh(&fib_lock); 589
627 if (tn->child[i] != NULL) { 590 n = tn->child[i];
628 /* compress one level */ 591 if (!n)
629 struct node *n = tn->child[i]; 592 continue;
630 593
631 if(n) 594 /* compress one level */
632 NODE_INIT_PARENT(n, NODE_TYPE(n)); 595
633 596 NODE_SET_PARENT(n, NULL);
634 write_unlock_bh(&fib_lock); 597 tnode_free(tn);
635 tnode_free(tn); 598 return n;
636 return n;
637 }
638 write_unlock_bh(&fib_lock);
639 } 599 }
640 600
641 return (struct node *) tn; 601 return (struct node *) tn;
642} 602}
643 603
644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) 604static struct tnode *inflate(struct trie *t, struct tnode *tn)
645{ 605{
646 struct tnode *inode; 606 struct tnode *inode;
647 struct tnode *oldtnode = tn; 607 struct tnode *oldtnode = tn;
648 int olen = tnode_child_length(tn); 608 int olen = tnode_child_length(tn);
649 int i; 609 int i;
650 610
651 if(trie_debug) 611 pr_debug("In inflate\n");
652 printk("In inflate\n");
653 612
654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 613 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
655 614
656 if (!tn) { 615 if (!tn)
657 *err = -ENOMEM; 616 return ERR_PTR(-ENOMEM);
658 return oldtnode;
659 }
660 617
661 /* 618 /*
662 * Preallocate and store tnodes before the actual work so we 619 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation 620 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate 621 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored. 622 * of tnode is ignored.
666 */ 623 */
667 624
668 for(i = 0; i < olen; i++) { 625 for (i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); 626 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670 627
671 if (inode && 628 if (inode &&
@@ -673,56 +630,40 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
673 inode->pos == oldtnode->pos + oldtnode->bits && 630 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) { 631 inode->bits > 1) {
675 struct tnode *left, *right; 632 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1); 633 t_key m = TKEY_GET_MASK(inode->pos, 1);
678 634
679 left = tnode_new(inode->key&(~m), inode->pos + 1, 635 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1); 636 inode->bits - 1);
637 if (!left)
638 goto nomem;
681 639
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1, 640 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1); 641 inode->bits - 1);
689 642
690 if(!right) { 643 if (!right) {
691 *err = -ENOMEM; 644 tnode_free(left);
692 break; 645 goto nomem;
693 } 646 }
694 647
695 put_child(t, tn, 2*i, (struct node *) left); 648 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right); 649 put_child(t, tn, 2*i+1, (struct node *) right);
697 } 650 }
698 } 651 }
699 652
700 if(*err) { 653 for (i = 0; i < olen; i++) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
713
714 for(i = 0; i < olen; i++) {
715 struct node *node = tnode_get_child(oldtnode, i); 654 struct node *node = tnode_get_child(oldtnode, i);
716 655 struct tnode *left, *right;
656 int size, j;
657
717 /* An empty child */ 658 /* An empty child */
718 if (node == NULL) 659 if (node == NULL)
719 continue; 660 continue;
720 661
721 /* A leaf or an internal node with skipped bits */ 662 /* A leaf or an internal node with skipped bits */
722 663
723 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 664 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
724 tn->pos + tn->bits - 1) { 665 tn->pos + tn->bits - 1) {
725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 666 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
726 1) == 0) 667 1) == 0)
727 put_child(t, tn, 2*i, node); 668 put_child(t, tn, 2*i, node);
728 else 669 else
@@ -738,207 +679,212 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
738 put_child(t, tn, 2*i+1, inode->child[1]); 679 put_child(t, tn, 2*i+1, inode->child[1]);
739 680
740 tnode_free(inode); 681 tnode_free(inode);
682 continue;
741 } 683 }
742 684
743 /* An internal node with more than two children */ 685 /* An internal node with more than two children */
744 else { 686
745 struct tnode *left, *right; 687 /* We will replace this node 'inode' with two new
746 int size, j; 688 * ones, 'left' and 'right', each with half of the
747 689 * original children. The two new nodes will have
748 /* We will replace this node 'inode' with two new 690 * a position one bit further down the key and this
749 * ones, 'left' and 'right', each with half of the 691 * means that the "significant" part of their keys
750 * original children. The two new nodes will have 692 * (see the discussion near the top of this file)
751 * a position one bit further down the key and this 693 * will differ by one bit, which will be "0" in
752 * means that the "significant" part of their keys 694 * left's key and "1" in right's key. Since we are
753 * (see the discussion near the top of this file) 695 * moving the key position by one step, the bit that
754 * will differ by one bit, which will be "0" in 696 * we are moving away from - the bit at position
755 * left's key and "1" in right's key. Since we are 697 * (inode->pos) - is the one that will differ between
756 * moving the key position by one step, the bit that 698 * left and right. So... we synthesize that bit in the
757 * we are moving away from - the bit at position 699 * two new keys.
758 * (inode->pos) - is the one that will differ between 700 * The mask 'm' below will be a single "one" bit at
759 * left and right. So... we synthesize that bit in the 701 * the position (inode->pos)
760 * two new keys. 702 */
761 * The mask 'm' below will be a single "one" bit at
762 * the position (inode->pos)
763 */
764
765 /* Use the old key, but set the new significant
766 * bit to zero.
767 */
768 703
769 left = (struct tnode *) tnode_get_child(tn, 2*i); 704 /* Use the old key, but set the new significant
770 put_child(t, tn, 2*i, NULL); 705 * bit to zero.
706 */
771 707
772 if(!left) 708 left = (struct tnode *) tnode_get_child(tn, 2*i);
773 BUG(); 709 put_child(t, tn, 2*i, NULL);
774 710
775 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 711 BUG_ON(!left);
776 put_child(t, tn, 2*i+1, NULL);
777 712
778 if(!right) 713 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
779 BUG(); 714 put_child(t, tn, 2*i+1, NULL);
780 715
781 size = tnode_child_length(left); 716 BUG_ON(!right);
782 for(j = 0; j < size; j++) {
783 put_child(t, left, j, inode->child[j]);
784 put_child(t, right, j, inode->child[j + size]);
785 }
786 put_child(t, tn, 2*i, resize(t, left));
787 put_child(t, tn, 2*i+1, resize(t, right));
788 717
789 tnode_free(inode); 718 size = tnode_child_length(left);
719 for (j = 0; j < size; j++) {
720 put_child(t, left, j, inode->child[j]);
721 put_child(t, right, j, inode->child[j + size]);
790 } 722 }
723 put_child(t, tn, 2*i, resize(t, left));
724 put_child(t, tn, 2*i+1, resize(t, right));
725
726 tnode_free(inode);
791 } 727 }
792 tnode_free(oldtnode); 728 tnode_free(oldtnode);
793 return tn; 729 return tn;
730nomem:
731 {
732 int size = tnode_child_length(tn);
733 int j;
734
735 for (j = 0; j < size; j++)
736 if (tn->child[j])
737 tnode_free((struct tnode *)tn->child[j]);
738
739 tnode_free(tn);
740
741 return ERR_PTR(-ENOMEM);
742 }
794} 743}
795 744
796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) 745static struct tnode *halve(struct trie *t, struct tnode *tn)
797{ 746{
798 struct tnode *oldtnode = tn; 747 struct tnode *oldtnode = tn;
799 struct node *left, *right; 748 struct node *left, *right;
800 int i; 749 int i;
801 int olen = tnode_child_length(tn); 750 int olen = tnode_child_length(tn);
802 751
803 if(trie_debug) printk("In halve\n"); 752 pr_debug("In halve\n");
804
805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806 753
807 if (!tn) { 754 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808 *err = -ENOMEM; 755
809 return oldtnode; 756 if (!tn)
810 } 757 return ERR_PTR(-ENOMEM);
811 758
812 /* 759 /*
813 * Preallocate and store tnodes before the actual work so we 760 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation 761 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve 762 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored. 763 * of tnode is ignored.
817 */ 764 */
818 765
819 for(i = 0; i < olen; i += 2) { 766 for (i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i); 767 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1); 768 right = tnode_get_child(oldtnode, i+1);
822 769
823 /* Two nonempty children */ 770 /* Two nonempty children */
824 if( left && right) { 771 if (left && right) {
825 struct tnode *newBinNode = 772 struct tnode *newn;
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827 773
828 if(!newBinNode) { 774 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835 775
836 if(*err) { 776 if (!newn)
837 int size = tnode_child_length(tn); 777 goto nomem;
838 int j;
839 778
840 for(j = 0; j < size; j++) 779 put_child(t, tn, i/2, (struct node *)newn);
841 if( tn->child[j]) 780 }
842 tnode_free((struct tnode *)tn->child[j]);
843 781
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 } 782 }
849 783
850 for(i = 0; i < olen; i += 2) { 784 for (i = 0; i < olen; i += 2) {
785 struct tnode *newBinNode;
786
851 left = tnode_get_child(oldtnode, i); 787 left = tnode_get_child(oldtnode, i);
852 right = tnode_get_child(oldtnode, i+1); 788 right = tnode_get_child(oldtnode, i+1);
853 789
854 /* At least one of the children is empty */ 790 /* At least one of the children is empty */
855 if (left == NULL) { 791 if (left == NULL) {
856 if (right == NULL) /* Both are empty */ 792 if (right == NULL) /* Both are empty */
857 continue; 793 continue;
858 put_child(t, tn, i/2, right); 794 put_child(t, tn, i/2, right);
859 } else if (right == NULL) 795 continue;
860 put_child(t, tn, i/2, left); 796 }
861
862 /* Two nonempty children */
863 else {
864 struct tnode *newBinNode =
865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
867
868 if(!newBinNode)
869 BUG();
870 797
871 put_child(t, newBinNode, 0, left); 798 if (right == NULL) {
872 put_child(t, newBinNode, 1, right); 799 put_child(t, tn, i/2, left);
873 put_child(t, tn, i/2, resize(t, newBinNode)); 800 continue;
874 } 801 }
802
803 /* Two nonempty children */
804 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
805 put_child(t, tn, i/2, NULL);
806 put_child(t, newBinNode, 0, left);
807 put_child(t, newBinNode, 1, right);
808 put_child(t, tn, i/2, resize(t, newBinNode));
875 } 809 }
876 tnode_free(oldtnode); 810 tnode_free(oldtnode);
877 return tn; 811 return tn;
812nomem:
813 {
814 int size = tnode_child_length(tn);
815 int j;
816
817 for (j = 0; j < size; j++)
818 if (tn->child[j])
819 tnode_free((struct tnode *)tn->child[j]);
820
821 tnode_free(tn);
822
823 return ERR_PTR(-ENOMEM);
824 }
878} 825}
879 826
880static void *trie_init(struct trie *t) 827static void trie_init(struct trie *t)
881{ 828{
882 if(t) { 829 if (!t)
883 t->size = 0; 830 return;
884 t->trie = NULL; 831
885 t->revision = 0; 832 t->size = 0;
833 rcu_assign_pointer(t->trie, NULL);
834 t->revision = 0;
886#ifdef CONFIG_IP_FIB_TRIE_STATS 835#ifdef CONFIG_IP_FIB_TRIE_STATS
887 memset(&t->stats, 0, sizeof(struct trie_use_stats)); 836 memset(&t->stats, 0, sizeof(struct trie_use_stats));
888#endif 837#endif
889 }
890 return t;
891} 838}
892 839
840/* readside most use rcu_read_lock currently dump routines
841 via get_fa_head and dump */
842
893static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) 843static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
894{ 844{
895 struct hlist_node *node; 845 struct hlist_node *node;
896 struct leaf_info *li; 846 struct leaf_info *li;
897 847
898 hlist_for_each_entry(li, node, head, hlist) { 848 hlist_for_each_entry_rcu(li, node, head, hlist)
899 849 if (li->plen == plen)
900 if ( li->plen == plen )
901 return li; 850 return li;
902 } 851
903 return NULL; 852 return NULL;
904} 853}
905 854
906static inline struct list_head * get_fa_head(struct leaf *l, int plen) 855static inline struct list_head * get_fa_head(struct leaf *l, int plen)
907{ 856{
908 struct list_head *fa_head=NULL;
909 struct leaf_info *li = find_leaf_info(&l->list, plen); 857 struct leaf_info *li = find_leaf_info(&l->list, plen);
910 858
911 if(li) 859 if (!li)
912 fa_head = &li->falh; 860 return NULL;
913 861
914 return fa_head; 862 return &li->falh;
915} 863}
916 864
917static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 865static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
918{ 866{
919 struct leaf_info *li=NULL, *last=NULL; 867 struct leaf_info *li = NULL, *last = NULL;
920 struct hlist_node *node, *tmp; 868 struct hlist_node *node;
921 869
922 write_lock_bh(&fib_lock); 870 if (hlist_empty(head)) {
923 871 hlist_add_head_rcu(&new->hlist, head);
924 if(hlist_empty(head)) 872 } else {
925 hlist_add_head(&new->hlist, head); 873 hlist_for_each_entry(li, node, head, hlist) {
926 else { 874 if (new->plen > li->plen)
927 hlist_for_each_entry_safe(li, node, tmp, head, hlist) { 875 break;
928 876
929 if (new->plen > li->plen) 877 last = li;
930 break; 878 }
931 879 if (last)
932 last = li; 880 hlist_add_after_rcu(&last->hlist, &new->hlist);
933 } 881 else
934 if(last) 882 hlist_add_before_rcu(&new->hlist, &li->hlist);
935 hlist_add_after(&last->hlist, &new->hlist); 883 }
936 else
937 hlist_add_before(&new->hlist, &li->hlist);
938 }
939 write_unlock_bh(&fib_lock);
940} 884}
941 885
886/* rcu_read_lock needs to be hold by caller from readside */
887
942static struct leaf * 888static struct leaf *
943fib_find_node(struct trie *t, u32 key) 889fib_find_node(struct trie *t, u32 key)
944{ 890{
@@ -947,73 +893,57 @@ fib_find_node(struct trie *t, u32 key)
947 struct node *n; 893 struct node *n;
948 894
949 pos = 0; 895 pos = 0;
950 n=t->trie; 896 n = rcu_dereference(t->trie);
951 897
952 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 898 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
953 tn = (struct tnode *) n; 899 tn = (struct tnode *) n;
954 900
955 check_tnode(tn); 901 check_tnode(tn);
956 902
957 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 903 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958 pos=tn->pos + tn->bits; 904 pos = tn->pos + tn->bits;
959 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 905 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960 } 906 } else
961 else
962 break; 907 break;
963 } 908 }
964 /* Case we have found a leaf. Compare prefixes */ 909 /* Case we have found a leaf. Compare prefixes */
965 910
966 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 911 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
967 struct leaf *l = (struct leaf *) n; 912 return (struct leaf *)n;
968 return l; 913
969 }
970 return NULL; 914 return NULL;
971} 915}
972 916
973static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 917static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
974{ 918{
975 int i = 0;
976 int wasfull; 919 int wasfull;
977 t_key cindex, key; 920 t_key cindex, key;
978 struct tnode *tp = NULL; 921 struct tnode *tp = NULL;
979 922
980 if(!tn)
981 BUG();
982
983 key = tn->key; 923 key = tn->key;
984 i = 0;
985 924
986 while (tn != NULL && NODE_PARENT(tn) != NULL) { 925 while (tn != NULL && NODE_PARENT(tn) != NULL) {
987 926
988 if( i > 10 ) {
989 printk("Rebalance tn=%p \n", tn);
990 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
991
992 printk("Rebalance tp=%p \n", tp);
993 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
994 }
995
996 if( i > 12 ) BUG();
997 i++;
998
999 tp = NODE_PARENT(tn); 927 tp = NODE_PARENT(tn);
1000 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 928 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1001 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 929 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1002 tn = (struct tnode *) resize (t, (struct tnode *)tn); 930 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1003 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 931 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1004 932
1005 if(!NODE_PARENT(tn)) 933 if (!NODE_PARENT(tn))
1006 break; 934 break;
1007 935
1008 tn = NODE_PARENT(tn); 936 tn = NODE_PARENT(tn);
1009 } 937 }
1010 /* Handle last (top) tnode */ 938 /* Handle last (top) tnode */
1011 if (IS_TNODE(tn)) 939 if (IS_TNODE(tn))
1012 tn = (struct tnode*) resize(t, (struct tnode *)tn); 940 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1013 941
1014 return (struct node*) tn; 942 return (struct node*) tn;
1015} 943}
1016 944
945/* only used from updater-side */
946
1017static struct list_head * 947static struct list_head *
1018fib_insert_node(struct trie *t, int *err, u32 key, int plen) 948fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1019{ 949{
@@ -1022,68 +952,62 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1022 struct node *n; 952 struct node *n;
1023 struct leaf *l; 953 struct leaf *l;
1024 int missbit; 954 int missbit;
1025 struct list_head *fa_head=NULL; 955 struct list_head *fa_head = NULL;
1026 struct leaf_info *li; 956 struct leaf_info *li;
1027 t_key cindex; 957 t_key cindex;
1028 958
1029 pos = 0; 959 pos = 0;
1030 n=t->trie; 960 n = t->trie;
1031 961
1032 /* If we point to NULL, stop. Either the tree is empty and we should 962 /* If we point to NULL, stop. Either the tree is empty and we should
1033 * just put a new leaf in if, or we have reached an empty child slot, 963 * just put a new leaf in if, or we have reached an empty child slot,
1034 * and we should just put our new leaf in that. 964 * and we should just put our new leaf in that.
1035 * If we point to a T_TNODE, check if it matches our key. Note that 965 * If we point to a T_TNODE, check if it matches our key. Note that
1036 * a T_TNODE might be skipping any number of bits - its 'pos' need 966 * a T_TNODE might be skipping any number of bits - its 'pos' need
1037 * not be the parent's 'pos'+'bits'! 967 * not be the parent's 'pos'+'bits'!
1038 * 968 *
1039 * If it does match the current key, get pos/bits from it, extract 969 * If it does match the current key, get pos/bits from it, extract
1040 * the index from our key, push the T_TNODE and walk the tree. 970 * the index from our key, push the T_TNODE and walk the tree.
1041 * 971 *
1042 * If it doesn't, we have to replace it with a new T_TNODE. 972 * If it doesn't, we have to replace it with a new T_TNODE.
1043 * 973 *
1044 * If we point to a T_LEAF, it might or might not have the same key 974 * If we point to a T_LEAF, it might or might not have the same key
1045 * as we do. If it does, just change the value, update the T_LEAF's 975 * as we do. If it does, just change the value, update the T_LEAF's
1046 * value, and return it. 976 * value, and return it.
1047 * If it doesn't, we need to replace it with a T_TNODE. 977 * If it doesn't, we need to replace it with a T_TNODE.
1048 */ 978 */
1049 979
1050 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 980 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1051 tn = (struct tnode *) n; 981 tn = (struct tnode *) n;
1052 982
1053 check_tnode(tn); 983 check_tnode(tn);
1054 984
1055 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 985 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1056 tp = tn; 986 tp = tn;
1057 pos=tn->pos + tn->bits; 987 pos = tn->pos + tn->bits;
1058 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 988 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1059 989
1060 if(n && NODE_PARENT(n) != tn) { 990 BUG_ON(n && NODE_PARENT(n) != tn);
1061 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 991 } else
1062 BUG();
1063 }
1064 }
1065 else
1066 break; 992 break;
1067 } 993 }
1068 994
1069 /* 995 /*
1070 * n ----> NULL, LEAF or TNODE 996 * n ----> NULL, LEAF or TNODE
1071 * 997 *
1072 * tp is n's (parent) ----> NULL or TNODE 998 * tp is n's (parent) ----> NULL or TNODE
1073 */ 999 */
1074 1000
1075 if(tp && IS_LEAF(tp)) 1001 BUG_ON(tp && IS_LEAF(tp));
1076 BUG();
1077
1078 1002
1079 /* Case 1: n is a leaf. Compare prefixes */ 1003 /* Case 1: n is a leaf. Compare prefixes */
1080 1004
1081 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1005 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1082 struct leaf *l = ( struct leaf *) n; 1006 struct leaf *l = (struct leaf *) n;
1083 1007
1084 li = leaf_info_new(plen); 1008 li = leaf_info_new(plen);
1085 1009
1086 if(! li) { 1010 if (!li) {
1087 *err = -ENOMEM; 1011 *err = -ENOMEM;
1088 goto err; 1012 goto err;
1089 } 1013 }
@@ -1095,7 +1019,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1095 t->size++; 1019 t->size++;
1096 l = leaf_new(); 1020 l = leaf_new();
1097 1021
1098 if(! l) { 1022 if (!l) {
1099 *err = -ENOMEM; 1023 *err = -ENOMEM;
1100 goto err; 1024 goto err;
1101 } 1025 }
@@ -1103,7 +1027,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1103 l->key = key; 1027 l->key = key;
1104 li = leaf_info_new(plen); 1028 li = leaf_info_new(plen);
1105 1029
1106 if(! li) { 1030 if (!li) {
1107 tnode_free((struct tnode *) l); 1031 tnode_free((struct tnode *) l);
1108 *err = -ENOMEM; 1032 *err = -ENOMEM;
1109 goto err; 1033 goto err;
@@ -1112,70 +1036,65 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1112 fa_head = &li->falh; 1036 fa_head = &li->falh;
1113 insert_leaf_info(&l->list, li); 1037 insert_leaf_info(&l->list, li);
1114 1038
1115 /* Case 2: n is NULL, and will just insert a new leaf */
1116 if (t->trie && n == NULL) { 1039 if (t->trie && n == NULL) {
1040 /* Case 2: n is NULL, and will just insert a new leaf */
1117 1041
1118 NODE_SET_PARENT(l, tp); 1042 NODE_SET_PARENT(l, tp);
1119
1120 if (!tp)
1121 BUG();
1122 1043
1123 else { 1044 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1045 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1125 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1046 } else {
1126 } 1047 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127 } 1048 /*
1128 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1049 * Add a new tnode here
1129 else {
1130 /*
1131 * Add a new tnode here
1132 * first tnode need some special handling 1050 * first tnode need some special handling
1133 */ 1051 */
1134 1052
1135 if (tp) 1053 if (tp)
1136 pos=tp->pos+tp->bits; 1054 pos = tp->pos+tp->bits;
1137 else 1055 else
1138 pos=0; 1056 pos = 0;
1139 if(n) { 1057
1058 if (n) {
1140 newpos = tkey_mismatch(key, pos, n->key); 1059 newpos = tkey_mismatch(key, pos, n->key);
1141 tn = tnode_new(n->key, newpos, 1); 1060 tn = tnode_new(n->key, newpos, 1);
1142 } 1061 } else {
1143 else {
1144 newpos = 0; 1062 newpos = 0;
1145 tn = tnode_new(key, newpos, 1); /* First tnode */ 1063 tn = tnode_new(key, newpos, 1); /* First tnode */
1146 } 1064 }
1147 1065
1148 if(!tn) { 1066 if (!tn) {
1149 free_leaf_info(li); 1067 free_leaf_info(li);
1150 tnode_free((struct tnode *) l); 1068 tnode_free((struct tnode *) l);
1151 *err = -ENOMEM; 1069 *err = -ENOMEM;
1152 goto err; 1070 goto err;
1153 } 1071 }
1154 1072
1155 NODE_SET_PARENT(tn, tp); 1073 NODE_SET_PARENT(tn, tp);
1156 1074
1157 missbit=tkey_extract_bits(key, newpos, 1); 1075 missbit = tkey_extract_bits(key, newpos, 1);
1158 put_child(t, tn, missbit, (struct node *)l); 1076 put_child(t, tn, missbit, (struct node *)l);
1159 put_child(t, tn, 1-missbit, n); 1077 put_child(t, tn, 1-missbit, n);
1160 1078
1161 if(tp) { 1079 if (tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1080 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1081 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1164 } 1082 } else {
1165 else { 1083 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1166 t->trie = (struct node*) tn; /* First tnode */
1167 tp = tn; 1084 tp = tn;
1168 } 1085 }
1169 } 1086 }
1170 if(tp && tp->pos+tp->bits > 32) { 1087
1171 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1088 if (tp && tp->pos + tp->bits > 32)
1089 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172 tp, tp->pos, tp->bits, key, plen); 1090 tp, tp->pos, tp->bits, key, plen);
1173 } 1091
1174 /* Rebalance the trie */ 1092 /* Rebalance the trie */
1175 t->trie = trie_rebalance(t, tp); 1093
1094 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1176done: 1095done:
1177 t->revision++; 1096 t->revision++;
1178err:; 1097err:
1179 return fa_head; 1098 return fa_head;
1180} 1099}
1181 1100
@@ -1185,7 +1104,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1185{ 1104{
1186 struct trie *t = (struct trie *) tb->tb_data; 1105 struct trie *t = (struct trie *) tb->tb_data;
1187 struct fib_alias *fa, *new_fa; 1106 struct fib_alias *fa, *new_fa;
1188 struct list_head *fa_head=NULL; 1107 struct list_head *fa_head = NULL;
1189 struct fib_info *fi; 1108 struct fib_info *fi;
1190 int plen = r->rtm_dst_len; 1109 int plen = r->rtm_dst_len;
1191 int type = r->rtm_type; 1110 int type = r->rtm_type;
@@ -1198,28 +1117,29 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1198 return -EINVAL; 1117 return -EINVAL;
1199 1118
1200 key = 0; 1119 key = 0;
1201 if (rta->rta_dst) 1120 if (rta->rta_dst)
1202 memcpy(&key, rta->rta_dst, 4); 1121 memcpy(&key, rta->rta_dst, 4);
1203 1122
1204 key = ntohl(key); 1123 key = ntohl(key);
1205 1124
1206 if(trie_debug) 1125 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1207 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1208 1126
1209 mask = ntohl( inet_make_mask(plen) ); 1127 mask = ntohl(inet_make_mask(plen));
1210 1128
1211 if(key & ~mask) 1129 if (key & ~mask)
1212 return -EINVAL; 1130 return -EINVAL;
1213 1131
1214 key = key & mask; 1132 key = key & mask;
1215 1133
1216 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) 1134 fi = fib_create_info(r, rta, nlhdr, &err);
1135
1136 if (!fi)
1217 goto err; 1137 goto err;
1218 1138
1219 l = fib_find_node(t, key); 1139 l = fib_find_node(t, key);
1220 fa = NULL; 1140 fa = NULL;
1221 1141
1222 if(l) { 1142 if (l) {
1223 fa_head = get_fa_head(l, plen); 1143 fa_head = get_fa_head(l, plen);
1224 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1144 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1225 } 1145 }
@@ -1235,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1235 * and we need to allocate a new one of those as well. 1155 * and we need to allocate a new one of those as well.
1236 */ 1156 */
1237 1157
1238 if (fa && 1158 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1239 fa->fa_info->fib_priority == fi->fib_priority) {
1240 struct fib_alias *fa_orig; 1159 struct fib_alias *fa_orig;
1241 1160
1242 err = -EEXIST; 1161 err = -EEXIST;
@@ -1247,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1247 struct fib_info *fi_drop; 1166 struct fib_info *fi_drop;
1248 u8 state; 1167 u8 state;
1249 1168
1250 write_lock_bh(&fib_lock); 1169 err = -ENOBUFS;
1170 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1171 if (new_fa == NULL)
1172 goto out;
1251 1173
1252 fi_drop = fa->fa_info; 1174 fi_drop = fa->fa_info;
1253 fa->fa_info = fi; 1175 new_fa->fa_tos = fa->fa_tos;
1254 fa->fa_type = type; 1176 new_fa->fa_info = fi;
1255 fa->fa_scope = r->rtm_scope; 1177 new_fa->fa_type = type;
1178 new_fa->fa_scope = r->rtm_scope;
1256 state = fa->fa_state; 1179 state = fa->fa_state;
1257 fa->fa_state &= ~FA_S_ACCESSED; 1180 new_fa->fa_state &= ~FA_S_ACCESSED;
1258 1181
1259 write_unlock_bh(&fib_lock); 1182 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1183 alias_free_mem_rcu(fa);
1260 1184
1261 fib_release_info(fi_drop); 1185 fib_release_info(fi_drop);
1262 if (state & FA_S_ACCESSED) 1186 if (state & FA_S_ACCESSED)
1263 rt_cache_flush(-1); 1187 rt_cache_flush(-1);
1264 1188
1265 goto succeeded; 1189 goto succeeded;
1266 } 1190 }
1267 /* Error if we find a perfect match which 1191 /* Error if we find a perfect match which
1268 * uses the same scope, type, and nexthop 1192 * uses the same scope, type, and nexthop
@@ -1284,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1284 fa = fa_orig; 1208 fa = fa_orig;
1285 } 1209 }
1286 err = -ENOENT; 1210 err = -ENOENT;
1287 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) 1211 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1288 goto out; 1212 goto out;
1289 1213
1290 err = -ENOBUFS; 1214 err = -ENOBUFS;
@@ -1297,26 +1221,19 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1297 new_fa->fa_type = type; 1221 new_fa->fa_type = type;
1298 new_fa->fa_scope = r->rtm_scope; 1222 new_fa->fa_scope = r->rtm_scope;
1299 new_fa->fa_state = 0; 1223 new_fa->fa_state = 0;
1300#if 0
1301 new_fa->dst = NULL;
1302#endif
1303 /* 1224 /*
1304 * Insert new entry to the list. 1225 * Insert new entry to the list.
1305 */ 1226 */
1306 1227
1307 if(!fa_head) { 1228 if (!fa_head) {
1308 fa_head = fib_insert_node(t, &err, key, plen); 1229 fa_head = fib_insert_node(t, &err, key, plen);
1309 err = 0; 1230 err = 0;
1310 if(err) 1231 if (err)
1311 goto out_free_new_fa; 1232 goto out_free_new_fa;
1312 } 1233 }
1313 1234
1314 write_lock_bh(&fib_lock); 1235 list_add_tail_rcu(&new_fa->fa_list,
1315 1236 (fa ? &fa->fa_list : fa_head));
1316 list_add_tail(&new_fa->fa_list,
1317 (fa ? &fa->fa_list : fa_head));
1318
1319 write_unlock_bh(&fib_lock);
1320 1237
1321 rt_cache_flush(-1); 1238 rt_cache_flush(-1);
1322 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1239 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1327,38 +1244,40 @@ out_free_new_fa:
1327 kmem_cache_free(fn_alias_kmem, new_fa); 1244 kmem_cache_free(fn_alias_kmem, new_fa);
1328out: 1245out:
1329 fib_release_info(fi); 1246 fib_release_info(fi);
1330err:; 1247err:
1331 return err; 1248 return err;
1332} 1249}
1333 1250
1334static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, 1251
1335 struct fib_result *res, int *err) 1252/* should be clalled with rcu_read_lock */
1253static inline int check_leaf(struct trie *t, struct leaf *l,
1254 t_key key, int *plen, const struct flowi *flp,
1255 struct fib_result *res)
1336{ 1256{
1337 int i; 1257 int err, i;
1338 t_key mask; 1258 t_key mask;
1339 struct leaf_info *li; 1259 struct leaf_info *li;
1340 struct hlist_head *hhead = &l->list; 1260 struct hlist_head *hhead = &l->list;
1341 struct hlist_node *node; 1261 struct hlist_node *node;
1342
1343 hlist_for_each_entry(li, node, hhead, hlist) {
1344 1262
1263 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1345 i = li->plen; 1264 i = li->plen;
1346 mask = ntohl(inet_make_mask(i)); 1265 mask = ntohl(inet_make_mask(i));
1347 if (l->key != (key & mask)) 1266 if (l->key != (key & mask))
1348 continue; 1267 continue;
1349 1268
1350 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { 1269 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1351 *plen = i; 1270 *plen = i;
1352#ifdef CONFIG_IP_FIB_TRIE_STATS 1271#ifdef CONFIG_IP_FIB_TRIE_STATS
1353 t->stats.semantic_match_passed++; 1272 t->stats.semantic_match_passed++;
1354#endif 1273#endif
1355 return 1; 1274 return err;
1356 } 1275 }
1357#ifdef CONFIG_IP_FIB_TRIE_STATS 1276#ifdef CONFIG_IP_FIB_TRIE_STATS
1358 t->stats.semantic_match_miss++; 1277 t->stats.semantic_match_miss++;
1359#endif 1278#endif
1360 } 1279 }
1361 return 0; 1280 return 1;
1362} 1281}
1363 1282
1364static int 1283static int
@@ -1369,14 +1288,18 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1369 struct node *n; 1288 struct node *n;
1370 struct tnode *pn; 1289 struct tnode *pn;
1371 int pos, bits; 1290 int pos, bits;
1372 t_key key=ntohl(flp->fl4_dst); 1291 t_key key = ntohl(flp->fl4_dst);
1373 int chopped_off; 1292 int chopped_off;
1374 t_key cindex = 0; 1293 t_key cindex = 0;
1375 int current_prefix_length = KEYLENGTH; 1294 int current_prefix_length = KEYLENGTH;
1376 n = t->trie; 1295 struct tnode *cn;
1296 t_key node_prefix, key_prefix, pref_mismatch;
1297 int mp;
1377 1298
1378 read_lock(&fib_lock); 1299 rcu_read_lock();
1379 if(!n) 1300
1301 n = rcu_dereference(t->trie);
1302 if (!n)
1380 goto failed; 1303 goto failed;
1381 1304
1382#ifdef CONFIG_IP_FIB_TRIE_STATS 1305#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1385,19 +1308,18 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1385 1308
1386 /* Just a leaf? */ 1309 /* Just a leaf? */
1387 if (IS_LEAF(n)) { 1310 if (IS_LEAF(n)) {
1388 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) ) 1311 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1389 goto found; 1312 goto found;
1390 goto failed; 1313 goto failed;
1391 } 1314 }
1392 pn = (struct tnode *) n; 1315 pn = (struct tnode *) n;
1393 chopped_off = 0; 1316 chopped_off = 0;
1394
1395 while (pn) {
1396 1317
1318 while (pn) {
1397 pos = pn->pos; 1319 pos = pn->pos;
1398 bits = pn->bits; 1320 bits = pn->bits;
1399 1321
1400 if(!chopped_off) 1322 if (!chopped_off)
1401 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); 1323 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1402 1324
1403 n = tnode_get_child(pn, cindex); 1325 n = tnode_get_child(pn, cindex);
@@ -1409,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1409 goto backtrace; 1331 goto backtrace;
1410 } 1332 }
1411 1333
1412 if (IS_TNODE(n)) { 1334 if (IS_LEAF(n)) {
1335 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1336 goto found;
1337 else
1338 goto backtrace;
1339 }
1340
1413#define HL_OPTIMIZE 1341#define HL_OPTIMIZE
1414#ifdef HL_OPTIMIZE 1342#ifdef HL_OPTIMIZE
1415 struct tnode *cn = (struct tnode *)n; 1343 cn = (struct tnode *)n;
1416 t_key node_prefix, key_prefix, pref_mismatch;
1417 int mp;
1418 1344
1419 /* 1345 /*
1420 * It's a tnode, and we can do some extra checks here if we 1346 * It's a tnode, and we can do some extra checks here if we
1421 * like, to avoid descending into a dead-end branch. 1347 * like, to avoid descending into a dead-end branch.
1422 * This tnode is in the parent's child array at index 1348 * This tnode is in the parent's child array at index
1423 * key[p_pos..p_pos+p_bits] but potentially with some bits 1349 * key[p_pos..p_pos+p_bits] but potentially with some bits
1424 * chopped off, so in reality the index may be just a 1350 * chopped off, so in reality the index may be just a
1425 * subprefix, padded with zero at the end. 1351 * subprefix, padded with zero at the end.
1426 * We can also take a look at any skipped bits in this 1352 * We can also take a look at any skipped bits in this
1427 * tnode - everything up to p_pos is supposed to be ok, 1353 * tnode - everything up to p_pos is supposed to be ok,
1428 * and the non-chopped bits of the index (se previous 1354 * and the non-chopped bits of the index (se previous
1429 * paragraph) are also guaranteed ok, but the rest is 1355 * paragraph) are also guaranteed ok, but the rest is
1430 * considered unknown. 1356 * considered unknown.
1431 * 1357 *
1432 * The skipped bits are key[pos+bits..cn->pos]. 1358 * The skipped bits are key[pos+bits..cn->pos].
1433 */ 1359 */
1434
1435 /* If current_prefix_length < pos+bits, we are already doing
1436 * actual prefix matching, which means everything from
1437 * pos+(bits-chopped_off) onward must be zero along some
1438 * branch of this subtree - otherwise there is *no* valid
1439 * prefix present. Here we can only check the skipped
1440 * bits. Remember, since we have already indexed into the
1441 * parent's child array, we know that the bits we chopped of
1442 * *are* zero.
1443 */
1444 1360
1445 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1361 /* If current_prefix_length < pos+bits, we are already doing
1446 1362 * actual prefix matching, which means everything from
1447 if (current_prefix_length < pos+bits) { 1363 * pos+(bits-chopped_off) onward must be zero along some
1448 if (tkey_extract_bits(cn->key, current_prefix_length, 1364 * branch of this subtree - otherwise there is *no* valid
1449 cn->pos - current_prefix_length) != 0 || 1365 * prefix present. Here we can only check the skipped
1450 !(cn->child[0])) 1366 * bits. Remember, since we have already indexed into the
1451 goto backtrace; 1367 * parent's child array, we know that the bits we chopped of
1452 } 1368 * *are* zero.
1369 */
1453 1370
1454 /* 1371 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1455 * If chopped_off=0, the index is fully validated and we
1456 * only need to look at the skipped bits for this, the new,
1457 * tnode. What we actually want to do is to find out if
1458 * these skipped bits match our key perfectly, or if we will
1459 * have to count on finding a matching prefix further down,
1460 * because if we do, we would like to have some way of
1461 * verifying the existence of such a prefix at this point.
1462 */
1463 1372
1464 /* The only thing we can do at this point is to verify that 1373 if (current_prefix_length < pos+bits) {
1465 * any such matching prefix can indeed be a prefix to our 1374 if (tkey_extract_bits(cn->key, current_prefix_length,
1466 * key, and if the bits in the node we are inspecting that 1375 cn->pos - current_prefix_length) != 0 ||
1467 * do not match our key are not ZERO, this cannot be true. 1376 !(cn->child[0]))
1468 * Thus, find out where there is a mismatch (before cn->pos) 1377 goto backtrace;
1469 * and verify that all the mismatching bits are zero in the 1378 }
1470 * new tnode's key.
1471 */
1472 1379
1473 /* Note: We aren't very concerned about the piece of the key 1380 /*
1474 * that precede pn->pos+pn->bits, since these have already been 1381 * If chopped_off=0, the index is fully validated and we
1475 * checked. The bits after cn->pos aren't checked since these are 1382 * only need to look at the skipped bits for this, the new,
1476 * by definition "unknown" at this point. Thus, what we want to 1383 * tnode. What we actually want to do is to find out if
1477 * see is if we are about to enter the "prefix matching" state, 1384 * these skipped bits match our key perfectly, or if we will
1478 * and in that case verify that the skipped bits that will prevail 1385 * have to count on finding a matching prefix further down,
1479 * throughout this subtree are zero, as they have to be if we are 1386 * because if we do, we would like to have some way of
1480 * to find a matching prefix. 1387 * verifying the existence of such a prefix at this point.
1481 */ 1388 */
1482 1389
1483 node_prefix = MASK_PFX(cn->key, cn->pos); 1390 /* The only thing we can do at this point is to verify that
1484 key_prefix = MASK_PFX(key, cn->pos); 1391 * any such matching prefix can indeed be a prefix to our
1485 pref_mismatch = key_prefix^node_prefix; 1392 * key, and if the bits in the node we are inspecting that
1486 mp = 0; 1393 * do not match our key are not ZERO, this cannot be true.
1394 * Thus, find out where there is a mismatch (before cn->pos)
1395 * and verify that all the mismatching bits are zero in the
1396 * new tnode's key.
1397 */
1487 1398
1488 /* In short: If skipped bits in this node do not match the search 1399 /* Note: We aren't very concerned about the piece of the key
1489 * key, enter the "prefix matching" state.directly. 1400 * that precede pn->pos+pn->bits, since these have already been
1490 */ 1401 * checked. The bits after cn->pos aren't checked since these are
1491 if (pref_mismatch) { 1402 * by definition "unknown" at this point. Thus, what we want to
1492 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1403 * see is if we are about to enter the "prefix matching" state,
1493 mp++; 1404 * and in that case verify that the skipped bits that will prevail
1494 pref_mismatch = pref_mismatch <<1; 1405 * throughout this subtree are zero, as they have to be if we are
1495 } 1406 * to find a matching prefix.
1496 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1407 */
1497 1408
1498 if (key_prefix != 0) 1409 node_prefix = MASK_PFX(cn->key, cn->pos);
1499 goto backtrace; 1410 key_prefix = MASK_PFX(key, cn->pos);
1500 1411 pref_mismatch = key_prefix^node_prefix;
1501 if (current_prefix_length >= cn->pos) 1412 mp = 0;
1502 current_prefix_length=mp; 1413
1503 } 1414 /* In short: If skipped bits in this node do not match the search
1415 * key, enter the "prefix matching" state.directly.
1416 */
1417 if (pref_mismatch) {
1418 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1419 mp++;
1420 pref_mismatch = pref_mismatch <<1;
1421 }
1422 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1423
1424 if (key_prefix != 0)
1425 goto backtrace;
1426
1427 if (current_prefix_length >= cn->pos)
1428 current_prefix_length = mp;
1429 }
1504#endif 1430#endif
1505 pn = (struct tnode *)n; /* Descend */ 1431 pn = (struct tnode *)n; /* Descend */
1506 chopped_off = 0; 1432 chopped_off = 0;
1507 continue; 1433 continue;
1508 } 1434
1509 if (IS_LEAF(n)) {
1510 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1511 goto found;
1512 }
1513backtrace: 1435backtrace:
1514 chopped_off++; 1436 chopped_off++;
1515 1437
1516 /* As zero don't change the child key (cindex) */ 1438 /* As zero don't change the child key (cindex) */
1517 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { 1439 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1518 chopped_off++; 1440 chopped_off++;
1519 }
1520 1441
1521 /* Decrease current_... with bits chopped off */ 1442 /* Decrease current_... with bits chopped off */
1522 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1443 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1523 current_prefix_length = pn->pos + pn->bits - chopped_off; 1444 current_prefix_length = pn->pos + pn->bits - chopped_off;
1524 1445
1525 /* 1446 /*
1526 * Either we do the actual chop off according or if we have 1447 * Either we do the actual chop off according or if we have
1527 * chopped off all bits in this tnode walk up to our parent. 1448 * chopped off all bits in this tnode walk up to our parent.
1528 */ 1449 */
1529 1450
1530 if(chopped_off <= pn->bits) 1451 if (chopped_off <= pn->bits) {
1531 cindex &= ~(1 << (chopped_off-1)); 1452 cindex &= ~(1 << (chopped_off-1));
1532 else { 1453 } else {
1533 if( NODE_PARENT(pn) == NULL) 1454 if (NODE_PARENT(pn) == NULL)
1534 goto failed; 1455 goto failed;
1535 1456
1536 /* Get Child's index */ 1457 /* Get Child's index */
1537 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1458 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1538 pn = NODE_PARENT(pn); 1459 pn = NODE_PARENT(pn);
@@ -1542,15 +1463,16 @@ backtrace:
1542 t->stats.backtrack++; 1463 t->stats.backtrack++;
1543#endif 1464#endif
1544 goto backtrace; 1465 goto backtrace;
1545 } 1466 }
1546 } 1467 }
1547failed: 1468failed:
1548 ret = 1; 1469 ret = 1;
1549found: 1470found:
1550 read_unlock(&fib_lock); 1471 rcu_read_unlock();
1551 return ret; 1472 return ret;
1552} 1473}
1553 1474
1475/* only called from updater side */
1554static int trie_leaf_remove(struct trie *t, t_key key) 1476static int trie_leaf_remove(struct trie *t, t_key key)
1555{ 1477{
1556 t_key cindex; 1478 t_key cindex;
@@ -1558,54 +1480,51 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1558 struct node *n = t->trie; 1480 struct node *n = t->trie;
1559 struct leaf *l; 1481 struct leaf *l;
1560 1482
1561 if(trie_debug) 1483 pr_debug("entering trie_leaf_remove(%p)\n", n);
1562 printk("entering trie_leaf_remove(%p)\n", n);
1563 1484
1564 /* Note that in the case skipped bits, those bits are *not* checked! 1485 /* Note that in the case skipped bits, those bits are *not* checked!
1565 * When we finish this, we will have NULL or a T_LEAF, and the 1486 * When we finish this, we will have NULL or a T_LEAF, and the
1566 * T_LEAF may or may not match our key. 1487 * T_LEAF may or may not match our key.
1567 */ 1488 */
1568 1489
1569 while (n != NULL && IS_TNODE(n)) { 1490 while (n != NULL && IS_TNODE(n)) {
1570 struct tnode *tn = (struct tnode *) n; 1491 struct tnode *tn = (struct tnode *) n;
1571 check_tnode(tn); 1492 check_tnode(tn);
1572 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1493 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1573 1494
1574 if(n && NODE_PARENT(n) != tn) { 1495 BUG_ON(n && NODE_PARENT(n) != tn);
1575 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1496 }
1576 BUG();
1577 }
1578 }
1579 l = (struct leaf *) n; 1497 l = (struct leaf *) n;
1580 1498
1581 if(!n || !tkey_equals(l->key, key)) 1499 if (!n || !tkey_equals(l->key, key))
1582 return 0; 1500 return 0;
1583 1501
1584 /* 1502 /*
1585 * Key found. 1503 * Key found.
1586 * Remove the leaf and rebalance the tree 1504 * Remove the leaf and rebalance the tree
1587 */ 1505 */
1588 1506
1589 t->revision++; 1507 t->revision++;
1590 t->size--; 1508 t->size--;
1591 1509
1510 preempt_disable();
1592 tp = NODE_PARENT(n); 1511 tp = NODE_PARENT(n);
1593 tnode_free((struct tnode *) n); 1512 tnode_free((struct tnode *) n);
1594 1513
1595 if(tp) { 1514 if (tp) {
1596 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1515 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1597 put_child(t, (struct tnode *)tp, cindex, NULL); 1516 put_child(t, (struct tnode *)tp, cindex, NULL);
1598 t->trie = trie_rebalance(t, tp); 1517 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1599 } 1518 } else
1600 else 1519 rcu_assign_pointer(t->trie, NULL);
1601 t->trie = NULL; 1520 preempt_enable();
1602 1521
1603 return 1; 1522 return 1;
1604} 1523}
1605 1524
1606static int 1525static int
1607fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1526fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1608 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1527 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1609{ 1528{
1610 struct trie *t = (struct trie *) tb->tb_data; 1529 struct trie *t = (struct trie *) tb->tb_data;
1611 u32 key, mask; 1530 u32 key, mask;
@@ -1614,24 +1533,26 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1614 struct fib_alias *fa, *fa_to_delete; 1533 struct fib_alias *fa, *fa_to_delete;
1615 struct list_head *fa_head; 1534 struct list_head *fa_head;
1616 struct leaf *l; 1535 struct leaf *l;
1536 struct leaf_info *li;
1537
1617 1538
1618 if (plen > 32) 1539 if (plen > 32)
1619 return -EINVAL; 1540 return -EINVAL;
1620 1541
1621 key = 0; 1542 key = 0;
1622 if (rta->rta_dst) 1543 if (rta->rta_dst)
1623 memcpy(&key, rta->rta_dst, 4); 1544 memcpy(&key, rta->rta_dst, 4);
1624 1545
1625 key = ntohl(key); 1546 key = ntohl(key);
1626 mask = ntohl( inet_make_mask(plen) ); 1547 mask = ntohl(inet_make_mask(plen));
1627 1548
1628 if(key & ~mask) 1549 if (key & ~mask)
1629 return -EINVAL; 1550 return -EINVAL;
1630 1551
1631 key = key & mask; 1552 key = key & mask;
1632 l = fib_find_node(t, key); 1553 l = fib_find_node(t, key);
1633 1554
1634 if(!l) 1555 if (!l)
1635 return -ESRCH; 1556 return -ESRCH;
1636 1557
1637 fa_head = get_fa_head(l, plen); 1558 fa_head = get_fa_head(l, plen);
@@ -1640,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1640 if (!fa) 1561 if (!fa)
1641 return -ESRCH; 1562 return -ESRCH;
1642 1563
1643 if (trie_debug) 1564 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1644 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1645 1565
1646 fa_to_delete = NULL; 1566 fa_to_delete = NULL;
1647 fa_head = fa->fa_list.prev; 1567 fa_head = fa->fa_list.prev;
1568
1648 list_for_each_entry(fa, fa_head, fa_list) { 1569 list_for_each_entry(fa, fa_head, fa_list) {
1649 struct fib_info *fi = fa->fa_info; 1570 struct fib_info *fi = fa->fa_info;
1650 1571
@@ -1663,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1663 } 1584 }
1664 } 1585 }
1665 1586
1666 if (fa_to_delete) { 1587 if (!fa_to_delete)
1667 int kill_li = 0; 1588 return -ESRCH;
1668 struct leaf_info *li;
1669
1670 fa = fa_to_delete;
1671 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1672 1589
1673 l = fib_find_node(t, key); 1590 fa = fa_to_delete;
1674 li = find_leaf_info(&l->list, plen); 1591 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1675 1592
1676 write_lock_bh(&fib_lock); 1593 l = fib_find_node(t, key);
1594 li = find_leaf_info(&l->list, plen);
1677 1595
1678 list_del(&fa->fa_list); 1596 list_del_rcu(&fa->fa_list);
1679 1597
1680 if(list_empty(fa_head)) { 1598 if (list_empty(fa_head)) {
1681 hlist_del(&li->hlist); 1599 hlist_del_rcu(&li->hlist);
1682 kill_li = 1; 1600 free_leaf_info(li);
1683 } 1601 }
1684 write_unlock_bh(&fib_lock);
1685
1686 if(kill_li)
1687 free_leaf_info(li);
1688 1602
1689 if(hlist_empty(&l->list)) 1603 if (hlist_empty(&l->list))
1690 trie_leaf_remove(t, key); 1604 trie_leaf_remove(t, key);
1691 1605
1692 if (fa->fa_state & FA_S_ACCESSED) 1606 if (fa->fa_state & FA_S_ACCESSED)
1693 rt_cache_flush(-1); 1607 rt_cache_flush(-1);
1694 1608
1695 fn_free_alias(fa); 1609 fib_release_info(fa->fa_info);
1696 return 0; 1610 alias_free_mem_rcu(fa);
1697 } 1611 return 0;
1698 return -ESRCH;
1699} 1612}
1700 1613
1701static int trie_flush_list(struct trie *t, struct list_head *head) 1614static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1705,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
1705 1618
1706 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1619 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1707 struct fib_info *fi = fa->fa_info; 1620 struct fib_info *fi = fa->fa_info;
1708
1709 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1710
1711 write_lock_bh(&fib_lock);
1712 list_del(&fa->fa_list);
1713 write_unlock_bh(&fib_lock);
1714 1621
1715 fn_free_alias(fa); 1622 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1623 list_del_rcu(&fa->fa_list);
1624 fib_release_info(fa->fa_info);
1625 alias_free_mem_rcu(fa);
1716 found++; 1626 found++;
1717 } 1627 }
1718 } 1628 }
@@ -1727,71 +1637,71 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
1727 struct leaf_info *li = NULL; 1637 struct leaf_info *li = NULL;
1728 1638
1729 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1639 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1730
1731 found += trie_flush_list(t, &li->falh); 1640 found += trie_flush_list(t, &li->falh);
1732 1641
1733 if (list_empty(&li->falh)) { 1642 if (list_empty(&li->falh)) {
1734 1643 hlist_del_rcu(&li->hlist);
1735 write_lock_bh(&fib_lock);
1736 hlist_del(&li->hlist);
1737 write_unlock_bh(&fib_lock);
1738
1739 free_leaf_info(li); 1644 free_leaf_info(li);
1740 } 1645 }
1741 } 1646 }
1742 return found; 1647 return found;
1743} 1648}
1744 1649
1650/* rcu_read_lock needs to be hold by caller from readside */
1651
1745static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1652static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1746{ 1653{
1747 struct node *c = (struct node *) thisleaf; 1654 struct node *c = (struct node *) thisleaf;
1748 struct tnode *p; 1655 struct tnode *p;
1749 int idx; 1656 int idx;
1657 struct node *trie = rcu_dereference(t->trie);
1750 1658
1751 if(c == NULL) { 1659 if (c == NULL) {
1752 if(t->trie == NULL) 1660 if (trie == NULL)
1753 return NULL; 1661 return NULL;
1754 1662
1755 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1663 if (IS_LEAF(trie)) /* trie w. just a leaf */
1756 return (struct leaf *) t->trie; 1664 return (struct leaf *) trie;
1757 1665
1758 p = (struct tnode*) t->trie; /* Start */ 1666 p = (struct tnode*) trie; /* Start */
1759 } 1667 } else
1760 else
1761 p = (struct tnode *) NODE_PARENT(c); 1668 p = (struct tnode *) NODE_PARENT(c);
1669
1762 while (p) { 1670 while (p) {
1763 int pos, last; 1671 int pos, last;
1764 1672
1765 /* Find the next child of the parent */ 1673 /* Find the next child of the parent */
1766 if(c) 1674 if (c)
1767 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1675 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1768 else 1676 else
1769 pos = 0; 1677 pos = 0;
1770 1678
1771 last = 1 << p->bits; 1679 last = 1 << p->bits;
1772 for(idx = pos; idx < last ; idx++) { 1680 for (idx = pos; idx < last ; idx++) {
1773 if( p->child[idx]) { 1681 c = rcu_dereference(p->child[idx]);
1774 1682
1775 /* Decend if tnode */ 1683 if (!c)
1776 1684 continue;
1777 while (IS_TNODE(p->child[idx])) { 1685
1778 p = (struct tnode*) p->child[idx]; 1686 /* Decend if tnode */
1779 idx = 0; 1687 while (IS_TNODE(c)) {
1780 1688 p = (struct tnode *) c;
1781 /* Rightmost non-NULL branch */ 1689 idx = 0;
1782 if( p && IS_TNODE(p) ) 1690
1783 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++; 1691 /* Rightmost non-NULL branch */
1784 1692 if (p && IS_TNODE(p))
1785 /* Done with this tnode? */ 1693 while (!(c = rcu_dereference(p->child[idx]))
1786 if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 1694 && idx < (1<<p->bits)) idx++;
1787 goto up; 1695
1788 } 1696 /* Done with this tnode? */
1789 return (struct leaf*) p->child[idx]; 1697 if (idx >= (1 << p->bits) || !c)
1698 goto up;
1790 } 1699 }
1700 return (struct leaf *) c;
1791 } 1701 }
1792up: 1702up:
1793 /* No more children go up one step */ 1703 /* No more children go up one step */
1794 c = (struct node*) p; 1704 c = (struct node *) p;
1795 p = (struct tnode *) NODE_PARENT(p); 1705 p = (struct tnode *) NODE_PARENT(p);
1796 } 1706 }
1797 return NULL; /* Ready. Root of trie */ 1707 return NULL; /* Ready. Root of trie */
@@ -1805,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb)
1805 1715
1806 t->revision++; 1716 t->revision++;
1807 1717
1808 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1718 rcu_read_lock();
1719 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1809 found += trie_flush_leaf(t, l); 1720 found += trie_flush_leaf(t, l);
1810 1721
1811 if (ll && hlist_empty(&ll->list)) 1722 if (ll && hlist_empty(&ll->list))
1812 trie_leaf_remove(t, ll->key); 1723 trie_leaf_remove(t, ll->key);
1813 ll = l; 1724 ll = l;
1814 } 1725 }
1726 rcu_read_unlock();
1815 1727
1816 if (ll && hlist_empty(&ll->list)) 1728 if (ll && hlist_empty(&ll->list))
1817 trie_leaf_remove(t, ll->key); 1729 trie_leaf_remove(t, ll->key);
1818 1730
1819 if(trie_debug) 1731 pr_debug("trie_flush found=%d\n", found);
1820 printk("trie_flush found=%d\n", found);
1821 return found; 1732 return found;
1822} 1733}
1823 1734
1824static int trie_last_dflt=-1; 1735static int trie_last_dflt = -1;
1825 1736
1826static void 1737static void
1827fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1738fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1838,33 +1749,33 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1838 last_resort = NULL; 1749 last_resort = NULL;
1839 order = -1; 1750 order = -1;
1840 1751
1841 read_lock(&fib_lock); 1752 rcu_read_lock();
1842 1753
1843 l = fib_find_node(t, 0); 1754 l = fib_find_node(t, 0);
1844 if(!l) 1755 if (!l)
1845 goto out; 1756 goto out;
1846 1757
1847 fa_head = get_fa_head(l, 0); 1758 fa_head = get_fa_head(l, 0);
1848 if(!fa_head) 1759 if (!fa_head)
1849 goto out; 1760 goto out;
1850 1761
1851 if (list_empty(fa_head)) 1762 if (list_empty(fa_head))
1852 goto out; 1763 goto out;
1853 1764
1854 list_for_each_entry(fa, fa_head, fa_list) { 1765 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1855 struct fib_info *next_fi = fa->fa_info; 1766 struct fib_info *next_fi = fa->fa_info;
1856 1767
1857 if (fa->fa_scope != res->scope || 1768 if (fa->fa_scope != res->scope ||
1858 fa->fa_type != RTN_UNICAST) 1769 fa->fa_type != RTN_UNICAST)
1859 continue; 1770 continue;
1860 1771
1861 if (next_fi->fib_priority > res->fi->fib_priority) 1772 if (next_fi->fib_priority > res->fi->fib_priority)
1862 break; 1773 break;
1863 if (!next_fi->fib_nh[0].nh_gw || 1774 if (!next_fi->fib_nh[0].nh_gw ||
1864 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1775 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1865 continue; 1776 continue;
1866 fa->fa_state |= FA_S_ACCESSED; 1777 fa->fa_state |= FA_S_ACCESSED;
1867 1778
1868 if (fi == NULL) { 1779 if (fi == NULL) {
1869 if (next_fi != res->fi) 1780 if (next_fi != res->fi)
1870 break; 1781 break;
@@ -1902,21 +1813,23 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1902 } 1813 }
1903 trie_last_dflt = last_idx; 1814 trie_last_dflt = last_idx;
1904 out:; 1815 out:;
1905 read_unlock(&fib_lock); 1816 rcu_read_unlock();
1906} 1817}
1907 1818
1908static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1819static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1909 struct sk_buff *skb, struct netlink_callback *cb) 1820 struct sk_buff *skb, struct netlink_callback *cb)
1910{ 1821{
1911 int i, s_i; 1822 int i, s_i;
1912 struct fib_alias *fa; 1823 struct fib_alias *fa;
1913 1824
1914 u32 xkey=htonl(key); 1825 u32 xkey = htonl(key);
1915 1826
1916 s_i=cb->args[3]; 1827 s_i = cb->args[3];
1917 i = 0; 1828 i = 0;
1918 1829
1919 list_for_each_entry(fa, fah, fa_list) { 1830 /* rcu_read_lock is hold by caller */
1831
1832 list_for_each_entry_rcu(fa, fah, fa_list) {
1920 if (i < s_i) { 1833 if (i < s_i) {
1921 i++; 1834 i++;
1922 continue; 1835 continue;
@@ -1944,23 +1857,23 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1944 fa->fa_info, 0) < 0) { 1857 fa->fa_info, 0) < 0) {
1945 cb->args[3] = i; 1858 cb->args[3] = i;
1946 return -1; 1859 return -1;
1947 } 1860 }
1948 i++; 1861 i++;
1949 } 1862 }
1950 cb->args[3]=i; 1863 cb->args[3] = i;
1951 return skb->len; 1864 return skb->len;
1952} 1865}
1953 1866
1954static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1867static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1955 struct netlink_callback *cb) 1868 struct netlink_callback *cb)
1956{ 1869{
1957 int h, s_h; 1870 int h, s_h;
1958 struct list_head *fa_head; 1871 struct list_head *fa_head;
1959 struct leaf *l = NULL; 1872 struct leaf *l = NULL;
1960 s_h=cb->args[2];
1961 1873
1962 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1874 s_h = cb->args[2];
1963 1875
1876 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1964 if (h < s_h) 1877 if (h < s_h)
1965 continue; 1878 continue;
1966 if (h > s_h) 1879 if (h > s_h)
@@ -1968,19 +1881,19 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1968 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1881 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1969 1882
1970 fa_head = get_fa_head(l, plen); 1883 fa_head = get_fa_head(l, plen);
1971 1884
1972 if(!fa_head) 1885 if (!fa_head)
1973 continue; 1886 continue;
1974 1887
1975 if(list_empty(fa_head)) 1888 if (list_empty(fa_head))
1976 continue; 1889 continue;
1977 1890
1978 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1891 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1979 cb->args[2]=h; 1892 cb->args[2] = h;
1980 return -1; 1893 return -1;
1981 } 1894 }
1982 } 1895 }
1983 cb->args[2]=h; 1896 cb->args[2] = h;
1984 return skb->len; 1897 return skb->len;
1985} 1898}
1986 1899
@@ -1991,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
1991 1904
1992 s_m = cb->args[1]; 1905 s_m = cb->args[1];
1993 1906
1994 read_lock(&fib_lock); 1907 rcu_read_lock();
1995 for (m=0; m<=32; m++) { 1908 for (m = 0; m <= 32; m++) {
1996
1997 if (m < s_m) 1909 if (m < s_m)
1998 continue; 1910 continue;
1999 if (m > s_m) 1911 if (m > s_m)
2000 memset(&cb->args[2], 0, 1912 memset(&cb->args[2], 0,
2001 sizeof(cb->args) - 2*sizeof(cb->args[0])); 1913 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2002 1914
2003 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { 1915 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2004 cb->args[1] = m; 1916 cb->args[1] = m;
2005 goto out; 1917 goto out;
2006 } 1918 }
2007 } 1919 }
2008 read_unlock(&fib_lock); 1920 rcu_read_unlock();
2009 cb->args[1] = m; 1921 cb->args[1] = m;
2010 return skb->len; 1922 return skb->len;
2011 out: 1923out:
2012 read_unlock(&fib_lock); 1924 rcu_read_unlock();
2013 return -1; 1925 return -1;
2014} 1926}
2015 1927
@@ -2048,10 +1960,10 @@ struct fib_table * __init fib_hash_init(int id)
2048 1960
2049 trie_init(t); 1961 trie_init(t);
2050 1962
2051 if (id == RT_TABLE_LOCAL) 1963 if (id == RT_TABLE_LOCAL)
2052 trie_local=t; 1964 trie_local = t;
2053 else if (id == RT_TABLE_MAIN) 1965 else if (id == RT_TABLE_MAIN)
2054 trie_main=t; 1966 trie_main = t;
2055 1967
2056 if (id == RT_TABLE_LOCAL) 1968 if (id == RT_TABLE_LOCAL)
2057 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); 1969 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2063,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id)
2063 1975
2064static void putspace_seq(struct seq_file *seq, int n) 1976static void putspace_seq(struct seq_file *seq, int n)
2065{ 1977{
2066 while (n--) seq_printf(seq, " "); 1978 while (n--)
1979 seq_printf(seq, " ");
2067} 1980}
2068 1981
2069static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) 1982static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
@@ -2072,7 +1985,7 @@ static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2072 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0"); 1985 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2073} 1986}
2074 1987
2075static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 1988static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2076 int pend, int cindex, int bits) 1989 int pend, int cindex, int bits)
2077{ 1990{
2078 putspace_seq(seq, indent); 1991 putspace_seq(seq, indent);
@@ -2084,49 +1997,41 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2084 seq_printf(seq, "%d/", cindex); 1997 seq_printf(seq, "%d/", cindex);
2085 printbin_seq(seq, cindex, bits); 1998 printbin_seq(seq, cindex, bits);
2086 seq_printf(seq, ": "); 1999 seq_printf(seq, ": ");
2087 } 2000 } else
2088 else
2089 seq_printf(seq, "<root>: "); 2001 seq_printf(seq, "<root>: ");
2090 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); 2002 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2091 2003
2092 if (IS_LEAF(n))
2093 seq_printf(seq, "key=%d.%d.%d.%d\n",
2094 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2095 else {
2096 int plen=((struct tnode *)n)->pos;
2097 t_key prf=MASK_PFX(n->key, plen);
2098 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2099 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2100 }
2101 if (IS_LEAF(n)) { 2004 if (IS_LEAF(n)) {
2102 struct leaf *l=(struct leaf *)n; 2005 struct leaf *l = (struct leaf *)n;
2103 struct fib_alias *fa; 2006 struct fib_alias *fa;
2104 int i; 2007 int i;
2105 for (i=32; i>=0; i--) 2008
2106 if(find_leaf_info(&l->list, i)) { 2009 seq_printf(seq, "key=%d.%d.%d.%d\n",
2107 2010 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2011
2012 for (i = 32; i >= 0; i--)
2013 if (find_leaf_info(&l->list, i)) {
2108 struct list_head *fa_head = get_fa_head(l, i); 2014 struct list_head *fa_head = get_fa_head(l, i);
2109 2015
2110 if(!fa_head) 2016 if (!fa_head)
2111 continue; 2017 continue;
2112 2018
2113 if(list_empty(fa_head)) 2019 if (list_empty(fa_head))
2114 continue; 2020 continue;
2115 2021
2116 putspace_seq(seq, indent+2); 2022 putspace_seq(seq, indent+2);
2117 seq_printf(seq, "{/%d...dumping}\n", i); 2023 seq_printf(seq, "{/%d...dumping}\n", i);
2118 2024
2119 2025 list_for_each_entry_rcu(fa, fa_head, fa_list) {
2120 list_for_each_entry(fa, fa_head, fa_list) {
2121 putspace_seq(seq, indent+2); 2026 putspace_seq(seq, indent+2);
2122 if (fa->fa_info->fib_nh == NULL) {
2123 seq_printf(seq, "Error _fib_nh=NULL\n");
2124 continue;
2125 }
2126 if (fa->fa_info == NULL) { 2027 if (fa->fa_info == NULL) {
2127 seq_printf(seq, "Error fa_info=NULL\n"); 2028 seq_printf(seq, "Error fa_info=NULL\n");
2128 continue; 2029 continue;
2129 } 2030 }
2031 if (fa->fa_info->fib_nh == NULL) {
2032 seq_printf(seq, "Error _fib_nh=NULL\n");
2033 continue;
2034 }
2130 2035
2131 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", 2036 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2132 fa->fa_type, 2037 fa->fa_type,
@@ -2134,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2134 fa->fa_tos); 2039 fa->fa_tos);
2135 } 2040 }
2136 } 2041 }
2137 } 2042 } else {
2138 else if (IS_TNODE(n)) { 2043 struct tnode *tn = (struct tnode *)n;
2139 struct tnode *tn=(struct tnode *)n; 2044 int plen = ((struct tnode *)n)->pos;
2045 t_key prf = MASK_PFX(n->key, plen);
2046
2047 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2048 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2049
2140 putspace_seq(seq, indent); seq_printf(seq, "| "); 2050 putspace_seq(seq, indent); seq_printf(seq, "| ");
2141 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); 2051 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2142 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); 2052 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2143 seq_printf(seq, "}\n"); 2053 seq_printf(seq, "}\n");
2144 putspace_seq(seq, indent); seq_printf(seq, "| "); 2054 putspace_seq(seq, indent); seq_printf(seq, "| ");
@@ -2152,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2152 2062
2153static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2063static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2154{ 2064{
2155 struct node *n=t->trie; 2065 struct node *n;
2156 int cindex=0; 2066 int cindex = 0;
2157 int indent=1; 2067 int indent = 1;
2158 int pend=0; 2068 int pend = 0;
2159 int depth = 0; 2069 int depth = 0;
2070 struct tnode *tn;
2160 2071
2161 read_lock(&fib_lock); 2072 rcu_read_lock();
2162 2073 n = rcu_dereference(t->trie);
2163 seq_printf(seq, "------ trie_dump of t=%p ------\n", t); 2074 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2164 if (n) {
2165 printnode_seq(seq, indent, n, pend, cindex, 0);
2166 if (IS_TNODE(n)) {
2167 struct tnode *tn=(struct tnode *)n;
2168 pend = tn->pos+tn->bits;
2169 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2170 indent += 3;
2171 depth++;
2172
2173 while (tn && cindex < (1 << tn->bits)) {
2174 if (tn->child[cindex]) {
2175
2176 /* Got a child */
2177
2178 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2179 if (IS_LEAF(tn->child[cindex])) {
2180 cindex++;
2181
2182 }
2183 else {
2184 /*
2185 * New tnode. Decend one level
2186 */
2187
2188 depth++;
2189 n=tn->child[cindex];
2190 tn=(struct tnode *)n;
2191 pend=tn->pos+tn->bits;
2192 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2193 indent+=3;
2194 cindex=0;
2195 }
2196 }
2197 else
2198 cindex++;
2199 2075
2076 if (!n) {
2077 seq_printf(seq, "------ trie is empty\n");
2078
2079 rcu_read_unlock();
2080 return;
2081 }
2082
2083 printnode_seq(seq, indent, n, pend, cindex, 0);
2084
2085 if (!IS_TNODE(n)) {
2086 rcu_read_unlock();
2087 return;
2088 }
2089
2090 tn = (struct tnode *)n;
2091 pend = tn->pos+tn->bits;
2092 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2093 indent += 3;
2094 depth++;
2095
2096 while (tn && cindex < (1 << tn->bits)) {
2097 struct node *child = rcu_dereference(tn->child[cindex]);
2098 if (!child)
2099 cindex++;
2100 else {
2101 /* Got a child */
2102 printnode_seq(seq, indent, child, pend,
2103 cindex, tn->bits);
2104
2105 if (IS_LEAF(child))
2106 cindex++;
2107
2108 else {
2200 /* 2109 /*
2201 * Test if we are done 2110 * New tnode. Decend one level
2202 */ 2111 */
2203
2204 while (cindex >= (1 << tn->bits)) {
2205 2112
2206 /* 2113 depth++;
2207 * Move upwards and test for root 2114 n = child;
2208 * pop off all traversed nodes 2115 tn = (struct tnode *)n;
2209 */ 2116 pend = tn->pos+tn->bits;
2210 2117 putspace_seq(seq, indent);
2211 if (NODE_PARENT(tn) == NULL) { 2118 seq_printf(seq, "\\--\n");
2212 tn = NULL; 2119 indent += 3;
2213 n = NULL; 2120 cindex = 0;
2214 break;
2215 }
2216 else {
2217 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2218 tn = NODE_PARENT(tn);
2219 cindex++;
2220 n=(struct node *)tn;
2221 pend=tn->pos+tn->bits;
2222 indent-=3;
2223 depth--;
2224 }
2225 }
2226 } 2121 }
2227 } 2122 }
2228 else n = NULL;
2229 }
2230 else seq_printf(seq, "------ trie is empty\n");
2231 2123
2232 read_unlock(&fib_lock); 2124 /*
2125 * Test if we are done
2126 */
2127
2128 while (cindex >= (1 << tn->bits)) {
2129 /*
2130 * Move upwards and test for root
2131 * pop off all traversed nodes
2132 */
2133
2134 if (NODE_PARENT(tn) == NULL) {
2135 tn = NULL;
2136 break;
2137 }
2138
2139 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2140 cindex++;
2141 tn = NODE_PARENT(tn);
2142 pend = tn->pos + tn->bits;
2143 indent -= 3;
2144 depth--;
2145 }
2146 }
2147 rcu_read_unlock();
2233} 2148}
2234 2149
2235static struct trie_stat *trie_stat_new(void) 2150static struct trie_stat *trie_stat_new(void)
2236{ 2151{
2237 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); 2152 struct trie_stat *s;
2238 int i; 2153 int i;
2239 2154
2240 if(s) { 2155 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2241 s->totdepth = 0; 2156 if (!s)
2242 s->maxdepth = 0; 2157 return NULL;
2243 s->tnodes = 0; 2158
2244 s->leaves = 0; 2159 s->totdepth = 0;
2245 s->nullpointers = 0; 2160 s->maxdepth = 0;
2246 2161 s->tnodes = 0;
2247 for(i=0; i< MAX_CHILDS; i++) 2162 s->leaves = 0;
2248 s->nodesizes[i] = 0; 2163 s->nullpointers = 0;
2249 } 2164
2165 for (i = 0; i < MAX_CHILDS; i++)
2166 s->nodesizes[i] = 0;
2167
2250 return s; 2168 return s;
2251} 2169}
2252 2170
2253static struct trie_stat *trie_collect_stats(struct trie *t) 2171static struct trie_stat *trie_collect_stats(struct trie *t)
2254{ 2172{
2255 struct node *n=t->trie; 2173 struct node *n;
2256 struct trie_stat *s = trie_stat_new(); 2174 struct trie_stat *s = trie_stat_new();
2257 int cindex = 0; 2175 int cindex = 0;
2258 int indent = 1;
2259 int pend = 0; 2176 int pend = 0;
2260 int depth = 0; 2177 int depth = 0;
2261 2178
2262 read_lock(&fib_lock); 2179 if (!s)
2180 return NULL;
2263 2181
2264 if (s) { 2182 rcu_read_lock();
2265 if (n) { 2183 n = rcu_dereference(t->trie);
2266 if (IS_TNODE(n)) {
2267 struct tnode *tn = (struct tnode *)n;
2268 pend=tn->pos+tn->bits;
2269 indent += 3;
2270 s->nodesizes[tn->bits]++;
2271 depth++;
2272 2184
2273 while (tn && cindex < (1 << tn->bits)) { 2185 if (!n)
2274 if (tn->child[cindex]) { 2186 return s;
2275 /* Got a child */ 2187
2276 2188 if (IS_TNODE(n)) {
2277 if (IS_LEAF(tn->child[cindex])) { 2189 struct tnode *tn = (struct tnode *)n;
2278 cindex++; 2190 pend = tn->pos+tn->bits;
2279 2191 s->nodesizes[tn->bits]++;
2280 /* stats */ 2192 depth++;
2281 if (depth > s->maxdepth)
2282 s->maxdepth = depth;
2283 s->totdepth += depth;
2284 s->leaves++;
2285 }
2286
2287 else {
2288 /*
2289 * New tnode. Decend one level
2290 */
2291
2292 s->tnodes++;
2293 s->nodesizes[tn->bits]++;
2294 depth++;
2295
2296 n = tn->child[cindex];
2297 tn = (struct tnode *)n;
2298 pend = tn->pos+tn->bits;
2299
2300 indent += 3;
2301 cindex = 0;
2302 }
2303 }
2304 else {
2305 cindex++;
2306 s->nullpointers++;
2307 }
2308 2193
2194 while (tn && cindex < (1 << tn->bits)) {
2195 struct node *ch = rcu_dereference(tn->child[cindex]);
2196 if (ch) {
2197
2198 /* Got a child */
2199
2200 if (IS_LEAF(tn->child[cindex])) {
2201 cindex++;
2202
2203 /* stats */
2204 if (depth > s->maxdepth)
2205 s->maxdepth = depth;
2206 s->totdepth += depth;
2207 s->leaves++;
2208 } else {
2309 /* 2209 /*
2310 * Test if we are done 2210 * New tnode. Decend one level
2311 */ 2211 */
2312 2212
2313 while (cindex >= (1 << tn->bits)) { 2213 s->tnodes++;
2314 2214 s->nodesizes[tn->bits]++;
2315 /* 2215 depth++;
2316 * Move upwards and test for root 2216
2317 * pop off all traversed nodes 2217 n = ch;
2318 */ 2218 tn = (struct tnode *)n;
2319 2219 pend = tn->pos+tn->bits;
2320 2220
2321 if (NODE_PARENT(tn) == NULL) { 2221 cindex = 0;
2322 tn = NULL;
2323 n = NULL;
2324 break;
2325 }
2326 else {
2327 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2328 tn = NODE_PARENT(tn);
2329 cindex++;
2330 n = (struct node *)tn;
2331 pend=tn->pos+tn->bits;
2332 indent -= 3;
2333 depth--;
2334 }
2335 }
2336 } 2222 }
2223 } else {
2224 cindex++;
2225 s->nullpointers++;
2337 } 2226 }
2338 else n = NULL; 2227
2228 /*
2229 * Test if we are done
2230 */
2231
2232 while (cindex >= (1 << tn->bits)) {
2233 /*
2234 * Move upwards and test for root
2235 * pop off all traversed nodes
2236 */
2237
2238 if (NODE_PARENT(tn) == NULL) {
2239 tn = NULL;
2240 n = NULL;
2241 break;
2242 }
2243
2244 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2245 tn = NODE_PARENT(tn);
2246 cindex++;
2247 n = (struct node *)tn;
2248 pend = tn->pos+tn->bits;
2249 depth--;
2250 }
2339 } 2251 }
2340 } 2252 }
2341 2253
2342 read_unlock(&fib_lock); 2254 rcu_read_unlock();
2343 return s; 2255 return s;
2344} 2256}
2345 2257
@@ -2357,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2357 2269
2358static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) 2270static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2359{ 2271{
2360 void *v = NULL; 2272 if (!ip_fib_main_table)
2273 return NULL;
2361 2274
2362 if (ip_fib_main_table) 2275 if (*pos)
2363 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; 2276 return fib_triestat_get_next(seq);
2364 return v; 2277 else
2278 return SEQ_START_TOKEN;
2365} 2279}
2366 2280
2367static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2281static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2368{ 2282{
2369 ++*pos; 2283 ++*pos;
2370 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); 2284 if (v == SEQ_START_TOKEN)
2285 return fib_triestat_get_first(seq);
2286 else
2287 return fib_triestat_get_next(seq);
2371} 2288}
2372 2289
2373static void fib_triestat_seq_stop(struct seq_file *seq, void *v) 2290static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
@@ -2375,7 +2292,7 @@ static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2375 2292
2376} 2293}
2377 2294
2378/* 2295/*
2379 * This outputs /proc/net/fib_triestats 2296 * This outputs /proc/net/fib_triestats
2380 * 2297 *
2381 * It always works in backward compatibility mode. 2298 * It always works in backward compatibility mode.
@@ -2386,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2386{ 2303{
2387 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ 2304 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2388 int i, max, pointers; 2305 int i, max, pointers;
2389 struct trie_stat *stat; 2306 struct trie_stat *stat;
2390 int avdepth; 2307 int avdepth;
2391 2308
2392 stat = trie_collect_stats(t); 2309 stat = trie_collect_stats(t);
2393 2310
2394 bytes=0; 2311 bytes = 0;
2395 seq_printf(seq, "trie=%p\n", t); 2312 seq_printf(seq, "trie=%p\n", t);
2396 2313
2397 if (stat) { 2314 if (stat) {
2398 if (stat->leaves) 2315 if (stat->leaves)
2399 avdepth=stat->totdepth*100 / stat->leaves; 2316 avdepth = stat->totdepth*100 / stat->leaves;
2400 else 2317 else
2401 avdepth=0; 2318 avdepth = 0;
2402 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2319 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2403 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); 2320 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2404 2321
2405 seq_printf(seq, "Leaves: %d\n", stat->leaves); 2322 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2406 bytes += sizeof(struct leaf) * stat->leaves; 2323 bytes += sizeof(struct leaf) * stat->leaves;
2407 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); 2324 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2413,7 +2330,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2413 max--; 2330 max--;
2414 pointers = 0; 2331 pointers = 0;
2415 2332
2416 for (i = 1; i <= max; i++) 2333 for (i = 1; i <= max; i++)
2417 if (stat->nodesizes[i] != 0) { 2334 if (stat->nodesizes[i] != 0) {
2418 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); 2335 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2419 pointers += (1<<i) * stat->nodesizes[i]; 2336 pointers += (1<<i) * stat->nodesizes[i];
@@ -2444,30 +2361,28 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2444static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2361static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2445{ 2362{
2446 char bf[128]; 2363 char bf[128];
2447 2364
2448 if (v == SEQ_START_TOKEN) { 2365 if (v == SEQ_START_TOKEN) {
2449 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2366 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2450 sizeof(struct leaf), sizeof(struct tnode)); 2367 sizeof(struct leaf), sizeof(struct tnode));
2451 if (trie_local) 2368 if (trie_local)
2452 collect_and_show(trie_local, seq); 2369 collect_and_show(trie_local, seq);
2453 2370
2454 if (trie_main) 2371 if (trie_main)
2455 collect_and_show(trie_main, seq); 2372 collect_and_show(trie_main, seq);
2456 } 2373 } else {
2457 else { 2374 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2458 snprintf(bf, sizeof(bf), 2375
2459 "*\t%08X\t%08X", 200, 400);
2460
2461 seq_printf(seq, "%-127s\n", bf); 2376 seq_printf(seq, "%-127s\n", bf);
2462 } 2377 }
2463 return 0; 2378 return 0;
2464} 2379}
2465 2380
2466static struct seq_operations fib_triestat_seq_ops = { 2381static struct seq_operations fib_triestat_seq_ops = {
2467 .start = fib_triestat_seq_start, 2382 .start = fib_triestat_seq_start,
2468 .next = fib_triestat_seq_next, 2383 .next = fib_triestat_seq_next,
2469 .stop = fib_triestat_seq_stop, 2384 .stop = fib_triestat_seq_stop,
2470 .show = fib_triestat_seq_show, 2385 .show = fib_triestat_seq_show,
2471}; 2386};
2472 2387
2473static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2388static int fib_triestat_seq_open(struct inode *inode, struct file *file)
@@ -2479,7 +2394,7 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2479 if (rc) 2394 if (rc)
2480 goto out_kfree; 2395 goto out_kfree;
2481 2396
2482 seq = file->private_data; 2397 seq = file->private_data;
2483out: 2398out:
2484 return rc; 2399 return rc;
2485out_kfree: 2400out_kfree:
@@ -2487,11 +2402,11 @@ out_kfree:
2487} 2402}
2488 2403
2489static struct file_operations fib_triestat_seq_fops = { 2404static struct file_operations fib_triestat_seq_fops = {
2490 .owner = THIS_MODULE, 2405 .owner = THIS_MODULE,
2491 .open = fib_triestat_seq_open, 2406 .open = fib_triestat_seq_open,
2492 .read = seq_read, 2407 .read = seq_read,
2493 .llseek = seq_lseek, 2408 .llseek = seq_lseek,
2494 .release = seq_release_private, 2409 .release = seq_release_private,
2495}; 2410};
2496 2411
2497int __init fib_stat_proc_init(void) 2412int __init fib_stat_proc_init(void)
@@ -2518,25 +2433,30 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2518 2433
2519static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2434static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2520{ 2435{
2521 void *v = NULL; 2436 if (!ip_fib_main_table)
2437 return NULL;
2522 2438
2523 if (ip_fib_main_table) 2439 if (*pos)
2524 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; 2440 return fib_trie_get_next(seq);
2525 return v; 2441 else
2442 return SEQ_START_TOKEN;
2526} 2443}
2527 2444
2528static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2445static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529{ 2446{
2530 ++*pos; 2447 ++*pos;
2531 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); 2448 if (v == SEQ_START_TOKEN)
2449 return fib_trie_get_first(seq);
2450 else
2451 return fib_trie_get_next(seq);
2452
2532} 2453}
2533 2454
2534static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2455static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2535{ 2456{
2536
2537} 2457}
2538 2458
2539/* 2459/*
2540 * This outputs /proc/net/fib_trie. 2460 * This outputs /proc/net/fib_trie.
2541 * 2461 *
2542 * It always works in backward compatibility mode. 2462 * It always works in backward compatibility mode.
@@ -2548,14 +2468,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2548 char bf[128]; 2468 char bf[128];
2549 2469
2550 if (v == SEQ_START_TOKEN) { 2470 if (v == SEQ_START_TOKEN) {
2551 if (trie_local) 2471 if (trie_local)
2552 trie_dump_seq(seq, trie_local); 2472 trie_dump_seq(seq, trie_local);
2553 2473
2554 if (trie_main) 2474 if (trie_main)
2555 trie_dump_seq(seq, trie_main); 2475 trie_dump_seq(seq, trie_main);
2556 } 2476 } else {
2557
2558 else {
2559 snprintf(bf, sizeof(bf), 2477 snprintf(bf, sizeof(bf),
2560 "*\t%08X\t%08X", 200, 400); 2478 "*\t%08X\t%08X", 200, 400);
2561 seq_printf(seq, "%-127s\n", bf); 2479 seq_printf(seq, "%-127s\n", bf);
@@ -2565,10 +2483,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2565} 2483}
2566 2484
2567static struct seq_operations fib_trie_seq_ops = { 2485static struct seq_operations fib_trie_seq_ops = {
2568 .start = fib_trie_seq_start, 2486 .start = fib_trie_seq_start,
2569 .next = fib_trie_seq_next, 2487 .next = fib_trie_seq_next,
2570 .stop = fib_trie_seq_stop, 2488 .stop = fib_trie_seq_stop,
2571 .show = fib_trie_seq_show, 2489 .show = fib_trie_seq_show,
2572}; 2490};
2573 2491
2574static int fib_trie_seq_open(struct inode *inode, struct file *file) 2492static int fib_trie_seq_open(struct inode *inode, struct file *file)
@@ -2580,7 +2498,7 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file)
2580 if (rc) 2498 if (rc)
2581 goto out_kfree; 2499 goto out_kfree;
2582 2500
2583 seq = file->private_data; 2501 seq = file->private_data;
2584out: 2502out:
2585 return rc; 2503 return rc;
2586out_kfree: 2504out_kfree:
@@ -2588,11 +2506,11 @@ out_kfree:
2588} 2506}
2589 2507
2590static struct file_operations fib_trie_seq_fops = { 2508static struct file_operations fib_trie_seq_fops = {
2591 .owner = THIS_MODULE, 2509 .owner = THIS_MODULE,
2592 .open = fib_trie_seq_open, 2510 .open = fib_trie_seq_open,
2593 .read = seq_read, 2511 .read = seq_read,
2594 .llseek = seq_lseek, 2512 .llseek = seq_lseek,
2595 .release = seq_release_private, 2513 .release= seq_release_private,
2596}; 2514};
2597 2515
2598int __init fib_proc_init(void) 2516int __init fib_proc_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 279f57abfecb..24eb56ae1b5a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
114/* 114/*
115 * Statistics 115 * Statistics
116 */ 116 */
117DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); 117DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
118 118
119/* An array of errno for error messages from dest unreach. */ 119/* An array of errno for error messages from dest unreach. */
120/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ 120/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
349{ 349{
350 struct sk_buff *skb; 350 struct sk_buff *skb;
351 351
352 ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, 352 if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
353 icmp_param->data_len+icmp_param->head_len, 353 icmp_param->data_len+icmp_param->head_len,
354 icmp_param->head_len, 354 icmp_param->head_len,
355 ipc, rt, MSG_DONTWAIT); 355 ipc, rt, MSG_DONTWAIT) < 0)
356 356 ip_flush_pending_frames(icmp_socket->sk);
357 if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { 357 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
358 struct icmphdr *icmph = skb->h.icmph; 358 struct icmphdr *icmph = skb->h.icmph;
359 unsigned int csum = 0; 359 unsigned int csum = 0;
360 struct sk_buff *skb1; 360 struct sk_buff *skb1;
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
627 break; 627 break;
628 case ICMP_FRAG_NEEDED: 628 case ICMP_FRAG_NEEDED:
629 if (ipv4_config.no_pmtu_disc) { 629 if (ipv4_config.no_pmtu_disc) {
630 LIMIT_NETDEBUG( 630 LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
631 printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
632 "fragmentation needed " 631 "fragmentation needed "
633 "and DF set.\n", 632 "and DF set.\n",
634 NIPQUAD(iph->daddr))); 633 NIPQUAD(iph->daddr));
635 } else { 634 } else {
636 info = ip_rt_frag_needed(iph, 635 info = ip_rt_frag_needed(iph,
637 ntohs(icmph->un.frag.mtu)); 636 ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
640 } 639 }
641 break; 640 break;
642 case ICMP_SR_FAILED: 641 case ICMP_SR_FAILED:
643 LIMIT_NETDEBUG( 642 LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
644 printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
645 "Route Failed.\n", 643 "Route Failed.\n",
646 NIPQUAD(iph->daddr))); 644 NIPQUAD(iph->daddr));
647 break; 645 break;
648 default: 646 default:
649 break; 647 break;
@@ -936,8 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_HW: 934 case CHECKSUM_HW:
937 if (!(u16)csum_fold(skb->csum)) 935 if (!(u16)csum_fold(skb->csum))
938 break; 936 break;
939 NETDEBUG(if (net_ratelimit()) 937 LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
940 printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
941 case CHECKSUM_NONE: 938 case CHECKSUM_NONE:
942 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 939 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
943 goto error; 940 goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835ae..44607f4767b8 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
904 case IGMP_MTRACE_RESP: 904 case IGMP_MTRACE_RESP:
905 break; 905 break;
906 default: 906 default:
907 NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); 907 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
908 } 908 }
909 in_dev_put(in_dev); 909 in_dev_put(in_dev);
910 kfree_skb(skb); 910 kfree_skb(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..fe3c6d3d0c91
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET connection oriented protocols.
7 *
8 * Authors: See the TCP sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/jhash.h>
19
20#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h>
22#include <net/inet_timewait_sock.h>
23#include <net/ip.h>
24#include <net/route.h>
25#include <net/tcp_states.h>
26#include <net/xfrm.h>
27
28#ifdef INET_CSK_DEBUG
29const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
30EXPORT_SYMBOL(inet_csk_timer_bug_msg);
31#endif
32
33/*
34 * This array holds the first and last local port number.
35 * For high-usage systems, use sysctl to change this to
36 * 32768-61000
37 */
38int sysctl_local_port_range[2] = { 1024, 4999 };
39
40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
41{
42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
43 struct sock *sk2;
44 struct hlist_node *node;
45 int reuse = sk->sk_reuse;
46
47 sk_for_each_bound(sk2, node, &tb->owners) {
48 if (sk != sk2 &&
49 !inet_v6_ipv6only(sk2) &&
50 (!sk->sk_bound_dev_if ||
51 !sk2->sk_bound_dev_if ||
52 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
53 if (!reuse || !sk2->sk_reuse ||
54 sk2->sk_state == TCP_LISTEN) {
55 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
56 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
57 sk2_rcv_saddr == sk_rcv_saddr)
58 break;
59 }
60 }
61 }
62 return node != NULL;
63}
64
65/* Obtain a reference to a local port for the given sock,
66 * if snum is zero it means select any available local port.
67 */
68int inet_csk_get_port(struct inet_hashinfo *hashinfo,
69 struct sock *sk, unsigned short snum)
70{
71 struct inet_bind_hashbucket *head;
72 struct hlist_node *node;
73 struct inet_bind_bucket *tb;
74 int ret;
75
76 local_bh_disable();
77 if (!snum) {
78 int low = sysctl_local_port_range[0];
79 int high = sysctl_local_port_range[1];
80 int remaining = (high - low) + 1;
81 int rover;
82
83 spin_lock(&hashinfo->portalloc_lock);
84 if (hashinfo->port_rover < low)
85 rover = low;
86 else
87 rover = hashinfo->port_rover;
88 do {
89 rover++;
90 if (rover > high)
91 rover = low;
92 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
93 spin_lock(&head->lock);
94 inet_bind_bucket_for_each(tb, node, &head->chain)
95 if (tb->port == rover)
96 goto next;
97 break;
98 next:
99 spin_unlock(&head->lock);
100 } while (--remaining > 0);
101 hashinfo->port_rover = rover;
102 spin_unlock(&hashinfo->portalloc_lock);
103
104 /* Exhausted local port range during search? It is not
105 * possible for us to be holding one of the bind hash
106 * locks if this test triggers, because if 'remaining'
107 * drops to zero, we broke out of the do/while loop at
108 * the top level, not from the 'break;' statement.
109 */
110 ret = 1;
111 if (remaining <= 0)
112 goto fail;
113
114 /* OK, here is the one we will use. HEAD is
115 * non-NULL and we hold it's mutex.
116 */
117 snum = rover;
118 } else {
119 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
120 spin_lock(&head->lock);
121 inet_bind_bucket_for_each(tb, node, &head->chain)
122 if (tb->port == snum)
123 goto tb_found;
124 }
125 tb = NULL;
126 goto tb_not_found;
127tb_found:
128 if (!hlist_empty(&tb->owners)) {
129 if (sk->sk_reuse > 1)
130 goto success;
131 if (tb->fastreuse > 0 &&
132 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
133 goto success;
134 } else {
135 ret = 1;
136 if (inet_csk_bind_conflict(sk, tb))
137 goto fail_unlock;
138 }
139 }
140tb_not_found:
141 ret = 1;
142 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
143 goto fail_unlock;
144 if (hlist_empty(&tb->owners)) {
145 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
146 tb->fastreuse = 1;
147 else
148 tb->fastreuse = 0;
149 } else if (tb->fastreuse &&
150 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
151 tb->fastreuse = 0;
152success:
153 if (!inet_csk(sk)->icsk_bind_hash)
154 inet_bind_hash(sk, tb, snum);
155 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
156 ret = 0;
157
158fail_unlock:
159 spin_unlock(&head->lock);
160fail:
161 local_bh_enable();
162 return ret;
163}
164
165EXPORT_SYMBOL_GPL(inet_csk_get_port);
166
167/*
168 * Wait for an incoming connection, avoid race conditions. This must be called
169 * with the socket locked.
170 */
171static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
172{
173 struct inet_connection_sock *icsk = inet_csk(sk);
174 DEFINE_WAIT(wait);
175 int err;
176
177 /*
178 * True wake-one mechanism for incoming connections: only
179 * one process gets woken up, not the 'whole herd'.
180 * Since we do not 'race & poll' for established sockets
181 * anymore, the common case will execute the loop only once.
182 *
183 * Subtle issue: "add_wait_queue_exclusive()" will be added
184 * after any current non-exclusive waiters, and we know that
185 * it will always _stay_ after any new non-exclusive waiters
186 * because all non-exclusive waiters are added at the
187 * beginning of the wait-queue. As such, it's ok to "drop"
188 * our exclusiveness temporarily when we get woken up without
189 * having to remove and re-insert us on the wait queue.
190 */
191 for (;;) {
192 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
193 TASK_INTERRUPTIBLE);
194 release_sock(sk);
195 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
196 timeo = schedule_timeout(timeo);
197 lock_sock(sk);
198 err = 0;
199 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
200 break;
201 err = -EINVAL;
202 if (sk->sk_state != TCP_LISTEN)
203 break;
204 err = sock_intr_errno(timeo);
205 if (signal_pending(current))
206 break;
207 err = -EAGAIN;
208 if (!timeo)
209 break;
210 }
211 finish_wait(sk->sk_sleep, &wait);
212 return err;
213}
214
215/*
216 * This will accept the next outstanding connection.
217 */
218struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
219{
220 struct inet_connection_sock *icsk = inet_csk(sk);
221 struct sock *newsk;
222 int error;
223
224 lock_sock(sk);
225
226 /* We need to make sure that this socket is listening,
227 * and that it has something pending.
228 */
229 error = -EINVAL;
230 if (sk->sk_state != TCP_LISTEN)
231 goto out_err;
232
233 /* Find already established connection */
234 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
235 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
236
237 /* If this is a non blocking socket don't sleep */
238 error = -EAGAIN;
239 if (!timeo)
240 goto out_err;
241
242 error = inet_csk_wait_for_connect(sk, timeo);
243 if (error)
244 goto out_err;
245 }
246
247 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
248 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
249out:
250 release_sock(sk);
251 return newsk;
252out_err:
253 newsk = NULL;
254 *err = error;
255 goto out;
256}
257
258EXPORT_SYMBOL(inet_csk_accept);
259
260/*
261 * Using different timers for retransmit, delayed acks and probes
262 * We may wish use just one timer maintaining a list of expire jiffies
263 * to optimize.
264 */
265void inet_csk_init_xmit_timers(struct sock *sk,
266 void (*retransmit_handler)(unsigned long),
267 void (*delack_handler)(unsigned long),
268 void (*keepalive_handler)(unsigned long))
269{
270 struct inet_connection_sock *icsk = inet_csk(sk);
271
272 init_timer(&icsk->icsk_retransmit_timer);
273 init_timer(&icsk->icsk_delack_timer);
274 init_timer(&sk->sk_timer);
275
276 icsk->icsk_retransmit_timer.function = retransmit_handler;
277 icsk->icsk_delack_timer.function = delack_handler;
278 sk->sk_timer.function = keepalive_handler;
279
280 icsk->icsk_retransmit_timer.data =
281 icsk->icsk_delack_timer.data =
282 sk->sk_timer.data = (unsigned long)sk;
283
284 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
285}
286
287EXPORT_SYMBOL(inet_csk_init_xmit_timers);
288
289void inet_csk_clear_xmit_timers(struct sock *sk)
290{
291 struct inet_connection_sock *icsk = inet_csk(sk);
292
293 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
294
295 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
296 sk_stop_timer(sk, &icsk->icsk_delack_timer);
297 sk_stop_timer(sk, &sk->sk_timer);
298}
299
300EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
301
302void inet_csk_delete_keepalive_timer(struct sock *sk)
303{
304 sk_stop_timer(sk, &sk->sk_timer);
305}
306
307EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
308
309void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
310{
311 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
312}
313
314EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
315
316struct dst_entry* inet_csk_route_req(struct sock *sk,
317 const struct request_sock *req)
318{
319 struct rtable *rt;
320 const struct inet_request_sock *ireq = inet_rsk(req);
321 struct ip_options *opt = inet_rsk(req)->opt;
322 struct flowi fl = { .oif = sk->sk_bound_dev_if,
323 .nl_u = { .ip4_u =
324 { .daddr = ((opt && opt->srr) ?
325 opt->faddr :
326 ireq->rmt_addr),
327 .saddr = ireq->loc_addr,
328 .tos = RT_CONN_FLAGS(sk) } },
329 .proto = sk->sk_protocol,
330 .uli_u = { .ports =
331 { .sport = inet_sk(sk)->sport,
332 .dport = ireq->rmt_port } } };
333
334 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
335 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
336 return NULL;
337 }
338 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
339 ip_rt_put(rt);
340 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
341 return NULL;
342 }
343 return &rt->u.dst;
344}
345
346EXPORT_SYMBOL_GPL(inet_csk_route_req);
347
348static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
349 const u32 rnd, const u16 synq_hsize)
350{
351 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
352}
353
354#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
355#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
356#else
357#define AF_INET_FAMILY(fam) 1
358#endif
359
360struct request_sock *inet_csk_search_req(const struct sock *sk,
361 struct request_sock ***prevp,
362 const __u16 rport, const __u32 raddr,
363 const __u32 laddr)
364{
365 const struct inet_connection_sock *icsk = inet_csk(sk);
366 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
367 struct request_sock *req, **prev;
368
369 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
370 lopt->nr_table_entries)];
371 (req = *prev) != NULL;
372 prev = &req->dl_next) {
373 const struct inet_request_sock *ireq = inet_rsk(req);
374
375 if (ireq->rmt_port == rport &&
376 ireq->rmt_addr == raddr &&
377 ireq->loc_addr == laddr &&
378 AF_INET_FAMILY(req->rsk_ops->family)) {
379 BUG_TRAP(!req->sk);
380 *prevp = prev;
381 break;
382 }
383 }
384
385 return req;
386}
387
388EXPORT_SYMBOL_GPL(inet_csk_search_req);
389
390void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
391 const unsigned timeout)
392{
393 struct inet_connection_sock *icsk = inet_csk(sk);
394 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
395 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
396 lopt->hash_rnd, lopt->nr_table_entries);
397
398 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
399 inet_csk_reqsk_queue_added(sk, timeout);
400}
401
402/* Only thing we need from tcp.h */
403extern int sysctl_tcp_synack_retries;
404
405EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
406
407void inet_csk_reqsk_queue_prune(struct sock *parent,
408 const unsigned long interval,
409 const unsigned long timeout,
410 const unsigned long max_rto)
411{
412 struct inet_connection_sock *icsk = inet_csk(parent);
413 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
414 struct listen_sock *lopt = queue->listen_opt;
415 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
416 int thresh = max_retries;
417 unsigned long now = jiffies;
418 struct request_sock **reqp, *req;
419 int i, budget;
420
421 if (lopt == NULL || lopt->qlen == 0)
422 return;
423
424 /* Normally all the openreqs are young and become mature
425 * (i.e. converted to established socket) for first timeout.
426 * If synack was not acknowledged for 3 seconds, it means
427 * one of the following things: synack was lost, ack was lost,
428 * rtt is high or nobody planned to ack (i.e. synflood).
429 * When server is a bit loaded, queue is populated with old
430 * open requests, reducing effective size of queue.
431 * When server is well loaded, queue size reduces to zero
432 * after several minutes of work. It is not synflood,
433 * it is normal operation. The solution is pruning
434 * too old entries overriding normal timeout, when
435 * situation becomes dangerous.
436 *
437 * Essentially, we reserve half of room for young
438 * embrions; and abort old ones without pity, if old
439 * ones are about to clog our table.
440 */
441 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
442 int young = (lopt->qlen_young<<1);
443
444 while (thresh > 2) {
445 if (lopt->qlen < young)
446 break;
447 thresh--;
448 young <<= 1;
449 }
450 }
451
452 if (queue->rskq_defer_accept)
453 max_retries = queue->rskq_defer_accept;
454
455 budget = 2 * (lopt->nr_table_entries / (timeout / interval));
456 i = lopt->clock_hand;
457
458 do {
459 reqp=&lopt->syn_table[i];
460 while ((req = *reqp) != NULL) {
461 if (time_after_eq(now, req->expires)) {
462 if ((req->retrans < thresh ||
463 (inet_rsk(req)->acked && req->retrans < max_retries))
464 && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
465 unsigned long timeo;
466
467 if (req->retrans++ == 0)
468 lopt->qlen_young--;
469 timeo = min((timeout << req->retrans), max_rto);
470 req->expires = now + timeo;
471 reqp = &req->dl_next;
472 continue;
473 }
474
475 /* Drop this request */
476 inet_csk_reqsk_queue_unlink(parent, req, reqp);
477 reqsk_queue_removed(queue, req);
478 reqsk_free(req);
479 continue;
480 }
481 reqp = &req->dl_next;
482 }
483
484 i = (i + 1) & (lopt->nr_table_entries - 1);
485
486 } while (--budget > 0);
487
488 lopt->clock_hand = i;
489
490 if (lopt->qlen)
491 inet_csk_reset_keepalive_timer(parent, interval);
492}
493
494EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
495
496struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
497 const unsigned int __nocast priority)
498{
499 struct sock *newsk = sk_clone(sk, priority);
500
501 if (newsk != NULL) {
502 struct inet_connection_sock *newicsk = inet_csk(newsk);
503
504 newsk->sk_state = TCP_SYN_RECV;
505 newicsk->icsk_bind_hash = NULL;
506
507 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
508 newsk->sk_write_space = sk_stream_write_space;
509
510 newicsk->icsk_retransmits = 0;
511 newicsk->icsk_backoff = 0;
512 newicsk->icsk_probes_out = 0;
513
514 /* Deinitialize accept_queue to trap illegal accesses. */
515 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
516 }
517 return newsk;
518}
519
520EXPORT_SYMBOL_GPL(inet_csk_clone);
521
522/*
523 * At this point, there should be no process reference to this
524 * socket, and thus no user references at all. Therefore we
525 * can assume the socket waitqueue is inactive and nobody will
526 * try to jump onto it.
527 */
528void inet_csk_destroy_sock(struct sock *sk)
529{
530 BUG_TRAP(sk->sk_state == TCP_CLOSE);
531 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
532
533 /* It cannot be in hash table! */
534 BUG_TRAP(sk_unhashed(sk));
535
536 /* If it has not 0 inet_sk(sk)->num, it must be bound */
537 BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
538
539 sk->sk_prot->destroy(sk);
540
541 sk_stream_kill_queues(sk);
542
543 xfrm_sk_free_policy(sk);
544
545 sk_refcnt_debug_release(sk);
546
547 atomic_dec(sk->sk_prot->orphan_count);
548 sock_put(sk);
549}
550
551EXPORT_SYMBOL(inet_csk_destroy_sock);
552
553int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
554{
555 struct inet_sock *inet = inet_sk(sk);
556 struct inet_connection_sock *icsk = inet_csk(sk);
557 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
558
559 if (rc != 0)
560 return rc;
561
562 sk->sk_max_ack_backlog = 0;
563 sk->sk_ack_backlog = 0;
564 inet_csk_delack_init(sk);
565
566 /* There is race window here: we announce ourselves listening,
567 * but this transition is still not validated by get_port().
568 * It is OK, because this socket enters to hash table only
569 * after validation is complete.
570 */
571 sk->sk_state = TCP_LISTEN;
572 if (!sk->sk_prot->get_port(sk, inet->num)) {
573 inet->sport = htons(inet->num);
574
575 sk_dst_reset(sk);
576 sk->sk_prot->hash(sk);
577
578 return 0;
579 }
580
581 sk->sk_state = TCP_CLOSE;
582 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
583 return -EADDRINUSE;
584}
585
586EXPORT_SYMBOL_GPL(inet_csk_listen_start);
587
588/*
589 * This routine closes sockets which have been at least partially
590 * opened, but not yet accepted.
591 */
592void inet_csk_listen_stop(struct sock *sk)
593{
594 struct inet_connection_sock *icsk = inet_csk(sk);
595 struct request_sock *acc_req;
596 struct request_sock *req;
597
598 inet_csk_delete_keepalive_timer(sk);
599
600 /* make all the listen_opt local to us */
601 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
602
603 /* Following specs, it would be better either to send FIN
604 * (and enter FIN-WAIT-1, it is normal close)
605 * or to send active reset (abort).
606 * Certainly, it is pretty dangerous while synflood, but it is
607 * bad justification for our negligence 8)
608 * To be honest, we are not able to make either
609 * of the variants now. --ANK
610 */
611 reqsk_queue_destroy(&icsk->icsk_accept_queue);
612
613 while ((req = acc_req) != NULL) {
614 struct sock *child = req->sk;
615
616 acc_req = req->dl_next;
617
618 local_bh_disable();
619 bh_lock_sock(child);
620 BUG_TRAP(!sock_owned_by_user(child));
621 sock_hold(child);
622
623 sk->sk_prot->disconnect(child, O_NONBLOCK);
624
625 sock_orphan(child);
626
627 atomic_inc(sk->sk_prot->orphan_count);
628
629 inet_csk_destroy_sock(child);
630
631 bh_unlock_sock(child);
632 local_bh_enable();
633 sock_put(child);
634
635 sk_acceptq_removed(sk);
636 __reqsk_free(req);
637 }
638 BUG_TRAP(!sk->sk_ack_backlog);
639}
640
641EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000000..71f3c7350c6e
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
1/*
2 * inet_diag.c Module for monitoring INET transport protocols sockets.
3 *
4 * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
5 *
6 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/fcntl.h>
18#include <linux/random.h>
19#include <linux/cache.h>
20#include <linux/init.h>
21#include <linux/time.h>
22
23#include <net/icmp.h>
24#include <net/tcp.h>
25#include <net/ipv6.h>
26#include <net/inet_common.h>
27#include <net/inet_connection_sock.h>
28#include <net/inet_hashtables.h>
29#include <net/inet_timewait_sock.h>
30#include <net/inet6_hashtables.h>
31
32#include <linux/inet.h>
33#include <linux/stddef.h>
34
35#include <linux/inet_diag.h>
36
37static const struct inet_diag_handler **inet_diag_table;
38
39struct inet_diag_entry {
40 u32 *saddr;
41 u32 *daddr;
42 u16 sport;
43 u16 dport;
44 u16 family;
45 u16 userlocks;
46};
47
48static struct sock *idiagnl;
49
50#define INET_DIAG_PUT(skb, attrtype, attrlen) \
51 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
52
53static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
54 int ext, u32 pid, u32 seq, u16 nlmsg_flags,
55 const struct nlmsghdr *unlh)
56{
57 const struct inet_sock *inet = inet_sk(sk);
58 const struct inet_connection_sock *icsk = inet_csk(sk);
59 struct inet_diag_msg *r;
60 struct nlmsghdr *nlh;
61 void *info = NULL;
62 struct inet_diag_meminfo *minfo = NULL;
63 unsigned char *b = skb->tail;
64 const struct inet_diag_handler *handler;
65
66 handler = inet_diag_table[unlh->nlmsg_type];
67 BUG_ON(handler == NULL);
68
69 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
70 nlh->nlmsg_flags = nlmsg_flags;
71
72 r = NLMSG_DATA(nlh);
73 if (sk->sk_state != TCP_TIME_WAIT) {
74 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
75 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
76 sizeof(*minfo));
77 if (ext & (1 << (INET_DIAG_INFO - 1)))
78 info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
79 handler->idiag_info_size);
80
81 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
82 size_t len = strlen(icsk->icsk_ca_ops->name);
83 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
84 icsk->icsk_ca_ops->name);
85 }
86 }
87 r->idiag_family = sk->sk_family;
88 r->idiag_state = sk->sk_state;
89 r->idiag_timer = 0;
90 r->idiag_retrans = 0;
91
92 r->id.idiag_if = sk->sk_bound_dev_if;
93 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
94 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
95
96 if (r->idiag_state == TCP_TIME_WAIT) {
97 const struct inet_timewait_sock *tw = inet_twsk(sk);
98 long tmo = tw->tw_ttd - jiffies;
99 if (tmo < 0)
100 tmo = 0;
101
102 r->id.idiag_sport = tw->tw_sport;
103 r->id.idiag_dport = tw->tw_dport;
104 r->id.idiag_src[0] = tw->tw_rcv_saddr;
105 r->id.idiag_dst[0] = tw->tw_daddr;
106 r->idiag_state = tw->tw_substate;
107 r->idiag_timer = 3;
108 r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
109 r->idiag_rqueue = 0;
110 r->idiag_wqueue = 0;
111 r->idiag_uid = 0;
112 r->idiag_inode = 0;
113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
114 if (r->idiag_family == AF_INET6) {
115 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
116
117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
118 &tcp6tw->tw_v6_rcv_saddr);
119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
120 &tcp6tw->tw_v6_daddr);
121 }
122#endif
123 nlh->nlmsg_len = skb->tail - b;
124 return skb->len;
125 }
126
127 r->id.idiag_sport = inet->sport;
128 r->id.idiag_dport = inet->dport;
129 r->id.idiag_src[0] = inet->rcv_saddr;
130 r->id.idiag_dst[0] = inet->daddr;
131
132#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
133 if (r->idiag_family == AF_INET6) {
134 struct ipv6_pinfo *np = inet6_sk(sk);
135
136 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
137 &np->rcv_saddr);
138 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
139 &np->daddr);
140 }
141#endif
142
143#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
144
145 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
146 r->idiag_timer = 1;
147 r->idiag_retrans = icsk->icsk_retransmits;
148 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
149 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
150 r->idiag_timer = 4;
151 r->idiag_retrans = icsk->icsk_probes_out;
152 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
153 } else if (timer_pending(&sk->sk_timer)) {
154 r->idiag_timer = 2;
155 r->idiag_retrans = icsk->icsk_probes_out;
156 r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
157 } else {
158 r->idiag_timer = 0;
159 r->idiag_expires = 0;
160 }
161#undef EXPIRES_IN_MS
162
163 r->idiag_uid = sock_i_uid(sk);
164 r->idiag_inode = sock_i_ino(sk);
165
166 if (minfo) {
167 minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
168 minfo->idiag_wmem = sk->sk_wmem_queued;
169 minfo->idiag_fmem = sk->sk_forward_alloc;
170 minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
171 }
172
173 handler->idiag_get_info(sk, r, info);
174
175 if (sk->sk_state < TCP_TIME_WAIT &&
176 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
177 icsk->icsk_ca_ops->get_info(sk, ext, skb);
178
179 nlh->nlmsg_len = skb->tail - b;
180 return skb->len;
181
182rtattr_failure:
183nlmsg_failure:
184 skb_trim(skb, b - skb->data);
185 return -1;
186}
187
188static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
189{
190 int err;
191 struct sock *sk;
192 struct inet_diag_req *req = NLMSG_DATA(nlh);
193 struct sk_buff *rep;
194 struct inet_hashinfo *hashinfo;
195 const struct inet_diag_handler *handler;
196
197 handler = inet_diag_table[nlh->nlmsg_type];
198 BUG_ON(handler == NULL);
199 hashinfo = handler->idiag_hashinfo;
200
201 if (req->idiag_family == AF_INET) {
202 sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
203 req->id.idiag_dport, req->id.idiag_src[0],
204 req->id.idiag_sport, req->id.idiag_if);
205 }
206#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
207 else if (req->idiag_family == AF_INET6) {
208 sk = inet6_lookup(hashinfo,
209 (struct in6_addr *)req->id.idiag_dst,
210 req->id.idiag_dport,
211 (struct in6_addr *)req->id.idiag_src,
212 req->id.idiag_sport,
213 req->id.idiag_if);
214 }
215#endif
216 else {
217 return -EINVAL;
218 }
219
220 if (sk == NULL)
221 return -ENOENT;
222
223 err = -ESTALE;
224 if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
225 req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
226 ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
227 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
228 goto out;
229
230 err = -ENOMEM;
231 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
232 sizeof(struct inet_diag_meminfo) +
233 handler->idiag_info_size + 64)),
234 GFP_KERNEL);
235 if (!rep)
236 goto out;
237
238 if (inet_diag_fill(rep, sk, req->idiag_ext,
239 NETLINK_CB(in_skb).pid,
240 nlh->nlmsg_seq, 0, nlh) <= 0)
241 BUG();
242
243 err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
244 MSG_DONTWAIT);
245 if (err > 0)
246 err = 0;
247
248out:
249 if (sk) {
250 if (sk->sk_state == TCP_TIME_WAIT)
251 inet_twsk_put((struct inet_timewait_sock *)sk);
252 else
253 sock_put(sk);
254 }
255 return err;
256}
257
258static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
259{
260 int words = bits >> 5;
261
262 bits &= 0x1f;
263
264 if (words) {
265 if (memcmp(a1, a2, words << 2))
266 return 0;
267 }
268 if (bits) {
269 __u32 w1, w2;
270 __u32 mask;
271
272 w1 = a1[words];
273 w2 = a2[words];
274
275 mask = htonl((0xffffffff) << (32 - bits));
276
277 if ((w1 ^ w2) & mask)
278 return 0;
279 }
280
281 return 1;
282}
283
284
285static int inet_diag_bc_run(const void *bc, int len,
286 const struct inet_diag_entry *entry)
287{
288 while (len > 0) {
289 int yes = 1;
290 const struct inet_diag_bc_op *op = bc;
291
292 switch (op->code) {
293 case INET_DIAG_BC_NOP:
294 break;
295 case INET_DIAG_BC_JMP:
296 yes = 0;
297 break;
298 case INET_DIAG_BC_S_GE:
299 yes = entry->sport >= op[1].no;
300 break;
301 case INET_DIAG_BC_S_LE:
302 yes = entry->dport <= op[1].no;
303 break;
304 case INET_DIAG_BC_D_GE:
305 yes = entry->dport >= op[1].no;
306 break;
307 case INET_DIAG_BC_D_LE:
308 yes = entry->dport <= op[1].no;
309 break;
310 case INET_DIAG_BC_AUTO:
311 yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
312 break;
313 case INET_DIAG_BC_S_COND:
314 case INET_DIAG_BC_D_COND: {
315 struct inet_diag_hostcond *cond;
316 u32 *addr;
317
318 cond = (struct inet_diag_hostcond *)(op + 1);
319 if (cond->port != -1 &&
320 cond->port != (op->code == INET_DIAG_BC_S_COND ?
321 entry->sport : entry->dport)) {
322 yes = 0;
323 break;
324 }
325
326 if (cond->prefix_len == 0)
327 break;
328
329 if (op->code == INET_DIAG_BC_S_COND)
330 addr = entry->saddr;
331 else
332 addr = entry->daddr;
333
334 if (bitstring_match(addr, cond->addr, cond->prefix_len))
335 break;
336 if (entry->family == AF_INET6 &&
337 cond->family == AF_INET) {
338 if (addr[0] == 0 && addr[1] == 0 &&
339 addr[2] == htonl(0xffff) &&
340 bitstring_match(addr + 3, cond->addr,
341 cond->prefix_len))
342 break;
343 }
344 yes = 0;
345 break;
346 }
347 }
348
349 if (yes) {
350 len -= op->yes;
351 bc += op->yes;
352 } else {
353 len -= op->no;
354 bc += op->no;
355 }
356 }
357 return (len == 0);
358}
359
360static int valid_cc(const void *bc, int len, int cc)
361{
362 while (len >= 0) {
363 const struct inet_diag_bc_op *op = bc;
364
365 if (cc > len)
366 return 0;
367 if (cc == len)
368 return 1;
369 if (op->yes < 4)
370 return 0;
371 len -= op->yes;
372 bc += op->yes;
373 }
374 return 0;
375}
376
377static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
378{
379 const unsigned char *bc = bytecode;
380 int len = bytecode_len;
381
382 while (len > 0) {
383 struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
384
385//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
386 switch (op->code) {
387 case INET_DIAG_BC_AUTO:
388 case INET_DIAG_BC_S_COND:
389 case INET_DIAG_BC_D_COND:
390 case INET_DIAG_BC_S_GE:
391 case INET_DIAG_BC_S_LE:
392 case INET_DIAG_BC_D_GE:
393 case INET_DIAG_BC_D_LE:
394 if (op->yes < 4 || op->yes > len + 4)
395 return -EINVAL;
396 case INET_DIAG_BC_JMP:
397 if (op->no < 4 || op->no > len + 4)
398 return -EINVAL;
399 if (op->no < len &&
400 !valid_cc(bytecode, bytecode_len, len - op->no))
401 return -EINVAL;
402 break;
403 case INET_DIAG_BC_NOP:
404 if (op->yes < 4 || op->yes > len + 4)
405 return -EINVAL;
406 break;
407 default:
408 return -EINVAL;
409 }
410 bc += op->yes;
411 len -= op->yes;
412 }
413 return len == 0 ? 0 : -EINVAL;
414}
415
416static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
417 struct netlink_callback *cb)
418{
419 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
420
421 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
422 struct inet_diag_entry entry;
423 struct rtattr *bc = (struct rtattr *)(r + 1);
424 struct inet_sock *inet = inet_sk(sk);
425
426 entry.family = sk->sk_family;
427#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
428 if (entry.family == AF_INET6) {
429 struct ipv6_pinfo *np = inet6_sk(sk);
430
431 entry.saddr = np->rcv_saddr.s6_addr32;
432 entry.daddr = np->daddr.s6_addr32;
433 } else
434#endif
435 {
436 entry.saddr = &inet->rcv_saddr;
437 entry.daddr = &inet->daddr;
438 }
439 entry.sport = inet->num;
440 entry.dport = ntohs(inet->dport);
441 entry.userlocks = sk->sk_userlocks;
442
443 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
444 return 0;
445 }
446
447 return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
448 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
449}
450
451static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
452 struct request_sock *req,
453 u32 pid, u32 seq,
454 const struct nlmsghdr *unlh)
455{
456 const struct inet_request_sock *ireq = inet_rsk(req);
457 struct inet_sock *inet = inet_sk(sk);
458 unsigned char *b = skb->tail;
459 struct inet_diag_msg *r;
460 struct nlmsghdr *nlh;
461 long tmo;
462
463 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
464 nlh->nlmsg_flags = NLM_F_MULTI;
465 r = NLMSG_DATA(nlh);
466
467 r->idiag_family = sk->sk_family;
468 r->idiag_state = TCP_SYN_RECV;
469 r->idiag_timer = 1;
470 r->idiag_retrans = req->retrans;
471
472 r->id.idiag_if = sk->sk_bound_dev_if;
473 r->id.idiag_cookie[0] = (u32)(unsigned long)req;
474 r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
475
476 tmo = req->expires - jiffies;
477 if (tmo < 0)
478 tmo = 0;
479
480 r->id.idiag_sport = inet->sport;
481 r->id.idiag_dport = ireq->rmt_port;
482 r->id.idiag_src[0] = ireq->loc_addr;
483 r->id.idiag_dst[0] = ireq->rmt_addr;
484 r->idiag_expires = jiffies_to_msecs(tmo);
485 r->idiag_rqueue = 0;
486 r->idiag_wqueue = 0;
487 r->idiag_uid = sock_i_uid(sk);
488 r->idiag_inode = 0;
489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
490 if (r->idiag_family == AF_INET6) {
491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
492 &tcp6_rsk(req)->loc_addr);
493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
494 &tcp6_rsk(req)->rmt_addr);
495 }
496#endif
497 nlh->nlmsg_len = skb->tail - b;
498
499 return skb->len;
500
501nlmsg_failure:
502 skb_trim(skb, b - skb->data);
503 return -1;
504}
505
506static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
507 struct netlink_callback *cb)
508{
509 struct inet_diag_entry entry;
510 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
511 struct inet_connection_sock *icsk = inet_csk(sk);
512 struct listen_sock *lopt;
513 struct rtattr *bc = NULL;
514 struct inet_sock *inet = inet_sk(sk);
515 int j, s_j;
516 int reqnum, s_reqnum;
517 int err = 0;
518
519 s_j = cb->args[3];
520 s_reqnum = cb->args[4];
521
522 if (s_j > 0)
523 s_j--;
524
525 entry.family = sk->sk_family;
526
527 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
528
529 lopt = icsk->icsk_accept_queue.listen_opt;
530 if (!lopt || !lopt->qlen)
531 goto out;
532
533 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
534 bc = (struct rtattr *)(r + 1);
535 entry.sport = inet->num;
536 entry.userlocks = sk->sk_userlocks;
537 }
538
539 for (j = s_j; j < lopt->nr_table_entries; j++) {
540 struct request_sock *req, *head = lopt->syn_table[j];
541
542 reqnum = 0;
543 for (req = head; req; reqnum++, req = req->dl_next) {
544 struct inet_request_sock *ireq = inet_rsk(req);
545
546 if (reqnum < s_reqnum)
547 continue;
548 if (r->id.idiag_dport != ireq->rmt_port &&
549 r->id.idiag_dport)
550 continue;
551
552 if (bc) {
553 entry.saddr =
554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
555 (entry.family == AF_INET6) ?
556 tcp6_rsk(req)->loc_addr.s6_addr32 :
557#endif
558 &ireq->loc_addr;
559 entry.daddr =
560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
561 (entry.family == AF_INET6) ?
562 tcp6_rsk(req)->rmt_addr.s6_addr32 :
563#endif
564 &ireq->rmt_addr;
565 entry.dport = ntohs(ireq->rmt_port);
566
567 if (!inet_diag_bc_run(RTA_DATA(bc),
568 RTA_PAYLOAD(bc), &entry))
569 continue;
570 }
571
572 err = inet_diag_fill_req(skb, sk, req,
573 NETLINK_CB(cb->skb).pid,
574 cb->nlh->nlmsg_seq, cb->nlh);
575 if (err < 0) {
576 cb->args[3] = j + 1;
577 cb->args[4] = reqnum;
578 goto out;
579 }
580 }
581
582 s_reqnum = 0;
583 }
584
585out:
586 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
587
588 return err;
589}
590
591static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
592{
593 int i, num;
594 int s_i, s_num;
595 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
596 const struct inet_diag_handler *handler;
597 struct inet_hashinfo *hashinfo;
598
599 handler = inet_diag_table[cb->nlh->nlmsg_type];
600 BUG_ON(handler == NULL);
601 hashinfo = handler->idiag_hashinfo;
602
603 s_i = cb->args[1];
604 s_num = num = cb->args[2];
605
606 if (cb->args[0] == 0) {
607 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
608 goto skip_listen_ht;
609
610 inet_listen_lock(hashinfo);
611 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
612 struct sock *sk;
613 struct hlist_node *node;
614
615 num = 0;
616 sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
617 struct inet_sock *inet = inet_sk(sk);
618
619 if (num < s_num) {
620 num++;
621 continue;
622 }
623
624 if (r->id.idiag_sport != inet->sport &&
625 r->id.idiag_sport)
626 goto next_listen;
627
628 if (!(r->idiag_states & TCPF_LISTEN) ||
629 r->id.idiag_dport ||
630 cb->args[3] > 0)
631 goto syn_recv;
632
633 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
634 inet_listen_unlock(hashinfo);
635 goto done;
636 }
637
638syn_recv:
639 if (!(r->idiag_states & TCPF_SYN_RECV))
640 goto next_listen;
641
642 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
643 inet_listen_unlock(hashinfo);
644 goto done;
645 }
646
647next_listen:
648 cb->args[3] = 0;
649 cb->args[4] = 0;
650 ++num;
651 }
652
653 s_num = 0;
654 cb->args[3] = 0;
655 cb->args[4] = 0;
656 }
657 inet_listen_unlock(hashinfo);
658skip_listen_ht:
659 cb->args[0] = 1;
660 s_i = num = s_num = 0;
661 }
662
663 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
664 return skb->len;
665
666 for (i = s_i; i < hashinfo->ehash_size; i++) {
667 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
668 struct sock *sk;
669 struct hlist_node *node;
670
671 if (i > s_i)
672 s_num = 0;
673
674 read_lock_bh(&head->lock);
675
676 num = 0;
677 sk_for_each(sk, node, &head->chain) {
678 struct inet_sock *inet = inet_sk(sk);
679
680 if (num < s_num)
681 goto next_normal;
682 if (!(r->idiag_states & (1 << sk->sk_state)))
683 goto next_normal;
684 if (r->id.idiag_sport != inet->sport &&
685 r->id.idiag_sport)
686 goto next_normal;
687 if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
688 goto next_normal;
689 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
690 read_unlock_bh(&head->lock);
691 goto done;
692 }
693next_normal:
694 ++num;
695 }
696
697 if (r->idiag_states & TCPF_TIME_WAIT) {
698 sk_for_each(sk, node,
699 &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
700 struct inet_sock *inet = inet_sk(sk);
701
702 if (num < s_num)
703 goto next_dying;
704 if (r->id.idiag_sport != inet->sport &&
705 r->id.idiag_sport)
706 goto next_dying;
707 if (r->id.idiag_dport != inet->dport &&
708 r->id.idiag_dport)
709 goto next_dying;
710 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
711 read_unlock_bh(&head->lock);
712 goto done;
713 }
714next_dying:
715 ++num;
716 }
717 }
718 read_unlock_bh(&head->lock);
719 }
720
721done:
722 cb->args[1] = i;
723 cb->args[2] = num;
724 return skb->len;
725}
726
727static int inet_diag_dump_done(struct netlink_callback *cb)
728{
729 return 0;
730}
731
732
733static __inline__ int
734inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
735{
736 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
737 return 0;
738
739 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
740 goto err_inval;
741
742 if (inet_diag_table[nlh->nlmsg_type] == NULL)
743 return -ENOENT;
744
745 if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
746 goto err_inval;
747
748 if (nlh->nlmsg_flags&NLM_F_DUMP) {
749 if (nlh->nlmsg_len >
750 (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
751 struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
752 sizeof(struct inet_diag_req));
753 if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
754 rta->rta_len < 8 ||
755 rta->rta_len >
756 (nlh->nlmsg_len -
757 NLMSG_SPACE(sizeof(struct inet_diag_req))))
758 goto err_inval;
759 if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
760 goto err_inval;
761 }
762 return netlink_dump_start(idiagnl, skb, nlh,
763 inet_diag_dump,
764 inet_diag_dump_done);
765 } else {
766 return inet_diag_get_exact(skb, nlh);
767 }
768
769err_inval:
770 return -EINVAL;
771}
772
773
774static inline void inet_diag_rcv_skb(struct sk_buff *skb)
775{
776 int err;
777 struct nlmsghdr * nlh;
778
779 if (skb->len >= NLMSG_SPACE(0)) {
780 nlh = (struct nlmsghdr *)skb->data;
781 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
782 return;
783 err = inet_diag_rcv_msg(skb, nlh);
784 if (err || nlh->nlmsg_flags & NLM_F_ACK)
785 netlink_ack(skb, nlh, err);
786 }
787}
788
789static void inet_diag_rcv(struct sock *sk, int len)
790{
791 struct sk_buff *skb;
792 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
793
794 while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
795 inet_diag_rcv_skb(skb);
796 kfree_skb(skb);
797 }
798}
799
800static DEFINE_SPINLOCK(inet_diag_register_lock);
801
802int inet_diag_register(const struct inet_diag_handler *h)
803{
804 const __u16 type = h->idiag_type;
805 int err = -EINVAL;
806
807 if (type >= INET_DIAG_GETSOCK_MAX)
808 goto out;
809
810 spin_lock(&inet_diag_register_lock);
811 err = -EEXIST;
812 if (inet_diag_table[type] == NULL) {
813 inet_diag_table[type] = h;
814 err = 0;
815 }
816 spin_unlock(&inet_diag_register_lock);
817out:
818 return err;
819}
820EXPORT_SYMBOL_GPL(inet_diag_register);
821
822void inet_diag_unregister(const struct inet_diag_handler *h)
823{
824 const __u16 type = h->idiag_type;
825
826 if (type >= INET_DIAG_GETSOCK_MAX)
827 return;
828
829 spin_lock(&inet_diag_register_lock);
830 inet_diag_table[type] = NULL;
831 spin_unlock(&inet_diag_register_lock);
832
833 synchronize_rcu();
834}
835EXPORT_SYMBOL_GPL(inet_diag_unregister);
836
837static int __init inet_diag_init(void)
838{
839 const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
840 sizeof(struct inet_diag_handler *));
841 int err = -ENOMEM;
842
843 inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
844 if (!inet_diag_table)
845 goto out;
846
847 memset(inet_diag_table, 0, inet_diag_table_size);
848 idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
849 THIS_MODULE);
850 if (idiagnl == NULL)
851 goto out_free_table;
852 err = 0;
853out:
854 return err;
855out_free_table:
856 kfree(inet_diag_table);
857 goto out;
858}
859
860static void __exit inet_diag_exit(void)
861{
862 sock_release(idiagnl->sk_socket);
863 kfree(inet_diag_table);
864}
865
866module_init(inet_diag_init);
867module_exit(inet_diag_exit);
868MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000000..e8d29fe736d2
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic INET transport hashtables
7 *
8 * Authors: Lotsa people, from code originally in tcp
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/sched.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21
22#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h>
24
25/*
26 * Allocate and initialize a new local port bind bucket.
27 * The bindhash mutex for snum's hash chain must be held here.
28 */
29struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
30 struct inet_bind_hashbucket *head,
31 const unsigned short snum)
32{
33 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
34
35 if (tb != NULL) {
36 tb->port = snum;
37 tb->fastreuse = 0;
38 INIT_HLIST_HEAD(&tb->owners);
39 hlist_add_head(&tb->node, &head->chain);
40 }
41 return tb;
42}
43
44EXPORT_SYMBOL(inet_bind_bucket_create);
45
46/*
47 * Caller must hold hashbucket lock for this tb with local BH disabled
48 */
49void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
50{
51 if (hlist_empty(&tb->owners)) {
52 __hlist_del(&tb->node);
53 kmem_cache_free(cachep, tb);
54 }
55}
56
57void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
58 const unsigned short snum)
59{
60 inet_sk(sk)->num = snum;
61 sk_add_bind_node(sk, &tb->owners);
62 inet_csk(sk)->icsk_bind_hash = tb;
63}
64
65EXPORT_SYMBOL(inet_bind_hash);
66
67/*
68 * Get rid of any references to a local port held by the given sock.
69 */
70static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
71{
72 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
73 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
74 struct inet_bind_bucket *tb;
75
76 spin_lock(&head->lock);
77 tb = inet_csk(sk)->icsk_bind_hash;
78 __sk_del_bind_node(sk);
79 inet_csk(sk)->icsk_bind_hash = NULL;
80 inet_sk(sk)->num = 0;
81 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
82 spin_unlock(&head->lock);
83}
84
85void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
86{
87 local_bh_disable();
88 __inet_put_port(hashinfo, sk);
89 local_bh_enable();
90}
91
92EXPORT_SYMBOL(inet_put_port);
93
94/*
95 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
96 * Look, when several writers sleep and reader wakes them up, all but one
97 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
98 * this, _but_ remember, it adds useless work on UP machines (wake up each
99 * exclusive lock release). It should be ifdefed really.
100 */
101void inet_listen_wlock(struct inet_hashinfo *hashinfo)
102{
103 write_lock(&hashinfo->lhash_lock);
104
105 if (atomic_read(&hashinfo->lhash_users)) {
106 DEFINE_WAIT(wait);
107
108 for (;;) {
109 prepare_to_wait_exclusive(&hashinfo->lhash_wait,
110 &wait, TASK_UNINTERRUPTIBLE);
111 if (!atomic_read(&hashinfo->lhash_users))
112 break;
113 write_unlock_bh(&hashinfo->lhash_lock);
114 schedule();
115 write_lock_bh(&hashinfo->lhash_lock);
116 }
117
118 finish_wait(&hashinfo->lhash_wait, &wait);
119 }
120}
121
122EXPORT_SYMBOL(inet_listen_wlock);
123
124/*
125 * Don't inline this cruft. Here are some nice properties to exploit here. The
126 * BSD API does not allow a listening sock to specify the remote port nor the
127 * remote address for the connection. So always assume those are both
128 * wildcarded during the search since they can never be otherwise.
129 */
130struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
131 const unsigned short hnum, const int dif)
132{
133 struct sock *result = NULL, *sk;
134 const struct hlist_node *node;
135 int hiscore = -1;
136
137 sk_for_each(sk, node, head) {
138 const struct inet_sock *inet = inet_sk(sk);
139
140 if (inet->num == hnum && !ipv6_only_sock(sk)) {
141 const __u32 rcv_saddr = inet->rcv_saddr;
142 int score = sk->sk_family == PF_INET ? 1 : 0;
143
144 if (rcv_saddr) {
145 if (rcv_saddr != daddr)
146 continue;
147 score += 2;
148 }
149 if (sk->sk_bound_dev_if) {
150 if (sk->sk_bound_dev_if != dif)
151 continue;
152 score += 2;
153 }
154 if (score == 5)
155 return sk;
156 if (score > hiscore) {
157 hiscore = score;
158 result = sk;
159 }
160 }
161 }
162 return result;
163}
164
165EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000000..4d1502a49852
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic TIME_WAIT sockets functions
7 *
8 * From code orinally in TCP
9 */
10
11#include <linux/config.h>
12
13#include <net/inet_hashtables.h>
14#include <net/inet_timewait_sock.h>
15#include <net/ip.h>
16
17/* Must be called with locally disabled BHs. */
18void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
19{
20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */
23 struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
24
25 write_lock(&ehead->lock);
26 if (hlist_unhashed(&tw->tw_node)) {
27 write_unlock(&ehead->lock);
28 return;
29 }
30 __hlist_del(&tw->tw_node);
31 sk_node_init(&tw->tw_node);
32 write_unlock(&ehead->lock);
33
34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
36 spin_lock(&bhead->lock);
37 tb = tw->tw_tb;
38 __hlist_del(&tw->tw_bind_node);
39 tw->tw_tb = NULL;
40 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
41 spin_unlock(&bhead->lock);
42#ifdef SOCK_REFCNT_DEBUG
43 if (atomic_read(&tw->tw_refcnt) != 1) {
44 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
45 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
46 }
47#endif
48 inet_twsk_put(tw);
49}
50
51EXPORT_SYMBOL_GPL(__inet_twsk_kill);
52
53/*
54 * Enter the time wait state. This is called with locally disabled BH.
55 * Essentially we whip up a timewait bucket, copy the relevant info into it
56 * from the SK, and mess with hash chains and list linkage.
57 */
58void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
59 struct inet_hashinfo *hashinfo)
60{
61 const struct inet_sock *inet = inet_sk(sk);
62 const struct inet_connection_sock *icsk = inet_csk(sk);
63 struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
64 struct inet_bind_hashbucket *bhead;
65 /* Step 1: Put TW into bind hash. Original socket stays there too.
66 Note, that any socket with inet->num != 0 MUST be bound in
67 binding cache, even if it is closed.
68 */
69 bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
70 spin_lock(&bhead->lock);
71 tw->tw_tb = icsk->icsk_bind_hash;
72 BUG_TRAP(icsk->icsk_bind_hash);
73 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
74 spin_unlock(&bhead->lock);
75
76 write_lock(&ehead->lock);
77
78 /* Step 2: Remove SK from established hash. */
79 if (__sk_del_node_init(sk))
80 sock_prot_dec_use(sk->sk_prot);
81
82 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
83 inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
84 atomic_inc(&tw->tw_refcnt);
85
86 write_unlock(&ehead->lock);
87}
88
89EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
90
91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
92{
93 struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
94 SLAB_ATOMIC);
95 if (tw != NULL) {
96 const struct inet_sock *inet = inet_sk(sk);
97
98 /* Give us an identity. */
99 tw->tw_daddr = inet->daddr;
100 tw->tw_rcv_saddr = inet->rcv_saddr;
101 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
102 tw->tw_num = inet->num;
103 tw->tw_state = TCP_TIME_WAIT;
104 tw->tw_substate = state;
105 tw->tw_sport = inet->sport;
106 tw->tw_dport = inet->dport;
107 tw->tw_family = sk->sk_family;
108 tw->tw_reuse = sk->sk_reuse;
109 tw->tw_hashent = sk->sk_hashent;
110 tw->tw_ipv6only = 0;
111 tw->tw_prot = sk->sk_prot_creator;
112 atomic_set(&tw->tw_refcnt, 1);
113 inet_twsk_dead_node_init(tw);
114 }
115
116 return tw;
117}
118
119EXPORT_SYMBOL_GPL(inet_twsk_alloc);
120
121/* Returns non-zero if quota exceeded. */
122static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
123 const int slot)
124{
125 struct inet_timewait_sock *tw;
126 struct hlist_node *node;
127 unsigned int killed;
128 int ret;
129
130 /* NOTE: compare this to previous version where lock
131 * was released after detaching chain. It was racy,
132 * because tw buckets are scheduled in not serialized context
133 * in 2.3 (with netfilter), and with softnet it is common, because
134 * soft irqs are not sequenced.
135 */
136 killed = 0;
137 ret = 0;
138rescan:
139 inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
140 __inet_twsk_del_dead_node(tw);
141 spin_unlock(&twdr->death_lock);
142 __inet_twsk_kill(tw, twdr->hashinfo);
143 inet_twsk_put(tw);
144 killed++;
145 spin_lock(&twdr->death_lock);
146 if (killed > INET_TWDR_TWKILL_QUOTA) {
147 ret = 1;
148 break;
149 }
150
151 /* While we dropped twdr->death_lock, another cpu may have
152 * killed off the next TW bucket in the list, therefore
153 * do a fresh re-read of the hlist head node with the
154 * lock reacquired. We still use the hlist traversal
155 * macro in order to get the prefetches.
156 */
157 goto rescan;
158 }
159
160 twdr->tw_count -= killed;
161 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
162
163 return ret;
164}
165
166void inet_twdr_hangman(unsigned long data)
167{
168 struct inet_timewait_death_row *twdr;
169 int unsigned need_timer;
170
171 twdr = (struct inet_timewait_death_row *)data;
172 spin_lock(&twdr->death_lock);
173
174 if (twdr->tw_count == 0)
175 goto out;
176
177 need_timer = 0;
178 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
179 twdr->thread_slots |= (1 << twdr->slot);
180 mb();
181 schedule_work(&twdr->twkill_work);
182 need_timer = 1;
183 } else {
184 /* We purged the entire slot, anything left? */
185 if (twdr->tw_count)
186 need_timer = 1;
187 }
188 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
189 if (need_timer)
190 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
191out:
192 spin_unlock(&twdr->death_lock);
193}
194
195EXPORT_SYMBOL_GPL(inet_twdr_hangman);
196
197extern void twkill_slots_invalid(void);
198
199void inet_twdr_twkill_work(void *data)
200{
201 struct inet_timewait_death_row *twdr = data;
202 int i;
203
204 if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
205 twkill_slots_invalid();
206
207 while (twdr->thread_slots) {
208 spin_lock_bh(&twdr->death_lock);
209 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
210 if (!(twdr->thread_slots & (1 << i)))
211 continue;
212
213 while (inet_twdr_do_twkill_work(twdr, i) != 0) {
214 if (need_resched()) {
215 spin_unlock_bh(&twdr->death_lock);
216 schedule();
217 spin_lock_bh(&twdr->death_lock);
218 }
219 }
220
221 twdr->thread_slots &= ~(1 << i);
222 }
223 spin_unlock_bh(&twdr->death_lock);
224 }
225}
226
227EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
228
229/* These are always called from BH context. See callers in
230 * tcp_input.c to verify this.
231 */
232
233/* This is for handling early-kills of TIME_WAIT sockets. */
234void inet_twsk_deschedule(struct inet_timewait_sock *tw,
235 struct inet_timewait_death_row *twdr)
236{
237 spin_lock(&twdr->death_lock);
238 if (inet_twsk_del_dead_node(tw)) {
239 inet_twsk_put(tw);
240 if (--twdr->tw_count == 0)
241 del_timer(&twdr->tw_timer);
242 }
243 spin_unlock(&twdr->death_lock);
244 __inet_twsk_kill(tw, twdr->hashinfo);
245}
246
247EXPORT_SYMBOL(inet_twsk_deschedule);
248
249void inet_twsk_schedule(struct inet_timewait_sock *tw,
250 struct inet_timewait_death_row *twdr,
251 const int timeo, const int timewait_len)
252{
253 struct hlist_head *list;
254 int slot;
255
256 /* timeout := RTO * 3.5
257 *
258 * 3.5 = 1+2+0.5 to wait for two retransmits.
259 *
260 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
261 * our ACK acking that FIN can be lost. If N subsequent retransmitted
262 * FINs (or previous seqments) are lost (probability of such event
263 * is p^(N+1), where p is probability to lose single packet and
264 * time to detect the loss is about RTO*(2^N - 1) with exponential
265 * backoff). Normal timewait length is calculated so, that we
266 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
267 * [ BTW Linux. following BSD, violates this requirement waiting
268 * only for 60sec, we should wait at least for 240 secs.
269 * Well, 240 consumes too much of resources 8)
270 * ]
271 * This interval is not reduced to catch old duplicate and
272 * responces to our wandering segments living for two MSLs.
273 * However, if we use PAWS to detect
274 * old duplicates, we can reduce the interval to bounds required
275 * by RTO, rather than MSL. So, if peer understands PAWS, we
276 * kill tw bucket after 3.5*RTO (it is important that this number
277 * is greater than TS tick!) and detect old duplicates with help
278 * of PAWS.
279 */
280 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
281
282 spin_lock(&twdr->death_lock);
283
284 /* Unlink it, if it was scheduled */
285 if (inet_twsk_del_dead_node(tw))
286 twdr->tw_count--;
287 else
288 atomic_inc(&tw->tw_refcnt);
289
290 if (slot >= INET_TWDR_RECYCLE_SLOTS) {
291 /* Schedule to slow timer */
292 if (timeo >= timewait_len) {
293 slot = INET_TWDR_TWKILL_SLOTS - 1;
294 } else {
295 slot = (timeo + twdr->period - 1) / twdr->period;
296 if (slot >= INET_TWDR_TWKILL_SLOTS)
297 slot = INET_TWDR_TWKILL_SLOTS - 1;
298 }
299 tw->tw_ttd = jiffies + timeo;
300 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
301 list = &twdr->cells[slot];
302 } else {
303 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
304
305 if (twdr->twcal_hand < 0) {
306 twdr->twcal_hand = 0;
307 twdr->twcal_jiffie = jiffies;
308 twdr->twcal_timer.expires = twdr->twcal_jiffie +
309 (slot << INET_TWDR_RECYCLE_TICK);
310 add_timer(&twdr->twcal_timer);
311 } else {
312 if (time_after(twdr->twcal_timer.expires,
313 jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
314 mod_timer(&twdr->twcal_timer,
315 jiffies + (slot << INET_TWDR_RECYCLE_TICK));
316 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
317 }
318 list = &twdr->twcal_row[slot];
319 }
320
321 hlist_add_head(&tw->tw_death_node, list);
322
323 if (twdr->tw_count++ == 0)
324 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
325 spin_unlock(&twdr->death_lock);
326}
327
328EXPORT_SYMBOL_GPL(inet_twsk_schedule);
329
330void inet_twdr_twcal_tick(unsigned long data)
331{
332 struct inet_timewait_death_row *twdr;
333 int n, slot;
334 unsigned long j;
335 unsigned long now = jiffies;
336 int killed = 0;
337 int adv = 0;
338
339 twdr = (struct inet_timewait_death_row *)data;
340
341 spin_lock(&twdr->death_lock);
342 if (twdr->twcal_hand < 0)
343 goto out;
344
345 slot = twdr->twcal_hand;
346 j = twdr->twcal_jiffie;
347
348 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
349 if (time_before_eq(j, now)) {
350 struct hlist_node *node, *safe;
351 struct inet_timewait_sock *tw;
352
353 inet_twsk_for_each_inmate_safe(tw, node, safe,
354 &twdr->twcal_row[slot]) {
355 __inet_twsk_del_dead_node(tw);
356 __inet_twsk_kill(tw, twdr->hashinfo);
357 inet_twsk_put(tw);
358 killed++;
359 }
360 } else {
361 if (!adv) {
362 adv = 1;
363 twdr->twcal_jiffie = j;
364 twdr->twcal_hand = slot;
365 }
366
367 if (!hlist_empty(&twdr->twcal_row[slot])) {
368 mod_timer(&twdr->twcal_timer, j);
369 goto out;
370 }
371 }
372 j += 1 << INET_TWDR_RECYCLE_TICK;
373 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
374 }
375 twdr->twcal_hand = -1;
376
377out:
378 if ((twdr->tw_count -= killed) == 0)
379 del_timer(&twdr->tw_timer);
380 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
381 spin_unlock(&twdr->death_lock);
382}
383
384EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c406..f84ba9c96551 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/net.h> 22#include <linux/net.h>
23#include <net/ip.h>
23#include <net/inetpeer.h> 24#include <net/inetpeer.h>
24 25
25/* 26/*
@@ -72,7 +73,7 @@
72/* Exported for inet_getid inline function. */ 73/* Exported for inet_getid inline function. */
73DEFINE_SPINLOCK(inet_peer_idlock); 74DEFINE_SPINLOCK(inet_peer_idlock);
74 75
75static kmem_cache_t *peer_cachep; 76static kmem_cache_t *peer_cachep __read_mostly;
76 77
77#define node_height(x) x->avl_height 78#define node_height(x) x->avl_height
78static struct inet_peer peer_fake_node = { 79static struct inet_peer peer_fake_node = {
@@ -450,11 +451,12 @@ static void peer_check_expire(unsigned long dummy)
450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 451 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
451 * interval depending on the total number of entries (more entries, 452 * interval depending on the total number of entries (more entries,
452 * less interval). */ 453 * less interval). */
453 peer_periodic_timer.expires = jiffies 454 if (peer_total >= inet_peer_threshold)
454 + inet_peer_gc_maxtime 455 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
455 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 456 else
456 peer_total / inet_peer_threshold * HZ; 457 peer_periodic_timer.expires = jiffies
458 + inet_peer_gc_maxtime
459 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
460 peer_total / inet_peer_threshold * HZ;
457 add_timer(&peer_periodic_timer); 461 add_timer(&peer_periodic_timer);
458} 462}
459
460EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c28..0923add122b4 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
76 * that reaches zero, we must reply an ICMP control message telling 76 * that reaches zero, we must reply an ICMP control message telling
77 * that the packet's lifetime expired. 77 * that the packet's lifetime expired.
78 */ 78 */
79 79 if (skb->nh.iph->ttl <= 1)
80 iph = skb->nh.iph;
81
82 if (iph->ttl <= 1)
83 goto too_many_hops; 80 goto too_many_hops;
84 81
85 if (!xfrm4_route_forward(skb)) 82 if (!xfrm4_route_forward(skb))
86 goto drop; 83 goto drop;
87 84
88 iph = skb->nh.iph;
89 rt = (struct rtable*)skb->dst; 85 rt = (struct rtable*)skb->dst;
90 86
91 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4ea..9e6e683cc34d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
377 return ip_frag_intern(hash, qp); 377 return ip_frag_intern(hash, qp);
378 378
379out_nomem: 379out_nomem:
380 NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); 380 LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
381 return NULL; 381 return NULL;
382} 382}
383 383
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
533 if (skb->dev) 533 if (skb->dev)
534 qp->iif = skb->dev->ifindex; 534 qp->iif = skb->dev->ifindex;
535 skb->dev = NULL; 535 skb->dev = NULL;
536 qp->stamp = skb->stamp; 536 skb_get_timestamp(skb, &qp->stamp);
537 qp->meat += skb->len; 537 qp->meat += skb->len;
538 atomic_add(skb->truesize, &ip_frag_mem); 538 atomic_add(skb->truesize, &ip_frag_mem);
539 if (offset == 0) 539 if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
615 615
616 head->next = NULL; 616 head->next = NULL;
617 head->dev = dev; 617 head->dev = dev;
618 head->stamp = qp->stamp; 618 skb_set_timestamp(head, &qp->stamp);
619 619
620 iph = head->nh.iph; 620 iph = head->nh.iph;
621 iph->frag_off = 0; 621 iph->frag_off = 0;
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
625 return head; 625 return head;
626 626
627out_nomem: 627out_nomem:
628 NETDEBUG(if (net_ratelimit()) 628 LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
629 printk(KERN_ERR 629 "queue %p\n", qp);
630 "IP: queue_glue: no memory for gluing queue %p\n",
631 qp));
632 goto out_fail; 630 goto out_fail;
633out_oversize: 631out_oversize:
634 if (net_ratelimit()) 632 if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 884835522224..f0d5740d7e22 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
290 290
291 dev_hold(dev); 291 dev_hold(dev);
292 ipgre_tunnel_link(nt); 292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt; 293 return nt;
295 294
296failed: 295failed:
@@ -1277,12 +1276,28 @@ err1:
1277 goto out; 1276 goto out;
1278} 1277}
1279 1278
1280static void ipgre_fini(void) 1279static void __exit ipgre_destroy_tunnels(void)
1280{
1281 int prio;
1282
1283 for (prio = 0; prio < 4; prio++) {
1284 int h;
1285 for (h = 0; h < HASH_SIZE; h++) {
1286 struct ip_tunnel *t;
1287 while ((t = tunnels[prio][h]) != NULL)
1288 unregister_netdevice(t->dev);
1289 }
1290 }
1291}
1292
1293static void __exit ipgre_fini(void)
1281{ 1294{
1282 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1295 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1296 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1284 1297
1285 unregister_netdev(ipgre_fb_tunnel_dev); 1298 rtnl_lock();
1299 ipgre_destroy_tunnels();
1300 rtnl_unlock();
1286} 1301}
1287 1302
1288module_init(ipgre_init); 1303module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bcd..473d0f2b2e0d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
150 * SNMP management statistics 150 * SNMP management statistics
151 */ 151 */
152 152
153DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); 153DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
154 154
155/* 155/*
156 * Process Router Attention IP option 156 * Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
225 /* If there maybe a raw socket we must check - if not we 225 /* If there maybe a raw socket we must check - if not we
226 * don't care less 226 * don't care less
227 */ 227 */
228 if (raw_sk) 228 if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
229 raw_v4_input(skb, skb->nh.iph, hash); 229 raw_sk = NULL;
230 230
231 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { 231 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
232 int ret; 232 int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
279 ip_local_deliver_finish); 279 ip_local_deliver_finish);
280} 280}
281 281
282static inline int ip_rcv_finish(struct sk_buff *skb) 282static inline int ip_rcv_options(struct sk_buff *skb)
283{ 283{
284 struct ip_options *opt;
285 struct iphdr *iph;
284 struct net_device *dev = skb->dev; 286 struct net_device *dev = skb->dev;
287
288 /* It looks as overkill, because not all
289 IP options require packet mangling.
290 But it is the easiest for now, especially taking
291 into account that combination of IP options
292 and running sniffer is extremely rare condition.
293 --ANK (980813)
294 */
295 if (skb_cow(skb, skb_headroom(skb))) {
296 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
297 goto drop;
298 }
299
300 iph = skb->nh.iph;
301
302 if (ip_options_compile(NULL, skb)) {
303 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
304 goto drop;
305 }
306
307 opt = &(IPCB(skb)->opt);
308 if (unlikely(opt->srr)) {
309 struct in_device *in_dev = in_dev_get(dev);
310 if (in_dev) {
311 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
312 if (IN_DEV_LOG_MARTIANS(in_dev) &&
313 net_ratelimit())
314 printk(KERN_INFO "source route option "
315 "%u.%u.%u.%u -> %u.%u.%u.%u\n",
316 NIPQUAD(iph->saddr),
317 NIPQUAD(iph->daddr));
318 in_dev_put(in_dev);
319 goto drop;
320 }
321
322 in_dev_put(in_dev);
323 }
324
325 if (ip_options_rcv_srr(skb))
326 goto drop;
327 }
328
329 return 0;
330drop:
331 return -1;
332}
333
334static inline int ip_rcv_finish(struct sk_buff *skb)
335{
285 struct iphdr *iph = skb->nh.iph; 336 struct iphdr *iph = skb->nh.iph;
286 int err;
287 337
288 /* 338 /*
289 * Initialise the virtual path cache for the packet. It describes 339 * Initialise the virtual path cache for the packet. It describes
290 * how the packet travels inside Linux networking. 340 * how the packet travels inside Linux networking.
291 */ 341 */
292 if (skb->dst == NULL) { 342 if (likely(skb->dst == NULL)) {
293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { 343 int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
344 skb->dev);
345 if (unlikely(err)) {
294 if (err == -EHOSTUNREACH) 346 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); 347 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
296 goto drop; 348 goto drop;
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
298 } 350 }
299 351
300#ifdef CONFIG_NET_CLS_ROUTE 352#ifdef CONFIG_NET_CLS_ROUTE
301 if (skb->dst->tclassid) { 353 if (unlikely(skb->dst->tclassid)) {
302 struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); 354 struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
303 u32 idx = skb->dst->tclassid; 355 u32 idx = skb->dst->tclassid;
304 st[idx&0xFF].o_packets++; 356 st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
308 } 360 }
309#endif 361#endif
310 362
311 if (iph->ihl > 5) { 363 if (iph->ihl > 5 && ip_rcv_options(skb))
312 struct ip_options *opt; 364 goto drop;
313
314 /* It looks as overkill, because not all
315 IP options require packet mangling.
316 But it is the easiest for now, especially taking
317 into account that combination of IP options
318 and running sniffer is extremely rare condition.
319 --ANK (980813)
320 */
321
322 if (skb_cow(skb, skb_headroom(skb))) {
323 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
324 goto drop;
325 }
326 iph = skb->nh.iph;
327
328 if (ip_options_compile(NULL, skb))
329 goto inhdr_error;
330
331 opt = &(IPCB(skb)->opt);
332 if (opt->srr) {
333 struct in_device *in_dev = in_dev_get(dev);
334 if (in_dev) {
335 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
336 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
337 printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
338 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
339 in_dev_put(in_dev);
340 goto drop;
341 }
342 in_dev_put(in_dev);
343 }
344 if (ip_options_rcv_srr(skb))
345 goto drop;
346 }
347 }
348 365
349 return dst_input(skb); 366 return dst_input(skb);
350 367
351inhdr_error:
352 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
353drop: 368drop:
354 kfree_skb(skb); 369 kfree_skb(skb);
355 return NET_RX_DROP; 370 return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
358/* 373/*
359 * Main IP Receive routine. 374 * Main IP Receive routine.
360 */ 375 */
361int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 376int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
362{ 377{
363 struct iphdr *iph; 378 struct iphdr *iph;
379 u32 len;
364 380
365 /* When the interface is in promisc. mode, drop all the crap 381 /* When the interface is in promisc. mode, drop all the crap
366 * that it receives, do not try to analyse it. 382 * that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
392 */ 408 */
393 409
394 if (iph->ihl < 5 || iph->version != 4) 410 if (iph->ihl < 5 || iph->version != 4)
395 goto inhdr_error; 411 goto inhdr_error;
396 412
397 if (!pskb_may_pull(skb, iph->ihl*4)) 413 if (!pskb_may_pull(skb, iph->ihl*4))
398 goto inhdr_error; 414 goto inhdr_error;
399 415
400 iph = skb->nh.iph; 416 iph = skb->nh.iph;
401 417
402 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) 418 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
403 goto inhdr_error; 419 goto inhdr_error;
404 420
405 { 421 len = ntohs(iph->tot_len);
406 __u32 len = ntohs(iph->tot_len); 422 if (skb->len < len || len < (iph->ihl*4))
407 if (skb->len < len || len < (iph->ihl<<2)) 423 goto inhdr_error;
408 goto inhdr_error;
409 424
410 /* Our transport medium may have padded the buffer out. Now we know it 425 /* Our transport medium may have padded the buffer out. Now we know it
411 * is IP we can trim to the true length of the frame. 426 * is IP we can trim to the true length of the frame.
412 * Note this now means skb->len holds ntohs(iph->tot_len). 427 * Note this now means skb->len holds ntohs(iph->tot_len).
413 */ 428 */
414 if (pskb_trim_rcsum(skb, len)) { 429 if (pskb_trim_rcsum(skb, len)) {
415 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); 430 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
416 goto drop; 431 goto drop;
417 }
418 } 432 }
419 433
420 return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, 434 return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
428 return NET_RX_DROP; 442 return NET_RX_DROP;
429} 443}
430 444
431EXPORT_SYMBOL(ip_rcv);
432EXPORT_SYMBOL(ip_statistics); 445EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e701..bce4e875193b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
489 } 489 }
490} 490}
491 491
492int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) 492static struct ip_options *ip_options_get_alloc(const int optlen)
493{ 493{
494 struct ip_options *opt; 494 struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
495 GFP_KERNEL);
496 if (opt)
497 memset(opt, 0, sizeof(*opt));
498 return opt;
499}
495 500
496 opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); 501static int ip_options_get_finish(struct ip_options **optp,
497 if (!opt) 502 struct ip_options *opt, int optlen)
498 return -ENOMEM; 503{
499 memset(opt, 0, sizeof(struct ip_options));
500 if (optlen) {
501 if (user) {
502 if (copy_from_user(opt->__data, data, optlen)) {
503 kfree(opt);
504 return -EFAULT;
505 }
506 } else
507 memcpy(opt->__data, data, optlen);
508 }
509 while (optlen & 3) 504 while (optlen & 3)
510 opt->__data[optlen++] = IPOPT_END; 505 opt->__data[optlen++] = IPOPT_END;
511 opt->optlen = optlen; 506 opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
521 return 0; 516 return 0;
522} 517}
523 518
519int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
520{
521 struct ip_options *opt = ip_options_get_alloc(optlen);
522
523 if (!opt)
524 return -ENOMEM;
525 if (optlen && copy_from_user(opt->__data, data, optlen)) {
526 kfree(opt);
527 return -EFAULT;
528 }
529 return ip_options_get_finish(optp, opt, optlen);
530}
531
532int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
533{
534 struct ip_options *opt = ip_options_get_alloc(optlen);
535
536 if (!opt)
537 return -ENOMEM;
538 if (optlen)
539 memcpy(opt->__data, data, optlen);
540 return ip_options_get_finish(optp, opt, optlen);
541}
542
524void ip_forward_options(struct sk_buff *skb) 543void ip_forward_options(struct sk_buff *skb)
525{ 544{
526 struct ip_options * opt = &(IPCB(skb)->opt); 545 struct ip_options * opt = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
620 } 639 }
621 return 0; 640 return 0;
622} 641}
623
624EXPORT_SYMBOL(ip_options_compile);
625EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9de83e6e0f1d..3f1a263e1249 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
69#include <net/ip.h> 69#include <net/ip.h>
70#include <net/protocol.h> 70#include <net/protocol.h>
71#include <net/route.h> 71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/udp.h>
74#include <linux/skbuff.h> 72#include <linux/skbuff.h>
75#include <net/sock.h> 73#include <net/sock.h>
76#include <net/arp.h> 74#include <net/arp.h>
77#include <net/icmp.h> 75#include <net/icmp.h>
78#include <net/raw.h>
79#include <net/checksum.h> 76#include <net/checksum.h>
80#include <net/inetpeer.h> 77#include <net/inetpeer.h>
81#include <net/checksum.h> 78#include <net/checksum.h>
@@ -84,12 +81,8 @@
84#include <linux/netfilter_bridge.h> 81#include <linux/netfilter_bridge.h>
85#include <linux/mroute.h> 82#include <linux/mroute.h>
86#include <linux/netlink.h> 83#include <linux/netlink.h>
84#include <linux/tcp.h>
87 85
88/*
89 * Shall we try to damage output packets if routing dev changes?
90 */
91
92int sysctl_ip_dynaddr;
93int sysctl_ip_default_ttl = IPDEFTTL; 86int sysctl_ip_default_ttl = IPDEFTTL;
94 87
95/* Generate a checksum for an outgoing IP datagram. */ 88/* Generate a checksum for an outgoing IP datagram. */
@@ -107,7 +100,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 100 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 102 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb); 103 netif_rx(newskb);
112 return 0; 104 return 0;
113} 105}
@@ -166,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
166 dst_output); 158 dst_output);
167} 159}
168 160
161EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
169static inline int ip_finish_output2(struct sk_buff *skb) 163static inline int ip_finish_output2(struct sk_buff *skb)
170{ 164{
171 struct dst_entry *dst = skb->dst; 165 struct dst_entry *dst = skb->dst;
@@ -188,14 +182,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 182 skb = skb2;
189 } 183 }
190 184
191#ifdef CONFIG_BRIDGE_NETFILTER
192 /* bridge-netfilter defers calling some IP hooks to the bridge layer
193 * and still needs the conntrack reference.
194 */
195 if (skb->nf_bridge == NULL)
196#endif
197 nf_reset(skb);
198
199 if (hh) { 185 if (hh) {
200 int hh_alen; 186 int hh_alen;
201 187
@@ -214,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
214 return -EINVAL; 200 return -EINVAL;
215} 201}
216 202
217int ip_finish_output(struct sk_buff *skb) 203static inline int ip_finish_output(struct sk_buff *skb)
218{ 204{
219 struct net_device *dev = skb->dst->dev; 205 struct net_device *dev = skb->dst->dev;
220 206
@@ -338,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
338 if (ip_route_output_flow(&rt, &fl, sk, 0)) 324 if (ip_route_output_flow(&rt, &fl, sk, 0))
339 goto no_route; 325 goto no_route;
340 } 326 }
341 __sk_dst_set(sk, &rt->u.dst); 327 sk_setup_caps(sk, &rt->u.dst);
342 tcp_v4_setup_caps(sk, &rt->u.dst);
343 } 328 }
344 skb->dst = dst_clone(&rt->u.dst); 329 skb->dst = dst_clone(&rt->u.dst);
345 330
@@ -401,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
401#endif 386#endif
402#ifdef CONFIG_NETFILTER 387#ifdef CONFIG_NETFILTER
403 to->nfmark = from->nfmark; 388 to->nfmark = from->nfmark;
404 to->nfcache = from->nfcache;
405 /* Connection association is same as pre-frag packet */ 389 /* Connection association is same as pre-frag packet */
406 nf_conntrack_put(to->nfct); 390 nf_conntrack_put(to->nfct);
407 to->nfct = from->nfct; 391 to->nfct = from->nfct;
@@ -589,7 +573,7 @@ slow_path:
589 */ 573 */
590 574
591 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 575 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
592 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); 576 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
593 err = -ENOMEM; 577 err = -ENOMEM;
594 goto fail; 578 goto fail;
595 } 579 }
@@ -1338,12 +1322,7 @@ void __init ip_init(void)
1338#endif 1322#endif
1339} 1323}
1340 1324
1341EXPORT_SYMBOL(ip_finish_output);
1342EXPORT_SYMBOL(ip_fragment); 1325EXPORT_SYMBOL(ip_fragment);
1343EXPORT_SYMBOL(ip_generic_getfrag); 1326EXPORT_SYMBOL(ip_generic_getfrag);
1344EXPORT_SYMBOL(ip_queue_xmit); 1327EXPORT_SYMBOL(ip_queue_xmit);
1345EXPORT_SYMBOL(ip_send_check); 1328EXPORT_SYMBOL(ip_send_check);
1346
1347#ifdef CONFIG_SYSCTL
1348EXPORT_SYMBOL(sysctl_ip_default_ttl);
1349#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7c481d0d79..2f0b47da5b37 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
153 switch (cmsg->cmsg_type) { 153 switch (cmsg->cmsg_type) {
154 case IP_RETOPTS: 154 case IP_RETOPTS:
155 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 155 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
156 err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); 156 err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
157 if (err) 157 if (err)
158 return err; 158 return err;
159 break; 159 break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
425 struct ip_options * opt = NULL; 425 struct ip_options * opt = NULL;
426 if (optlen > 40 || optlen < 0) 426 if (optlen > 40 || optlen < 0)
427 goto e_inval; 427 goto e_inval;
428 err = ip_options_get(&opt, optval, optlen, 1); 428 err = ip_options_get_from_user(&opt, optval, optlen);
429 if (err) 429 if (err)
430 break; 430 break;
431 if (sk->sk_type == SOCK_STREAM) { 431 if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
614 } 614 }
615 case IP_MSFILTER: 615 case IP_MSFILTER:
616 { 616 {
617 extern int sysctl_optmem_max;
618 extern int sysctl_igmp_max_msf; 617 extern int sysctl_igmp_max_msf;
619 struct ip_msfilter *msf; 618 struct ip_msfilter *msf;
620 619
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
769 } 768 }
770 case MCAST_MSFILTER: 769 case MCAST_MSFILTER:
771 { 770 {
772 extern int sysctl_optmem_max;
773 extern int sysctl_igmp_max_msf; 771 extern int sysctl_igmp_max_msf;
774 struct sockaddr_in *psin; 772 struct sockaddr_in *psin;
775 struct ip_msfilter *msf = NULL; 773 struct ip_msfilter *msf = NULL;
@@ -848,6 +846,9 @@ mc_msf_out:
848 846
849 case IP_IPSEC_POLICY: 847 case IP_IPSEC_POLICY:
850 case IP_XFRM_POLICY: 848 case IP_XFRM_POLICY:
849 err = -EPERM;
850 if (!capable(CAP_NET_ADMIN))
851 break;
851 err = xfrm_user_policy(sk, optname, optval, optlen); 852 err = xfrm_user_policy(sk, optname, optval, optlen);
852 break; 853 break;
853 854
@@ -1087,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1087 1088
1088EXPORT_SYMBOL(ip_cmsg_recv); 1089EXPORT_SYMBOL(ip_cmsg_recv);
1089 1090
1090#ifdef CONFIG_IP_SCTP_MODULE
1091EXPORT_SYMBOL(ip_getsockopt); 1091EXPORT_SYMBOL(ip_getsockopt);
1092EXPORT_SYMBOL(ip_setsockopt); 1092EXPORT_SYMBOL(ip_setsockopt);
1093#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9e5..fc718df17b40 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
214 spi, IPPROTO_COMP, AF_INET); 214 spi, IPPROTO_COMP, AF_INET);
215 if (!x) 215 if (!x)
216 return; 216 return;
217 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", 217 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
218 spi, NIPQUAD(iph->daddr))); 218 spi, NIPQUAD(iph->daddr));
219 xfrm_state_put(x); 219 xfrm_state_put(x);
220} 220}
221 221
@@ -345,8 +345,7 @@ static void ipcomp_free_tfms(struct crypto_tfm **tfms)
345 345
346 for_each_cpu(cpu) { 346 for_each_cpu(cpu) {
347 struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); 347 struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
348 if (tfm) 348 crypto_free_tfm(tfm);
349 crypto_free_tfm(tfm);
350 } 349 }
351 free_percpu(tfms); 350 free_percpu(tfms);
352} 351}
@@ -358,7 +357,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
358 int cpu; 357 int cpu;
359 358
360 /* This can be any valid CPU ID so we don't need locking. */ 359 /* This can be any valid CPU ID so we don't need locking. */
361 cpu = smp_processor_id(); 360 cpu = raw_smp_processor_id();
362 361
363 list_for_each_entry(pos, &ipcomp_tfms_list, list) { 362 list_for_each_entry(pos, &ipcomp_tfms_list, list) {
364 struct crypto_tfm *tfm; 363 struct crypto_tfm *tfm;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a3..953129d392d2 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,6 +54,7 @@
54#include <linux/major.h> 54#include <linux/major.h>
55#include <linux/root_dev.h> 55#include <linux/root_dev.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
57#include <linux/nfs_fs.h>
57#include <net/arp.h> 58#include <net/arp.h>
58#include <net/ip.h> 59#include <net/ip.h>
59#include <net/ipconfig.h> 60#include <net/ipconfig.h>
@@ -393,7 +394,7 @@ static int __init ic_defaults(void)
393 394
394#ifdef IPCONFIG_RARP 395#ifdef IPCONFIG_RARP
395 396
396static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); 397static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
397 398
398static struct packet_type rarp_packet_type __initdata = { 399static struct packet_type rarp_packet_type __initdata = {
399 .type = __constant_htons(ETH_P_RARP), 400 .type = __constant_htons(ETH_P_RARP),
@@ -414,7 +415,7 @@ static inline void ic_rarp_cleanup(void)
414 * Process received RARP packet. 415 * Process received RARP packet.
415 */ 416 */
416static int __init 417static int __init
417ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 418ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
418{ 419{
419 struct arphdr *rarp; 420 struct arphdr *rarp;
420 unsigned char *rarp_ptr; 421 unsigned char *rarp_ptr;
@@ -555,7 +556,7 @@ struct bootp_pkt { /* BOOTP packet format */
555#define DHCPRELEASE 7 556#define DHCPRELEASE 7
556#define DHCPINFORM 8 557#define DHCPINFORM 8
557 558
558static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); 559static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
559 560
560static struct packet_type bootp_packet_type __initdata = { 561static struct packet_type bootp_packet_type __initdata = {
561 .type = __constant_htons(ETH_P_IP), 562 .type = __constant_htons(ETH_P_IP),
@@ -823,7 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
823/* 824/*
824 * Receive BOOTP reply. 825 * Receive BOOTP reply.
825 */ 826 */
826static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 827static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
827{ 828{
828 struct bootp_pkt *b; 829 struct bootp_pkt *b;
829 struct iphdr *h; 830 struct iphdr *h;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a78731f722..c05c1df0bb04 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
255 255
256 dev_hold(dev); 256 dev_hold(dev);
257 ipip_tunnel_link(nt); 257 ipip_tunnel_link(nt);
258 /* Do not decrement MOD_USE_COUNT here. */
259 return nt; 258 return nt;
260 259
261failed: 260failed:
@@ -273,7 +272,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
273 dev_put(dev); 272 dev_put(dev);
274} 273}
275 274
276static void ipip_err(struct sk_buff *skb, void *__unused) 275static void ipip_err(struct sk_buff *skb, u32 info)
277{ 276{
278#ifndef I_WISH_WORLD_WERE_PERFECT 277#ifndef I_WISH_WORLD_WERE_PERFECT
279 278
@@ -852,11 +851,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
852 return 0; 851 return 0;
853} 852}
854 853
854#ifdef CONFIG_INET_TUNNEL
855static struct xfrm_tunnel ipip_handler = { 855static struct xfrm_tunnel ipip_handler = {
856 .handler = ipip_rcv, 856 .handler = ipip_rcv,
857 .err_handler = ipip_err, 857 .err_handler = ipip_err,
858}; 858};
859 859
860static inline int ipip_register(void)
861{
862 return xfrm4_tunnel_register(&ipip_handler);
863}
864
865static inline int ipip_unregister(void)
866{
867 return xfrm4_tunnel_deregister(&ipip_handler);
868}
869#else
870static struct net_protocol ipip_protocol = {
871 .handler = ipip_rcv,
872 .err_handler = ipip_err,
873 .no_policy = 1,
874};
875
876static inline int ipip_register(void)
877{
878 return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
879}
880
881static inline int ipip_unregister(void)
882{
883 return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
884}
885#endif
886
860static char banner[] __initdata = 887static char banner[] __initdata =
861 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 888 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
862 889
@@ -866,7 +893,7 @@ static int __init ipip_init(void)
866 893
867 printk(banner); 894 printk(banner);
868 895
869 if (xfrm4_tunnel_register(&ipip_handler) < 0) { 896 if (ipip_register() < 0) {
870 printk(KERN_INFO "ipip init: can't register tunnel\n"); 897 printk(KERN_INFO "ipip init: can't register tunnel\n");
871 return -EAGAIN; 898 return -EAGAIN;
872 } 899 }
@@ -888,16 +915,33 @@ static int __init ipip_init(void)
888 err2: 915 err2:
889 free_netdev(ipip_fb_tunnel_dev); 916 free_netdev(ipip_fb_tunnel_dev);
890 err1: 917 err1:
891 xfrm4_tunnel_deregister(&ipip_handler); 918 ipip_unregister();
892 goto out; 919 goto out;
893} 920}
894 921
922static void __exit ipip_destroy_tunnels(void)
923{
924 int prio;
925
926 for (prio = 1; prio < 4; prio++) {
927 int h;
928 for (h = 0; h < HASH_SIZE; h++) {
929 struct ip_tunnel *t;
930 while ((t = tunnels[prio][h]) != NULL)
931 unregister_netdevice(t->dev);
932 }
933 }
934}
935
895static void __exit ipip_fini(void) 936static void __exit ipip_fini(void)
896{ 937{
897 if (xfrm4_tunnel_deregister(&ipip_handler) < 0) 938 if (ipip_unregister() < 0)
898 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 939 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
899 940
900 unregister_netdev(ipip_fb_tunnel_dev); 941 rtnl_lock();
942 ipip_destroy_tunnels();
943 unregister_netdevice(ipip_fb_tunnel_dev);
944 rtnl_unlock();
901} 945}
902 946
903module_init(ipip_init); 947module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7833d920bdba..9dbf5909f3a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
103 In this case data path is free of exclusive locks at all. 103 In this case data path is free of exclusive locks at all.
104 */ 104 */
105 105
106static kmem_cache_t *mrt_cachep; 106static kmem_cache_t *mrt_cachep __read_mostly;
107 107
108static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 108static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); 109static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
@@ -362,7 +362,7 @@ out:
362 362
363/* Fill oifs list. It is called under write locked mrt_lock. */ 363/* Fill oifs list. It is called under write locked mrt_lock. */
364 364
365static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) 365static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
366{ 366{
367 int vifi; 367 int vifi;
368 368
@@ -727,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
727 if (c != NULL) { 727 if (c != NULL) {
728 write_lock_bh(&mrt_lock); 728 write_lock_bh(&mrt_lock);
729 c->mfc_parent = mfc->mfcc_parent; 729 c->mfc_parent = mfc->mfcc_parent;
730 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 730 ipmr_update_thresholds(c, mfc->mfcc_ttls);
731 if (!mrtsock) 731 if (!mrtsock)
732 c->mfc_flags |= MFC_STATIC; 732 c->mfc_flags |= MFC_STATIC;
733 write_unlock_bh(&mrt_lock); 733 write_unlock_bh(&mrt_lock);
@@ -744,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
744 c->mfc_origin=mfc->mfcc_origin.s_addr; 744 c->mfc_origin=mfc->mfcc_origin.s_addr;
745 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; 745 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
746 c->mfc_parent=mfc->mfcc_parent; 746 c->mfc_parent=mfc->mfcc_parent;
747 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 747 ipmr_update_thresholds(c, mfc->mfcc_ttls);
748 if (!mrtsock) 748 if (!mrtsock)
749 c->mfc_flags |= MFC_STATIC; 749 c->mfc_flags |= MFC_STATIC;
750 750
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd193..6e092dadb388 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
26#include <linux/in.h> 26#include <linux/in.h>
27#include <linux/ip.h> 27#include <linux/ip.h>
28#include <net/protocol.h> 28#include <net/protocol.h>
29#include <net/tcp.h>
29#include <asm/system.h> 30#include <asm/system.h>
30#include <linux/stat.h> 31#include <linux/stat.h>
31#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 9f16ab309106..e11952ea17af 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
40static struct list_head *ip_vs_conn_tab; 40static struct list_head *ip_vs_conn_tab;
41 41
42/* SLAB cache for IPVS connections */ 42/* SLAB cache for IPVS connections */
43static kmem_cache_t *ip_vs_conn_cachep; 43static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
44 44
45/* counter for current IPVS connections */ 45/* counter for current IPVS connections */
46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
@@ -758,7 +758,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
758 return 1; 758 return 1;
759} 759}
760 760
761 761/* Called from keventd and must protect itself from softirqs */
762void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
763{ 763{
764 int idx; 764 int idx;
@@ -773,7 +773,7 @@ void ip_vs_random_dropentry(void)
773 /* 773 /*
774 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
775 */ 775 */
776 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
777 777
778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -806,7 +806,7 @@ void ip_vs_random_dropentry(void)
806 ip_vs_conn_expire_now(cp->control); 806 ip_vs_conn_expire_now(cp->control);
807 } 807 }
808 } 808 }
809 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
810 } 810 }
811} 811}
812 812
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07cb..3ac7eeca04ac 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
22 * 22 *
23 * Changes: 23 * Changes:
24 * Paul `Rusty' Russell properly handle non-linear skbs 24 * Paul `Rusty' Russell properly handle non-linear skbs
25 * Harald Welte don't use nfcache
25 * 26 *
26 */ 27 */
27 28
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
529 const struct net_device *out, 530 const struct net_device *out,
530 int (*okfn)(struct sk_buff *)) 531 int (*okfn)(struct sk_buff *))
531{ 532{
532 if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) 533 if (!((*pskb)->ipvs_property))
533 return NF_ACCEPT; 534 return NF_ACCEPT;
534 535
535 /* The packet was sent from IPVS, exit this chain */ 536 /* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
701 /* do the statistics and put it back */ 702 /* do the statistics and put it back */
702 ip_vs_out_stats(cp, skb); 703 ip_vs_out_stats(cp, skb);
703 704
704 skb->nfcache |= NFC_IPVS_PROPERTY; 705 skb->ipvs_property = 1;
705 verdict = NF_ACCEPT; 706 verdict = NF_ACCEPT;
706 707
707 out: 708 out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
739 740
740 EnterFunction(11); 741 EnterFunction(11);
741 742
742 if (skb->nfcache & NFC_IPVS_PROPERTY) 743 if (skb->ipvs_property)
743 return NF_ACCEPT; 744 return NF_ACCEPT;
744 745
745 iph = skb->nh.iph; 746 iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
821 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 822 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
822 ip_vs_conn_put(cp); 823 ip_vs_conn_put(cp);
823 824
824 skb->nfcache |= NFC_IPVS_PROPERTY; 825 skb->ipvs_property = 1;
825 826
826 LeaveFunction(11); 827 LeaveFunction(11);
827 return NF_ACCEPT; 828 return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 12a82e91d22a..2d66848e7aa0 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
@@ -1595,7 +1598,7 @@ static ctl_table vs_table[] = {
1595 { .ctl_name = 0 } 1598 { .ctl_name = 0 }
1596}; 1599};
1597 1600
1598static ctl_table ipv4_table[] = { 1601static ctl_table ipvs_ipv4_table[] = {
1599 { 1602 {
1600 .ctl_name = NET_IPV4, 1603 .ctl_name = NET_IPV4,
1601 .procname = "ipv4", 1604 .procname = "ipv4",
@@ -1610,7 +1613,7 @@ static ctl_table vs_root_table[] = {
1610 .ctl_name = CTL_NET, 1613 .ctl_name = CTL_NET,
1611 .procname = "net", 1614 .procname = "net",
1612 .mode = 0555, 1615 .mode = 0555,
1613 .child = ipv4_table, 1616 .child = ipvs_ipv4_table,
1614 }, 1617 },
1615 { .ctl_name = 0 } 1618 { .ctl_name = 0 }
1616}; 1619};
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780a..561cda326fa8 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
131 { .ctl_name = 0 } 131 { .ctl_name = 0 }
132}; 132};
133 133
134static ctl_table ipv4_table[] = { 134static ctl_table ipvs_ipv4_table[] = {
135 { 135 {
136 .ctl_name = NET_IPV4, 136 .ctl_name = NET_IPV4,
137 .procname = "ipv4", 137 .procname = "ipv4",
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
146 .ctl_name = CTL_NET, 146 .ctl_name = CTL_NET,
147 .procname = "net", 147 .procname = "net",
148 .mode = 0555, 148 .mode = 0555,
149 .child = ipv4_table 149 .child = ipvs_ipv4_table
150 }, 150 },
151 { .ctl_name = 0 } 151 { .ctl_name = 0 }
152}; 152};
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d271..ce456dbf09a5 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
320 { .ctl_name = 0 } 320 { .ctl_name = 0 }
321}; 321};
322 322
323static ctl_table ipv4_table[] = { 323static ctl_table ipvs_ipv4_table[] = {
324 { 324 {
325 .ctl_name = NET_IPV4, 325 .ctl_name = NET_IPV4,
326 .procname = "ipv4", 326 .procname = "ipv4",
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
335 .ctl_name = CTL_NET, 335 .ctl_name = CTL_NET,
336 .procname = "net", 336 .procname = "net",
337 .mode = 0555, 337 .mode = 0555,
338 .child = ipv4_table 338 .child = ipvs_ipv4_table
339 }, 339 },
340 { .ctl_name = 0 } 340 { .ctl_name = 0 }
341}; 341};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da74..c19408973c09 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
604} 604}
605 605
606 606
607static void tcp_init(struct ip_vs_protocol *pp) 607static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
608{ 608{
609 IP_VS_INIT_HASH_TABLE(tcp_apps); 609 IP_VS_INIT_HASH_TABLE(tcp_apps);
610 pp->timeout_table = tcp_timeouts; 610 pp->timeout_table = tcp_timeouts;
611} 611}
612 612
613 613
614static void tcp_exit(struct ip_vs_protocol *pp) 614static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
615{ 615{
616} 616}
617 617
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
621 .protocol = IPPROTO_TCP, 621 .protocol = IPPROTO_TCP,
622 .dont_defrag = 0, 622 .dont_defrag = 0,
623 .appcnt = ATOMIC_INIT(0), 623 .appcnt = ATOMIC_INIT(0),
624 .init = tcp_init, 624 .init = ip_vs_tcp_init,
625 .exit = tcp_exit, 625 .exit = ip_vs_tcp_exit,
626 .register_app = tcp_register_app, 626 .register_app = tcp_register_app,
627 .unregister_app = tcp_unregister_app, 627 .unregister_app = tcp_unregister_app,
628 .conn_schedule = tcp_conn_schedule, 628 .conn_schedule = tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08a..3b87482049cf 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->ipvs_property = 1; \
131 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
133 (rt)->u.dst.dev, dst_output); \ 133 (rt)->u.dst.dev, dst_output); \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051d..db67373f9b34 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
107 return NOTIFY_DONE; 107 return NOTIFY_DONE;
108} 108}
109 109
110struct notifier_block drr_dev_notifier = { 110static struct notifier_block drr_dev_notifier = {
111 .notifier_call = drr_dev_event, 111 .notifier_call = drr_dev_event,
112}; 112};
113 113
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000000..ae0779d82c5d
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
1/* IPv4 specific functions of netfilter core */
2
3#include <linux/config.h>
4#ifdef CONFIG_NETFILTER
5
6#include <linux/kernel.h>
7#include <linux/netfilter.h>
8#include <linux/netfilter_ipv4.h>
9
10#include <linux/tcp.h>
11#include <linux/udp.h>
12#include <linux/icmp.h>
13#include <net/route.h>
14#include <linux/ip.h>
15
16/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
17int ip_route_me_harder(struct sk_buff **pskb)
18{
19 struct iphdr *iph = (*pskb)->nh.iph;
20 struct rtable *rt;
21 struct flowi fl = {};
22 struct dst_entry *odst;
23 unsigned int hh_len;
24
25 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
26 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
27 */
28 if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
29 fl.nl_u.ip4_u.daddr = iph->daddr;
30 fl.nl_u.ip4_u.saddr = iph->saddr;
31 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
32 fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
33#ifdef CONFIG_IP_ROUTE_FWMARK
34 fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
35#endif
36 fl.proto = iph->protocol;
37 if (ip_route_output_key(&rt, &fl) != 0)
38 return -1;
39
40 /* Drop old route. */
41 dst_release((*pskb)->dst);
42 (*pskb)->dst = &rt->u.dst;
43 } else {
44 /* non-local src, find valid iif to satisfy
45 * rp-filter when calling ip_route_input. */
46 fl.nl_u.ip4_u.daddr = iph->saddr;
47 if (ip_route_output_key(&rt, &fl) != 0)
48 return -1;
49
50 odst = (*pskb)->dst;
51 if (ip_route_input(*pskb, iph->daddr, iph->saddr,
52 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
53 dst_release(&rt->u.dst);
54 return -1;
55 }
56 dst_release(&rt->u.dst);
57 dst_release(odst);
58 }
59
60 if ((*pskb)->dst->error)
61 return -1;
62
63 /* Change in oif may mean change in hh_len. */
64 hh_len = (*pskb)->dst->dev->hard_header_len;
65 if (skb_headroom(*pskb) < hh_len) {
66 struct sk_buff *nskb;
67
68 nskb = skb_realloc_headroom(*pskb, hh_len);
69 if (!nskb)
70 return -1;
71 if ((*pskb)->sk)
72 skb_set_owner_w(nskb, (*pskb)->sk);
73 kfree_skb(*pskb);
74 *pskb = nskb;
75 }
76
77 return 0;
78}
79EXPORT_SYMBOL(ip_route_me_harder);
80
81/*
82 * Extra routing may needed on local out, as the QUEUE target never
83 * returns control to the table.
84 */
85
86struct ip_rt_info {
87 u_int32_t daddr;
88 u_int32_t saddr;
89 u_int8_t tos;
90};
91
92static void queue_save(const struct sk_buff *skb, struct nf_info *info)
93{
94 struct ip_rt_info *rt_info = nf_info_reroute(info);
95
96 if (info->hook == NF_IP_LOCAL_OUT) {
97 const struct iphdr *iph = skb->nh.iph;
98
99 rt_info->tos = iph->tos;
100 rt_info->daddr = iph->daddr;
101 rt_info->saddr = iph->saddr;
102 }
103}
104
105static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
106{
107 const struct ip_rt_info *rt_info = nf_info_reroute(info);
108
109 if (info->hook == NF_IP_LOCAL_OUT) {
110 struct iphdr *iph = (*pskb)->nh.iph;
111
112 if (!(iph->tos == rt_info->tos
113 && iph->daddr == rt_info->daddr
114 && iph->saddr == rt_info->saddr))
115 return ip_route_me_harder(pskb);
116 }
117 return 0;
118}
119
120static struct nf_queue_rerouter ip_reroute = {
121 .rer_size = sizeof(struct ip_rt_info),
122 .save = queue_save,
123 .reroute = queue_reroute,
124};
125
126static int init(void)
127{
128 return nf_register_queue_rerouter(PF_INET, &ip_reroute);
129}
130
131static void fini(void)
132{
133 nf_unregister_queue_rerouter(PF_INET);
134}
135
136module_init(init);
137module_exit(fini);
138
139#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f0..e046f5521814 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK
40 of packets, but this mark value is kept in the conntrack session 40 of packets, but this mark value is kept in the conntrack session
41 instead of the individual packets. 41 instead of the individual packets.
42 42
43config IP_NF_CONNTRACK_EVENTS
44 bool "Connection tracking events"
45 depends on IP_NF_CONNTRACK
46 help
47 If this option is enabled, the connection tracking code will
48 provide a notifier chain that can be used by other kernel code
49 to get notified about changes in the connection tracking state.
50
51 IF unsure, say `N'.
52
43config IP_NF_CT_PROTO_SCTP 53config IP_NF_CT_PROTO_SCTP
44 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' 54 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
45 depends on IP_NF_CONNTRACK && EXPERIMENTAL 55 depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -100,11 +110,15 @@ config IP_NF_AMANDA
100 To compile it as a module, choose M here. If unsure, say Y. 110 To compile it as a module, choose M here. If unsure, say Y.
101 111
102config IP_NF_QUEUE 112config IP_NF_QUEUE
103 tristate "Userspace queueing via NETLINK" 113 tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
104 help 114 help
105 Netfilter has the ability to queue packets to user space: the 115 Netfilter has the ability to queue packets to user space: the
106 netlink device can be used to access them using this driver. 116 netlink device can be used to access them using this driver.
107 117
118 This option enables the old IPv4-only "ip_queue" implementation
119 which has been obsoleted by the new "nfnetlink_queue" code (see
120 CONFIG_NETFILTER_NETLINK_QUEUE).
121
108 To compile it as a module, choose M here. If unsure, say N. 122 To compile it as a module, choose M here. If unsure, say N.
109 123
110config IP_NF_IPTABLES 124config IP_NF_IPTABLES
@@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP
340 If you want to compile it as a module, say M here and read 354 If you want to compile it as a module, say M here and read
341 <file:Documentation/modules.txt>. If unsure, say `N'. 355 <file:Documentation/modules.txt>. If unsure, say `N'.
342 356
357config IP_NF_MATCH_DCCP
358 tristate 'DCCP protocol match support'
359 depends on IP_NF_IPTABLES
360 help
361 With this option enabled, you will be able to use the iptables
362 `dccp' match in order to match on DCCP source/destination ports
363 and DCCP flags.
364
365 If you want to compile it as a module, say M here and read
366 <file:Documentation/modules.txt>. If unsure, say `N'.
367
343config IP_NF_MATCH_COMMENT 368config IP_NF_MATCH_COMMENT
344 tristate 'comment match support' 369 tristate 'comment match support'
345 depends on IP_NF_IPTABLES 370 depends on IP_NF_IPTABLES
@@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK
361 <file:Documentation/modules.txt>. The module will be called 386 <file:Documentation/modules.txt>. The module will be called
362 ipt_connmark.o. If unsure, say `N'. 387 ipt_connmark.o. If unsure, say `N'.
363 388
389config IP_NF_MATCH_CONNBYTES
390 tristate 'Connection byte/packet counter match support'
391 depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
392 help
393 This option adds a `connbytes' match, which allows you to match the
394 number of bytes and/or packets for each direction within a connection.
395
396 If you want to compile it as a module, say M here and read
397 <file:Documentation/modules.txt>. If unsure, say `N'.
398
364config IP_NF_MATCH_HASHLIMIT 399config IP_NF_MATCH_HASHLIMIT
365 tristate 'hashlimit match support' 400 tristate 'hashlimit match support'
366 depends on IP_NF_IPTABLES 401 depends on IP_NF_IPTABLES
@@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT
375 destination IP' or `500pps from any given source IP' with a single 410 destination IP' or `500pps from any given source IP' with a single
376 IPtables rule. 411 IPtables rule.
377 412
413config IP_NF_MATCH_STRING
414 tristate 'string match support'
415 depends on IP_NF_IPTABLES
416 select TEXTSEARCH
417 select TEXTSEARCH_KMP
418 select TEXTSEARCH_BM
419 select TEXTSEARCH_FSM
420 help
421 This option adds a `string' match, which allows you to look for
422 pattern matchings in packets.
423
424 To compile it as a module, choose M here. If unsure, say N.
425
378# `filter', generic and specific targets 426# `filter', generic and specific targets
379config IP_NF_FILTER 427config IP_NF_FILTER
380 tristate "Packet filtering" 428 tristate "Packet filtering"
@@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY
616 664
617 To compile it as a module, choose M here. If unsure, say N. 665 To compile it as a module, choose M here. If unsure, say N.
618 666
667config IP_NF_TARGET_TTL
668 tristate 'TTL target support'
669 depends on IP_NF_MANGLE
670 help
671 This option adds a `TTL' target, which enables the user to modify
672 the TTL value of the IP header.
673
674 While it is safe to decrement/lower the TTL, this target also enables
675 functionality to increment and set the TTL value of the IP header to
676 arbitrary values. This is EXTREMELY DANGEROUS since you can easily
677 create immortal packets that loop forever on the network.
678
679 To compile it as a module, choose M here. If unsure, say N.
680
619config IP_NF_TARGET_CONNMARK 681config IP_NF_TARGET_CONNMARK
620 tristate 'CONNMARK target support' 682 tristate 'CONNMARK target support'
621 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE 683 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE
692 Allows altering the ARP packet payload: source and destination 754 Allows altering the ARP packet payload: source and destination
693 hardware and network addresses. 755 hardware and network addresses.
694 756
757config IP_NF_CONNTRACK_NETLINK
758 tristate 'Connection tracking netlink interface'
759 depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
760 help
761 This option enables support for a netlink-based userspace interface
762
695endmenu 763endmenu
696 764
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924dd..a7bd38f50522 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
9# connection tracking 9# connection tracking
10obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o 10obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
11 11
12# conntrack netlink interface
13obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
14
15
12# SCTP protocol connection tracking 16# SCTP protocol connection tracking
13obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o 17obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
14 18
@@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
38obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o 42obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
39obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o 43obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
40obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o 44obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
45obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
41obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o 46obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
42obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o 47obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
43obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o 48obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
54obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o 59obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
55obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o 60obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
56obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o 61obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
62obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
57obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o 63obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
58obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o 64obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
59obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o 65obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
60obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o 66obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
61obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o 67obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
68obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
62 69
63# targets 70# targets
64obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 71obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
78obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o 85obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
79obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o 86obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
80obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 87obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
88obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
81 89
82# generic ARP tables 90# generic ARP tables
83obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o 91obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
87obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 95obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
88 96
89obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o 97obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
98obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a78a320eee08..be4c9eb3243f 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
40static char *conns[] = { "DATA ", "MESG ", "INDEX " }; 40static char *conns[] = { "DATA ", "MESG ", "INDEX " };
41 41
42/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
43static char amanda_buffer[65536]; 43static char *amanda_buffer;
44static DEFINE_SPINLOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
45 45
46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -101,14 +101,13 @@ static int help(struct sk_buff **pskb,
101 if (port == 0 || len > 5) 101 if (port == 0 || len > 5)
102 break; 102 break;
103 103
104 exp = ip_conntrack_expect_alloc(); 104 exp = ip_conntrack_expect_alloc(ct);
105 if (exp == NULL) { 105 if (exp == NULL) {
106 ret = NF_DROP; 106 ret = NF_DROP;
107 goto out; 107 goto out;
108 } 108 }
109 109
110 exp->expectfn = NULL; 110 exp->expectfn = NULL;
111 exp->master = ct;
112 111
113 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; 112 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
114 exp->tuple.src.u.tcp.port = 0; 113 exp->tuple.src.u.tcp.port = 0;
@@ -126,10 +125,9 @@ static int help(struct sk_buff **pskb,
126 ret = ip_nat_amanda_hook(pskb, ctinfo, 125 ret = ip_nat_amanda_hook(pskb, ctinfo,
127 tmp - amanda_buffer, 126 tmp - amanda_buffer,
128 len, exp); 127 len, exp);
129 else if (ip_conntrack_expect_related(exp) != 0) { 128 else if (ip_conntrack_expect_related(exp) != 0)
130 ip_conntrack_expect_free(exp);
131 ret = NF_DROP; 129 ret = NF_DROP;
132 } 130 ip_conntrack_expect_put(exp);
133 } 131 }
134 132
135out: 133out:
@@ -155,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = {
155static void __exit fini(void) 153static void __exit fini(void)
156{ 154{
157 ip_conntrack_helper_unregister(&amanda_helper); 155 ip_conntrack_helper_unregister(&amanda_helper);
156 kfree(amanda_buffer);
158} 157}
159 158
160static int __init init(void) 159static int __init init(void)
161{ 160{
162 return ip_conntrack_helper_register(&amanda_helper); 161 int ret;
162
163 amanda_buffer = kmalloc(65536, GFP_KERNEL);
164 if (!amanda_buffer)
165 return -ENOMEM;
166
167 ret = ip_conntrack_helper_register(&amanda_helper);
168 if (ret < 0) {
169 kfree(amanda_buffer);
170 return ret;
171 }
172 return 0;
173
174
163} 175}
164 176
165module_init(init); 177module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4b78ebeb6635..a0648600190e 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
37#include <linux/err.h> 37#include <linux/err.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/notifier.h>
40 41
41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected 42/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 43 registrations, conntrack timers*/
@@ -49,7 +50,7 @@
49#include <linux/netfilter_ipv4/ip_conntrack_core.h> 50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
50#include <linux/netfilter_ipv4/listhelp.h> 51#include <linux/netfilter_ipv4/listhelp.h>
51 52
52#define IP_CONNTRACK_VERSION "2.1" 53#define IP_CONNTRACK_VERSION "2.3"
53 54
54#if 0 55#if 0
55#define DEBUGP printk 56#define DEBUGP printk
@@ -69,21 +70,80 @@ static LIST_HEAD(helpers);
69unsigned int ip_conntrack_htable_size = 0; 70unsigned int ip_conntrack_htable_size = 0;
70int ip_conntrack_max; 71int ip_conntrack_max;
71struct list_head *ip_conntrack_hash; 72struct list_head *ip_conntrack_hash;
72static kmem_cache_t *ip_conntrack_cachep; 73static kmem_cache_t *ip_conntrack_cachep __read_mostly;
73static kmem_cache_t *ip_conntrack_expect_cachep; 74static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
74struct ip_conntrack ip_conntrack_untracked; 75struct ip_conntrack ip_conntrack_untracked;
75unsigned int ip_ct_log_invalid; 76unsigned int ip_ct_log_invalid;
76static LIST_HEAD(unconfirmed); 77static LIST_HEAD(unconfirmed);
77static int ip_conntrack_vmalloc; 78static int ip_conntrack_vmalloc;
78 79
79DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); 80static unsigned int ip_conntrack_next_id = 1;
81static unsigned int ip_conntrack_expect_next_id = 1;
82#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83struct notifier_block *ip_conntrack_chain;
84struct notifier_block *ip_conntrack_expect_chain;
85
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88/* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90static inline void
91__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92{
93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
100}
101
102/* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
104void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105{
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113}
114
115void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116{
117 struct ip_conntrack_ecache *ecache;
118
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
127}
80 128
81void 129/* flush the event cache - touches other CPU's data and must not be called while
82ip_conntrack_put(struct ip_conntrack *ct) 130 * packets are still passing through the code */
131static void ip_ct_event_cache_flush(void)
83{ 132{
84 IP_NF_ASSERT(ct); 133 struct ip_conntrack_ecache *ecache;
85 nf_conntrack_put(&ct->ct_general); 134 int cpu;
135
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
140 }
86} 141}
142#else
143static inline void ip_ct_event_cache_flush(void) {}
144#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
87 147
88static int ip_conntrack_hash_rnd_initted; 148static int ip_conntrack_hash_rnd_initted;
89static unsigned int ip_conntrack_hash_rnd; 149static unsigned int ip_conntrack_hash_rnd;
@@ -137,20 +197,20 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
137 197
138 198
139/* ip_conntrack_expect helper functions */ 199/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp) 200static void unlink_expect(struct ip_conntrack_expect *exp)
141{ 201{
142 ip_conntrack_put(exp->master); 202 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout)); 203 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp); 204 list_del(&exp->list);
145 CONNTRACK_STAT_INC(expect_delete); 205 CONNTRACK_STAT_INC(expect_delete);
206 exp->master->expecting--;
207 ip_conntrack_expect_put(exp);
146} 208}
147 209
148static void unlink_expect(struct ip_conntrack_expect *exp) 210void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
149{ 211{
150 ASSERT_WRITE_LOCK(&ip_conntrack_lock); 212 unlink_expect(exp);
151 list_del(&exp->list); 213 ip_conntrack_expect_put(exp);
152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--;
154} 214}
155 215
156static void expectation_timed_out(unsigned long ul_expect) 216static void expectation_timed_out(unsigned long ul_expect)
@@ -160,7 +220,34 @@ static void expectation_timed_out(unsigned long ul_expect)
160 write_lock_bh(&ip_conntrack_lock); 220 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 221 unlink_expect(exp);
162 write_unlock_bh(&ip_conntrack_lock); 222 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 223 ip_conntrack_expect_put(exp);
224}
225
226struct ip_conntrack_expect *
227__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228{
229 struct ip_conntrack_expect *i;
230
231 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233 atomic_inc(&i->use);
234 return i;
235 }
236 }
237 return NULL;
238}
239
240/* Just find a expectation corresponding to a tuple. */
241struct ip_conntrack_expect *
242ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243{
244 struct ip_conntrack_expect *i;
245
246 read_lock_bh(&ip_conntrack_lock);
247 i = __ip_conntrack_expect_find(tuple);
248 read_unlock_bh(&ip_conntrack_lock);
249
250 return i;
164} 251}
165 252
166/* If an expectation for this connection is found, it gets delete from 253/* If an expectation for this connection is found, it gets delete from
@@ -187,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
187} 274}
188 275
189/* delete all expectations for this conntrack */ 276/* delete all expectations for this conntrack */
190static void remove_expectations(struct ip_conntrack *ct) 277void ip_ct_remove_expectations(struct ip_conntrack *ct)
191{ 278{
192 struct ip_conntrack_expect *i, *tmp; 279 struct ip_conntrack_expect *i, *tmp;
193 280
@@ -198,7 +285,7 @@ static void remove_expectations(struct ip_conntrack *ct)
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { 285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) { 286 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i); 287 unlink_expect(i);
201 destroy_expect(i); 288 ip_conntrack_expect_put(i);
202 } 289 }
203 } 290 }
204} 291}
@@ -217,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct)
217 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); 304 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
218 305
219 /* Destroy all pending expectations */ 306 /* Destroy all pending expectations */
220 remove_expectations(ct); 307 ip_ct_remove_expectations(ct);
221} 308}
222 309
223static void 310static void
@@ -230,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
230 IP_NF_ASSERT(atomic_read(&nfct->use) == 0); 317 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
231 IP_NF_ASSERT(!timer_pending(&ct->timeout)); 318 IP_NF_ASSERT(!timer_pending(&ct->timeout));
232 319
320 ip_conntrack_event(IPCT_DESTROY, ct);
321 set_bit(IPS_DYING_BIT, &ct->status);
322
233 /* To make sure we don't get any weird locking issues here: 323 /* To make sure we don't get any weird locking issues here:
234 * destroy_conntrack() MUST NOT be called with a write lock 324 * destroy_conntrack() MUST NOT be called with a write lock
235 * to ip_conntrack_lock!!! -HW */ 325 * to ip_conntrack_lock!!! -HW */
236 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); 326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
237 if (proto && proto->destroy) 327 if (proto && proto->destroy)
238 proto->destroy(ct); 328 proto->destroy(ct);
239 329
@@ -245,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
245 * except TFTP can create an expectation on the first packet, 335 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 336 * before connection is in the list, so we need to clean here,
247 * too. */ 337 * too. */
248 remove_expectations(ct); 338 ip_ct_remove_expectations(ct);
249 339
250 /* We overload first tuple to link into unconfirmed list. */ 340 /* We overload first tuple to link into unconfirmed list. */
251 if (!is_confirmed(ct)) { 341 if (!is_confirmed(ct)) {
@@ -260,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
260 ip_conntrack_put(ct->master); 350 ip_conntrack_put(ct->master);
261 351
262 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); 352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
263 kmem_cache_free(ip_conntrack_cachep, ct); 353 ip_conntrack_free(ct);
264 atomic_dec(&ip_conntrack_count);
265} 354}
266 355
267static void death_by_timeout(unsigned long ul_conntrack) 356static void death_by_timeout(unsigned long ul_conntrack)
@@ -287,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
287 && ip_ct_tuple_equal(tuple, &i->tuple); 376 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 377}
289 378
290static struct ip_conntrack_tuple_hash * 379struct ip_conntrack_tuple_hash *
291__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, 380__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
292 const struct ip_conntrack *ignored_conntrack) 381 const struct ip_conntrack *ignored_conntrack)
293{ 382{
@@ -322,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
322 return h; 411 return h;
323} 412}
324 413
414static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415 unsigned int hash,
416 unsigned int repl_hash)
417{
418 ct->id = ++ip_conntrack_next_id;
419 list_prepend(&ip_conntrack_hash[hash],
420 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421 list_prepend(&ip_conntrack_hash[repl_hash],
422 &ct->tuplehash[IP_CT_DIR_REPLY].list);
423}
424
425void ip_conntrack_hash_insert(struct ip_conntrack *ct)
426{
427 unsigned int hash, repl_hash;
428
429 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
431
432 write_lock_bh(&ip_conntrack_lock);
433 __ip_conntrack_hash_insert(ct, hash, repl_hash);
434 write_unlock_bh(&ip_conntrack_lock);
435}
436
325/* Confirm a connection given skb; places it in hash table */ 437/* Confirm a connection given skb; places it in hash table */
326int 438int
327__ip_conntrack_confirm(struct sk_buff **pskb) 439__ip_conntrack_confirm(struct sk_buff **pskb)
@@ -368,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
368 /* Remove from unconfirmed list */ 480 /* Remove from unconfirmed list */
369 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); 481 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
370 482
371 list_prepend(&ip_conntrack_hash[hash], 483 __ip_conntrack_hash_insert(ct, hash, repl_hash);
372 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
373 list_prepend(&ip_conntrack_hash[repl_hash],
374 &ct->tuplehash[IP_CT_DIR_REPLY]);
375 /* Timer relative to confirmation time, not original 484 /* Timer relative to confirmation time, not original
376 setting time, otherwise we'd get timer wrap in 485 setting time, otherwise we'd get timer wrap in
377 weird delay cases. */ 486 weird delay cases. */
@@ -381,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 490 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 491 CONNTRACK_STAT_INC(insert);
383 write_unlock_bh(&ip_conntrack_lock); 492 write_unlock_bh(&ip_conntrack_lock);
493 if (ct->helper)
494 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495#ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499#endif
500 ip_conntrack_event_cache(master_ct(ct) ?
501 IPCT_RELATED : IPCT_NEW, *pskb);
502
384 return NF_ACCEPT; 503 return NF_ACCEPT;
385 } 504 }
386 505
@@ -445,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
445 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); 564 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
446} 565}
447 566
448static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) 567static struct ip_conntrack_helper *
568__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
449{ 569{
450 return LIST_FIND(&helpers, helper_cmp, 570 return LIST_FIND(&helpers, helper_cmp,
451 struct ip_conntrack_helper *, 571 struct ip_conntrack_helper *,
452 tuple); 572 tuple);
453} 573}
454 574
455/* Allocate a new conntrack: we return -ENOMEM if classification 575struct ip_conntrack_helper *
456 failed due to stress. Otherwise it really is unclassifiable. */ 576ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
457static struct ip_conntrack_tuple_hash * 577{
458init_conntrack(const struct ip_conntrack_tuple *tuple, 578 struct ip_conntrack_helper *helper;
459 struct ip_conntrack_protocol *protocol, 579
460 struct sk_buff *skb) 580 /* need ip_conntrack_lock to assure that helper exists until
581 * try_module_get() is called */
582 read_lock_bh(&ip_conntrack_lock);
583
584 helper = __ip_conntrack_helper_find(tuple);
585 if (helper) {
586 /* need to increase module usage count to assure helper will
587 * not go away while the caller is e.g. busy putting a
588 * conntrack in the hash that uses the helper */
589 if (!try_module_get(helper->me))
590 helper = NULL;
591 }
592
593 read_unlock_bh(&ip_conntrack_lock);
594
595 return helper;
596}
597
598void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
599{
600 module_put(helper->me);
601}
602
603struct ip_conntrack_protocol *
604__ip_conntrack_proto_find(u_int8_t protocol)
605{
606 return ip_ct_protos[protocol];
607}
608
609/* this is guaranteed to always return a valid protocol helper, since
610 * it falls back to generic_protocol */
611struct ip_conntrack_protocol *
612ip_conntrack_proto_find_get(u_int8_t protocol)
613{
614 struct ip_conntrack_protocol *p;
615
616 preempt_disable();
617 p = __ip_conntrack_proto_find(protocol);
618 if (p) {
619 if (!try_module_get(p->me))
620 p = &ip_conntrack_generic_protocol;
621 }
622 preempt_enable();
623
624 return p;
625}
626
627void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
628{
629 module_put(p->me);
630}
631
632struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633 struct ip_conntrack_tuple *repl)
461{ 634{
462 struct ip_conntrack *conntrack; 635 struct ip_conntrack *conntrack;
463 struct ip_conntrack_tuple repl_tuple;
464 size_t hash;
465 struct ip_conntrack_expect *exp;
466 636
467 if (!ip_conntrack_hash_rnd_initted) { 637 if (!ip_conntrack_hash_rnd_initted) {
468 get_random_bytes(&ip_conntrack_hash_rnd, 4); 638 get_random_bytes(&ip_conntrack_hash_rnd, 4);
469 ip_conntrack_hash_rnd_initted = 1; 639 ip_conntrack_hash_rnd_initted = 1;
470 } 640 }
471 641
472 hash = hash_conntrack(tuple);
473
474 if (ip_conntrack_max 642 if (ip_conntrack_max
475 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { 643 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644 unsigned int hash = hash_conntrack(orig);
476 /* Try dropping from this hash chain. */ 645 /* Try dropping from this hash chain. */
477 if (!early_drop(&ip_conntrack_hash[hash])) { 646 if (!early_drop(&ip_conntrack_hash[hash])) {
478 if (net_ratelimit()) 647 if (net_ratelimit())
@@ -483,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
483 } 652 }
484 } 653 }
485 654
486 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
487 DEBUGP("Can't invert tuple.\n");
488 return NULL;
489 }
490
491 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); 655 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
492 if (!conntrack) { 656 if (!conntrack) {
493 DEBUGP("Can't allocate conntrack.\n"); 657 DEBUGP("Can't allocate conntrack.\n");
@@ -497,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
497 memset(conntrack, 0, sizeof(*conntrack)); 661 memset(conntrack, 0, sizeof(*conntrack));
498 atomic_set(&conntrack->ct_general.use, 1); 662 atomic_set(&conntrack->ct_general.use, 1);
499 conntrack->ct_general.destroy = destroy_conntrack; 663 conntrack->ct_general.destroy = destroy_conntrack;
500 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; 664 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
501 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; 665 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
502 if (!protocol->new(conntrack, skb)) {
503 kmem_cache_free(ip_conntrack_cachep, conntrack);
504 return NULL;
505 }
506 /* Don't set timer yet: wait for confirmation */ 666 /* Don't set timer yet: wait for confirmation */
507 init_timer(&conntrack->timeout); 667 init_timer(&conntrack->timeout);
508 conntrack->timeout.data = (unsigned long)conntrack; 668 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 669 conntrack->timeout.function = death_by_timeout;
510 670
671 atomic_inc(&ip_conntrack_count);
672
673 return conntrack;
674}
675
676void
677ip_conntrack_free(struct ip_conntrack *conntrack)
678{
679 atomic_dec(&ip_conntrack_count);
680 kmem_cache_free(ip_conntrack_cachep, conntrack);
681}
682
683/* Allocate a new conntrack: we return -ENOMEM if classification
684 * failed due to stress. Otherwise it really is unclassifiable */
685static struct ip_conntrack_tuple_hash *
686init_conntrack(struct ip_conntrack_tuple *tuple,
687 struct ip_conntrack_protocol *protocol,
688 struct sk_buff *skb)
689{
690 struct ip_conntrack *conntrack;
691 struct ip_conntrack_tuple repl_tuple;
692 struct ip_conntrack_expect *exp;
693
694 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695 DEBUGP("Can't invert tuple.\n");
696 return NULL;
697 }
698
699 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
700 if (conntrack == NULL || IS_ERR(conntrack))
701 return (struct ip_conntrack_tuple_hash *)conntrack;
702
703 if (!protocol->new(conntrack, skb)) {
704 ip_conntrack_free(conntrack);
705 return NULL;
706 }
707
511 write_lock_bh(&ip_conntrack_lock); 708 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 709 exp = find_expectation(tuple);
513 710
@@ -517,13 +714,18 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
517 /* Welcome, Mr. Bond. We've been expecting you... */ 714 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status); 715 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master; 716 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK 717#ifdef CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark; 718 conntrack->mark = exp->master->mark;
522#endif 719#endif
720#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
721 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
722 /* this is ugly, but there is no other place where to put it */
723 conntrack->nat.masq_index = exp->master->nat.masq_index;
724#endif
523 nf_conntrack_get(&conntrack->master->ct_general); 725 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new); 726 CONNTRACK_STAT_INC(expect_new);
525 } else { 727 } else {
526 conntrack->helper = ip_ct_find_helper(&repl_tuple); 728 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
527 729
528 CONNTRACK_STAT_INC(new); 730 CONNTRACK_STAT_INC(new);
529 } 731 }
@@ -531,13 +733,12 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
531 /* Overload tuple linked list to put us in unconfirmed list. */ 733 /* Overload tuple linked list to put us in unconfirmed list. */
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 734 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 735
534 atomic_inc(&ip_conntrack_count);
535 write_unlock_bh(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
536 737
537 if (exp) { 738 if (exp) {
538 if (exp->expectfn) 739 if (exp->expectfn)
539 exp->expectfn(conntrack, exp); 740 exp->expectfn(conntrack, exp);
540 destroy_expect(exp); 741 ip_conntrack_expect_put(exp);
541 } 742 }
542 743
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 744 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -609,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
609 struct ip_conntrack *ct; 810 struct ip_conntrack *ct;
610 enum ip_conntrack_info ctinfo; 811 enum ip_conntrack_info ctinfo;
611 struct ip_conntrack_protocol *proto; 812 struct ip_conntrack_protocol *proto;
612 int set_reply; 813 int set_reply = 0;
613 int ret; 814 int ret;
614 815
615 /* Previously seen (loopback or untracked)? Ignore. */ 816 /* Previously seen (loopback or untracked)? Ignore. */
@@ -627,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
627 return NF_DROP; 828 return NF_DROP;
628 } 829 }
629 830
630 /* FIXME: Do this right please. --RR */
631 (*pskb)->nfcache |= NFC_UNKNOWN;
632
633/* Doesn't cover locally-generated broadcast, so not worth it. */ 831/* Doesn't cover locally-generated broadcast, so not worth it. */
634#if 0 832#if 0
635 /* Ignore broadcast: no `connection'. */ 833 /* Ignore broadcast: no `connection'. */
@@ -645,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
645 } 843 }
646#endif 844#endif
647 845
648 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); 846 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
649 847
650 /* It may be an special packet, error, unclean... 848 /* It may be an special packet, error, unclean...
651 * inverse of the return code tells to the netfilter 849 * inverse of the return code tells to the netfilter
@@ -681,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
681 return -ret; 879 return -ret;
682 } 880 }
683 881
684 if (set_reply) 882 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
685 set_bit(IPS_SEEN_REPLY_BIT, &ct->status); 883 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
686 884
687 return ret; 885 return ret;
688} 886}
@@ -691,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
691 const struct ip_conntrack_tuple *orig) 889 const struct ip_conntrack_tuple *orig)
692{ 890{
693 return ip_ct_invert_tuple(inverse, orig, 891 return ip_ct_invert_tuple(inverse, orig,
694 ip_ct_find_proto(orig->dst.protonum)); 892 __ip_conntrack_proto_find(orig->dst.protonum));
695} 893}
696 894
697/* Would two expected things clash? */ 895/* Would two expected things clash? */
@@ -729,14 +927,14 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 927 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 928 unlink_expect(i);
731 write_unlock_bh(&ip_conntrack_lock); 929 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 930 ip_conntrack_expect_put(i);
733 return; 931 return;
734 } 932 }
735 } 933 }
736 write_unlock_bh(&ip_conntrack_lock); 934 write_unlock_bh(&ip_conntrack_lock);
737} 935}
738 936
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 937struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
740{ 938{
741 struct ip_conntrack_expect *new; 939 struct ip_conntrack_expect *new;
742 940
@@ -745,18 +943,23 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
745 DEBUGP("expect_related: OOM allocating expect\n"); 943 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL; 944 return NULL;
747 } 945 }
748 new->master = NULL; 946 new->master = me;
947 atomic_inc(&new->master->ct_general.use);
948 atomic_set(&new->use, 1);
749 return new; 949 return new;
750} 950}
751 951
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) 952void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
753{ 953{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect); 954 if (atomic_dec_and_test(&exp->use)) {
955 ip_conntrack_put(exp->master);
956 kmem_cache_free(ip_conntrack_expect_cachep, exp);
957 }
755} 958}
756 959
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) 960static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{ 961{
759 atomic_inc(&exp->master->ct_general.use); 962 atomic_inc(&exp->use);
760 exp->master->expecting++; 963 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 964 list_add(&exp->list, &ip_conntrack_expect_list);
762 965
@@ -766,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; 969 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 add_timer(&exp->timeout); 970 add_timer(&exp->timeout);
768 971
972 exp->id = ++ip_conntrack_expect_next_id;
973 atomic_inc(&exp->use);
769 CONNTRACK_STAT_INC(expect_create); 974 CONNTRACK_STAT_INC(expect_create);
770} 975}
771 976
@@ -778,7 +983,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
778 if (i->master == master) { 983 if (i->master == master) {
779 if (del_timer(&i->timeout)) { 984 if (del_timer(&i->timeout)) {
780 unlink_expect(i); 985 unlink_expect(i);
781 destroy_expect(i); 986 ip_conntrack_expect_put(i);
782 } 987 }
783 break; 988 break;
784 } 989 }
@@ -810,8 +1015,6 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
810 /* Refresh timer: if it's dying, ignore.. */ 1015 /* Refresh timer: if it's dying, ignore.. */
811 if (refresh_timer(i)) { 1016 if (refresh_timer(i)) {
812 ret = 0; 1017 ret = 0;
813 /* We don't need the one they've given us. */
814 ip_conntrack_expect_free(expect);
815 goto out; 1018 goto out;
816 } 1019 }
817 } else if (expect_clash(i, expect)) { 1020 } else if (expect_clash(i, expect)) {
@@ -826,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
826 evict_oldest_expect(expect->master); 1029 evict_oldest_expect(expect->master);
827 1030
828 ip_conntrack_expect_insert(expect); 1031 ip_conntrack_expect_insert(expect);
1032 ip_conntrack_expect_event(IPEXP_NEW, expect);
829 ret = 0; 1033 ret = 0;
830out: 1034out:
831 write_unlock_bh(&ip_conntrack_lock); 1035 write_unlock_bh(&ip_conntrack_lock);
@@ -846,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
846 1050
847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1051 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
848 if (!conntrack->master && conntrack->expecting == 0) 1052 if (!conntrack->master && conntrack->expecting == 0)
849 conntrack->helper = ip_ct_find_helper(newreply); 1053 conntrack->helper = __ip_conntrack_helper_find(newreply);
850 write_unlock_bh(&ip_conntrack_lock); 1054 write_unlock_bh(&ip_conntrack_lock);
851} 1055}
852 1056
@@ -860,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
860 return 0; 1064 return 0;
861} 1065}
862 1066
1067struct ip_conntrack_helper *
1068__ip_conntrack_helper_find_byname(const char *name)
1069{
1070 struct ip_conntrack_helper *h;
1071
1072 list_for_each_entry(h, &helpers, list) {
1073 if (!strcmp(h->name, name))
1074 return h;
1075 }
1076
1077 return NULL;
1078}
1079
863static inline int unhelp(struct ip_conntrack_tuple_hash *i, 1080static inline int unhelp(struct ip_conntrack_tuple_hash *i,
864 const struct ip_conntrack_helper *me) 1081 const struct ip_conntrack_helper *me)
865{ 1082{
866 if (tuplehash_to_ctrack(i)->helper == me) 1083 if (tuplehash_to_ctrack(i)->helper == me) {
1084 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
867 tuplehash_to_ctrack(i)->helper = NULL; 1085 tuplehash_to_ctrack(i)->helper = NULL;
1086 }
868 return 0; 1087 return 0;
869} 1088}
870 1089
@@ -881,7 +1100,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
881 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { 1100 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
882 if (exp->master->helper == me && del_timer(&exp->timeout)) { 1101 if (exp->master->helper == me && del_timer(&exp->timeout)) {
883 unlink_expect(exp); 1102 unlink_expect(exp);
884 destroy_expect(exp); 1103 ip_conntrack_expect_put(exp);
885 } 1104 }
886 } 1105 }
887 /* Get rid of expecteds, set helpers to NULL. */ 1106 /* Get rid of expecteds, set helpers to NULL. */
@@ -926,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
926 if (del_timer(&ct->timeout)) { 1145 if (del_timer(&ct->timeout)) {
927 ct->timeout.expires = jiffies + extra_jiffies; 1146 ct->timeout.expires = jiffies + extra_jiffies;
928 add_timer(&ct->timeout); 1147 add_timer(&ct->timeout);
1148 ip_conntrack_event_cache(IPCT_REFRESH, skb);
929 } 1149 }
930 ct_add_counters(ct, ctinfo, skb); 1150 ct_add_counters(ct, ctinfo, skb);
931 write_unlock_bh(&ip_conntrack_lock); 1151 write_unlock_bh(&ip_conntrack_lock);
932 } 1152 }
933} 1153}
934 1154
1155#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1156 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1157/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1158 * in ip_conntrack_core, since we don't want the protocols to autoload
1159 * or depend on ctnetlink */
1160int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1161 const struct ip_conntrack_tuple *tuple)
1162{
1163 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1164 &tuple->src.u.tcp.port);
1165 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1166 &tuple->dst.u.tcp.port);
1167 return 0;
1168
1169nfattr_failure:
1170 return -1;
1171}
1172
1173int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1174 struct ip_conntrack_tuple *t)
1175{
1176 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1177 return -EINVAL;
1178
1179 t->src.u.tcp.port =
1180 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1181 t->dst.u.tcp.port =
1182 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1183
1184 return 0;
1185}
1186#endif
1187
935/* Returns new sk_buff, or NULL */ 1188/* Returns new sk_buff, or NULL */
936struct sk_buff * 1189struct sk_buff *
937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 1190ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -942,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942 skb = ip_defrag(skb, user); 1195 skb = ip_defrag(skb, user);
943 local_bh_enable(); 1196 local_bh_enable();
944 1197
945 if (skb) { 1198 if (skb)
946 ip_send_check(skb->nh.iph); 1199 ip_send_check(skb->nh.iph);
947 skb->nfcache |= NFC_ALTERED;
948 }
949 return skb; 1200 return skb;
950} 1201}
951 1202
@@ -1095,23 +1346,31 @@ static void free_conntrack_hash(void)
1095 * ip_conntrack_htable_size)); 1346 * ip_conntrack_htable_size));
1096} 1347}
1097 1348
1098/* Mishearing the voices in his head, our hero wonders how he's 1349void ip_conntrack_flush()
1099 supposed to kill the mall. */
1100void ip_conntrack_cleanup(void)
1101{ 1350{
1102 ip_ct_attach = NULL;
1103 /* This makes sure all current packets have passed through 1351 /* This makes sure all current packets have passed through
1104 netfilter framework. Roll on, two-stage module 1352 netfilter framework. Roll on, two-stage module
1105 delete... */ 1353 delete... */
1106 synchronize_net(); 1354 synchronize_net();
1107 1355
1356 ip_ct_event_cache_flush();
1108 i_see_dead_people: 1357 i_see_dead_people:
1109 ip_ct_iterate_cleanup(kill_all, NULL); 1358 ip_ct_iterate_cleanup(kill_all, NULL);
1110 if (atomic_read(&ip_conntrack_count) != 0) { 1359 if (atomic_read(&ip_conntrack_count) != 0) {
1111 schedule(); 1360 schedule();
1112 goto i_see_dead_people; 1361 goto i_see_dead_people;
1113 } 1362 }
1363 /* wait until all references to ip_conntrack_untracked are dropped */
1364 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1365 schedule();
1366}
1114 1367
1368/* Mishearing the voices in his head, our hero wonders how he's
1369 supposed to kill the mall. */
1370void ip_conntrack_cleanup(void)
1371{
1372 ip_ct_attach = NULL;
1373 ip_conntrack_flush();
1115 kmem_cache_destroy(ip_conntrack_cachep); 1374 kmem_cache_destroy(ip_conntrack_cachep);
1116 kmem_cache_destroy(ip_conntrack_expect_cachep); 1375 kmem_cache_destroy(ip_conntrack_expect_cachep);
1117 free_conntrack_hash(); 1376 free_conntrack_hash();
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index fea6dd2a00b6..3a2627db1729 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp connection tracking helper"); 25MODULE_DESCRIPTION("ftp connection tracking helper");
26 26
27/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
28static char ftp_buffer[65536]; 28static char *ftp_buffer;
29
30static DEFINE_SPINLOCK(ip_ftp_lock); 29static DEFINE_SPINLOCK(ip_ftp_lock);
31 30
32#define MAX_PORTS 8 31#define MAX_PORTS 8
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
262} 261}
263 262
264/* We don't update if it's older than what we have. */ 263/* We don't update if it's older than what we have. */
265static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) 264static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
265 struct sk_buff *skb)
266{ 266{
267 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; 267 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
268 268
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
276 oldest = i; 276 oldest = i;
277 } 277 }
278 278
279 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) 279 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
280 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; 280 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
281 else if (oldest != NUM_SEQ_TO_REMEMBER) 281 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
282 } else if (oldest != NUM_SEQ_TO_REMEMBER) {
282 info->seq_aft_nl[dir][oldest] = nl_seq; 283 info->seq_aft_nl[dir][oldest] = nl_seq;
284 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
285 }
283} 286}
284 287
285static int help(struct sk_buff **pskb, 288static int help(struct sk_buff **pskb,
@@ -376,7 +379,7 @@ static int help(struct sk_buff **pskb,
376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); 379 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
377 380
378 /* Allocate expectation which will be inserted */ 381 /* Allocate expectation which will be inserted */
379 exp = ip_conntrack_expect_alloc(); 382 exp = ip_conntrack_expect_alloc(ct);
380 if (exp == NULL) { 383 if (exp == NULL) {
381 ret = NF_DROP; 384 ret = NF_DROP;
382 goto out; 385 goto out;
@@ -403,8 +406,7 @@ static int help(struct sk_buff **pskb,
403 networks, or the packet filter itself). */ 406 networks, or the packet filter itself). */
404 if (!loose) { 407 if (!loose) {
405 ret = NF_ACCEPT; 408 ret = NF_ACCEPT;
406 ip_conntrack_expect_free(exp); 409 goto out_put_expect;
407 goto out_update_nl;
408 } 410 }
409 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) 411 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
410 | (array[2] << 8) | array[3]); 412 | (array[2] << 8) | array[3]);
@@ -419,7 +421,6 @@ static int help(struct sk_buff **pskb,
419 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 421 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
420 422
421 exp->expectfn = NULL; 423 exp->expectfn = NULL;
422 exp->master = ct;
423 424
424 /* Now, NAT might want to mangle the packet, and register the 425 /* Now, NAT might want to mangle the packet, and register the
425 * (possibly changed) expectation itself. */ 426 * (possibly changed) expectation itself. */
@@ -428,18 +429,20 @@ static int help(struct sk_buff **pskb,
428 matchoff, matchlen, exp, &seq); 429 matchoff, matchlen, exp, &seq);
429 else { 430 else {
430 /* Can't expect this? Best to drop packet now. */ 431 /* Can't expect this? Best to drop packet now. */
431 if (ip_conntrack_expect_related(exp) != 0) { 432 if (ip_conntrack_expect_related(exp) != 0)
432 ip_conntrack_expect_free(exp);
433 ret = NF_DROP; 433 ret = NF_DROP;
434 } else 434 else
435 ret = NF_ACCEPT; 435 ret = NF_ACCEPT;
436 } 436 }
437 437
438out_put_expect:
439 ip_conntrack_expect_put(exp);
440
438out_update_nl: 441out_update_nl:
439 /* Now if this ends in \n, update ftp info. Seq may have been 442 /* Now if this ends in \n, update ftp info. Seq may have been
440 * adjusted by NAT code. */ 443 * adjusted by NAT code. */
441 if (ends_in_nl) 444 if (ends_in_nl)
442 update_nl_seq(seq, ct_ftp_info,dir); 445 update_nl_seq(seq, ct_ftp_info,dir, *pskb);
443 out: 446 out:
444 spin_unlock_bh(&ip_ftp_lock); 447 spin_unlock_bh(&ip_ftp_lock);
445 return ret; 448 return ret;
@@ -457,6 +460,8 @@ static void fini(void)
457 ports[i]); 460 ports[i]);
458 ip_conntrack_helper_unregister(&ftp[i]); 461 ip_conntrack_helper_unregister(&ftp[i]);
459 } 462 }
463
464 kfree(ftp_buffer);
460} 465}
461 466
462static int __init init(void) 467static int __init init(void)
@@ -464,6 +469,10 @@ static int __init init(void)
464 int i, ret; 469 int i, ret;
465 char *tmpname; 470 char *tmpname;
466 471
472 ftp_buffer = kmalloc(65536, GFP_KERNEL);
473 if (!ftp_buffer)
474 return -ENOMEM;
475
467 if (ports_c == 0) 476 if (ports_c == 0)
468 ports[ports_c++] = FTP_PORT; 477 ports[ports_c++] = FTP_PORT;
469 478
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index cd98772cc332..25438eec21a1 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -39,7 +39,7 @@ static int ports_c;
39static int max_dcc_channels = 8; 39static int max_dcc_channels = 8;
40static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
41/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
42static char irc_buffer[65536]; 42static char *irc_buffer;
43static DEFINE_SPINLOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
44 44
45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -197,7 +197,7 @@ static int help(struct sk_buff **pskb,
197 continue; 197 continue;
198 } 198 }
199 199
200 exp = ip_conntrack_expect_alloc(); 200 exp = ip_conntrack_expect_alloc(ct);
201 if (exp == NULL) { 201 if (exp == NULL) {
202 ret = NF_DROP; 202 ret = NF_DROP;
203 goto out; 203 goto out;
@@ -221,16 +221,14 @@ static int help(struct sk_buff **pskb,
221 { { 0, { 0 } }, 221 { { 0, { 0 } },
222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
223 exp->expectfn = NULL; 223 exp->expectfn = NULL;
224 exp->master = ct;
225 if (ip_nat_irc_hook) 224 if (ip_nat_irc_hook)
226 ret = ip_nat_irc_hook(pskb, ctinfo, 225 ret = ip_nat_irc_hook(pskb, ctinfo,
227 addr_beg_p - ib_ptr, 226 addr_beg_p - ib_ptr,
228 addr_end_p - addr_beg_p, 227 addr_end_p - addr_beg_p,
229 exp); 228 exp);
230 else if (ip_conntrack_expect_related(exp) != 0) { 229 else if (ip_conntrack_expect_related(exp) != 0)
231 ip_conntrack_expect_free(exp);
232 ret = NF_DROP; 230 ret = NF_DROP;
233 } 231 ip_conntrack_expect_put(exp);
234 goto out; 232 goto out;
235 } /* for .. NUM_DCCPROTO */ 233 } /* for .. NUM_DCCPROTO */
236 } /* while data < ... */ 234 } /* while data < ... */
@@ -259,6 +257,10 @@ static int __init init(void)
259 printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); 257 printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
260 return -EBUSY; 258 return -EBUSY;
261 } 259 }
260
261 irc_buffer = kmalloc(65536, GFP_KERNEL);
262 if (!irc_buffer)
263 return -ENOMEM;
262 264
263 /* If no port given, default to standard irc port */ 265 /* If no port given, default to standard irc port */
264 if (ports_c == 0) 266 if (ports_c == 0)
@@ -306,6 +308,7 @@ static void fini(void)
306 ports[i]); 308 ports[i]);
307 ip_conntrack_helper_unregister(&irc_helpers[i]); 309 ip_conntrack_helper_unregister(&irc_helpers[i]);
308 } 310 }
311 kfree(irc_buffer);
309} 312}
310 313
311module_init(init); 314module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 000000000000..a4e9278db4ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1579 @@
1/* Connection tracking via netlink socket. Allows for user space
2 * protocol helpers and general trouble making from userspace.
3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2003 by Patrick Mchardy <kaber@trash.net>
7 * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
8 *
9 * I've reworked this stuff to use attributes instead of conntrack
10 * structures. 5.44 am. I need more tea. --pablo 05/07/11.
11 *
12 * Initial connection tracking via netlink development funded and
13 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
14 *
15 * Further development of this code funded by Astaro AG (http://www.astaro.com)
16 *
17 * This software may be used and distributed according to the terms
18 * of the GNU General Public License, incorporated herein by reference.
19 */
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/types.h>
25#include <linux/timer.h>
26#include <linux/skbuff.h>
27#include <linux/errno.h>
28#include <linux/netlink.h>
29#include <linux/spinlock.h>
30#include <linux/notifier.h>
31#include <linux/rtnetlink.h>
32
33#include <linux/netfilter.h>
34#include <linux/netfilter_ipv4.h>
35#include <linux/netfilter_ipv4/ip_tables.h>
36#include <linux/netfilter_ipv4/ip_conntrack.h>
37#include <linux/netfilter_ipv4/ip_conntrack_core.h>
38#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
39#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
40#include <linux/netfilter_ipv4/ip_nat_protocol.h>
41
42#include <linux/netfilter/nfnetlink.h>
43#include <linux/netfilter/nfnetlink_conntrack.h>
44
45MODULE_LICENSE("GPL");
46
47static char __initdata version[] = "0.90";
48
49#if 0
50#define DEBUGP printk
51#else
52#define DEBUGP(format, args...)
53#endif
54
55
56static inline int
57ctnetlink_dump_tuples_proto(struct sk_buff *skb,
58 const struct ip_conntrack_tuple *tuple)
59{
60 struct ip_conntrack_protocol *proto;
61
62 NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
63
64 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
65 if (proto && proto->tuple_to_nfattr)
66 return proto->tuple_to_nfattr(skb, tuple);
67
68 return 0;
69
70nfattr_failure:
71 return -1;
72}
73
74static inline int
75ctnetlink_dump_tuples(struct sk_buff *skb,
76 const struct ip_conntrack_tuple *tuple)
77{
78 struct nfattr *nest_parms;
79
80 nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
81 NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
82 NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
83 NFA_NEST_END(skb, nest_parms);
84
85 nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
86 ctnetlink_dump_tuples_proto(skb, tuple);
87 NFA_NEST_END(skb, nest_parms);
88
89 return 0;
90
91nfattr_failure:
92 return -1;
93}
94
95static inline int
96ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
97{
98 u_int32_t status = htonl((u_int32_t) ct->status);
99 NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
100 return 0;
101
102nfattr_failure:
103 return -1;
104}
105
106static inline int
107ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
108{
109 long timeout_l = ct->timeout.expires - jiffies;
110 u_int32_t timeout;
111
112 if (timeout_l < 0)
113 timeout = 0;
114 else
115 timeout = htonl(timeout_l / HZ);
116
117 NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
118 return 0;
119
120nfattr_failure:
121 return -1;
122}
123
124static inline int
125ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
126{
127 struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
128
129 struct nfattr *nest_proto;
130 int ret;
131
132 if (!proto || !proto->to_nfattr)
133 return 0;
134
135 nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
136
137 ret = proto->to_nfattr(skb, nest_proto, ct);
138
139 ip_conntrack_proto_put(proto);
140
141 NFA_NEST_END(skb, nest_proto);
142
143 return ret;
144
145nfattr_failure:
146 return -1;
147}
148
149static inline int
150ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
151{
152 struct nfattr *nest_helper;
153
154 if (!ct->helper)
155 return 0;
156
157 nest_helper = NFA_NEST(skb, CTA_HELP);
158 NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
159
160 if (ct->helper->to_nfattr)
161 ct->helper->to_nfattr(skb, ct);
162
163 NFA_NEST_END(skb, nest_helper);
164
165 return 0;
166
167nfattr_failure:
168 return -1;
169}
170
171#ifdef CONFIG_IP_NF_CT_ACCT
172static inline int
173ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
174 enum ip_conntrack_dir dir)
175{
176 enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
177 struct nfattr *nest_count = NFA_NEST(skb, type);
178 u_int64_t tmp;
179
180 tmp = cpu_to_be64(ct->counters[dir].packets);
181 NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
182
183 tmp = cpu_to_be64(ct->counters[dir].bytes);
184 NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
185
186 NFA_NEST_END(skb, nest_count);
187
188 return 0;
189
190nfattr_failure:
191 return -1;
192}
193#else
194#define ctnetlink_dump_counters(a, b, c) (0)
195#endif
196
197#ifdef CONFIG_IP_NF_CONNTRACK_MARK
198static inline int
199ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
200{
201 u_int32_t mark = htonl(ct->mark);
202
203 NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
204 return 0;
205
206nfattr_failure:
207 return -1;
208}
209#else
210#define ctnetlink_dump_mark(a, b) (0)
211#endif
212
213static inline int
214ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
215{
216 u_int32_t id = htonl(ct->id);
217 NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
218 return 0;
219
220nfattr_failure:
221 return -1;
222}
223
224static inline int
225ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
226{
227 unsigned int use = htonl(atomic_read(&ct->ct_general.use));
228
229 NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
230 return 0;
231
232nfattr_failure:
233 return -1;
234}
235
236#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
237
238static int
239ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
240 int event, int nowait,
241 const struct ip_conntrack *ct)
242{
243 struct nlmsghdr *nlh;
244 struct nfgenmsg *nfmsg;
245 struct nfattr *nest_parms;
246 unsigned char *b;
247
248 b = skb->tail;
249
250 event |= NFNL_SUBSYS_CTNETLINK << 8;
251 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
252 nfmsg = NLMSG_DATA(nlh);
253
254 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
255 nfmsg->nfgen_family = AF_INET;
256 nfmsg->version = NFNETLINK_V0;
257 nfmsg->res_id = 0;
258
259 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
260 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
261 goto nfattr_failure;
262 NFA_NEST_END(skb, nest_parms);
263
264 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
265 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
266 goto nfattr_failure;
267 NFA_NEST_END(skb, nest_parms);
268
269 if (ctnetlink_dump_status(skb, ct) < 0 ||
270 ctnetlink_dump_timeout(skb, ct) < 0 ||
271 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
272 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
273 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
274 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
275 ctnetlink_dump_mark(skb, ct) < 0 ||
276 ctnetlink_dump_id(skb, ct) < 0 ||
277 ctnetlink_dump_use(skb, ct) < 0)
278 goto nfattr_failure;
279
280 nlh->nlmsg_len = skb->tail - b;
281 return skb->len;
282
283nlmsg_failure:
284nfattr_failure:
285 skb_trim(skb, b - skb->data);
286 return -1;
287}
288
289#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
290static int ctnetlink_conntrack_event(struct notifier_block *this,
291 unsigned long events, void *ptr)
292{
293 struct nlmsghdr *nlh;
294 struct nfgenmsg *nfmsg;
295 struct nfattr *nest_parms;
296 struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
297 struct sk_buff *skb;
298 unsigned int type;
299 unsigned char *b;
300 unsigned int flags = 0, group;
301
302 /* ignore our fake conntrack entry */
303 if (ct == &ip_conntrack_untracked)
304 return NOTIFY_DONE;
305
306 if (events & IPCT_DESTROY) {
307 type = IPCTNL_MSG_CT_DELETE;
308 group = NFNLGRP_CONNTRACK_DESTROY;
309 goto alloc_skb;
310 }
311 if (events & (IPCT_NEW | IPCT_RELATED)) {
312 type = IPCTNL_MSG_CT_NEW;
313 flags = NLM_F_CREATE|NLM_F_EXCL;
314 /* dump everything */
315 events = ~0UL;
316 group = NFNLGRP_CONNTRACK_NEW;
317 goto alloc_skb;
318 }
319 if (events & (IPCT_STATUS |
320 IPCT_PROTOINFO |
321 IPCT_HELPER |
322 IPCT_HELPINFO |
323 IPCT_NATINFO)) {
324 type = IPCTNL_MSG_CT_NEW;
325 group = NFNLGRP_CONNTRACK_UPDATE;
326 goto alloc_skb;
327 }
328
329 return NOTIFY_DONE;
330
331alloc_skb:
332 /* FIXME: Check if there are any listeners before, don't hurt performance */
333
334 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
335 if (!skb)
336 return NOTIFY_DONE;
337
338 b = skb->tail;
339
340 type |= NFNL_SUBSYS_CTNETLINK << 8;
341 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
342 nfmsg = NLMSG_DATA(nlh);
343
344 nlh->nlmsg_flags = flags;
345 nfmsg->nfgen_family = AF_INET;
346 nfmsg->version = NFNETLINK_V0;
347 nfmsg->res_id = 0;
348
349 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
350 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
351 goto nfattr_failure;
352 NFA_NEST_END(skb, nest_parms);
353
354 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
355 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
356 goto nfattr_failure;
357 NFA_NEST_END(skb, nest_parms);
358
359 /* NAT stuff is now a status flag */
360 if ((events & IPCT_STATUS || events & IPCT_NATINFO)
361 && ctnetlink_dump_status(skb, ct) < 0)
362 goto nfattr_failure;
363 if (events & IPCT_REFRESH
364 && ctnetlink_dump_timeout(skb, ct) < 0)
365 goto nfattr_failure;
366 if (events & IPCT_PROTOINFO
367 && ctnetlink_dump_protoinfo(skb, ct) < 0)
368 goto nfattr_failure;
369 if (events & IPCT_HELPINFO
370 && ctnetlink_dump_helpinfo(skb, ct) < 0)
371 goto nfattr_failure;
372
373 if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
374 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
375 goto nfattr_failure;
376
377 nlh->nlmsg_len = skb->tail - b;
378 nfnetlink_send(skb, 0, group, 0);
379 return NOTIFY_DONE;
380
381nlmsg_failure:
382nfattr_failure:
383 kfree_skb(skb);
384 return NOTIFY_DONE;
385}
386#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
387
388static int ctnetlink_done(struct netlink_callback *cb)
389{
390 DEBUGP("entered %s\n", __FUNCTION__);
391 return 0;
392}
393
394static int
395ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
396{
397 struct ip_conntrack *ct = NULL;
398 struct ip_conntrack_tuple_hash *h;
399 struct list_head *i;
400 u_int32_t *id = (u_int32_t *) &cb->args[1];
401
402 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
403 cb->args[0], *id);
404
405 read_lock_bh(&ip_conntrack_lock);
406 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
407 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
408 h = (struct ip_conntrack_tuple_hash *) i;
409 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
410 continue;
411 ct = tuplehash_to_ctrack(h);
412 if (ct->id <= *id)
413 continue;
414 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
415 cb->nlh->nlmsg_seq,
416 IPCTNL_MSG_CT_NEW,
417 1, ct) < 0)
418 goto out;
419 *id = ct->id;
420 }
421 }
422out:
423 read_unlock_bh(&ip_conntrack_lock);
424
425 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
426
427 return skb->len;
428}
429
430#ifdef CONFIG_IP_NF_CT_ACCT
431static int
432ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
433{
434 struct ip_conntrack *ct = NULL;
435 struct ip_conntrack_tuple_hash *h;
436 struct list_head *i;
437 u_int32_t *id = (u_int32_t *) &cb->args[1];
438
439 DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
440 cb->args[0], *id);
441
442 write_lock_bh(&ip_conntrack_lock);
443 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
444 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
445 h = (struct ip_conntrack_tuple_hash *) i;
446 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
447 continue;
448 ct = tuplehash_to_ctrack(h);
449 if (ct->id <= *id)
450 continue;
451 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
452 cb->nlh->nlmsg_seq,
453 IPCTNL_MSG_CT_NEW,
454 1, ct) < 0)
455 goto out;
456 *id = ct->id;
457
458 memset(&ct->counters, 0, sizeof(ct->counters));
459 }
460 }
461out:
462 write_unlock_bh(&ip_conntrack_lock);
463
464 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
465
466 return skb->len;
467}
468#endif
469
470static const int cta_min_ip[CTA_IP_MAX] = {
471 [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
472 [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
473};
474
475static inline int
476ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
477{
478 struct nfattr *tb[CTA_IP_MAX];
479
480 DEBUGP("entered %s\n", __FUNCTION__);
481
482
483 if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
484 goto nfattr_failure;
485
486 if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
487 return -EINVAL;
488
489 if (!tb[CTA_IP_V4_SRC-1])
490 return -EINVAL;
491 tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
492
493 if (!tb[CTA_IP_V4_DST-1])
494 return -EINVAL;
495 tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
496
497 DEBUGP("leaving\n");
498
499 return 0;
500
501nfattr_failure:
502 return -1;
503}
504
505static const int cta_min_proto[CTA_PROTO_MAX] = {
506 [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
507 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
508 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
509 [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
510 [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
511 [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
512};
513
514static inline int
515ctnetlink_parse_tuple_proto(struct nfattr *attr,
516 struct ip_conntrack_tuple *tuple)
517{
518 struct nfattr *tb[CTA_PROTO_MAX];
519 struct ip_conntrack_protocol *proto;
520 int ret = 0;
521
522 DEBUGP("entered %s\n", __FUNCTION__);
523
524 if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
525 goto nfattr_failure;
526
527 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
528 return -EINVAL;
529
530 if (!tb[CTA_PROTO_NUM-1])
531 return -EINVAL;
532 tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
533
534 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
535
536 if (likely(proto && proto->nfattr_to_tuple)) {
537 ret = proto->nfattr_to_tuple(tb, tuple);
538 ip_conntrack_proto_put(proto);
539 }
540
541 return ret;
542
543nfattr_failure:
544 return -1;
545}
546
547static inline int
548ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
549 enum ctattr_tuple type)
550{
551 struct nfattr *tb[CTA_TUPLE_MAX];
552 int err;
553
554 DEBUGP("entered %s\n", __FUNCTION__);
555
556 memset(tuple, 0, sizeof(*tuple));
557
558 if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
559 goto nfattr_failure;
560
561 if (!tb[CTA_TUPLE_IP-1])
562 return -EINVAL;
563
564 err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
565 if (err < 0)
566 return err;
567
568 if (!tb[CTA_TUPLE_PROTO-1])
569 return -EINVAL;
570
571 err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
572 if (err < 0)
573 return err;
574
575 /* orig and expect tuples get DIR_ORIGINAL */
576 if (type == CTA_TUPLE_REPLY)
577 tuple->dst.dir = IP_CT_DIR_REPLY;
578 else
579 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
580
581 DUMP_TUPLE(tuple);
582
583 DEBUGP("leaving\n");
584
585 return 0;
586
587nfattr_failure:
588 return -1;
589}
590
591#ifdef CONFIG_IP_NF_NAT_NEEDED
592static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
593 [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
594 [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
595};
596
597static int ctnetlink_parse_nat_proto(struct nfattr *attr,
598 const struct ip_conntrack *ct,
599 struct ip_nat_range *range)
600{
601 struct nfattr *tb[CTA_PROTONAT_MAX];
602 struct ip_nat_protocol *npt;
603
604 DEBUGP("entered %s\n", __FUNCTION__);
605
606 if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
607 goto nfattr_failure;
608
609 if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
610 goto nfattr_failure;
611
612 npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
613 if (!npt)
614 return 0;
615
616 if (!npt->nfattr_to_range) {
617 ip_nat_proto_put(npt);
618 return 0;
619 }
620
621 /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
622 if (npt->nfattr_to_range(tb, range) > 0)
623 range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
624
625 ip_nat_proto_put(npt);
626
627 DEBUGP("leaving\n");
628 return 0;
629
630nfattr_failure:
631 return -1;
632}
633
634static inline int
635ctnetlink_parse_nat(struct nfattr *cda[],
636 const struct ip_conntrack *ct, struct ip_nat_range *range)
637{
638 struct nfattr *tb[CTA_NAT_MAX];
639 int err;
640
641 DEBUGP("entered %s\n", __FUNCTION__);
642
643 memset(range, 0, sizeof(*range));
644
645 if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
646 goto nfattr_failure;
647
648 if (tb[CTA_NAT_MINIP-1])
649 range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
650
651 if (!tb[CTA_NAT_MAXIP-1])
652 range->max_ip = range->min_ip;
653 else
654 range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
655
656 if (range->min_ip)
657 range->flags |= IP_NAT_RANGE_MAP_IPS;
658
659 if (!tb[CTA_NAT_PROTO-1])
660 return 0;
661
662 err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
663 if (err < 0)
664 return err;
665
666 DEBUGP("leaving\n");
667 return 0;
668
669nfattr_failure:
670 return -1;
671}
672#endif
673
674static inline int
675ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
676{
677 struct nfattr *tb[CTA_HELP_MAX];
678
679 DEBUGP("entered %s\n", __FUNCTION__);
680
681 if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
682 goto nfattr_failure;
683
684 if (!tb[CTA_HELP_NAME-1])
685 return -EINVAL;
686
687 *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
688
689 return 0;
690
691nfattr_failure:
692 return -1;
693}
694
695static int
696ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
697 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
698{
699 struct ip_conntrack_tuple_hash *h;
700 struct ip_conntrack_tuple tuple;
701 struct ip_conntrack *ct;
702 int err = 0;
703
704 DEBUGP("entered %s\n", __FUNCTION__);
705
706 if (cda[CTA_TUPLE_ORIG-1])
707 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
708 else if (cda[CTA_TUPLE_REPLY-1])
709 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
710 else {
711 /* Flush the whole table */
712 ip_conntrack_flush();
713 return 0;
714 }
715
716 if (err < 0)
717 return err;
718
719 h = ip_conntrack_find_get(&tuple, NULL);
720 if (!h) {
721 DEBUGP("tuple not found in conntrack hash\n");
722 return -ENOENT;
723 }
724
725 ct = tuplehash_to_ctrack(h);
726
727 if (cda[CTA_ID-1]) {
728 u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
729 if (ct->id != id) {
730 ip_conntrack_put(ct);
731 return -ENOENT;
732 }
733 }
734 if (del_timer(&ct->timeout)) {
735 ip_conntrack_put(ct);
736 ct->timeout.function((unsigned long)ct);
737 return 0;
738 }
739 ip_conntrack_put(ct);
740 DEBUGP("leaving\n");
741
742 return 0;
743}
744
745static int
746ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
747 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
748{
749 struct ip_conntrack_tuple_hash *h;
750 struct ip_conntrack_tuple tuple;
751 struct ip_conntrack *ct;
752 struct sk_buff *skb2 = NULL;
753 int err = 0;
754
755 DEBUGP("entered %s\n", __FUNCTION__);
756
757 if (nlh->nlmsg_flags & NLM_F_DUMP) {
758 struct nfgenmsg *msg = NLMSG_DATA(nlh);
759 u32 rlen;
760
761 if (msg->nfgen_family != AF_INET)
762 return -EAFNOSUPPORT;
763
764 if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
765 IPCTNL_MSG_CT_GET_CTRZERO) {
766#ifdef CONFIG_IP_NF_CT_ACCT
767 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
768 ctnetlink_dump_table_w,
769 ctnetlink_done)) != 0)
770 return -EINVAL;
771#else
772 return -ENOTSUPP;
773#endif
774 } else {
775 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
776 ctnetlink_dump_table,
777 ctnetlink_done)) != 0)
778 return -EINVAL;
779 }
780
781 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
782 if (rlen > skb->len)
783 rlen = skb->len;
784 skb_pull(skb, rlen);
785 return 0;
786 }
787
788 if (cda[CTA_TUPLE_ORIG-1])
789 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
790 else if (cda[CTA_TUPLE_REPLY-1])
791 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
792 else
793 return -EINVAL;
794
795 if (err < 0)
796 return err;
797
798 h = ip_conntrack_find_get(&tuple, NULL);
799 if (!h) {
800 DEBUGP("tuple not found in conntrack hash");
801 return -ENOENT;
802 }
803 DEBUGP("tuple found\n");
804 ct = tuplehash_to_ctrack(h);
805
806 err = -ENOMEM;
807 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
808 if (!skb2) {
809 ip_conntrack_put(ct);
810 return -ENOMEM;
811 }
812 NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
813
814 err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
815 IPCTNL_MSG_CT_NEW, 1, ct);
816 ip_conntrack_put(ct);
817 if (err <= 0)
818 goto out;
819
820 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
821 if (err < 0)
822 goto out;
823
824 DEBUGP("leaving\n");
825 return 0;
826
827out:
828 if (skb2)
829 kfree_skb(skb2);
830 return -1;
831}
832
833static inline int
834ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
835{
836 unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
837 d = ct->status ^ status;
838
839 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
840 /* unchangeable */
841 return -EINVAL;
842
843 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
844 /* SEEN_REPLY bit can only be set */
845 return -EINVAL;
846
847
848 if (d & IPS_ASSURED && !(status & IPS_ASSURED))
849 /* ASSURED bit can only be set */
850 return -EINVAL;
851
852 if (cda[CTA_NAT-1]) {
853#ifndef CONFIG_IP_NF_NAT_NEEDED
854 return -EINVAL;
855#else
856 unsigned int hooknum;
857 struct ip_nat_range range;
858
859 if (ctnetlink_parse_nat(cda, ct, &range) < 0)
860 return -EINVAL;
861
862 DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
863 NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
864 htons(range.min.all), htons(range.max.all));
865
866 /* This is tricky but it works. ip_nat_setup_info needs the
867 * hook number as parameter, so let's do the correct
868 * conversion and run away */
869 if (status & IPS_SRC_NAT_DONE)
870 hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
871 else if (status & IPS_DST_NAT_DONE)
872 hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
873 else
874 return -EINVAL; /* Missing NAT flags */
875
876 DEBUGP("NAT status: %lu\n",
877 status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
878
879 if (ip_nat_initialized(ct, hooknum))
880 return -EEXIST;
881 ip_nat_setup_info(ct, &range, hooknum);
882
883 DEBUGP("NAT status after setup_info: %lu\n",
884 ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
885#endif
886 }
887
888 /* Be careful here, modifying NAT bits can screw up things,
889 * so don't let users modify them directly if they don't pass
890 * ip_nat_range. */
891 ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
892 return 0;
893}
894
895
896static inline int
897ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
898{
899 struct ip_conntrack_helper *helper;
900 char *helpname;
901 int err;
902
903 DEBUGP("entered %s\n", __FUNCTION__);
904
905 /* don't change helper of sibling connections */
906 if (ct->master)
907 return -EINVAL;
908
909 err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
910 if (err < 0)
911 return err;
912
913 helper = __ip_conntrack_helper_find_byname(helpname);
914 if (!helper) {
915 if (!strcmp(helpname, ""))
916 helper = NULL;
917 else
918 return -EINVAL;
919 }
920
921 if (ct->helper) {
922 if (!helper) {
923 /* we had a helper before ... */
924 ip_ct_remove_expectations(ct);
925 ct->helper = NULL;
926 } else {
927 /* need to zero data of old helper */
928 memset(&ct->help, 0, sizeof(ct->help));
929 }
930 }
931
932 ct->helper = helper;
933
934 return 0;
935}
936
937static inline int
938ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
939{
940 u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
941
942 if (!del_timer(&ct->timeout))
943 return -ETIME;
944
945 ct->timeout.expires = jiffies + timeout * HZ;
946 add_timer(&ct->timeout);
947
948 return 0;
949}
950
951static int
952ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
953{
954 int err;
955
956 DEBUGP("entered %s\n", __FUNCTION__);
957
958 if (cda[CTA_HELP-1]) {
959 err = ctnetlink_change_helper(ct, cda);
960 if (err < 0)
961 return err;
962 }
963
964 if (cda[CTA_TIMEOUT-1]) {
965 err = ctnetlink_change_timeout(ct, cda);
966 if (err < 0)
967 return err;
968 }
969
970 if (cda[CTA_STATUS-1]) {
971 err = ctnetlink_change_status(ct, cda);
972 if (err < 0)
973 return err;
974 }
975
976 DEBUGP("all done\n");
977 return 0;
978}
979
980static int
981ctnetlink_create_conntrack(struct nfattr *cda[],
982 struct ip_conntrack_tuple *otuple,
983 struct ip_conntrack_tuple *rtuple)
984{
985 struct ip_conntrack *ct;
986 int err = -EINVAL;
987
988 DEBUGP("entered %s\n", __FUNCTION__);
989
990 ct = ip_conntrack_alloc(otuple, rtuple);
991 if (ct == NULL || IS_ERR(ct))
992 return -ENOMEM;
993
994 if (!cda[CTA_TIMEOUT-1])
995 goto err;
996 ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
997
998 ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
999 ct->status |= IPS_CONFIRMED;
1000
1001 err = ctnetlink_change_status(ct, cda);
1002 if (err < 0)
1003 goto err;
1004
1005 ct->helper = ip_conntrack_helper_find_get(rtuple);
1006
1007 add_timer(&ct->timeout);
1008 ip_conntrack_hash_insert(ct);
1009
1010 if (ct->helper)
1011 ip_conntrack_helper_put(ct->helper);
1012
1013 DEBUGP("conntrack with id %u inserted\n", ct->id);
1014 return 0;
1015
1016err:
1017 ip_conntrack_free(ct);
1018 return err;
1019}
1020
1021static int
1022ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1023 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1024{
1025 struct ip_conntrack_tuple otuple, rtuple;
1026 struct ip_conntrack_tuple_hash *h = NULL;
1027 int err = 0;
1028
1029 DEBUGP("entered %s\n", __FUNCTION__);
1030
1031 if (cda[CTA_TUPLE_ORIG-1]) {
1032 err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
1033 if (err < 0)
1034 return err;
1035 }
1036
1037 if (cda[CTA_TUPLE_REPLY-1]) {
1038 err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
1039 if (err < 0)
1040 return err;
1041 }
1042
1043 write_lock_bh(&ip_conntrack_lock);
1044 if (cda[CTA_TUPLE_ORIG-1])
1045 h = __ip_conntrack_find(&otuple, NULL);
1046 else if (cda[CTA_TUPLE_REPLY-1])
1047 h = __ip_conntrack_find(&rtuple, NULL);
1048
1049 if (h == NULL) {
1050 write_unlock_bh(&ip_conntrack_lock);
1051 DEBUGP("no such conntrack, create new\n");
1052 err = -ENOENT;
1053 if (nlh->nlmsg_flags & NLM_F_CREATE)
1054 err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
1055 return err;
1056 }
1057 /* implicit 'else' */
1058
1059 /* we only allow nat config for new conntracks */
1060 if (cda[CTA_NAT-1]) {
1061 err = -EINVAL;
1062 goto out_unlock;
1063 }
1064
1065 /* We manipulate the conntrack inside the global conntrack table lock,
1066 * so there's no need to increase the refcount */
1067 DEBUGP("conntrack found\n");
1068 err = -EEXIST;
1069 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1070 err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
1071
1072out_unlock:
1073 write_unlock_bh(&ip_conntrack_lock);
1074 return err;
1075}
1076
1077/***********************************************************************
1078 * EXPECT
1079 ***********************************************************************/
1080
1081static inline int
1082ctnetlink_exp_dump_tuple(struct sk_buff *skb,
1083 const struct ip_conntrack_tuple *tuple,
1084 enum ctattr_expect type)
1085{
1086 struct nfattr *nest_parms = NFA_NEST(skb, type);
1087
1088 if (ctnetlink_dump_tuples(skb, tuple) < 0)
1089 goto nfattr_failure;
1090
1091 NFA_NEST_END(skb, nest_parms);
1092
1093 return 0;
1094
1095nfattr_failure:
1096 return -1;
1097}
1098
1099static inline int
1100ctnetlink_exp_dump_expect(struct sk_buff *skb,
1101 const struct ip_conntrack_expect *exp)
1102{
1103 struct ip_conntrack *master = exp->master;
1104 u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
1105 u_int32_t id = htonl(exp->id);
1106
1107 if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
1108 goto nfattr_failure;
1109 if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
1110 goto nfattr_failure;
1111 if (ctnetlink_exp_dump_tuple(skb,
1112 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1113 CTA_EXPECT_MASTER) < 0)
1114 goto nfattr_failure;
1115
1116 NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
1117 NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
1118
1119 return 0;
1120
1121nfattr_failure:
1122 return -1;
1123}
1124
1125static int
1126ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1127 int event,
1128 int nowait,
1129 const struct ip_conntrack_expect *exp)
1130{
1131 struct nlmsghdr *nlh;
1132 struct nfgenmsg *nfmsg;
1133 unsigned char *b;
1134
1135 b = skb->tail;
1136
1137 event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
1138 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
1139 nfmsg = NLMSG_DATA(nlh);
1140
1141 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1142 nfmsg->nfgen_family = AF_INET;
1143 nfmsg->version = NFNETLINK_V0;
1144 nfmsg->res_id = 0;
1145
1146 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1147 goto nfattr_failure;
1148
1149 nlh->nlmsg_len = skb->tail - b;
1150 return skb->len;
1151
1152nlmsg_failure:
1153nfattr_failure:
1154 skb_trim(skb, b - skb->data);
1155 return -1;
1156}
1157
1158#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1159static int ctnetlink_expect_event(struct notifier_block *this,
1160 unsigned long events, void *ptr)
1161{
1162 struct nlmsghdr *nlh;
1163 struct nfgenmsg *nfmsg;
1164 struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
1165 struct sk_buff *skb;
1166 unsigned int type;
1167 unsigned char *b;
1168 int flags = 0;
1169 u16 proto;
1170
1171 if (events & IPEXP_NEW) {
1172 type = IPCTNL_MSG_EXP_NEW;
1173 flags = NLM_F_CREATE|NLM_F_EXCL;
1174 } else
1175 return NOTIFY_DONE;
1176
1177 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
1178 if (!skb)
1179 return NOTIFY_DONE;
1180
1181 b = skb->tail;
1182
1183 type |= NFNL_SUBSYS_CTNETLINK << 8;
1184 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
1185 nfmsg = NLMSG_DATA(nlh);
1186
1187 nlh->nlmsg_flags = flags;
1188 nfmsg->nfgen_family = AF_INET;
1189 nfmsg->version = NFNETLINK_V0;
1190 nfmsg->res_id = 0;
1191
1192 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1193 goto nfattr_failure;
1194
1195 nlh->nlmsg_len = skb->tail - b;
1196 proto = exp->tuple.dst.protonum;
1197 nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
1198 return NOTIFY_DONE;
1199
1200nlmsg_failure:
1201nfattr_failure:
1202 kfree_skb(skb);
1203 return NOTIFY_DONE;
1204}
1205#endif
1206
1207static int
1208ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
1209{
1210 struct ip_conntrack_expect *exp = NULL;
1211 struct list_head *i;
1212 u_int32_t *id = (u_int32_t *) &cb->args[0];
1213
1214 DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
1215
1216 read_lock_bh(&ip_conntrack_lock);
1217 list_for_each_prev(i, &ip_conntrack_expect_list) {
1218 exp = (struct ip_conntrack_expect *) i;
1219 if (exp->id <= *id)
1220 continue;
1221 if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
1222 cb->nlh->nlmsg_seq,
1223 IPCTNL_MSG_EXP_NEW,
1224 1, exp) < 0)
1225 goto out;
1226 *id = exp->id;
1227 }
1228out:
1229 read_unlock_bh(&ip_conntrack_lock);
1230
1231 DEBUGP("leaving, last id=%llu\n", *id);
1232
1233 return skb->len;
1234}
1235
1236static int
1237ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
1238 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1239{
1240 struct ip_conntrack_tuple tuple;
1241 struct ip_conntrack_expect *exp;
1242 struct sk_buff *skb2;
1243 int err = 0;
1244
1245 DEBUGP("entered %s\n", __FUNCTION__);
1246
1247 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1248 struct nfgenmsg *msg = NLMSG_DATA(nlh);
1249 u32 rlen;
1250
1251 if (msg->nfgen_family != AF_INET)
1252 return -EAFNOSUPPORT;
1253
1254 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
1255 ctnetlink_exp_dump_table,
1256 ctnetlink_done)) != 0)
1257 return -EINVAL;
1258 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
1259 if (rlen > skb->len)
1260 rlen = skb->len;
1261 skb_pull(skb, rlen);
1262 return 0;
1263 }
1264
1265 if (cda[CTA_EXPECT_MASTER-1])
1266 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
1267 else
1268 return -EINVAL;
1269
1270 if (err < 0)
1271 return err;
1272
1273 exp = ip_conntrack_expect_find_get(&tuple);
1274 if (!exp)
1275 return -ENOENT;
1276
1277 err = -ENOMEM;
1278 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1279 if (!skb2)
1280 goto out;
1281 NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
1282
1283 err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
1284 nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
1285 1, exp);
1286 if (err <= 0)
1287 goto out;
1288
1289 ip_conntrack_expect_put(exp);
1290
1291 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
1292 if (err < 0)
1293 goto free;
1294
1295 return err;
1296
1297out:
1298 ip_conntrack_expect_put(exp);
1299free:
1300 if (skb2)
1301 kfree_skb(skb2);
1302 return err;
1303}
1304
1305static int
1306ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1307 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1308{
1309 struct ip_conntrack_expect *exp, *tmp;
1310 struct ip_conntrack_tuple tuple;
1311 struct ip_conntrack_helper *h;
1312 int err;
1313
1314 if (cda[CTA_EXPECT_TUPLE-1]) {
1315 /* delete a single expect by tuple */
1316 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1317 if (err < 0)
1318 return err;
1319
1320 /* bump usage count to 2 */
1321 exp = ip_conntrack_expect_find_get(&tuple);
1322 if (!exp)
1323 return -ENOENT;
1324
1325 if (cda[CTA_EXPECT_ID-1]) {
1326 u_int32_t id =
1327 *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
1328 if (exp->id != ntohl(id)) {
1329 ip_conntrack_expect_put(exp);
1330 return -ENOENT;
1331 }
1332 }
1333
1334 /* after list removal, usage count == 1 */
1335 ip_conntrack_unexpect_related(exp);
1336 /* have to put what we 'get' above.
1337 * after this line usage count == 0 */
1338 ip_conntrack_expect_put(exp);
1339 } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
1340 char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
1341
1342 /* delete all expectations for this helper */
1343 write_lock_bh(&ip_conntrack_lock);
1344 h = __ip_conntrack_helper_find_byname(name);
1345 if (!h) {
1346 write_unlock_bh(&ip_conntrack_lock);
1347 return -EINVAL;
1348 }
1349 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1350 list) {
1351 if (exp->master->helper == h
1352 && del_timer(&exp->timeout))
1353 __ip_ct_expect_unlink_destroy(exp);
1354 }
1355 write_unlock(&ip_conntrack_lock);
1356 } else {
1357 /* This basically means we have to flush everything*/
1358 write_lock_bh(&ip_conntrack_lock);
1359 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1360 list) {
1361 if (del_timer(&exp->timeout))
1362 __ip_ct_expect_unlink_destroy(exp);
1363 }
1364 write_unlock_bh(&ip_conntrack_lock);
1365 }
1366
1367 return 0;
1368}
1369static int
1370ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
1371{
1372 return -EOPNOTSUPP;
1373}
1374
1375static int
1376ctnetlink_create_expect(struct nfattr *cda[])
1377{
1378 struct ip_conntrack_tuple tuple, mask, master_tuple;
1379 struct ip_conntrack_tuple_hash *h = NULL;
1380 struct ip_conntrack_expect *exp;
1381 struct ip_conntrack *ct;
1382 int err = 0;
1383
1384 DEBUGP("entered %s\n", __FUNCTION__);
1385
1386 /* caller guarantees that those three CTA_EXPECT_* exist */
1387 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1388 if (err < 0)
1389 return err;
1390 err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
1391 if (err < 0)
1392 return err;
1393 err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
1394 if (err < 0)
1395 return err;
1396
1397 /* Look for master conntrack of this expectation */
1398 h = ip_conntrack_find_get(&master_tuple, NULL);
1399 if (!h)
1400 return -ENOENT;
1401 ct = tuplehash_to_ctrack(h);
1402
1403 if (!ct->helper) {
1404 /* such conntrack hasn't got any helper, abort */
1405 err = -EINVAL;
1406 goto out;
1407 }
1408
1409 exp = ip_conntrack_expect_alloc(ct);
1410 if (!exp) {
1411 err = -ENOMEM;
1412 goto out;
1413 }
1414
1415 exp->expectfn = NULL;
1416 exp->master = ct;
1417 memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
1418 memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
1419
1420 err = ip_conntrack_expect_related(exp);
1421 ip_conntrack_expect_put(exp);
1422
1423out:
1424 ip_conntrack_put(tuplehash_to_ctrack(h));
1425 return err;
1426}
1427
1428static int
1429ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
1430 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1431{
1432 struct ip_conntrack_tuple tuple;
1433 struct ip_conntrack_expect *exp;
1434 int err = 0;
1435
1436 DEBUGP("entered %s\n", __FUNCTION__);
1437
1438 if (!cda[CTA_EXPECT_TUPLE-1]
1439 || !cda[CTA_EXPECT_MASK-1]
1440 || !cda[CTA_EXPECT_MASTER-1])
1441 return -EINVAL;
1442
1443 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1444 if (err < 0)
1445 return err;
1446
1447 write_lock_bh(&ip_conntrack_lock);
1448 exp = __ip_conntrack_expect_find(&tuple);
1449
1450 if (!exp) {
1451 write_unlock_bh(&ip_conntrack_lock);
1452 err = -ENOENT;
1453 if (nlh->nlmsg_flags & NLM_F_CREATE)
1454 err = ctnetlink_create_expect(cda);
1455 return err;
1456 }
1457
1458 err = -EEXIST;
1459 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1460 err = ctnetlink_change_expect(exp, cda);
1461 write_unlock_bh(&ip_conntrack_lock);
1462
1463 DEBUGP("leaving\n");
1464
1465 return err;
1466}
1467
1468#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1469static struct notifier_block ctnl_notifier = {
1470 .notifier_call = ctnetlink_conntrack_event,
1471};
1472
1473static struct notifier_block ctnl_notifier_exp = {
1474 .notifier_call = ctnetlink_expect_event,
1475};
1476#endif
1477
1478static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
1479 [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
1480 .attr_count = CTA_MAX,
1481 .cap_required = CAP_NET_ADMIN },
1482 [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
1483 .attr_count = CTA_MAX,
1484 .cap_required = CAP_NET_ADMIN },
1485 [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
1486 .attr_count = CTA_MAX,
1487 .cap_required = CAP_NET_ADMIN },
1488 [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
1489 .attr_count = CTA_MAX,
1490 .cap_required = CAP_NET_ADMIN },
1491};
1492
1493static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
1494 [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
1495 .attr_count = CTA_EXPECT_MAX,
1496 .cap_required = CAP_NET_ADMIN },
1497 [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
1498 .attr_count = CTA_EXPECT_MAX,
1499 .cap_required = CAP_NET_ADMIN },
1500 [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
1501 .attr_count = CTA_EXPECT_MAX,
1502 .cap_required = CAP_NET_ADMIN },
1503};
1504
1505static struct nfnetlink_subsystem ctnl_subsys = {
1506 .name = "conntrack",
1507 .subsys_id = NFNL_SUBSYS_CTNETLINK,
1508 .cb_count = IPCTNL_MSG_MAX,
1509 .cb = ctnl_cb,
1510};
1511
1512static struct nfnetlink_subsystem ctnl_exp_subsys = {
1513 .name = "conntrack_expect",
1514 .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
1515 .cb_count = IPCTNL_MSG_EXP_MAX,
1516 .cb = ctnl_exp_cb,
1517};
1518
1519static int __init ctnetlink_init(void)
1520{
1521 int ret;
1522
1523 printk("ctnetlink v%s: registering with nfnetlink.\n", version);
1524 ret = nfnetlink_subsys_register(&ctnl_subsys);
1525 if (ret < 0) {
1526 printk("ctnetlink_init: cannot register with nfnetlink.\n");
1527 goto err_out;
1528 }
1529
1530 ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
1531 if (ret < 0) {
1532 printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
1533 goto err_unreg_subsys;
1534 }
1535
1536#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1537 ret = ip_conntrack_register_notifier(&ctnl_notifier);
1538 if (ret < 0) {
1539 printk("ctnetlink_init: cannot register notifier.\n");
1540 goto err_unreg_exp_subsys;
1541 }
1542
1543 ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
1544 if (ret < 0) {
1545 printk("ctnetlink_init: cannot expect register notifier.\n");
1546 goto err_unreg_notifier;
1547 }
1548#endif
1549
1550 return 0;
1551
1552#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1553err_unreg_notifier:
1554 ip_conntrack_unregister_notifier(&ctnl_notifier);
1555err_unreg_exp_subsys:
1556 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1557#endif
1558err_unreg_subsys:
1559 nfnetlink_subsys_unregister(&ctnl_subsys);
1560err_out:
1561 return ret;
1562}
1563
1564static void __exit ctnetlink_exit(void)
1565{
1566 printk("ctnetlink: unregistering from nfnetlink.\n");
1567
1568#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1569 ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
1570 ip_conntrack_unregister_notifier(&ctnl_notifier);
1571#endif
1572
1573 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1574 nfnetlink_subsys_unregister(&ctnl_subsys);
1575 return;
1576}
1577
1578module_init(ctnetlink_init);
1579module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db3252..838d1d69b36e 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
102 ct->timeout.function((unsigned long)ct); 102 ct->timeout.function((unsigned long)ct);
103 } else { 103 } else {
104 atomic_inc(&ct->proto.icmp.count); 104 atomic_inc(&ct->proto.icmp.count);
105 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
105 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); 106 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
106 } 107 }
107 108
108 return NF_ACCEPT; 109 return NF_ACCEPT;
109} 110}
110 111
112static u_int8_t valid_new[] = {
113 [ICMP_ECHO] = 1,
114 [ICMP_TIMESTAMP] = 1,
115 [ICMP_INFO_REQUEST] = 1,
116 [ICMP_ADDRESS] = 1
117};
118
111/* Called when a new connection for this protocol found. */ 119/* Called when a new connection for this protocol found. */
112static int icmp_new(struct ip_conntrack *conntrack, 120static int icmp_new(struct ip_conntrack *conntrack,
113 const struct sk_buff *skb) 121 const struct sk_buff *skb)
114{ 122{
115 static u_int8_t valid_new[]
116 = { [ICMP_ECHO] = 1,
117 [ICMP_TIMESTAMP] = 1,
118 [ICMP_INFO_REQUEST] = 1,
119 [ICMP_ADDRESS] = 1 };
120
121 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) 123 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
122 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { 124 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
123 /* Can't create a new ICMP `conn' with this. */ 125 /* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
158 return NF_ACCEPT; 160 return NF_ACCEPT;
159 } 161 }
160 162
161 innerproto = ip_ct_find_proto(inside->ip.protocol); 163 innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
162 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; 164 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
163 /* Are they talking about one of our connections? */ 165 /* Are they talking about one of our connections? */
164 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { 166 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
165 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); 167 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
168 ip_conntrack_proto_put(innerproto);
166 return NF_ACCEPT; 169 return NF_ACCEPT;
167 } 170 }
168 171
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
170 been preserved inside the ICMP. */ 173 been preserved inside the ICMP. */
171 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { 174 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
172 DEBUGP("icmp_error_track: Can't invert tuple\n"); 175 DEBUGP("icmp_error_track: Can't invert tuple\n");
176 ip_conntrack_proto_put(innerproto);
173 return NF_ACCEPT; 177 return NF_ACCEPT;
174 } 178 }
179 ip_conntrack_proto_put(innerproto);
175 180
176 *ctinfo = IP_CT_RELATED; 181 *ctinfo = IP_CT_RELATED;
177 182
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
212 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); 217 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
213 if (icmph == NULL) { 218 if (icmph == NULL) {
214 if (LOG_INVALID(IPPROTO_ICMP)) 219 if (LOG_INVALID(IPPROTO_ICMP))
215 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 220 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
216 "ip_ct_icmp: short packet "); 221 "ip_ct_icmp: short packet ");
217 return -NF_ACCEPT; 222 return -NF_ACCEPT;
218 } 223 }
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
226 if (!(u16)csum_fold(skb->csum)) 231 if (!(u16)csum_fold(skb->csum))
227 break; 232 break;
228 if (LOG_INVALID(IPPROTO_ICMP)) 233 if (LOG_INVALID(IPPROTO_ICMP))
229 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 234 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
230 "ip_ct_icmp: bad HW ICMP checksum "); 235 "ip_ct_icmp: bad HW ICMP checksum ");
231 return -NF_ACCEPT; 236 return -NF_ACCEPT;
232 case CHECKSUM_NONE: 237 case CHECKSUM_NONE:
233 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { 238 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
234 if (LOG_INVALID(IPPROTO_ICMP)) 239 if (LOG_INVALID(IPPROTO_ICMP))
235 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 240 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
236 "ip_ct_icmp: bad ICMP checksum "); 241 "ip_ct_icmp: bad ICMP checksum ");
237 return -NF_ACCEPT; 242 return -NF_ACCEPT;
238 } 243 }
@@ -249,7 +254,7 @@ checksum_skipped:
249 */ 254 */
250 if (icmph->type > NR_ICMP_TYPES) { 255 if (icmph->type > NR_ICMP_TYPES) {
251 if (LOG_INVALID(IPPROTO_ICMP)) 256 if (LOG_INVALID(IPPROTO_ICMP))
252 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 257 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
253 "ip_ct_icmp: invalid ICMP type "); 258 "ip_ct_icmp: invalid ICMP type ");
254 return -NF_ACCEPT; 259 return -NF_ACCEPT;
255 } 260 }
@@ -265,6 +270,47 @@ checksum_skipped:
265 return icmp_error_message(skb, ctinfo, hooknum); 270 return icmp_error_message(skb, ctinfo, hooknum);
266} 271}
267 272
273#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
274 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
275static int icmp_tuple_to_nfattr(struct sk_buff *skb,
276 const struct ip_conntrack_tuple *t)
277{
278 NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
279 &t->src.u.icmp.id);
280 NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
281 &t->dst.u.icmp.type);
282 NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
283 &t->dst.u.icmp.code);
284
285 if (t->dst.u.icmp.type >= sizeof(valid_new)
286 || !valid_new[t->dst.u.icmp.type])
287 return -EINVAL;
288
289 return 0;
290
291nfattr_failure:
292 return -1;
293}
294
295static int icmp_nfattr_to_tuple(struct nfattr *tb[],
296 struct ip_conntrack_tuple *tuple)
297{
298 if (!tb[CTA_PROTO_ICMP_TYPE-1]
299 || !tb[CTA_PROTO_ICMP_CODE-1]
300 || !tb[CTA_PROTO_ICMP_ID-1])
301 return -1;
302
303 tuple->dst.u.icmp.type =
304 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
305 tuple->dst.u.icmp.code =
306 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
307 tuple->src.u.icmp.id =
308 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
309
310 return 0;
311}
312#endif
313
268struct ip_conntrack_protocol ip_conntrack_protocol_icmp = 314struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
269{ 315{
270 .proto = IPPROTO_ICMP, 316 .proto = IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
276 .packet = icmp_packet, 322 .packet = icmp_packet,
277 .new = icmp_new, 323 .new = icmp_new,
278 .error = icmp_error, 324 .error = icmp_error,
325#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
326 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
327 .tuple_to_nfattr = icmp_tuple_to_nfattr,
328 .nfattr_to_tuple = icmp_nfattr_to_tuple,
329#endif
279}; 330};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf12..a875f35e576d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
404 } 404 }
405 405
406 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
407 if (oldsctpstate != newconntrack)
408 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
407 write_unlock_bh(&sctp_lock); 409 write_unlock_bh(&sctp_lock);
408 } 410 }
409 411
@@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
503 .packet = sctp_packet, 505 .packet = sctp_packet,
504 .new = sctp_new, 506 .new = sctp_new,
505 .destroy = NULL, 507 .destroy = NULL,
506 .me = THIS_MODULE 508 .me = THIS_MODULE,
509#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
510 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
511 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
512 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
513#endif
507}; 514};
508 515
509#ifdef CONFIG_SYSCTL 516#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d4..f23ef1f88c46 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s,
336 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
337} 337}
338 338
339#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
340 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
341static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
342 const struct ip_conntrack *ct)
343{
344 read_lock_bh(&tcp_lock);
345 NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
346 &ct->proto.tcp.state);
347 read_unlock_bh(&tcp_lock);
348
349 return 0;
350
351nfattr_failure:
352 return -1;
353}
354#endif
355
339static unsigned int get_conntrack_index(const struct tcphdr *tcph) 356static unsigned int get_conntrack_index(const struct tcphdr *tcph)
340{ 357{
341 if (tcph->rst) return TCP_RST_SET; 358 if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
699 res = 1; 716 res = 1;
700 } else { 717 } else {
701 if (LOG_INVALID(IPPROTO_TCP)) 718 if (LOG_INVALID(IPPROTO_TCP))
702 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 719 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
703 "ip_ct_tcp: %s ", 720 "ip_ct_tcp: %s ",
704 before(seq, sender->td_maxend + 1) ? 721 before(seq, sender->td_maxend + 1) ?
705 after(end, sender->td_end - receiver->td_maxwin - 1) ? 722 after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb,
798 sizeof(_tcph), &_tcph); 815 sizeof(_tcph), &_tcph);
799 if (th == NULL) { 816 if (th == NULL) {
800 if (LOG_INVALID(IPPROTO_TCP)) 817 if (LOG_INVALID(IPPROTO_TCP))
801 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 818 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
802 "ip_ct_tcp: short packet "); 819 "ip_ct_tcp: short packet ");
803 return -NF_ACCEPT; 820 return -NF_ACCEPT;
804 } 821 }
@@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb,
806 /* Not whole TCP header or malformed packet */ 823 /* Not whole TCP header or malformed packet */
807 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { 824 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
808 if (LOG_INVALID(IPPROTO_TCP)) 825 if (LOG_INVALID(IPPROTO_TCP))
809 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 826 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
810 "ip_ct_tcp: truncated/malformed packet "); 827 "ip_ct_tcp: truncated/malformed packet ");
811 return -NF_ACCEPT; 828 return -NF_ACCEPT;
812 } 829 }
@@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb,
823 skb->ip_summed == CHECKSUM_HW ? skb->csum 840 skb->ip_summed == CHECKSUM_HW ? skb->csum
824 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { 841 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
825 if (LOG_INVALID(IPPROTO_TCP)) 842 if (LOG_INVALID(IPPROTO_TCP))
826 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 843 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
827 "ip_ct_tcp: bad TCP checksum "); 844 "ip_ct_tcp: bad TCP checksum ");
828 return -NF_ACCEPT; 845 return -NF_ACCEPT;
829 } 846 }
@@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb,
832 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); 849 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
833 if (!tcp_valid_flags[tcpflags]) { 850 if (!tcp_valid_flags[tcpflags]) {
834 if (LOG_INVALID(IPPROTO_TCP)) 851 if (LOG_INVALID(IPPROTO_TCP))
835 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 852 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
836 "ip_ct_tcp: invalid TCP flag combination "); 853 "ip_ct_tcp: invalid TCP flag combination ");
837 return -NF_ACCEPT; 854 return -NF_ACCEPT;
838 } 855 }
@@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
880 */ 897 */
881 write_unlock_bh(&tcp_lock); 898 write_unlock_bh(&tcp_lock);
882 if (LOG_INVALID(IPPROTO_TCP)) 899 if (LOG_INVALID(IPPROTO_TCP))
883 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 900 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
884 "ip_ct_tcp: killing out of sync session "); 901 NULL, "ip_ct_tcp: "
902 "killing out of sync session ");
885 if (del_timer(&conntrack->timeout)) 903 if (del_timer(&conntrack->timeout))
886 conntrack->timeout.function((unsigned long) 904 conntrack->timeout.function((unsigned long)
887 conntrack); 905 conntrack);
@@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
895 913
896 write_unlock_bh(&tcp_lock); 914 write_unlock_bh(&tcp_lock);
897 if (LOG_INVALID(IPPROTO_TCP)) 915 if (LOG_INVALID(IPPROTO_TCP))
898 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 916 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
899 "ip_ct_tcp: invalid packet ignored "); 917 "ip_ct_tcp: invalid packet ignored ");
900 return NF_ACCEPT; 918 return NF_ACCEPT;
901 case TCP_CONNTRACK_MAX: 919 case TCP_CONNTRACK_MAX:
@@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
905 old_state); 923 old_state);
906 write_unlock_bh(&tcp_lock); 924 write_unlock_bh(&tcp_lock);
907 if (LOG_INVALID(IPPROTO_TCP)) 925 if (LOG_INVALID(IPPROTO_TCP))
908 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 926 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
909 "ip_ct_tcp: invalid state "); 927 "ip_ct_tcp: invalid state ");
910 return -NF_ACCEPT; 928 return -NF_ACCEPT;
911 case TCP_CONNTRACK_SYN_SENT: 929 case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
926 write_unlock_bh(&tcp_lock); 944 write_unlock_bh(&tcp_lock);
927 if (LOG_INVALID(IPPROTO_TCP)) 945 if (LOG_INVALID(IPPROTO_TCP))
928 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 946 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
929 "ip_ct_tcp: invalid SYN"); 947 NULL, "ip_ct_tcp: invalid SYN");
930 return -NF_ACCEPT; 948 return -NF_ACCEPT;
931 } 949 }
932 case TCP_CONNTRACK_CLOSE: 950 case TCP_CONNTRACK_CLOSE:
@@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 991 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
974 write_unlock_bh(&tcp_lock); 992 write_unlock_bh(&tcp_lock);
975 993
994 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
995 if (new_state != old_state)
996 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
997
976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 998 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
977 /* If only reply is a RST, we can consider ourselves not to 999 /* If only reply is a RST, we can consider ourselves not to
978 have an established connection: this is a fairly common 1000 have an established connection: this is a fairly common
@@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1096 .packet = tcp_packet, 1118 .packet = tcp_packet,
1097 .new = tcp_new, 1119 .new = tcp_new,
1098 .error = tcp_error, 1120 .error = tcp_error,
1121#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1122 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1123 .to_nfattr = tcp_to_nfattr,
1124 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
1125 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
1126#endif
1099}; 1127};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d4..f2dcac7c7660 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
73 ip_ct_refresh_acct(conntrack, ctinfo, skb, 73 ip_ct_refresh_acct(conntrack, ctinfo, skb,
74 ip_ct_udp_timeout_stream); 74 ip_ct_udp_timeout_stream);
75 /* Also, more likely to be important, and not a probe */ 75 /* Also, more likely to be important, and not a probe */
76 set_bit(IPS_ASSURED_BIT, &conntrack->status); 76 if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
77 ip_conntrack_event_cache(IPCT_STATUS, skb);
77 } else 78 } else
78 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); 79 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
79 80
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
97 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); 98 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
98 if (hdr == NULL) { 99 if (hdr == NULL) {
99 if (LOG_INVALID(IPPROTO_UDP)) 100 if (LOG_INVALID(IPPROTO_UDP))
100 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 101 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
101 "ip_ct_udp: short packet "); 102 "ip_ct_udp: short packet ");
102 return -NF_ACCEPT; 103 return -NF_ACCEPT;
103 } 104 }
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
105 /* Truncated/malformed packets */ 106 /* Truncated/malformed packets */
106 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { 107 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
107 if (LOG_INVALID(IPPROTO_UDP)) 108 if (LOG_INVALID(IPPROTO_UDP))
108 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 109 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
109 "ip_ct_udp: truncated/malformed packet "); 110 "ip_ct_udp: truncated/malformed packet ");
110 return -NF_ACCEPT; 111 return -NF_ACCEPT;
111 } 112 }
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
125 skb->ip_summed == CHECKSUM_HW ? skb->csum 126 skb->ip_summed == CHECKSUM_HW ? skb->csum
126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 127 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
127 if (LOG_INVALID(IPPROTO_UDP)) 128 if (LOG_INVALID(IPPROTO_UDP))
128 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 129 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
129 "ip_ct_udp: bad UDP checksum "); 130 "ip_ct_udp: bad UDP checksum ");
130 return -NF_ACCEPT; 131 return -NF_ACCEPT;
131 } 132 }
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
144 .packet = udp_packet, 145 .packet = udp_packet,
145 .new = udp_new, 146 .new = udp_new,
146 .error = udp_error, 147 .error = udp_error,
148#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
149 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
150 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
151 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
152#endif
147}; 153};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..ee5895afd0c3 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
5*/ 5*/
6 6
7/* (C) 1999-2001 Paul `Rusty' Russell 7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as 11 * it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
147 if (DIRECTION(hash)) 147 if (DIRECTION(hash))
148 return 0; 148 return 0;
149 149
150 proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 150 proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
151 .tuple.dst.protonum);
152 IP_NF_ASSERT(proto); 151 IP_NF_ASSERT(proto);
153 152
154 if (seq_printf(s, "%-8s %u %ld ", 153 if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
185 return -ENOSPC; 184 return -ENOSPC;
186 185
187#if defined(CONFIG_IP_NF_CONNTRACK_MARK) 186#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
188 if (seq_printf(s, "mark=%lu ", conntrack->mark)) 187 if (seq_printf(s, "mark=%u ", conntrack->mark))
189 return -ENOSPC; 188 return -ENOSPC;
190#endif 189#endif
191 190
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
283 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); 282 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
284 283
285 print_tuple(s, &expect->tuple, 284 print_tuple(s, &expect->tuple,
286 ip_ct_find_proto(expect->tuple.dst.protonum)); 285 __ip_conntrack_proto_find(expect->tuple.dst.protonum));
287 return seq_putc(s, '\n'); 286 return seq_putc(s, '\n');
288} 287}
289 288
@@ -432,6 +431,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
432 const struct net_device *out, 431 const struct net_device *out,
433 int (*okfn)(struct sk_buff *)) 432 int (*okfn)(struct sk_buff *))
434{ 433{
434#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
435 /* Previously seen (loopback)? Ignore. Do this before
436 fragment check. */
437 if ((*pskb)->nfct)
438 return NF_ACCEPT;
439#endif
440
435 /* Gather fragments. */ 441 /* Gather fragments. */
436 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 442 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
437 *pskb = ip_ct_gather_frags(*pskb, 443 *pskb = ip_ct_gather_frags(*pskb,
@@ -882,6 +888,7 @@ static int init_or_cleanup(int init)
882 return ret; 888 return ret;
883 889
884 cleanup: 890 cleanup:
891 synchronize_net();
885#ifdef CONFIG_SYSCTL 892#ifdef CONFIG_SYSCTL
886 unregister_sysctl_table(ip_ct_sysctl_header); 893 unregister_sysctl_table(ip_ct_sysctl_header);
887 cleanup_localinops: 894 cleanup_localinops:
@@ -964,6 +971,14 @@ void need_ip_conntrack(void)
964{ 971{
965} 972}
966 973
974#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
975EXPORT_SYMBOL_GPL(ip_conntrack_chain);
976EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
977EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
978EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
979EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
980EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
981#endif
967EXPORT_SYMBOL(ip_conntrack_protocol_register); 982EXPORT_SYMBOL(ip_conntrack_protocol_register);
968EXPORT_SYMBOL(ip_conntrack_protocol_unregister); 983EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
969EXPORT_SYMBOL(ip_ct_get_tuple); 984EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -975,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
975EXPORT_SYMBOL(ip_conntrack_helper_unregister); 990EXPORT_SYMBOL(ip_conntrack_helper_unregister);
976EXPORT_SYMBOL(ip_ct_iterate_cleanup); 991EXPORT_SYMBOL(ip_ct_iterate_cleanup);
977EXPORT_SYMBOL(ip_ct_refresh_acct); 992EXPORT_SYMBOL(ip_ct_refresh_acct);
978EXPORT_SYMBOL(ip_ct_protos); 993
979EXPORT_SYMBOL(ip_ct_find_proto);
980EXPORT_SYMBOL(ip_conntrack_expect_alloc); 994EXPORT_SYMBOL(ip_conntrack_expect_alloc);
981EXPORT_SYMBOL(ip_conntrack_expect_free); 995EXPORT_SYMBOL(ip_conntrack_expect_put);
996EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
982EXPORT_SYMBOL(ip_conntrack_expect_related); 997EXPORT_SYMBOL(ip_conntrack_expect_related);
983EXPORT_SYMBOL(ip_conntrack_unexpect_related); 998EXPORT_SYMBOL(ip_conntrack_unexpect_related);
999EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
1000EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
1001EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
1002
984EXPORT_SYMBOL(ip_conntrack_tuple_taken); 1003EXPORT_SYMBOL(ip_conntrack_tuple_taken);
985EXPORT_SYMBOL(ip_ct_gather_frags); 1004EXPORT_SYMBOL(ip_ct_gather_frags);
986EXPORT_SYMBOL(ip_conntrack_htable_size); 1005EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -988,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
988EXPORT_SYMBOL(ip_conntrack_hash); 1007EXPORT_SYMBOL(ip_conntrack_hash);
989EXPORT_SYMBOL(ip_conntrack_untracked); 1008EXPORT_SYMBOL(ip_conntrack_untracked);
990EXPORT_SYMBOL_GPL(ip_conntrack_find_get); 1009EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
991EXPORT_SYMBOL_GPL(ip_conntrack_put);
992#ifdef CONFIG_IP_NF_NAT_NEEDED 1010#ifdef CONFIG_IP_NF_NAT_NEEDED
993EXPORT_SYMBOL(ip_conntrack_tcp_update); 1011EXPORT_SYMBOL(ip_conntrack_tcp_update);
994#endif 1012#endif
1013
1014EXPORT_SYMBOL_GPL(ip_conntrack_flush);
1015EXPORT_SYMBOL_GPL(__ip_conntrack_find);
1016
1017EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
1018EXPORT_SYMBOL_GPL(ip_conntrack_free);
1019EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
1020
1021EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
1022
1023EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
1024EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
1025EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
1026
1027EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
1028EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
1029EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
1030#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1031 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1032EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
1033EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
1034#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36ee..f8ff170f390a 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
67 67
68 exp = ip_conntrack_expect_alloc(); 68 exp = ip_conntrack_expect_alloc(ct);
69 if (exp == NULL) 69 if (exp == NULL)
70 return NF_DROP; 70 return NF_DROP;
71 71
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
75 exp->mask.dst.u.udp.port = 0xffff; 75 exp->mask.dst.u.udp.port = 0xffff;
76 exp->mask.dst.protonum = 0xff; 76 exp->mask.dst.protonum = 0xff;
77 exp->expectfn = NULL; 77 exp->expectfn = NULL;
78 exp->master = ct;
79 78
80 DEBUGP("expect: "); 79 DEBUGP("expect: ");
81 DUMP_TUPLE(&exp->tuple); 80 DUMP_TUPLE(&exp->tuple);
82 DUMP_TUPLE(&exp->mask); 81 DUMP_TUPLE(&exp->mask);
83 if (ip_nat_tftp_hook) 82 if (ip_nat_tftp_hook)
84 ret = ip_nat_tftp_hook(pskb, ctinfo, exp); 83 ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
85 else if (ip_conntrack_expect_related(exp) != 0) { 84 else if (ip_conntrack_expect_related(exp) != 0)
86 ip_conntrack_expect_free(exp);
87 ret = NF_DROP; 85 ret = NF_DROP;
88 } 86 ip_conntrack_expect_put(exp);
89 break; 87 break;
90 case TFTP_OPCODE_DATA: 88 case TFTP_OPCODE_DATA:
91 case TFTP_OPCODE_ACK: 89 case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583ed..706c8074f422 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
56 break; 56 break;
57 } 57 }
58 58
59 if (port == 0) { 59 if (port == 0)
60 ip_conntrack_expect_free(exp);
61 return NF_DROP; 60 return NF_DROP;
62 }
63 61
64 sprintf(buffer, "%u", port); 62 sprintf(buffer, "%u", port);
65 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, 63 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c82..1adedb743f60 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock);
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
48 48
49static struct list_head *bysource; 49static struct list_head *bysource;
50
51#define MAX_IP_NAT_PROTO 256
50struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; 52struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
51 53
54static inline struct ip_nat_protocol *
55__ip_nat_proto_find(u_int8_t protonum)
56{
57 return ip_nat_protos[protonum];
58}
59
60struct ip_nat_protocol *
61ip_nat_proto_find_get(u_int8_t protonum)
62{
63 struct ip_nat_protocol *p;
64
65 /* we need to disable preemption to make sure 'p' doesn't get
66 * removed until we've grabbed the reference */
67 preempt_disable();
68 p = __ip_nat_proto_find(protonum);
69 if (p) {
70 if (!try_module_get(p->me))
71 p = &ip_nat_unknown_protocol;
72 }
73 preempt_enable();
74
75 return p;
76}
77
78void
79ip_nat_proto_put(struct ip_nat_protocol *p)
80{
81 module_put(p->me);
82}
52 83
53/* We keep an extra hash for each conntrack, for fast searching. */ 84/* We keep an extra hash for each conntrack, for fast searching. */
54static inline unsigned int 85static inline unsigned int
@@ -103,7 +134,8 @@ static int
103in_range(const struct ip_conntrack_tuple *tuple, 134in_range(const struct ip_conntrack_tuple *tuple,
104 const struct ip_nat_range *range) 135 const struct ip_nat_range *range)
105{ 136{
106 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); 137 struct ip_nat_protocol *proto =
138 __ip_nat_proto_find(tuple->dst.protonum);
107 139
108 /* If we are supposed to map IPs, then we must be in the 140 /* If we are supposed to map IPs, then we must be in the
109 range specified, otherwise let this drag us onto a new src IP. */ 141 range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
216 struct ip_conntrack *conntrack, 248 struct ip_conntrack *conntrack,
217 enum ip_nat_manip_type maniptype) 249 enum ip_nat_manip_type maniptype)
218{ 250{
219 struct ip_nat_protocol *proto 251 struct ip_nat_protocol *proto;
220 = ip_nat_find_proto(orig_tuple->dst.protonum);
221 252
222 /* 1) If this srcip/proto/src-proto-part is currently mapped, 253 /* 1) If this srcip/proto/src-proto-part is currently mapped,
223 and that same mapping gives a unique tuple within the given 254 and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
242 /* 3) The per-protocol part of the manip is made to map into 273 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */ 274 the range to make a unique tuple. */
244 275
276 proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
277
245 /* Only bother mapping if it's not already in range and unique */ 278 /* Only bother mapping if it's not already in range and unique */
246 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) 279 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247 || proto->in_range(tuple, maniptype, &range->min, &range->max)) 280 || proto->in_range(tuple, maniptype, &range->min, &range->max))
248 && !ip_nat_used_tuple(tuple, conntrack)) 281 && !ip_nat_used_tuple(tuple, conntrack)) {
282 ip_nat_proto_put(proto);
249 return; 283 return;
284 }
250 285
251 /* Last change: get protocol to try to obtain unique tuple. */ 286 /* Last change: get protocol to try to obtain unique tuple. */
252 proto->unique_tuple(tuple, range, maniptype, conntrack); 287 proto->unique_tuple(tuple, range, maniptype, conntrack);
288
289 ip_nat_proto_put(proto);
253} 290}
254 291
255unsigned int 292unsigned int
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
320 enum ip_nat_manip_type maniptype) 357 enum ip_nat_manip_type maniptype)
321{ 358{
322 struct iphdr *iph; 359 struct iphdr *iph;
360 struct ip_nat_protocol *p;
323 361
324 (*pskb)->nfcache |= NFC_ALTERED; 362 if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
325 if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
326 return 0; 363 return 0;
327 364
328 iph = (void *)(*pskb)->data + iphdroff; 365 iph = (void *)(*pskb)->data + iphdroff;
329 366
330 /* Manipulate protcol part. */ 367 /* Manipulate protcol part. */
331 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, 368 p = ip_nat_proto_find_get(proto);
332 target, maniptype)) 369 if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
370 ip_nat_proto_put(p);
333 return 0; 371 return 0;
372 }
373 ip_nat_proto_put(p);
334 374
335 iph = (void *)(*pskb)->data + iphdroff; 375 iph = (void *)(*pskb)->data + iphdroff;
336 376
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
391 struct ip_conntrack_tuple inner, target; 431 struct ip_conntrack_tuple inner, target;
392 int hdrlen = (*pskb)->nh.iph->ihl * 4; 432 int hdrlen = (*pskb)->nh.iph->ihl * 4;
393 433
394 if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) 434 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
395 return 0; 435 return 0;
396 436
397 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 437 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
426 466
427 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + 467 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
428 sizeof(struct icmphdr) + inside->ip.ihl*4, 468 sizeof(struct icmphdr) + inside->ip.ihl*4,
429 &inner, ip_ct_find_proto(inside->ip.protocol))) 469 &inner,
470 __ip_conntrack_proto_find(inside->ip.protocol)))
430 return 0; 471 return 0;
431 472
432 /* Change inner back to look like incoming packet. We do the 473 /* Change inner back to look like incoming packet. We do the
@@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
496 synchronize_net(); 537 synchronize_net();
497} 538}
498 539
540#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
541 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
542int
543ip_nat_port_range_to_nfattr(struct sk_buff *skb,
544 const struct ip_nat_range *range)
545{
546 NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
547 &range->min.tcp.port);
548 NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
549 &range->max.tcp.port);
550
551 return 0;
552
553nfattr_failure:
554 return -1;
555}
556
557int
558ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
559{
560 int ret = 0;
561
562 /* we have to return whether we actually parsed something or not */
563
564 if (tb[CTA_PROTONAT_PORT_MIN-1]) {
565 ret = 1;
566 range->min.tcp.port =
567 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
568 }
569
570 if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
571 if (ret)
572 range->max.tcp.port = range->min.tcp.port;
573 } else {
574 ret = 1;
575 range->max.tcp.port =
576 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
577 }
578
579 return ret;
580}
581#endif
582
499int __init ip_nat_init(void) 583int __init ip_nat_init(void)
500{ 584{
501 size_t i; 585 size_t i;
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794ad6..d83757a70d9f 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
143 break; 143 break;
144 } 144 }
145 145
146 if (port == 0) { 146 if (port == 0)
147 ip_conntrack_expect_free(exp);
148 return NF_DROP; 147 return NF_DROP;
149 }
150 148
151 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, 149 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
152 seq)) { 150 seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c04..d2dd5d313556 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
168 struct tcphdr *tcph; 168 struct tcphdr *tcph;
169 int datalen; 169 int datalen;
170 170
171 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 171 if (!skb_make_writable(pskb, (*pskb)->len))
172 return 0; 172 return 0;
173 173
174 if (rep_len > match_len 174 if (rep_len > match_len
@@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
228 match_offset + match_len) 228 match_offset + match_len)
229 return 0; 229 return 0;
230 230
231 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 231 if (!skb_make_writable(pskb, (*pskb)->len))
232 return 0; 232 return 0;
233 233
234 if (rep_len > match_len 234 if (rep_len > match_len
@@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
315 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); 315 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
316 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; 316 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
317 317
318 if (!skb_ip_make_writable(pskb, optend)) 318 if (!skb_make_writable(pskb, optend))
319 return 0; 319 return 0;
320 320
321 dir = CTINFO2DIR(ctinfo); 321 dir = CTINFO2DIR(ctinfo);
@@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
363 this_way = &ct->nat.info.seq[dir]; 363 this_way = &ct->nat.info.seq[dir];
364 other_way = &ct->nat.info.seq[!dir]; 364 other_way = &ct->nat.info.seq[!dir];
365 365
366 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 366 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
367 return 0; 367 return 0;
368 368
369 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 369 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d56..de31942babe3 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
65 break; 65 break;
66 } 66 }
67 67
68 if (port == 0) { 68 if (port == 0)
69 ip_conntrack_expect_free(exp);
70 return NF_DROP; 69 return NF_DROP;
71 }
72 70
73 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 71 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
74 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 72 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0eee8a..938719043999 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
35 const struct ip_conntrack *conntrack) 35 const struct ip_conntrack *conntrack)
36{ 36{
37 static u_int16_t id; 37 static u_int16_t id;
38 unsigned int range_size 38 unsigned int range_size;
39 = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
40 unsigned int i; 39 unsigned int i;
41 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */ 42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 44 range_size = 0xFFFF;
45 45
46 for (i = 0; i < range_size; i++, id++) { 46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); 47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
48 (id % range_size));
48 if (!ip_nat_used_tuple(tuple, conntrack)) 49 if (!ip_nat_used_tuple(tuple, conntrack))
49 return 1; 50 return 1;
50 } 51 }
@@ -61,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
61 struct icmphdr *hdr; 62 struct icmphdr *hdr;
62 unsigned int hdroff = iphdroff + iph->ihl*4; 63 unsigned int hdroff = iphdroff + iph->ihl*4;
63 64
64 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) 65 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
65 return 0; 66 return 0;
66 67
67 hdr = (struct icmphdr *)((*pskb)->data + hdroff); 68 hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -105,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
105 else return 0; 106 else return 0;
106} 107}
107 108
108struct ip_nat_protocol ip_nat_protocol_icmp 109struct ip_nat_protocol ip_nat_protocol_icmp = {
109= { "ICMP", IPPROTO_ICMP, 110 .name = "ICMP",
110 icmp_manip_pkt, 111 .protonum = IPPROTO_ICMP,
111 icmp_in_range, 112 .me = THIS_MODULE,
112 icmp_unique_tuple, 113 .manip_pkt = icmp_manip_pkt,
113 icmp_print, 114 .in_range = icmp_in_range,
114 icmp_print_range 115 .unique_tuple = icmp_unique_tuple,
116 .print = icmp_print,
117 .print_range = icmp_print_range,
118#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
119 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
120 .range_to_nfattr = ip_nat_port_range_to_nfattr,
121 .nfattr_to_range = ip_nat_port_nfattr_to_range,
122#endif
115}; 123};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfceff272..1d381bf68574 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
12#include <linux/ip.h> 12#include <linux/ip.h>
13#include <linux/tcp.h> 13#include <linux/tcp.h>
14#include <linux/if.h> 14#include <linux/if.h>
15#include <linux/netfilter/nfnetlink_conntrack.h>
15#include <linux/netfilter_ipv4/ip_nat.h> 16#include <linux/netfilter_ipv4/ip_nat.h>
16#include <linux/netfilter_ipv4/ip_nat_rule.h> 17#include <linux/netfilter_ipv4/ip_nat_rule.h>
17#include <linux/netfilter_ipv4/ip_nat_protocol.h> 18#include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -40,7 +41,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
40 enum ip_nat_manip_type maniptype, 41 enum ip_nat_manip_type maniptype,
41 const struct ip_conntrack *conntrack) 42 const struct ip_conntrack *conntrack)
42{ 43{
43 static u_int16_t port, *portptr; 44 static u_int16_t port;
45 u_int16_t *portptr;
44 unsigned int range_size, min, i; 46 unsigned int range_size, min, i;
45 47
46 if (maniptype == IP_NAT_MANIP_SRC) 48 if (maniptype == IP_NAT_MANIP_SRC)
@@ -101,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
101 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) 103 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
102 hdrsize = sizeof(struct tcphdr); 104 hdrsize = sizeof(struct tcphdr);
103 105
104 if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) 106 if (!skb_make_writable(pskb, hdroff + hdrsize))
105 return 0; 107 return 0;
106 108
107 iph = (struct iphdr *)((*pskb)->data + iphdroff); 109 iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -168,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
168 else return 0; 170 else return 0;
169} 171}
170 172
171struct ip_nat_protocol ip_nat_protocol_tcp 173struct ip_nat_protocol ip_nat_protocol_tcp = {
172= { "TCP", IPPROTO_TCP, 174 .name = "TCP",
173 tcp_manip_pkt, 175 .protonum = IPPROTO_TCP,
174 tcp_in_range, 176 .me = THIS_MODULE,
175 tcp_unique_tuple, 177 .manip_pkt = tcp_manip_pkt,
176 tcp_print, 178 .in_range = tcp_in_range,
177 tcp_print_range 179 .unique_tuple = tcp_unique_tuple,
180 .print = tcp_print,
181 .print_range = tcp_print_range,
182#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
183 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
184 .range_to_nfattr = ip_nat_port_range_to_nfattr,
185 .nfattr_to_range = ip_nat_port_nfattr_to_range,
186#endif
178}; 187};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b5f5d0..c4906e1aa24a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 enum ip_nat_manip_type maniptype, 41 enum ip_nat_manip_type maniptype,
42 const struct ip_conntrack *conntrack) 42 const struct ip_conntrack *conntrack)
43{ 43{
44 static u_int16_t port, *portptr; 44 static u_int16_t port;
45 u_int16_t *portptr;
45 unsigned int range_size, min, i; 46 unsigned int range_size, min, i;
46 47
47 if (maniptype == IP_NAT_MANIP_SRC) 48 if (maniptype == IP_NAT_MANIP_SRC)
@@ -93,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
93 u32 oldip, newip; 94 u32 oldip, newip;
94 u16 *portptr, newport; 95 u16 *portptr, newport;
95 96
96 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) 97 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
97 return 0; 98 return 0;
98 99
99 iph = (struct iphdr *)((*pskb)->data + iphdroff); 100 iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -155,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
155 else return 0; 156 else return 0;
156} 157}
157 158
158struct ip_nat_protocol ip_nat_protocol_udp 159struct ip_nat_protocol ip_nat_protocol_udp = {
159= { "UDP", IPPROTO_UDP, 160 .name = "UDP",
160 udp_manip_pkt, 161 .protonum = IPPROTO_UDP,
161 udp_in_range, 162 .me = THIS_MODULE,
162 udp_unique_tuple, 163 .manip_pkt = udp_manip_pkt,
163 udp_print, 164 .in_range = udp_in_range,
164 udp_print_range 165 .unique_tuple = udp_unique_tuple,
166 .print = udp_print,
167 .print_range = udp_print_range,
168#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
169 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
170 .range_to_nfattr = ip_nat_port_range_to_nfattr,
171 .nfattr_to_range = ip_nat_port_nfattr_to_range,
172#endif
165}; 173};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d16..99bbef56f84e 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
61} 61}
62 62
63struct ip_nat_protocol ip_nat_unknown_protocol = { 63struct ip_nat_protocol ip_nat_unknown_protocol = {
64 "unknown", 0, 64 .name = "unknown",
65 unknown_manip_pkt, 65 .me = THIS_MODULE,
66 unknown_in_range, 66 .manip_pkt = unknown_manip_pkt,
67 unknown_unique_tuple, 67 .in_range = unknown_in_range,
68 unknown_print, 68 .unique_tuple = unknown_unique_tuple,
69 unknown_print_range 69 .print = unknown_print,
70 .print_range = unknown_print_range
70}; 71};
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635ae..93b2c5111bb2 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
1275 return NF_DROP; 1275 return NF_DROP;
1276 } 1276 }
1277 1277
1278 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 1278 if (!skb_make_writable(pskb, (*pskb)->len))
1279 return NF_DROP; 1279 return NF_DROP;
1280 1280
1281 spin_lock_bh(&snmp_lock); 1281 spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e89e..89db052add81 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
73 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off 73 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
74 & htons(IP_MF|IP_OFFSET))); 74 & htons(IP_MF|IP_OFFSET)));
75 75
76 (*pskb)->nfcache |= NFC_UNKNOWN;
77
78 /* If we had a hardware checksum before, it's now invalid */ 76 /* If we had a hardware checksum before, it's now invalid */
79 if ((*pskb)->ip_summed == CHECKSUM_HW) 77 if ((*pskb)->ip_summed == CHECKSUM_HW)
80 if (skb_checksum_help(*pskb, (out == NULL))) 78 if (skb_checksum_help(*pskb, (out == NULL)))
@@ -102,6 +100,10 @@ ip_nat_fn(unsigned int hooknum,
102 return NF_ACCEPT; 100 return NF_ACCEPT;
103 } 101 }
104 102
103 /* Don't try to NAT if this packet is not conntracked */
104 if (ct == &ip_conntrack_untracked)
105 return NF_ACCEPT;
106
105 switch (ctinfo) { 107 switch (ctinfo) {
106 case IP_CT_RELATED: 108 case IP_CT_RELATED:
107 case IP_CT_RELATED+IP_CT_IS_REPLY: 109 case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -392,6 +394,8 @@ module_exit(fini);
392EXPORT_SYMBOL(ip_nat_setup_info); 394EXPORT_SYMBOL(ip_nat_setup_info);
393EXPORT_SYMBOL(ip_nat_protocol_register); 395EXPORT_SYMBOL(ip_nat_protocol_register);
394EXPORT_SYMBOL(ip_nat_protocol_unregister); 396EXPORT_SYMBOL(ip_nat_protocol_unregister);
397EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
398EXPORT_SYMBOL_GPL(ip_nat_proto_put);
395EXPORT_SYMBOL(ip_nat_cheat_check); 399EXPORT_SYMBOL(ip_nat_cheat_check);
396EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); 400EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
397EXPORT_SYMBOL(ip_nat_mangle_udp_packet); 401EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d64674..2215317c76b7 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; 45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_REPLY; 46 exp->dir = IP_CT_DIR_REPLY;
47 exp->expectfn = ip_nat_follow_master; 47 exp->expectfn = ip_nat_follow_master;
48 if (ip_conntrack_expect_related(exp) != 0) { 48 if (ip_conntrack_expect_related(exp) != 0)
49 ip_conntrack_expect_free(exp);
50 return NF_DROP; 49 return NF_DROP;
51 }
52 return NF_ACCEPT; 50 return NF_ACCEPT;
53} 51}
54 52
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431a4..d54f14d926f6 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
43#define NET_IPQ_QMAX 2088 43#define NET_IPQ_QMAX 2088
44#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" 44#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
45 45
46struct ipq_rt_info {
47 __u8 tos;
48 __u32 daddr;
49 __u32 saddr;
50};
51
52struct ipq_queue_entry { 46struct ipq_queue_entry {
53 struct list_head list; 47 struct list_head list;
54 struct nf_info *info; 48 struct nf_info *info;
55 struct sk_buff *skb; 49 struct sk_buff *skb;
56 struct ipq_rt_info rt_info;
57}; 50};
58 51
59typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); 52typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -214,6 +207,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
214 break; 207 break;
215 208
216 case IPQ_COPY_PACKET: 209 case IPQ_COPY_PACKET:
210 if (entry->skb->ip_summed == CHECKSUM_HW &&
211 (*errp = skb_checksum_help(entry->skb,
212 entry->info->outdev == NULL))) {
213 read_unlock_bh(&queue_lock);
214 return NULL;
215 }
217 if (copy_range == 0 || copy_range > entry->skb->len) 216 if (copy_range == 0 || copy_range > entry->skb->len)
218 data_len = entry->skb->len; 217 data_len = entry->skb->len;
219 else 218 else
@@ -241,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
241 240
242 pmsg->packet_id = (unsigned long )entry; 241 pmsg->packet_id = (unsigned long )entry;
243 pmsg->data_len = data_len; 242 pmsg->data_len = data_len;
244 pmsg->timestamp_sec = entry->skb->stamp.tv_sec; 243 pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
245 pmsg->timestamp_usec = entry->skb->stamp.tv_usec; 244 pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
246 pmsg->mark = entry->skb->nfmark; 245 pmsg->mark = entry->skb->nfmark;
247 pmsg->hook = entry->info->hook; 246 pmsg->hook = entry->info->hook;
248 pmsg->hw_protocol = entry->skb->protocol; 247 pmsg->hw_protocol = entry->skb->protocol;
@@ -281,7 +280,8 @@ nlmsg_failure:
281} 280}
282 281
283static int 282static int
284ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) 283ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
284 unsigned int queuenum, void *data)
285{ 285{
286 int status = -EINVAL; 286 int status = -EINVAL;
287 struct sk_buff *nskb; 287 struct sk_buff *nskb;
@@ -299,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
299 entry->info = info; 299 entry->info = info;
300 entry->skb = skb; 300 entry->skb = skb;
301 301
302 if (entry->info->hook == NF_IP_LOCAL_OUT) {
303 struct iphdr *iph = skb->nh.iph;
304
305 entry->rt_info.tos = iph->tos;
306 entry->rt_info.daddr = iph->daddr;
307 entry->rt_info.saddr = iph->saddr;
308 }
309
310 nskb = ipq_build_packet_message(entry, &status); 302 nskb = ipq_build_packet_message(entry, &status);
311 if (nskb == NULL) 303 if (nskb == NULL)
312 goto err_out_free; 304 goto err_out_free;
@@ -382,23 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
382 } 374 }
383 skb_put(e->skb, diff); 375 skb_put(e->skb, diff);
384 } 376 }
385 if (!skb_ip_make_writable(&e->skb, v->data_len)) 377 if (!skb_make_writable(&e->skb, v->data_len))
386 return -ENOMEM; 378 return -ENOMEM;
387 memcpy(e->skb->data, v->payload, v->data_len); 379 memcpy(e->skb->data, v->payload, v->data_len);
388 e->skb->nfcache |= NFC_ALTERED; 380 e->skb->ip_summed = CHECKSUM_NONE;
389 381
390 /*
391 * Extra routing may needed on local out, as the QUEUE target never
392 * returns control to the table.
393 */
394 if (e->info->hook == NF_IP_LOCAL_OUT) {
395 struct iphdr *iph = e->skb->nh.iph;
396
397 if (!(iph->tos == e->rt_info.tos
398 && iph->daddr == e->rt_info.daddr
399 && iph->saddr == e->rt_info.saddr))
400 return ip_route_me_harder(&e->skb);
401 }
402 return 0; 382 return 0;
403} 383}
404 384
@@ -676,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
676} 656}
677#endif /* CONFIG_PROC_FS */ 657#endif /* CONFIG_PROC_FS */
678 658
659static struct nf_queue_handler nfqh = {
660 .name = "ip_queue",
661 .outfn = &ipq_enqueue_packet,
662};
663
679static int 664static int
680init_or_cleanup(int init) 665init_or_cleanup(int init)
681{ 666{
@@ -686,7 +671,8 @@ init_or_cleanup(int init)
686 goto cleanup; 671 goto cleanup;
687 672
688 netlink_register_notifier(&ipq_nl_notifier); 673 netlink_register_notifier(&ipq_nl_notifier);
689 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); 674 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
675 THIS_MODULE);
690 if (ipqnl == NULL) { 676 if (ipqnl == NULL) {
691 printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); 677 printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
692 goto cleanup_netlink_notifier; 678 goto cleanup_netlink_notifier;
@@ -703,7 +689,7 @@ init_or_cleanup(int init)
703 register_netdevice_notifier(&ipq_dev_notifier); 689 register_netdevice_notifier(&ipq_dev_notifier);
704 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); 690 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
705 691
706 status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); 692 status = nf_register_queue_handler(PF_INET, &nfqh);
707 if (status < 0) { 693 if (status < 0) {
708 printk(KERN_ERR "ip_queue: failed to register queue handler\n"); 694 printk(KERN_ERR "ip_queue: failed to register queue handler\n");
709 goto cleanup_sysctl; 695 goto cleanup_sysctl;
@@ -711,7 +697,7 @@ init_or_cleanup(int init)
711 return status; 697 return status;
712 698
713cleanup: 699cleanup:
714 nf_unregister_queue_handler(PF_INET); 700 nf_unregister_queue_handlers(&nfqh);
715 synchronize_net(); 701 synchronize_net();
716 ipq_flush(NF_DROP); 702 ipq_flush(NF_DROP);
717 703
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c56..eef99a1b5de6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
312 do { 312 do {
313 IP_NF_ASSERT(e); 313 IP_NF_ASSERT(e);
314 IP_NF_ASSERT(back); 314 IP_NF_ASSERT(back);
315 (*pskb)->nfcache |= e->nfcache;
316 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { 315 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
317 struct ipt_entry_target *t; 316 struct ipt_entry_target *t;
318 317
@@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
341 back->comefrom); 340 back->comefrom);
342 continue; 341 continue;
343 } 342 }
344 if (table_base + v 343 if (table_base + v != (void *)e + e->next_offset
345 != (void *)e + e->next_offset) { 344 && !(e->ip.flags & IPT_F_GOTO)) {
346 /* Save old back ptr in next entry */ 345 /* Save old back ptr in next entry */
347 struct ipt_entry *next 346 struct ipt_entry *next
348 = (void *)e + e->next_offset; 347 = (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e23184..dab78d8bd494 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
32{ 32{
33 const struct ipt_classify_target_info *clinfo = targinfo; 33 const struct ipt_classify_target_info *clinfo = targinfo;
34 34
35 if((*pskb)->priority != clinfo->priority) { 35 if((*pskb)->priority != clinfo->priority)
36 (*pskb)->priority = clinfo->priority; 36 (*pskb)->priority = clinfo->priority;
37 (*pskb)->nfcache |= NFC_ALTERED;
38 }
39 37
40 return IPT_CONTINUE; 38 return IPT_CONTINUE;
41} 39}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4f..7d38913754b1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -144,7 +144,7 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
144 memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); 144 memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
145 c->num_total_nodes = i->num_total_nodes; 145 c->num_total_nodes = i->num_total_nodes;
146 c->num_local_nodes = i->num_local_nodes; 146 c->num_local_nodes = i->num_local_nodes;
147 memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes)); 147 memcpy(&c->local_nodes, &i->local_nodes, sizeof(c->local_nodes));
148 c->hash_mode = i->hash_mode; 148 c->hash_mode = i->hash_mode;
149 c->hash_initval = i->hash_initval; 149 c->hash_initval = i->hash_initval;
150 atomic_set(&c->refcount, 1); 150 atomic_set(&c->refcount, 1);
@@ -367,7 +367,7 @@ target(struct sk_buff **pskb,
367#ifdef DEBUG_CLUSTERP 367#ifdef DEBUG_CLUSTERP
368 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 368 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
369#endif 369#endif
370 DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); 370 DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
371 if (!clusterip_responsible(cipinfo->config, hash)) { 371 if (!clusterip_responsible(cipinfo->config, hash)) {
372 DEBUGP("not responsible\n"); 372 DEBUGP("not responsible\n");
373 return NF_DROP; 373 return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb7..134638021339 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
40 void *userinfo) 40 void *userinfo)
41{ 41{
42 const struct ipt_connmark_target_info *markinfo = targinfo; 42 const struct ipt_connmark_target_info *markinfo = targinfo;
43 unsigned long diff; 43 u_int32_t diff;
44 unsigned long nfmark; 44 u_int32_t nfmark;
45 unsigned long newmark; 45 u_int32_t newmark;
46 46
47 enum ip_conntrack_info ctinfo; 47 enum ip_conntrack_info ctinfo;
48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); 48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
61 case IPT_CONNMARK_RESTORE: 61 case IPT_CONNMARK_RESTORE:
62 nfmark = (*pskb)->nfmark; 62 nfmark = (*pskb)->nfmark;
63 diff = (ct->mark ^ nfmark) & markinfo->mask; 63 diff = (ct->mark ^ nfmark) & markinfo->mask;
64 if (diff != 0) { 64 if (diff != 0)
65 (*pskb)->nfmark = nfmark ^ diff; 65 (*pskb)->nfmark = nfmark ^ diff;
66 (*pskb)->nfcache |= NFC_ALTERED;
67 }
68 break; 66 break;
69 } 67 }
70 } 68 }
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
94 } 92 }
95 } 93 }
96 94
95 if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
96 printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
97 return 0;
98 }
99
97 return 1; 100 return 1;
98} 101}
99 102
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f9..6e319570a28c 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
39 if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { 39 if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
40 u_int16_t diffs[2]; 40 u_int16_t diffs[2];
41 41
42 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 42 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
43 return NF_DROP; 43 return NF_DROP;
44 44
45 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 45 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
51 sizeof(diffs), 51 sizeof(diffs),
52 (*pskb)->nh.iph->check 52 (*pskb)->nh.iph->check
53 ^ 0xFFFF)); 53 ^ 0xFFFF));
54 (*pskb)->nfcache |= NFC_ALTERED;
55 } 54 }
56 return IPT_CONTINUE; 55 return IPT_CONTINUE;
57} 56}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118e9..a1319693f648 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
31 != (einfo->ip_ect & IPT_ECN_IP_MASK)) { 31 != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
32 u_int16_t diffs[2]; 32 u_int16_t diffs[2];
33 33
34 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 34 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
35 return 0; 35 return 0;
36 36
37 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 37 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
43 sizeof(diffs), 43 sizeof(diffs),
44 (*pskb)->nh.iph->check 44 (*pskb)->nh.iph->check
45 ^0xFFFF)); 45 ^0xFFFF));
46 (*pskb)->nfcache |= NFC_ALTERED;
47 } 46 }
48 return 1; 47 return 1;
49} 48}
@@ -61,16 +60,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
61 if (!tcph) 60 if (!tcph)
62 return 0; 61 return 0;
63 62
64 if (!(einfo->operation & IPT_ECN_OP_SET_ECE 63 if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
65 || tcph->ece == einfo->proto.tcp.ece) 64 tcph->ece == einfo->proto.tcp.ece) &&
66 && (!(einfo->operation & IPT_ECN_OP_SET_CWR 65 ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
67 || tcph->cwr == einfo->proto.tcp.cwr))) 66 tcph->cwr == einfo->proto.tcp.cwr)))
68 return 1; 67 return 1;
69 68
70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 69 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
71 return 0; 70 return 0;
72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; 71 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
73 72
73 if ((*pskb)->ip_summed == CHECKSUM_HW &&
74 skb_checksum_help(*pskb, inward))
75 return 0;
76
74 diffs[0] = ((u_int16_t *)tcph)[6]; 77 diffs[0] = ((u_int16_t *)tcph)[6];
75 if (einfo->operation & IPT_ECN_OP_SET_ECE) 78 if (einfo->operation & IPT_ECN_OP_SET_ECE)
76 tcph->ece = einfo->proto.tcp.ece; 79 tcph->ece = einfo->proto.tcp.ece;
@@ -79,14 +82,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
79 diffs[1] = ((u_int16_t *)tcph)[6]; 82 diffs[1] = ((u_int16_t *)tcph)[6];
80 diffs[0] = diffs[0] ^ 0xFFFF; 83 diffs[0] = diffs[0] ^ 0xFFFF;
81 84
82 if ((*pskb)->ip_summed != CHECKSUM_HW) 85 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
83 tcph->check = csum_fold(csum_partial((char *)diffs, 86 tcph->check = csum_fold(csum_partial((char *)diffs,
84 sizeof(diffs), 87 sizeof(diffs),
85 tcph->check^0xFFFF)); 88 tcph->check^0xFFFF));
86 else
87 if (skb_checksum_help(*pskb, inward))
88 return 0;
89 (*pskb)->nfcache |= NFC_ALTERED;
90 return 1; 89 return 1;
91} 90}
92 91
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26da..92ed050fac69 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
28MODULE_DESCRIPTION("iptables syslog logging module"); 28MODULE_DESCRIPTION("iptables syslog logging module");
29 29
30static unsigned int nflog = 1;
31module_param(nflog, int, 0400);
32MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
33
34#if 0 30#if 0
35#define DEBUGP printk 31#define DEBUGP printk
36#else 32#else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
41static DEFINE_SPINLOCK(log_lock); 37static DEFINE_SPINLOCK(log_lock);
42 38
43/* One level of recursion won't kill us */ 39/* One level of recursion won't kill us */
44static void dump_packet(const struct ipt_log_info *info, 40static void dump_packet(const struct nf_loginfo *info,
45 const struct sk_buff *skb, 41 const struct sk_buff *skb,
46 unsigned int iphoff) 42 unsigned int iphoff)
47{ 43{
48 struct iphdr _iph, *ih; 44 struct iphdr _iph, *ih;
45 unsigned int logflags;
46
47 if (info->type == NF_LOG_TYPE_LOG)
48 logflags = info->u.log.logflags;
49 else
50 logflags = NF_LOG_MASK;
49 51
50 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 52 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
51 if (ih == NULL) { 53 if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
76 if (ntohs(ih->frag_off) & IP_OFFSET) 78 if (ntohs(ih->frag_off) & IP_OFFSET)
77 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 79 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
78 80
79 if ((info->logflags & IPT_LOG_IPOPT) 81 if ((logflags & IPT_LOG_IPOPT)
80 && ih->ihl * 4 > sizeof(struct iphdr)) { 82 && ih->ihl * 4 > sizeof(struct iphdr)) {
81 unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; 83 unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
82 unsigned int i, optsize; 84 unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
119 printk("SPT=%u DPT=%u ", 121 printk("SPT=%u DPT=%u ",
120 ntohs(th->source), ntohs(th->dest)); 122 ntohs(th->source), ntohs(th->dest));
121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 123 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
122 if (info->logflags & IPT_LOG_TCPSEQ) 124 if (logflags & IPT_LOG_TCPSEQ)
123 printk("SEQ=%u ACK=%u ", 125 printk("SEQ=%u ACK=%u ",
124 ntohl(th->seq), ntohl(th->ack_seq)); 126 ntohl(th->seq), ntohl(th->ack_seq));
125 /* Max length: 13 "WINDOW=65535 " */ 127 /* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
146 /* Max length: 11 "URGP=65535 " */ 148 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr)); 149 printk("URGP=%u ", ntohs(th->urg_ptr));
148 150
149 if ((info->logflags & IPT_LOG_TCPOPT) 151 if ((logflags & IPT_LOG_TCPOPT)
150 && th->doff * 4 > sizeof(struct tcphdr)) { 152 && th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; 153 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 unsigned char *op; 154 unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
328 } 330 }
329 331
330 /* Max length: 15 "UID=4294967295 " */ 332 /* Max length: 15 "UID=4294967295 " */
331 if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 333 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
332 read_lock_bh(&skb->sk->sk_callback_lock); 334 read_lock_bh(&skb->sk->sk_callback_lock);
333 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 335 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
334 printk("UID=%u ", skb->sk->sk_socket->file->f_uid); 336 printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
349 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 351 /* maxlen = 230+ 91 + 230 + 252 = 803 */
350} 352}
351 353
354struct nf_loginfo default_loginfo = {
355 .type = NF_LOG_TYPE_LOG,
356 .u = {
357 .log = {
358 .level = 0,
359 .logflags = NF_LOG_MASK,
360 },
361 },
362};
363
352static void 364static void
353ipt_log_packet(unsigned int hooknum, 365ipt_log_packet(unsigned int pf,
366 unsigned int hooknum,
354 const struct sk_buff *skb, 367 const struct sk_buff *skb,
355 const struct net_device *in, 368 const struct net_device *in,
356 const struct net_device *out, 369 const struct net_device *out,
357 const struct ipt_log_info *loginfo, 370 const struct nf_loginfo *loginfo,
358 const char *level_string,
359 const char *prefix) 371 const char *prefix)
360{ 372{
373 if (!loginfo)
374 loginfo = &default_loginfo;
375
361 spin_lock_bh(&log_lock); 376 spin_lock_bh(&log_lock);
362 printk(level_string); 377 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
363 printk("%sIN=%s OUT=%s ", 378 prefix,
364 prefix == NULL ? loginfo->prefix : prefix,
365 in ? in->name : "", 379 in ? in->name : "",
366 out ? out->name : ""); 380 out ? out->name : "");
367#ifdef CONFIG_BRIDGE_NETFILTER 381#ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
405 void *userinfo) 419 void *userinfo)
406{ 420{
407 const struct ipt_log_info *loginfo = targinfo; 421 const struct ipt_log_info *loginfo = targinfo;
408 char level_string[4] = "< >"; 422 struct nf_loginfo li;
409 423
410 level_string[1] = '0' + (loginfo->level % 8); 424 li.type = NF_LOG_TYPE_LOG;
411 ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); 425 li.u.log.level = loginfo->level;
426 li.u.log.logflags = loginfo->logflags;
412 427
413 return IPT_CONTINUE; 428 nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
414}
415 429
416static void 430 return IPT_CONTINUE;
417ipt_logfn(unsigned int hooknum,
418 const struct sk_buff *skb,
419 const struct net_device *in,
420 const struct net_device *out,
421 const char *prefix)
422{
423 struct ipt_log_info loginfo = {
424 .level = 0,
425 .logflags = IPT_LOG_MASK,
426 .prefix = ""
427 };
428
429 ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
430} 431}
431 432
432static int ipt_log_checkentry(const char *tablename, 433static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
464 .me = THIS_MODULE, 465 .me = THIS_MODULE,
465}; 466};
466 467
468static struct nf_logger ipt_log_logger ={
469 .name = "ipt_LOG",
470 .logfn = &ipt_log_packet,
471 .me = THIS_MODULE,
472};
473
467static int __init init(void) 474static int __init init(void)
468{ 475{
469 if (ipt_register_target(&ipt_log_reg)) 476 if (ipt_register_target(&ipt_log_reg))
470 return -EINVAL; 477 return -EINVAL;
471 if (nflog) 478 if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
472 nf_log_register(PF_INET, &ipt_logfn); 479 printk(KERN_WARNING "ipt_LOG: not logging via system console "
480 "since somebody else already registered for PF_INET\n");
481 /* we cannot make module load fail here, since otherwise
482 * iptables userspace would abort */
483 }
473 484
474 return 0; 485 return 0;
475} 486}
476 487
477static void __exit fini(void) 488static void __exit fini(void)
478{ 489{
479 if (nflog) 490 nf_log_unregister_logger(&ipt_log_logger);
480 nf_log_unregister(PF_INET, &ipt_logfn);
481 ipt_unregister_target(&ipt_log_reg); 491 ipt_unregister_target(&ipt_log_reg);
482} 492}
483 493
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8d..52b4f2c296bf 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
29{ 29{
30 const struct ipt_mark_target_info *markinfo = targinfo; 30 const struct ipt_mark_target_info *markinfo = targinfo;
31 31
32 if((*pskb)->nfmark != markinfo->mark) { 32 if((*pskb)->nfmark != markinfo->mark)
33 (*pskb)->nfmark = markinfo->mark; 33 (*pskb)->nfmark = markinfo->mark;
34 (*pskb)->nfcache |= NFC_ALTERED; 34
35 }
36 return IPT_CONTINUE; 35 return IPT_CONTINUE;
37} 36}
38 37
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
61 break; 60 break;
62 } 61 }
63 62
64 if((*pskb)->nfmark != mark) { 63 if((*pskb)->nfmark != mark)
65 (*pskb)->nfmark = mark; 64 (*pskb)->nfmark = mark;
66 (*pskb)->nfcache |= NFC_ALTERED; 65
67 }
68 return IPT_CONTINUE; 66 return IPT_CONTINUE;
69} 67}
70 68
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
76 unsigned int targinfosize, 74 unsigned int targinfosize,
77 unsigned int hook_mask) 75 unsigned int hook_mask)
78{ 76{
77 struct ipt_mark_target_info *markinfo = targinfo;
78
79 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { 79 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
80 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", 80 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
81 targinfosize, 81 targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
88 return 0; 88 return 0;
89 } 89 }
90 90
91 if (markinfo->mark > 0xffffffff) {
92 printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
93 return 0;
94 }
95
91 return 1; 96 return 1;
92} 97}
93 98
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
120 return 0; 125 return 0;
121 } 126 }
122 127
128 if (markinfo->mark > 0xffffffff) {
129 printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
130 return 0;
131 }
132
123 return 1; 133 return 1;
124} 134}
125 135
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d3..2f3e181c8e97 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
86 86
87 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); 87 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
88 88
89 /* FIXME: For the moment, don't do local packets, breaks
90 testsuite for 2.3.49 --RR */
91 if ((*pskb)->sk)
92 return NF_ACCEPT;
93
94 ct = ip_conntrack_get(*pskb, &ctinfo); 89 ct = ip_conntrack_get(*pskb, &ctinfo);
95 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 90 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
96 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 91 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d034..e6e7b6095363 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
46 DEBUGP(MODULENAME":check: size %u.\n", targinfosize); 46 DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
47 return 0; 47 return 0;
48 } 48 }
49 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { 49 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
50 (1 << NF_IP_LOCAL_OUT))) {
50 DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); 51 DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
51 return 0; 52 return 0;
52 } 53 }
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
76 struct ip_nat_range newrange; 77 struct ip_nat_range newrange;
77 78
78 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING 79 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
79 || hooknum == NF_IP_POST_ROUTING); 80 || hooknum == NF_IP_POST_ROUTING
81 || hooknum == NF_IP_LOCAL_OUT);
80 ct = ip_conntrack_get(*pskb, &ctinfo); 82 ct = ip_conntrack_get(*pskb, &ctinfo);
81 83
82 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 84 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
83 85
84 if (hooknum == NF_IP_PRE_ROUTING) 86 if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
85 new_ip = (*pskb)->nh.iph->daddr & ~netmask; 87 new_ip = (*pskb)->nh.iph->daddr & ~netmask;
86 else 88 else
87 new_ip = (*pskb)->nh.iph->saddr & ~netmask; 89 new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 000000000000..3cedc9be8807
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
1/* iptables module for using new netfilter netlink queue
2 *
3 * (C) 2005 by Harald Welte <laforge@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("iptables NFQUEUE target");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23target(struct sk_buff **pskb,
24 const struct net_device *in,
25 const struct net_device *out,
26 unsigned int hooknum,
27 const void *targinfo,
28 void *userinfo)
29{
30 const struct ipt_NFQ_info *tinfo = targinfo;
31
32 return NF_QUEUE_NR(tinfo->queuenum);
33}
34
35static int
36checkentry(const char *tablename,
37 const struct ipt_entry *e,
38 void *targinfo,
39 unsigned int targinfosize,
40 unsigned int hook_mask)
41{
42 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
43 printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
44 targinfosize,
45 IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
46 return 0;
47 }
48
49 return 1;
50}
51
52static struct ipt_target ipt_NFQ_reg = {
53 .name = "NFQUEUE",
54 .target = target,
55 .checkentry = checkentry,
56 .me = THIS_MODULE,
57};
58
59static int __init init(void)
60{
61 return ipt_register_target(&ipt_NFQ_reg);
62}
63
64static void __exit fini(void)
65{
66 ipt_unregister_target(&ipt_NFQ_reg);
67}
68
69module_init(init);
70module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 915696446020..f115a84a4ac6 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
156 156
157 /* This packet will not be the same as the other: clear nf fields */ 157 /* This packet will not be the same as the other: clear nf fields */
158 nf_reset(nskb); 158 nf_reset(nskb);
159 nskb->nfcache = 0;
160 nskb->nfmark = 0; 159 nskb->nfmark = 0;
161#ifdef CONFIG_BRIDGE_NETFILTER 160#ifdef CONFIG_BRIDGE_NETFILTER
162 nf_bridge_put(nskb->nf_bridge); 161 nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2bfb..8db70d6908c3 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,11 @@ ipt_tcpmss_target(struct sk_buff **pskb,
58 unsigned int i; 58 unsigned int i;
59 u_int8_t *opt; 59 u_int8_t *opt;
60 60
61 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 61 if (!skb_make_writable(pskb, (*pskb)->len))
62 return NF_DROP;
63
64 if ((*pskb)->ip_summed == CHECKSUM_HW &&
65 skb_checksum_help(*pskb, out == NULL))
62 return NF_DROP; 66 return NF_DROP;
63 67
64 iph = (*pskb)->nh.iph; 68 iph = (*pskb)->nh.iph;
@@ -186,10 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
186 newmss); 190 newmss);
187 191
188 retmodified: 192 retmodified:
189 /* We never hw checksum SYN packets. */
190 BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
191
192 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
193 return IPT_CONTINUE; 193 return IPT_CONTINUE;
194} 194}
195 195
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8b..deadb36d4428 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
33 if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { 33 if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
34 u_int16_t diffs[2]; 34 u_int16_t diffs[2];
35 35
36 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 36 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
37 return NF_DROP; 37 return NF_DROP;
38 38
39 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 39 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
46 sizeof(diffs), 46 sizeof(diffs),
47 (*pskb)->nh.iph->check 47 (*pskb)->nh.iph->check
48 ^0xFFFF)); 48 ^0xFFFF));
49 (*pskb)->nfcache |= NFC_ALTERED;
50 } 49 }
51 return IPT_CONTINUE; 50 return IPT_CONTINUE;
52} 51}
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 000000000000..b9ae6a9382f3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
1/* TTL modification target for IP tables
2 * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <net/checksum.h>
14
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netfilter_ipv4/ipt_TTL.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("IP tables TTL modification module");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
24 const struct net_device *out, unsigned int hooknum,
25 const void *targinfo, void *userinfo)
26{
27 struct iphdr *iph;
28 const struct ipt_TTL_info *info = targinfo;
29 u_int16_t diffs[2];
30 int new_ttl;
31
32 if (!skb_make_writable(pskb, (*pskb)->len))
33 return NF_DROP;
34
35 iph = (*pskb)->nh.iph;
36
37 switch (info->mode) {
38 case IPT_TTL_SET:
39 new_ttl = info->ttl;
40 break;
41 case IPT_TTL_INC:
42 new_ttl = iph->ttl + info->ttl;
43 if (new_ttl > 255)
44 new_ttl = 255;
45 break;
46 case IPT_TTL_DEC:
47 new_ttl = iph->ttl - info->ttl;
48 if (new_ttl < 0)
49 new_ttl = 0;
50 break;
51 default:
52 new_ttl = iph->ttl;
53 break;
54 }
55
56 if (new_ttl != iph->ttl) {
57 diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
58 iph->ttl = new_ttl;
59 diffs[1] = htons(((unsigned)iph->ttl) << 8);
60 iph->check = csum_fold(csum_partial((char *)diffs,
61 sizeof(diffs),
62 iph->check^0xFFFF));
63 }
64
65 return IPT_CONTINUE;
66}
67
68static int ipt_ttl_checkentry(const char *tablename,
69 const struct ipt_entry *e,
70 void *targinfo,
71 unsigned int targinfosize,
72 unsigned int hook_mask)
73{
74 struct ipt_TTL_info *info = targinfo;
75
76 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
77 printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
78 targinfosize,
79 IPT_ALIGN(sizeof(struct ipt_TTL_info)));
80 return 0;
81 }
82
83 if (strcmp(tablename, "mangle")) {
84 printk(KERN_WARNING "ipt_TTL: can only be called from "
85 "\"mangle\" table, not \"%s\"\n", tablename);
86 return 0;
87 }
88
89 if (info->mode > IPT_TTL_MAXMODE) {
90 printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
91 info->mode);
92 return 0;
93 }
94
95 if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
96 return 0;
97
98 return 1;
99}
100
101static struct ipt_target ipt_TTL = {
102 .name = "TTL",
103 .target = ipt_ttl_target,
104 .checkentry = ipt_ttl_checkentry,
105 .me = THIS_MODULE,
106};
107
108static int __init init(void)
109{
110 return ipt_register_target(&ipt_TTL);
111}
112
113static void __exit fini(void)
114{
115 ipt_unregister_target(&ipt_TTL);
116}
117
118module_init(init);
119module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a7..e2c14f3cb2fc 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
62MODULE_LICENSE("GPL"); 62MODULE_LICENSE("GPL");
63MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); 63MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
64MODULE_DESCRIPTION("iptables userspace logging module"); 64MODULE_DESCRIPTION("iptables userspace logging module");
65MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
65 66
66#define ULOG_NL_EVENT 111 /* Harald's favorite number */ 67#define ULOG_NL_EVENT 111 /* Harald's favorite number */
67#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ 68#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
115 if (ub->qlen > 1) 116 if (ub->qlen > 1)
116 ub->lastnlh->nlmsg_type = NLMSG_DONE; 117 ub->lastnlh->nlmsg_type = NLMSG_DONE;
117 118
118 NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); 119 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
119 DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", 120 DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
120 ub->qlen, nlgroupnum); 121 ub->qlen, nlgroupnum + 1);
121 netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); 122 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
122 123
123 ub->qlen = 0; 124 ub->qlen = 0;
124 ub->skb = NULL; 125 ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
219 pm = NLMSG_DATA(nlh); 220 pm = NLMSG_DATA(nlh);
220 221
221 /* We might not have a timestamp, get one */ 222 /* We might not have a timestamp, get one */
222 if (skb->stamp.tv_sec == 0) 223 if (skb->tstamp.off_sec == 0)
223 do_gettimeofday((struct timeval *)&skb->stamp); 224 __net_timestamp((struct sk_buff *)skb);
224 225
225 /* copy hook, prefix, timestamp, payload, etc. */ 226 /* copy hook, prefix, timestamp, payload, etc. */
226 pm->data_len = copy_len; 227 pm->data_len = copy_len;
227 pm->timestamp_sec = skb->stamp.tv_sec; 228 pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
228 pm->timestamp_usec = skb->stamp.tv_usec; 229 pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
229 pm->mark = skb->nfmark; 230 pm->mark = skb->nfmark;
230 pm->hook = hooknum; 231 pm->hook = hooknum;
231 if (prefix != NULL) 232 if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
303 return IPT_CONTINUE; 304 return IPT_CONTINUE;
304} 305}
305 306
306static void ipt_logfn(unsigned int hooknum, 307static void ipt_logfn(unsigned int pf,
308 unsigned int hooknum,
307 const struct sk_buff *skb, 309 const struct sk_buff *skb,
308 const struct net_device *in, 310 const struct net_device *in,
309 const struct net_device *out, 311 const struct net_device *out,
312 const struct nf_loginfo *li,
310 const char *prefix) 313 const char *prefix)
311{ 314{
312 struct ipt_ulog_info loginfo = { 315 struct ipt_ulog_info loginfo;
313 .nl_group = ULOG_DEFAULT_NLGROUP, 316
314 .copy_range = 0, 317 if (!li || li->type != NF_LOG_TYPE_ULOG) {
315 .qthreshold = ULOG_DEFAULT_QTHRESHOLD, 318 loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
316 .prefix = "" 319 loginfo.copy_range = 0;
317 }; 320 loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
321 loginfo.prefix[0] = '\0';
322 } else {
323 loginfo.nl_group = li->u.ulog.group;
324 loginfo.copy_range = li->u.ulog.copy_len;
325 loginfo.qthreshold = li->u.ulog.qthreshold;
326 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
327 }
318 328
319 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 329 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
320} 330}
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
354 .me = THIS_MODULE, 364 .me = THIS_MODULE,
355}; 365};
356 366
367static struct nf_logger ipt_ulog_logger = {
368 .name = "ipt_ULOG",
369 .logfn = &ipt_logfn,
370 .me = THIS_MODULE,
371};
372
357static int __init init(void) 373static int __init init(void)
358{ 374{
359 int i; 375 int i;
@@ -372,7 +388,8 @@ static int __init init(void)
372 ulog_buffers[i].timer.data = i; 388 ulog_buffers[i].timer.data = i;
373 } 389 }
374 390
375 nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); 391 nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
392 THIS_MODULE);
376 if (!nflognl) 393 if (!nflognl)
377 return -ENOMEM; 394 return -ENOMEM;
378 395
@@ -381,7 +398,7 @@ static int __init init(void)
381 return -EINVAL; 398 return -EINVAL;
382 } 399 }
383 if (nflog) 400 if (nflog)
384 nf_log_register(PF_INET, &ipt_logfn); 401 nf_log_register(PF_INET, &ipt_ulog_logger);
385 402
386 return 0; 403 return 0;
387} 404}
@@ -394,7 +411,7 @@ static void __exit fini(void)
394 DEBUGP("ipt_ULOG: cleanup_module\n"); 411 DEBUGP("ipt_ULOG: cleanup_module\n");
395 412
396 if (nflog) 413 if (nflog)
397 nf_log_unregister(PF_INET, &ipt_logfn); 414 nf_log_unregister_logger(&ipt_ulog_logger);
398 ipt_unregister_target(&ipt_ulog_reg); 415 ipt_unregister_target(&ipt_ulog_reg);
399 sock_release(nflognl->sk_socket); 416 sock_release(nflognl->sk_socket);
400 417
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 000000000000..df4a42c6da22
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
1/* Kernel module to match connection tracking byte counter.
2 * GPL (C) 2002 Martin Devera (devik@cdi.cz).
3 *
4 * 2004-07-20 Harald Welte <laforge@netfilter.org>
5 * - reimplemented to use per-connection accounting counters
6 * - add functionality to match number of packets
7 * - add functionality to match average packet size
8 * - add support to match directions seperately
9 *
10 */
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_connbytes.h>
16
17#include <asm/div64.h>
18#include <asm/bitops.h>
19
20MODULE_LICENSE("GPL");
21MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
22MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
23
24/* 64bit divisor, dividend and result. dynamic precision */
25static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
26{
27 u_int32_t d = divisor;
28
29 if (divisor > 0xffffffffULL) {
30 unsigned int shift = fls(divisor >> 32);
31
32 d = divisor >> shift;
33 dividend >>= shift;
34 }
35
36 do_div(dividend, d);
37 return dividend;
38}
39
40static int
41match(const struct sk_buff *skb,
42 const struct net_device *in,
43 const struct net_device *out,
44 const void *matchinfo,
45 int offset,
46 int *hotdrop)
47{
48 const struct ipt_connbytes_info *sinfo = matchinfo;
49 enum ip_conntrack_info ctinfo;
50 struct ip_conntrack *ct;
51 u_int64_t what = 0; /* initialize to make gcc happy */
52
53 if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
54 return 0; /* no match */
55
56 switch (sinfo->what) {
57 case IPT_CONNBYTES_PKTS:
58 switch (sinfo->direction) {
59 case IPT_CONNBYTES_DIR_ORIGINAL:
60 what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
61 break;
62 case IPT_CONNBYTES_DIR_REPLY:
63 what = ct->counters[IP_CT_DIR_REPLY].packets;
64 break;
65 case IPT_CONNBYTES_DIR_BOTH:
66 what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
67 what += ct->counters[IP_CT_DIR_REPLY].packets;
68 break;
69 }
70 break;
71 case IPT_CONNBYTES_BYTES:
72 switch (sinfo->direction) {
73 case IPT_CONNBYTES_DIR_ORIGINAL:
74 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
75 break;
76 case IPT_CONNBYTES_DIR_REPLY:
77 what = ct->counters[IP_CT_DIR_REPLY].bytes;
78 break;
79 case IPT_CONNBYTES_DIR_BOTH:
80 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
81 what += ct->counters[IP_CT_DIR_REPLY].bytes;
82 break;
83 }
84 break;
85 case IPT_CONNBYTES_AVGPKT:
86 switch (sinfo->direction) {
87 case IPT_CONNBYTES_DIR_ORIGINAL:
88 what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
89 ct->counters[IP_CT_DIR_ORIGINAL].packets);
90 break;
91 case IPT_CONNBYTES_DIR_REPLY:
92 what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
93 ct->counters[IP_CT_DIR_REPLY].packets);
94 break;
95 case IPT_CONNBYTES_DIR_BOTH:
96 {
97 u_int64_t bytes;
98 u_int64_t pkts;
99 bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
100 ct->counters[IP_CT_DIR_REPLY].bytes;
101 pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
102 ct->counters[IP_CT_DIR_REPLY].packets;
103
104 /* FIXME_THEORETICAL: what to do if sum
105 * overflows ? */
106
107 what = div64_64(bytes, pkts);
108 }
109 break;
110 }
111 break;
112 }
113
114 if (sinfo->count.to)
115 return (what <= sinfo->count.to && what >= sinfo->count.from);
116 else
117 return (what >= sinfo->count.from);
118}
119
120static int check(const char *tablename,
121 const struct ipt_ip *ip,
122 void *matchinfo,
123 unsigned int matchsize,
124 unsigned int hook_mask)
125{
126 const struct ipt_connbytes_info *sinfo = matchinfo;
127
128 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
129 return 0;
130
131 if (sinfo->what != IPT_CONNBYTES_PKTS &&
132 sinfo->what != IPT_CONNBYTES_BYTES &&
133 sinfo->what != IPT_CONNBYTES_AVGPKT)
134 return 0;
135
136 if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
137 sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
138 sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
139 return 0;
140
141 return 1;
142}
143
144static struct ipt_match state_match = {
145 .name = "connbytes",
146 .match = &match,
147 .checkentry = &check,
148 .me = THIS_MODULE
149};
150
151static int __init init(void)
152{
153 return ipt_register_match(&state_match);
154}
155
156static void __exit fini(void)
157{
158 ipt_unregister_match(&state_match);
159}
160
161module_init(init);
162module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea55..bf8de47ce004 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
54 unsigned int matchsize, 54 unsigned int matchsize,
55 unsigned int hook_mask) 55 unsigned int hook_mask)
56{ 56{
57 struct ipt_connmark_info *cm =
58 (struct ipt_connmark_info *)matchinfo;
57 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) 59 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
58 return 0; 60 return 0;
59 61
62 if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
63 printk(KERN_WARNING "connmark: only support 32bit mark\n");
64 return 0;
65 }
66
60 return 1; 67 return 1;
61} 68}
62 69
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 000000000000..ad3278bba6c1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
1/*
2 * iptables module for DCCP protocol header matching
3 *
4 * (C) 2005 by Harald Welte <laforge@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/spinlock.h>
14#include <net/ip.h>
15#include <linux/dccp.h>
16
17#include <linux/netfilter_ipv4/ip_tables.h>
18#include <linux/netfilter_ipv4/ipt_dccp.h>
19
20#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
21 || (!!((invflag) & (option)) ^ (cond)))
22
23static unsigned char *dccp_optbuf;
24static DEFINE_SPINLOCK(dccp_buflock);
25
26static inline int
27dccp_find_option(u_int8_t option,
28 const struct sk_buff *skb,
29 const struct dccp_hdr *dh,
30 int *hotdrop)
31{
32 /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
33 unsigned char *op;
34 unsigned int optoff = __dccp_hdr_len(dh);
35 unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
36 unsigned int i;
37
38 if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
39 *hotdrop = 1;
40 return 0;
41 }
42
43 if (!optlen)
44 return 0;
45
46 spin_lock_bh(&dccp_buflock);
47 op = skb_header_pointer(skb,
48 skb->nh.iph->ihl*4 + optoff,
49 optlen, dccp_optbuf);
50 if (op == NULL) {
51 /* If we don't have the whole header, drop packet. */
52 spin_unlock_bh(&dccp_buflock);
53 *hotdrop = 1;
54 return 0;
55 }
56
57 for (i = 0; i < optlen; ) {
58 if (op[i] == option) {
59 spin_unlock_bh(&dccp_buflock);
60 return 1;
61 }
62
63 if (op[i] < 2)
64 i++;
65 else
66 i += op[i+1]?:1;
67 }
68
69 spin_unlock_bh(&dccp_buflock);
70 return 0;
71}
72
73
74static inline int
75match_types(const struct dccp_hdr *dh, u_int16_t typemask)
76{
77 return (typemask & (1 << dh->dccph_type));
78}
79
80static inline int
81match_option(u_int8_t option, const struct sk_buff *skb,
82 const struct dccp_hdr *dh, int *hotdrop)
83{
84 return dccp_find_option(option, skb, dh, hotdrop);
85}
86
87static int
88match(const struct sk_buff *skb,
89 const struct net_device *in,
90 const struct net_device *out,
91 const void *matchinfo,
92 int offset,
93 int *hotdrop)
94{
95 const struct ipt_dccp_info *info =
96 (const struct ipt_dccp_info *)matchinfo;
97 struct dccp_hdr _dh, *dh;
98
99 if (offset)
100 return 0;
101
102 dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
103 if (dh == NULL) {
104 *hotdrop = 1;
105 return 0;
106 }
107
108 return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
109 && (ntohs(dh->dccph_sport) <= info->spts[1])),
110 IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
111 && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0])
112 && (ntohs(dh->dccph_dport) <= info->dpts[1])),
113 IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
114 && DCCHECK(match_types(dh, info->typemask),
115 IPT_DCCP_TYPE, info->flags, info->invflags)
116 && DCCHECK(match_option(info->option, skb, dh, hotdrop),
117 IPT_DCCP_OPTION, info->flags, info->invflags);
118}
119
120static int
121checkentry(const char *tablename,
122 const struct ipt_ip *ip,
123 void *matchinfo,
124 unsigned int matchsize,
125 unsigned int hook_mask)
126{
127 const struct ipt_dccp_info *info;
128
129 info = (const struct ipt_dccp_info *)matchinfo;
130
131 return ip->proto == IPPROTO_DCCP
132 && !(ip->invflags & IPT_INV_PROTO)
133 && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
134 && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
135 && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
136 && !(info->invflags & ~info->flags);
137}
138
139static struct ipt_match dccp_match =
140{
141 .name = "dccp",
142 .match = &match,
143 .checkentry = &checkentry,
144 .me = THIS_MODULE,
145};
146
147static int __init init(void)
148{
149 int ret;
150
151 /* doff is 8 bits, so the maximum option size is (4*256). Don't put
152 * this in BSS since DaveM is worried about locked TLB's for kernel
153 * BSS. */
154 dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
155 if (!dccp_optbuf)
156 return -ENOMEM;
157 ret = ipt_register_match(&dccp_match);
158 if (ret)
159 kfree(dccp_optbuf);
160
161 return ret;
162}
163
164static void __exit fini(void)
165{
166 ipt_unregister_match(&dccp_match);
167 kfree(dccp_optbuf);
168}
169
170module_init(init);
171module_exit(fini);
172
173MODULE_LICENSE("GPL");
174MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
175MODULE_DESCRIPTION("Match for DCCP protocol packets");
176
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebcf..2dd1cccbdab9 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
96static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
97static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep __read_mostly;
98 98
99static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) 99static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
100{ 100{
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b9..00bef6cdd3f8 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
37 unsigned int matchsize, 37 unsigned int matchsize,
38 unsigned int hook_mask) 38 unsigned int hook_mask)
39{ 39{
40 struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
41
40 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) 42 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
41 return 0; 43 return 0;
42 44
45 if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
46 printk(KERN_WARNING "mark: only supports 32bit mark\n");
47 return 0;
48 }
49
43 return 1; 50 return 1;
44} 51}
45 52
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e06381..c1889f88262b 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
21MODULE_DESCRIPTION("iptables owner match"); 21MODULE_DESCRIPTION("iptables owner match");
22 22
23static int 23static int
24match_comm(const struct sk_buff *skb, const char *comm)
25{
26 struct task_struct *g, *p;
27 struct files_struct *files;
28 int i;
29
30 read_lock(&tasklist_lock);
31 do_each_thread(g, p) {
32 if(strncmp(p->comm, comm, sizeof(p->comm)))
33 continue;
34
35 task_lock(p);
36 files = p->files;
37 if(files) {
38 spin_lock(&files->file_lock);
39 for (i=0; i < files->max_fds; i++) {
40 if (fcheck_files(files, i) ==
41 skb->sk->sk_socket->file) {
42 spin_unlock(&files->file_lock);
43 task_unlock(p);
44 read_unlock(&tasklist_lock);
45 return 1;
46 }
47 }
48 spin_unlock(&files->file_lock);
49 }
50 task_unlock(p);
51 } while_each_thread(g, p);
52 read_unlock(&tasklist_lock);
53 return 0;
54}
55
56static int
57match_pid(const struct sk_buff *skb, pid_t pid)
58{
59 struct task_struct *p;
60 struct files_struct *files;
61 int i;
62
63 read_lock(&tasklist_lock);
64 p = find_task_by_pid(pid);
65 if (!p)
66 goto out;
67 task_lock(p);
68 files = p->files;
69 if(files) {
70 spin_lock(&files->file_lock);
71 for (i=0; i < files->max_fds; i++) {
72 if (fcheck_files(files, i) ==
73 skb->sk->sk_socket->file) {
74 spin_unlock(&files->file_lock);
75 task_unlock(p);
76 read_unlock(&tasklist_lock);
77 return 1;
78 }
79 }
80 spin_unlock(&files->file_lock);
81 }
82 task_unlock(p);
83out:
84 read_unlock(&tasklist_lock);
85 return 0;
86}
87
88static int
89match_sid(const struct sk_buff *skb, pid_t sid)
90{
91 struct task_struct *g, *p;
92 struct file *file = skb->sk->sk_socket->file;
93 int i, found=0;
94
95 read_lock(&tasklist_lock);
96 do_each_thread(g, p) {
97 struct files_struct *files;
98 if (p->signal->session != sid)
99 continue;
100
101 task_lock(p);
102 files = p->files;
103 if (files) {
104 spin_lock(&files->file_lock);
105 for (i=0; i < files->max_fds; i++) {
106 if (fcheck_files(files, i) == file) {
107 found = 1;
108 break;
109 }
110 }
111 spin_unlock(&files->file_lock);
112 }
113 task_unlock(p);
114 if (found)
115 goto out;
116 } while_each_thread(g, p);
117out:
118 read_unlock(&tasklist_lock);
119
120 return found;
121}
122
123static int
124match(const struct sk_buff *skb, 24match(const struct sk_buff *skb,
125 const struct net_device *in, 25 const struct net_device *in,
126 const struct net_device *out, 26 const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
145 return 0; 45 return 0;
146 } 46 }
147 47
148 if(info->match & IPT_OWNER_PID) {
149 if (!match_pid(skb, info->pid) ^
150 !!(info->invert & IPT_OWNER_PID))
151 return 0;
152 }
153
154 if(info->match & IPT_OWNER_SID) {
155 if (!match_sid(skb, info->sid) ^
156 !!(info->invert & IPT_OWNER_SID))
157 return 0;
158 }
159
160 if(info->match & IPT_OWNER_COMM) {
161 if (!match_comm(skb, info->comm) ^
162 !!(info->invert & IPT_OWNER_COMM))
163 return 0;
164 }
165
166 return 1; 48 return 1;
167} 49}
168 50
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
173 unsigned int matchsize, 55 unsigned int matchsize,
174 unsigned int hook_mask) 56 unsigned int hook_mask)
175{ 57{
58 const struct ipt_owner_info *info = matchinfo;
59
176 if (hook_mask 60 if (hook_mask
177 & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { 61 & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
178 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); 62 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
184 IPT_ALIGN(sizeof(struct ipt_owner_info))); 68 IPT_ALIGN(sizeof(struct ipt_owner_info)));
185 return 0; 69 return 0;
186 } 70 }
187#ifdef CONFIG_SMP 71
188 /* files->file_lock can not be used in a BH */ 72 if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
189 if (((struct ipt_owner_info *)matchinfo)->match 73 printk("ipt_owner: pid, sid and command matching "
190 & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { 74 "not supported anymore\n");
191 printk("ipt_owner: pid, sid and command matching is broken "
192 "on SMP.\n");
193 return 0; 75 return 0;
194 } 76 }
195#endif 77
196 return 1; 78 return 1;
197} 79}
198 80
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 000000000000..b5def204d798
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
1/* String matching match for iptables
2 *
3 * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/kernel.h>
13#include <linux/skbuff.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_string.h>
16#include <linux/textsearch.h>
17
18MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
19MODULE_DESCRIPTION("IP tables string match module");
20MODULE_LICENSE("GPL");
21
22static int match(const struct sk_buff *skb,
23 const struct net_device *in,
24 const struct net_device *out,
25 const void *matchinfo,
26 int offset,
27 int *hotdrop)
28{
29 struct ts_state state;
30 struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
31
32 memset(&state, 0, sizeof(struct ts_state));
33
34 return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
35 conf->to_offset, conf->config, &state)
36 != UINT_MAX) && !conf->invert;
37}
38
39#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
40
41static int checkentry(const char *tablename,
42 const struct ipt_ip *ip,
43 void *matchinfo,
44 unsigned int matchsize,
45 unsigned int hook_mask)
46{
47 struct ipt_string_info *conf = matchinfo;
48 struct ts_config *ts_conf;
49
50 if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
51 return 0;
52
53 /* Damn, can't handle this case properly with iptables... */
54 if (conf->from_offset > conf->to_offset)
55 return 0;
56
57 ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
58 GFP_KERNEL, TS_AUTOLOAD);
59 if (IS_ERR(ts_conf))
60 return 0;
61
62 conf->config = ts_conf;
63
64 return 1;
65}
66
67static void destroy(void *matchinfo, unsigned int matchsize)
68{
69 textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
70}
71
72static struct ipt_match string_match = {
73 .name = "string",
74 .match = match,
75 .checkentry = checkentry,
76 .destroy = destroy,
77 .me = THIS_MODULE
78};
79
80static int __init init(void)
81{
82 return ipt_register_match(&string_match);
83}
84
85static void __exit fini(void)
86{
87 ipt_unregister_match(&string_match);
88}
89
90module_init(init);
91module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f415..f7943ba1f43c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
59 */ 59 */
60static int sockstat_seq_show(struct seq_file *seq, void *v) 60static int sockstat_seq_show(struct seq_file *seq, void *v)
61{ 61{
62 /* From net/socket.c */
63 extern void socket_seq_show(struct seq_file *seq);
64
65 socket_seq_show(seq); 62 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 63 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
67 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), 64 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
68 tcp_tw_count, atomic_read(&tcp_sockets_allocated), 65 tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
69 atomic_read(&tcp_memory_allocated)); 66 atomic_read(&tcp_memory_allocated));
70 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); 67 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
71 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); 68 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f2..291831e792af 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <net/ip.h> 41#include <net/ip.h>
42#include <net/protocol.h> 42#include <net/protocol.h>
43#include <net/tcp.h>
44#include <linux/skbuff.h> 43#include <linux/skbuff.h>
45#include <net/sock.h> 44#include <net/sock.h>
46#include <net/icmp.h> 45#include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c4..304bb0a1d4f0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
59#include <linux/netdevice.h> 59#include <linux/netdevice.h>
60#include <linux/in_route.h> 60#include <linux/in_route.h>
61#include <linux/route.h> 61#include <linux/route.h>
62#include <linux/tcp.h>
63#include <linux/skbuff.h> 62#include <linux/skbuff.h>
64#include <net/dst.h> 63#include <net/dst.h>
65#include <net/sock.h> 64#include <net/sock.h>
@@ -71,6 +70,7 @@
71#include <net/udp.h> 70#include <net/udp.h>
72#include <net/raw.h> 71#include <net/raw.h>
73#include <net/snmp.h> 72#include <net/snmp.h>
73#include <net/tcp_states.h>
74#include <net/inet_common.h> 74#include <net/inet_common.h>
75#include <net/checksum.h> 75#include <net/checksum.h>
76#include <net/xfrm.h> 76#include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
150 * RFC 1122: SHOULD pass TOS value up to the transport layer. 150 * RFC 1122: SHOULD pass TOS value up to the transport layer.
151 * -> It does. And not only TOS, but all IP header. 151 * -> It does. And not only TOS, but all IP header.
152 */ 152 */
153void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) 153int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
154{ 154{
155 struct sock *sk; 155 struct sock *sk;
156 struct hlist_head *head; 156 struct hlist_head *head;
157 int delivered = 0;
157 158
158 read_lock(&raw_v4_lock); 159 read_lock(&raw_v4_lock);
159 head = &raw_v4_htable[hash]; 160 head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
164 skb->dev->ifindex); 165 skb->dev->ifindex);
165 166
166 while (sk) { 167 while (sk) {
168 delivered = 1;
167 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { 169 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
168 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 170 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
169 171
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
177 } 179 }
178out: 180out:
179 read_unlock(&raw_v4_lock); 181 read_unlock(&raw_v4_lock);
182 return delivered;
180} 183}
181 184
182void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) 185void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 726ea5e8180a..8c0b14e3beec 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned rt_hash_mask;
240static int rt_hash_log; 240static int rt_hash_log;
241static unsigned int rt_hash_rnd; 241static unsigned int rt_hash_rnd;
242 242
243struct rt_cache_stat *rt_cache_stat; 243static struct rt_cache_stat *rt_cache_stat;
244#define RT_CACHE_STAT_INC(field) \
245 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
244 246
245static int rt_intern_hash(unsigned hash, struct rtable *rth, 247static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 struct rtable **res); 248 struct rtable **res);
@@ -1685,7 +1687,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1687 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686 "%u.%u.%u.%u, on dev %s\n", 1688 "%u.%u.%u.%u, on dev %s\n",
1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1689 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1688 if (dev->hard_header_len) { 1690 if (dev->hard_header_len && skb->mac.raw) {
1689 int i; 1691 int i;
1690 unsigned char *p = skb->mac.raw; 1692 unsigned char *p = skb->mac.raw;
1691 printk(KERN_WARNING "ll header: "); 1693 printk(KERN_WARNING "ll header: ");
@@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2600 return ip_route_output_slow(rp, flp); 2602 return ip_route_output_slow(rp, flp);
2601} 2603}
2602 2604
2605EXPORT_SYMBOL_GPL(__ip_route_output_key);
2606
2603int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2607int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604{ 2608{
2605 int err; 2609 int err;
@@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
2618 return 0; 2622 return 0;
2619} 2623}
2620 2624
2625EXPORT_SYMBOL_GPL(ip_route_output_flow);
2626
2621int ip_route_output_key(struct rtable **rp, struct flowi *flp) 2627int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622{ 2628{
2623 return ip_route_output_flow(rp, flp, NULL, 0); 2629 return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d014442185..a34e60ea48a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 170}
171 171
172extern struct request_sock_ops tcp_request_sock_ops;
173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 172static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct request_sock *req, 173 struct request_sock *req,
176 struct dst_entry *dst) 174 struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
180 178
181 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); 179 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
182 if (child) 180 if (child)
183 tcp_acceptq_queue(sk, req, child); 181 inet_csk_reqsk_queue_add(sk, req, child);
184 else 182 else
185 reqsk_free(req); 183 reqsk_free(req);
186 184
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e32894532416..652685623519 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sysctl.h> 12#include <linux/sysctl.h>
13#include <linux/config.h> 13#include <linux/config.h>
14#include <linux/igmp.h>
14#include <net/snmp.h> 15#include <net/snmp.h>
16#include <net/icmp.h>
15#include <net/ip.h> 17#include <net/ip.h>
16#include <net/route.h> 18#include <net/route.h>
17#include <net/tcp.h> 19#include <net/tcp.h>
@@ -19,36 +21,6 @@
19/* From af_inet.c */ 21/* From af_inet.c */
20extern int sysctl_ip_nonlocal_bind; 22extern int sysctl_ip_nonlocal_bind;
21 23
22/* From icmp.c */
23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses;
26extern int sysctl_icmp_errors_use_inbound_ifaddr;
27
28/* From ip_fragment.c */
29extern int sysctl_ipfrag_low_thresh;
30extern int sysctl_ipfrag_high_thresh;
31extern int sysctl_ipfrag_time;
32extern int sysctl_ipfrag_secret_interval;
33
34/* From ip_output.c */
35extern int sysctl_ip_dynaddr;
36
37/* From icmp.c */
38extern int sysctl_icmp_ratelimit;
39extern int sysctl_icmp_ratemask;
40
41/* From igmp.c */
42extern int sysctl_igmp_max_memberships;
43extern int sysctl_igmp_max_msf;
44
45/* From inetpeer.c */
46extern int inet_peer_threshold;
47extern int inet_peer_minttl;
48extern int inet_peer_maxttl;
49extern int inet_peer_gc_mintime;
50extern int inet_peer_gc_maxtime;
51
52#ifdef CONFIG_SYSCTL 24#ifdef CONFIG_SYSCTL
53static int tcp_retr1_max = 255; 25static int tcp_retr1_max = 255;
54static int ip_local_port_range_min[] = { 1, 1 }; 26static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
57 29
58struct ipv4_config ipv4_config; 30struct ipv4_config ipv4_config;
59 31
60extern ctl_table ipv4_route_table[];
61
62#ifdef CONFIG_SYSCTL 32#ifdef CONFIG_SYSCTL
63 33
64static 34static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
136 return ret; 106 return ret;
137} 107}
138 108
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, 109static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
140 void __user *oldval, size_t __user *oldlenp, 110 int nlen, void __user *oldval,
141 void __user *newval, size_t newlen, 111 size_t __user *oldlenp,
142 void **context) 112 void __user *newval, size_t newlen,
113 void **context)
143{ 114{
144 char val[TCP_CA_NAME_MAX]; 115 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = { 116 ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
259 { 230 {
260 .ctl_name = NET_TCP_MAX_TW_BUCKETS, 231 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
261 .procname = "tcp_max_tw_buckets", 232 .procname = "tcp_max_tw_buckets",
262 .data = &sysctl_tcp_max_tw_buckets, 233 .data = &tcp_death_row.sysctl_max_tw_buckets,
263 .maxlen = sizeof(int), 234 .maxlen = sizeof(int),
264 .mode = 0644, 235 .mode = 0644,
265 .proc_handler = &proc_dointvec 236 .proc_handler = &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
363 { 334 {
364 .ctl_name = NET_TCP_TW_RECYCLE, 335 .ctl_name = NET_TCP_TW_RECYCLE,
365 .procname = "tcp_tw_recycle", 336 .procname = "tcp_tw_recycle",
366 .data = &sysctl_tcp_tw_recycle, 337 .data = &tcp_death_row.sysctl_tw_recycle,
367 .maxlen = sizeof(int), 338 .maxlen = sizeof(int),
368 .mode = 0644, 339 .mode = 0644,
369 .proc_handler = &proc_dointvec 340 .proc_handler = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ddb6ce4ecff2..f3f0013a9580 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
269 269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); 272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276 273
277atomic_t tcp_orphan_count = ATOMIC_INIT(0); 274atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 275
276EXPORT_SYMBOL_GPL(tcp_orphan_count);
277
279int sysctl_tcp_mem[3]; 278int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; 279int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; 280int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
311EXPORT_SYMBOL(tcp_enter_memory_pressure); 310EXPORT_SYMBOL(tcp_enter_memory_pressure);
312 311
313/* 312/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 poll_table *wait)
318{
319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 * Wait for a TCP event. 313 * Wait for a TCP event.
324 * 314 *
325 * Note that we don't need to lock the socket, as the upper poll layers 315 * Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
334 324
335 poll_wait(file, sk->sk_sleep, wait); 325 poll_wait(file, sk->sk_sleep, wait);
336 if (sk->sk_state == TCP_LISTEN) 326 if (sk->sk_state == TCP_LISTEN)
337 return tcp_listen_poll(sk, wait); 327 return inet_csk_listen_poll(sk);
338 328
339 /* Socket is not locked. We are protected from async events 329 /* Socket is not locked. We are protected from async events
340 by poll logic and correct handling of state changes 330 by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
457 return put_user(answ, (int __user *)arg); 447 return put_user(answ, (int __user *)arg);
458} 448}
459 449
460
461int tcp_listen_start(struct sock *sk)
462{
463 struct inet_sock *inet = inet_sk(sk);
464 struct tcp_sock *tp = tcp_sk(sk);
465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
469
470 sk->sk_max_ack_backlog = 0;
471 sk->sk_ack_backlog = 0;
472 tcp_delack_init(tp);
473
474 /* There is race window here: we announce ourselves listening,
475 * but this transition is still not validated by get_port().
476 * It is OK, because this socket enters to hash table only
477 * after validation is complete.
478 */
479 sk->sk_state = TCP_LISTEN;
480 if (!sk->sk_prot->get_port(sk, inet->num)) {
481 inet->sport = htons(inet->num);
482
483 sk_dst_reset(sk);
484 sk->sk_prot->hash(sk);
485
486 return 0;
487 }
488
489 sk->sk_state = TCP_CLOSE;
490 reqsk_queue_destroy(&tp->accept_queue);
491 return -EADDRINUSE;
492}
493
494/*
495 * This routine closes sockets which have been at least partially
496 * opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501 struct tcp_sock *tp = tcp_sk(sk);
502 struct listen_sock *lopt;
503 struct request_sock *acc_req;
504 struct request_sock *req;
505 int i;
506
507 tcp_delete_keepalive_timer(sk);
508
509 /* make all the listen_opt local to us */
510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513 if (lopt->qlen) {
514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515 while ((req = lopt->syn_table[i]) != NULL) {
516 lopt->syn_table[i] = req->dl_next;
517 lopt->qlen--;
518 reqsk_free(req);
519
520 /* Following specs, it would be better either to send FIN
521 * (and enter FIN-WAIT-1, it is normal close)
522 * or to send active reset (abort).
523 * Certainly, it is pretty dangerous while synflood, but it is
524 * bad justification for our negligence 8)
525 * To be honest, we are not able to make either
526 * of the variants now. --ANK
527 */
528 }
529 }
530 }
531 BUG_TRAP(!lopt->qlen);
532
533 kfree(lopt);
534
535 while ((req = acc_req) != NULL) {
536 struct sock *child = req->sk;
537
538 acc_req = req->dl_next;
539
540 local_bh_disable();
541 bh_lock_sock(child);
542 BUG_TRAP(!sock_owned_by_user(child));
543 sock_hold(child);
544
545 tcp_disconnect(child, O_NONBLOCK);
546
547 sock_orphan(child);
548
549 atomic_inc(&tcp_orphan_count);
550
551 tcp_destroy_sock(child);
552
553 bh_unlock_sock(child);
554 local_bh_enable();
555 sock_put(child);
556
557 sk_acceptq_removed(sk);
558 __reqsk_free(req);
559 }
560 BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 450static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{ 451{
565 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 452 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -584,7 +471,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
584 sk_charge_skb(sk, skb); 471 sk_charge_skb(sk, skb);
585 if (!sk->sk_send_head) 472 if (!sk->sk_send_head)
586 sk->sk_send_head = skb; 473 sk->sk_send_head = skb;
587 else if (tp->nonagle&TCP_NAGLE_PUSH) 474 if (tp->nonagle & TCP_NAGLE_PUSH)
588 tp->nonagle &= ~TCP_NAGLE_PUSH; 475 tp->nonagle &= ~TCP_NAGLE_PUSH;
589} 476}
590 477
@@ -665,8 +552,7 @@ new_segment:
665 tcp_mark_push(tp, skb); 552 tcp_mark_push(tp, skb);
666 goto new_segment; 553 goto new_segment;
667 } 554 }
668 if (sk->sk_forward_alloc < copy && 555 if (!sk_stream_wmem_schedule(sk, copy))
669 !sk_stream_mem_schedule(sk, copy, 0))
670 goto wait_for_memory; 556 goto wait_for_memory;
671 557
672 if (can_coalesce) { 558 if (can_coalesce) {
@@ -883,19 +769,23 @@ new_segment:
883 if (off == PAGE_SIZE) { 769 if (off == PAGE_SIZE) {
884 put_page(page); 770 put_page(page);
885 TCP_PAGE(sk) = page = NULL; 771 TCP_PAGE(sk) = page = NULL;
772 off = 0;
886 } 773 }
887 } 774 } else
775 off = 0;
776
777 if (copy > PAGE_SIZE - off)
778 copy = PAGE_SIZE - off;
779
780 if (!sk_stream_wmem_schedule(sk, copy))
781 goto wait_for_memory;
888 782
889 if (!page) { 783 if (!page) {
890 /* Allocate new cache page. */ 784 /* Allocate new cache page. */
891 if (!(page = sk_stream_alloc_page(sk))) 785 if (!(page = sk_stream_alloc_page(sk)))
892 goto wait_for_memory; 786 goto wait_for_memory;
893 off = 0;
894 } 787 }
895 788
896 if (copy > PAGE_SIZE - off)
897 copy = PAGE_SIZE - off;
898
899 /* Time to copy data. We are close to 789 /* Time to copy data. We are close to
900 * the end! */ 790 * the end! */
901 err = skb_copy_to_page(sk, from, skb, page, 791 err = skb_copy_to_page(sk, from, skb, page,
@@ -975,7 +865,7 @@ do_fault:
975 if (!skb->len) { 865 if (!skb->len) {
976 if (sk->sk_send_head == skb) 866 if (sk->sk_send_head == skb)
977 sk->sk_send_head = NULL; 867 sk->sk_send_head = NULL;
978 __skb_unlink(skb, skb->list); 868 __skb_unlink(skb, &sk->sk_write_queue);
979 sk_stream_free_skb(sk, skb); 869 sk_stream_free_skb(sk, skb);
980 } 870 }
981 871
@@ -1057,20 +947,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
1057 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); 947 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1058#endif 948#endif
1059 949
1060 if (tcp_ack_scheduled(tp)) { 950 if (inet_csk_ack_scheduled(sk)) {
951 const struct inet_connection_sock *icsk = inet_csk(sk);
1061 /* Delayed ACKs frequently hit locked sockets during bulk 952 /* Delayed ACKs frequently hit locked sockets during bulk
1062 * receive. */ 953 * receive. */
1063 if (tp->ack.blocked || 954 if (icsk->icsk_ack.blocked ||
1064 /* Once-per-two-segments ACK was not sent by tcp_input.c */ 955 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1065 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || 956 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1066 /* 957 /*
1067 * If this read emptied read buffer, we send ACK, if 958 * If this read emptied read buffer, we send ACK, if
1068 * connection is not bidirectional, user drained 959 * connection is not bidirectional, user drained
1069 * receive buffer and there was a small segment 960 * receive buffer and there was a small segment
1070 * in queue. 961 * in queue.
1071 */ 962 */
1072 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && 963 (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1073 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) 964 !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1074 time_to_ack = 1; 965 time_to_ack = 1;
1075 } 966 }
1076 967
@@ -1572,40 +1463,6 @@ void tcp_shutdown(struct sock *sk, int how)
1572 } 1463 }
1573} 1464}
1574 1465
1575/*
1576 * At this point, there should be no process reference to this
1577 * socket, and thus no user references at all. Therefore we
1578 * can assume the socket waitqueue is inactive and nobody will
1579 * try to jump onto it.
1580 */
1581void tcp_destroy_sock(struct sock *sk)
1582{
1583 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1584 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1585
1586 /* It cannot be in hash table! */
1587 BUG_TRAP(sk_unhashed(sk));
1588
1589 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1590 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1591
1592 sk->sk_prot->destroy(sk);
1593
1594 sk_stream_kill_queues(sk);
1595
1596 xfrm_sk_free_policy(sk);
1597
1598#ifdef INET_REFCNT_DEBUG
1599 if (atomic_read(&sk->sk_refcnt) != 1) {
1600 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1601 sk, atomic_read(&sk->sk_refcnt));
1602 }
1603#endif
1604
1605 atomic_dec(&tcp_orphan_count);
1606 sock_put(sk);
1607}
1608
1609void tcp_close(struct sock *sk, long timeout) 1466void tcp_close(struct sock *sk, long timeout)
1610{ 1467{
1611 struct sk_buff *skb; 1468 struct sk_buff *skb;
@@ -1618,7 +1475,7 @@ void tcp_close(struct sock *sk, long timeout)
1618 tcp_set_state(sk, TCP_CLOSE); 1475 tcp_set_state(sk, TCP_CLOSE);
1619 1476
1620 /* Special case. */ 1477 /* Special case. */
1621 tcp_listen_stop(sk); 1478 inet_csk_listen_stop(sk);
1622 1479
1623 goto adjudge_to_death; 1480 goto adjudge_to_death;
1624 } 1481 }
@@ -1721,12 +1578,12 @@ adjudge_to_death:
1721 tcp_send_active_reset(sk, GFP_ATOMIC); 1578 tcp_send_active_reset(sk, GFP_ATOMIC);
1722 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); 1579 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1723 } else { 1580 } else {
1724 int tmo = tcp_fin_time(tp); 1581 const int tmo = tcp_fin_time(sk);
1725 1582
1726 if (tmo > TCP_TIMEWAIT_LEN) { 1583 if (tmo > TCP_TIMEWAIT_LEN) {
1727 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); 1584 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1728 } else { 1585 } else {
1729 atomic_inc(&tcp_orphan_count); 1586 atomic_inc(sk->sk_prot->orphan_count);
1730 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 1587 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1731 goto out; 1588 goto out;
1732 } 1589 }
@@ -1734,7 +1591,7 @@ adjudge_to_death:
1734 } 1591 }
1735 if (sk->sk_state != TCP_CLOSE) { 1592 if (sk->sk_state != TCP_CLOSE) {
1736 sk_stream_mem_reclaim(sk); 1593 sk_stream_mem_reclaim(sk);
1737 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || 1594 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1738 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && 1595 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1739 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { 1596 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1740 if (net_ratelimit()) 1597 if (net_ratelimit())
@@ -1745,10 +1602,10 @@ adjudge_to_death:
1745 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); 1602 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1746 } 1603 }
1747 } 1604 }
1748 atomic_inc(&tcp_orphan_count); 1605 atomic_inc(sk->sk_prot->orphan_count);
1749 1606
1750 if (sk->sk_state == TCP_CLOSE) 1607 if (sk->sk_state == TCP_CLOSE)
1751 tcp_destroy_sock(sk); 1608 inet_csk_destroy_sock(sk);
1752 /* Otherwise, socket is reprieved until protocol close. */ 1609 /* Otherwise, socket is reprieved until protocol close. */
1753 1610
1754out: 1611out:
@@ -1769,6 +1626,7 @@ static inline int tcp_need_reset(int state)
1769int tcp_disconnect(struct sock *sk, int flags) 1626int tcp_disconnect(struct sock *sk, int flags)
1770{ 1627{
1771 struct inet_sock *inet = inet_sk(sk); 1628 struct inet_sock *inet = inet_sk(sk);
1629 struct inet_connection_sock *icsk = inet_csk(sk);
1772 struct tcp_sock *tp = tcp_sk(sk); 1630 struct tcp_sock *tp = tcp_sk(sk);
1773 int err = 0; 1631 int err = 0;
1774 int old_state = sk->sk_state; 1632 int old_state = sk->sk_state;
@@ -1778,7 +1636,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1778 1636
1779 /* ABORT function of RFC793 */ 1637 /* ABORT function of RFC793 */
1780 if (old_state == TCP_LISTEN) { 1638 if (old_state == TCP_LISTEN) {
1781 tcp_listen_stop(sk); 1639 inet_csk_listen_stop(sk);
1782 } else if (tcp_need_reset(old_state) || 1640 } else if (tcp_need_reset(old_state) ||
1783 (tp->snd_nxt != tp->write_seq && 1641 (tp->snd_nxt != tp->write_seq &&
1784 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 1642 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1663,34 @@ int tcp_disconnect(struct sock *sk, int flags)
1805 tp->srtt = 0; 1663 tp->srtt = 0;
1806 if ((tp->write_seq += tp->max_window + 2) == 0) 1664 if ((tp->write_seq += tp->max_window + 2) == 0)
1807 tp->write_seq = 1; 1665 tp->write_seq = 1;
1808 tp->backoff = 0; 1666 icsk->icsk_backoff = 0;
1809 tp->snd_cwnd = 2; 1667 tp->snd_cwnd = 2;
1810 tp->probes_out = 0; 1668 icsk->icsk_probes_out = 0;
1811 tp->packets_out = 0; 1669 tp->packets_out = 0;
1812 tp->snd_ssthresh = 0x7fffffff; 1670 tp->snd_ssthresh = 0x7fffffff;
1813 tp->snd_cwnd_cnt = 0; 1671 tp->snd_cwnd_cnt = 0;
1814 tcp_set_ca_state(tp, TCP_CA_Open); 1672 tcp_set_ca_state(sk, TCP_CA_Open);
1815 tcp_clear_retrans(tp); 1673 tcp_clear_retrans(tp);
1816 tcp_delack_init(tp); 1674 inet_csk_delack_init(sk);
1817 sk->sk_send_head = NULL; 1675 sk->sk_send_head = NULL;
1818 tp->rx_opt.saw_tstamp = 0; 1676 tp->rx_opt.saw_tstamp = 0;
1819 tcp_sack_reset(&tp->rx_opt); 1677 tcp_sack_reset(&tp->rx_opt);
1820 __sk_dst_reset(sk); 1678 __sk_dst_reset(sk);
1821 1679
1822 BUG_TRAP(!inet->num || tp->bind_hash); 1680 BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1823 1681
1824 sk->sk_error_report(sk); 1682 sk->sk_error_report(sk);
1825 return err; 1683 return err;
1826} 1684}
1827 1685
1828/* 1686/*
1829 * Wait for an incoming connection, avoid race
1830 * conditions. This must be called with the socket locked.
1831 */
1832static int wait_for_connect(struct sock *sk, long timeo)
1833{
1834 struct tcp_sock *tp = tcp_sk(sk);
1835 DEFINE_WAIT(wait);
1836 int err;
1837
1838 /*
1839 * True wake-one mechanism for incoming connections: only
1840 * one process gets woken up, not the 'whole herd'.
1841 * Since we do not 'race & poll' for established sockets
1842 * anymore, the common case will execute the loop only once.
1843 *
1844 * Subtle issue: "add_wait_queue_exclusive()" will be added
1845 * after any current non-exclusive waiters, and we know that
1846 * it will always _stay_ after any new non-exclusive waiters
1847 * because all non-exclusive waiters are added at the
1848 * beginning of the wait-queue. As such, it's ok to "drop"
1849 * our exclusiveness temporarily when we get woken up without
1850 * having to remove and re-insert us on the wait queue.
1851 */
1852 for (;;) {
1853 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1854 TASK_INTERRUPTIBLE);
1855 release_sock(sk);
1856 if (reqsk_queue_empty(&tp->accept_queue))
1857 timeo = schedule_timeout(timeo);
1858 lock_sock(sk);
1859 err = 0;
1860 if (!reqsk_queue_empty(&tp->accept_queue))
1861 break;
1862 err = -EINVAL;
1863 if (sk->sk_state != TCP_LISTEN)
1864 break;
1865 err = sock_intr_errno(timeo);
1866 if (signal_pending(current))
1867 break;
1868 err = -EAGAIN;
1869 if (!timeo)
1870 break;
1871 }
1872 finish_wait(sk->sk_sleep, &wait);
1873 return err;
1874}
1875
1876/*
1877 * This will accept the next outstanding connection.
1878 */
1879
1880struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1881{
1882 struct tcp_sock *tp = tcp_sk(sk);
1883 struct sock *newsk;
1884 int error;
1885
1886 lock_sock(sk);
1887
1888 /* We need to make sure that this socket is listening,
1889 * and that it has something pending.
1890 */
1891 error = -EINVAL;
1892 if (sk->sk_state != TCP_LISTEN)
1893 goto out_err;
1894
1895 /* Find already established connection */
1896 if (reqsk_queue_empty(&tp->accept_queue)) {
1897 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1898
1899 /* If this is a non blocking socket don't sleep */
1900 error = -EAGAIN;
1901 if (!timeo)
1902 goto out_err;
1903
1904 error = wait_for_connect(sk, timeo);
1905 if (error)
1906 goto out_err;
1907 }
1908
1909 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1910 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1911out:
1912 release_sock(sk);
1913 return newsk;
1914out_err:
1915 newsk = NULL;
1916 *err = error;
1917 goto out;
1918}
1919
1920/*
1921 * Socket option code for TCP. 1687 * Socket option code for TCP.
1922 */ 1688 */
1923int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 1689int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1924 int optlen) 1690 int optlen)
1925{ 1691{
1926 struct tcp_sock *tp = tcp_sk(sk); 1692 struct tcp_sock *tp = tcp_sk(sk);
1693 struct inet_connection_sock *icsk = inet_csk(sk);
1927 int val; 1694 int val;
1928 int err = 0; 1695 int err = 0;
1929 1696
@@ -1945,7 +1712,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1945 name[val] = 0; 1712 name[val] = 0;
1946 1713
1947 lock_sock(sk); 1714 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name); 1715 err = tcp_set_congestion_control(sk, name);
1949 release_sock(sk); 1716 release_sock(sk);
1950 return err; 1717 return err;
1951 } 1718 }
@@ -2022,7 +1789,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2022 elapsed = tp->keepalive_time - elapsed; 1789 elapsed = tp->keepalive_time - elapsed;
2023 else 1790 else
2024 elapsed = 0; 1791 elapsed = 0;
2025 tcp_reset_keepalive_timer(sk, elapsed); 1792 inet_csk_reset_keepalive_timer(sk, elapsed);
2026 } 1793 }
2027 } 1794 }
2028 break; 1795 break;
@@ -2042,7 +1809,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2042 if (val < 1 || val > MAX_TCP_SYNCNT) 1809 if (val < 1 || val > MAX_TCP_SYNCNT)
2043 err = -EINVAL; 1810 err = -EINVAL;
2044 else 1811 else
2045 tp->syn_retries = val; 1812 icsk->icsk_syn_retries = val;
2046 break; 1813 break;
2047 1814
2048 case TCP_LINGER2: 1815 case TCP_LINGER2:
@@ -2055,15 +1822,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2055 break; 1822 break;
2056 1823
2057 case TCP_DEFER_ACCEPT: 1824 case TCP_DEFER_ACCEPT:
2058 tp->defer_accept = 0; 1825 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2059 if (val > 0) { 1826 if (val > 0) {
2060 /* Translate value in seconds to number of 1827 /* Translate value in seconds to number of
2061 * retransmits */ 1828 * retransmits */
2062 while (tp->defer_accept < 32 && 1829 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2063 val > ((TCP_TIMEOUT_INIT / HZ) << 1830 val > ((TCP_TIMEOUT_INIT / HZ) <<
2064 tp->defer_accept)) 1831 icsk->icsk_accept_queue.rskq_defer_accept))
2065 tp->defer_accept++; 1832 icsk->icsk_accept_queue.rskq_defer_accept++;
2066 tp->defer_accept++; 1833 icsk->icsk_accept_queue.rskq_defer_accept++;
2067 } 1834 }
2068 break; 1835 break;
2069 1836
@@ -2081,16 +1848,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2081 1848
2082 case TCP_QUICKACK: 1849 case TCP_QUICKACK:
2083 if (!val) { 1850 if (!val) {
2084 tp->ack.pingpong = 1; 1851 icsk->icsk_ack.pingpong = 1;
2085 } else { 1852 } else {
2086 tp->ack.pingpong = 0; 1853 icsk->icsk_ack.pingpong = 0;
2087 if ((1 << sk->sk_state) & 1854 if ((1 << sk->sk_state) &
2088 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 1855 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2089 tcp_ack_scheduled(tp)) { 1856 inet_csk_ack_scheduled(sk)) {
2090 tp->ack.pending |= TCP_ACK_PUSHED; 1857 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2091 cleanup_rbuf(sk, 1); 1858 cleanup_rbuf(sk, 1);
2092 if (!(val & 1)) 1859 if (!(val & 1))
2093 tp->ack.pingpong = 1; 1860 icsk->icsk_ack.pingpong = 1;
2094 } 1861 }
2095 } 1862 }
2096 break; 1863 break;
@@ -2107,15 +1874,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2107void tcp_get_info(struct sock *sk, struct tcp_info *info) 1874void tcp_get_info(struct sock *sk, struct tcp_info *info)
2108{ 1875{
2109 struct tcp_sock *tp = tcp_sk(sk); 1876 struct tcp_sock *tp = tcp_sk(sk);
1877 const struct inet_connection_sock *icsk = inet_csk(sk);
2110 u32 now = tcp_time_stamp; 1878 u32 now = tcp_time_stamp;
2111 1879
2112 memset(info, 0, sizeof(*info)); 1880 memset(info, 0, sizeof(*info));
2113 1881
2114 info->tcpi_state = sk->sk_state; 1882 info->tcpi_state = sk->sk_state;
2115 info->tcpi_ca_state = tp->ca_state; 1883 info->tcpi_ca_state = icsk->icsk_ca_state;
2116 info->tcpi_retransmits = tp->retransmits; 1884 info->tcpi_retransmits = icsk->icsk_retransmits;
2117 info->tcpi_probes = tp->probes_out; 1885 info->tcpi_probes = icsk->icsk_probes_out;
2118 info->tcpi_backoff = tp->backoff; 1886 info->tcpi_backoff = icsk->icsk_backoff;
2119 1887
2120 if (tp->rx_opt.tstamp_ok) 1888 if (tp->rx_opt.tstamp_ok)
2121 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 1889 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1898,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2130 if (tp->ecn_flags&TCP_ECN_OK) 1898 if (tp->ecn_flags&TCP_ECN_OK)
2131 info->tcpi_options |= TCPI_OPT_ECN; 1899 info->tcpi_options |= TCPI_OPT_ECN;
2132 1900
2133 info->tcpi_rto = jiffies_to_usecs(tp->rto); 1901 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 1902 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2135 info->tcpi_snd_mss = tp->mss_cache; 1903 info->tcpi_snd_mss = tp->mss_cache;
2136 info->tcpi_rcv_mss = tp->ack.rcv_mss; 1904 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2137 1905
2138 info->tcpi_unacked = tp->packets_out; 1906 info->tcpi_unacked = tp->packets_out;
2139 info->tcpi_sacked = tp->sacked_out; 1907 info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2142 info->tcpi_fackets = tp->fackets_out; 1910 info->tcpi_fackets = tp->fackets_out;
2143 1911
2144 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 1912 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2145 info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); 1913 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2146 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 1914 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2147 1915
2148 info->tcpi_pmtu = tp->pmtu_cookie; 1916 info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1933,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
2165int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, 1933int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2166 int __user *optlen) 1934 int __user *optlen)
2167{ 1935{
1936 struct inet_connection_sock *icsk = inet_csk(sk);
2168 struct tcp_sock *tp = tcp_sk(sk); 1937 struct tcp_sock *tp = tcp_sk(sk);
2169 int val, len; 1938 int val, len;
2170 1939
@@ -2202,7 +1971,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2202 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; 1971 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2203 break; 1972 break;
2204 case TCP_SYNCNT: 1973 case TCP_SYNCNT:
2205 val = tp->syn_retries ? : sysctl_tcp_syn_retries; 1974 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2206 break; 1975 break;
2207 case TCP_LINGER2: 1976 case TCP_LINGER2:
2208 val = tp->linger2; 1977 val = tp->linger2;
@@ -2210,8 +1979,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2210 val = (val ? : sysctl_tcp_fin_timeout) / HZ; 1979 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2211 break; 1980 break;
2212 case TCP_DEFER_ACCEPT: 1981 case TCP_DEFER_ACCEPT:
2213 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << 1982 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2214 (tp->defer_accept - 1)); 1983 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2215 break; 1984 break;
2216 case TCP_WINDOW_CLAMP: 1985 case TCP_WINDOW_CLAMP:
2217 val = tp->window_clamp; 1986 val = tp->window_clamp;
@@ -2232,7 +2001,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2232 return 0; 2001 return 0;
2233 } 2002 }
2234 case TCP_QUICKACK: 2003 case TCP_QUICKACK:
2235 val = !tp->ack.pingpong; 2004 val = !icsk->icsk_ack.pingpong;
2236 break; 2005 break;
2237 2006
2238 case TCP_CONGESTION: 2007 case TCP_CONGESTION:
@@ -2241,7 +2010,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX); 2010 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen)) 2011 if (put_user(len, optlen))
2243 return -EFAULT; 2012 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len)) 2013 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2245 return -EFAULT; 2014 return -EFAULT;
2246 return 0; 2015 return 0;
2247 default: 2016 default:
@@ -2278,79 +2047,72 @@ void __init tcp_init(void)
2278 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 2047 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2279 sizeof(skb->cb)); 2048 sizeof(skb->cb));
2280 2049
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 2050 tcp_hashinfo.bind_bucket_cachep =
2282 sizeof(struct tcp_bind_bucket), 2051 kmem_cache_create("tcp_bind_bucket",
2283 0, SLAB_HWCACHE_ALIGN, 2052 sizeof(struct inet_bind_bucket), 0,
2284 NULL, NULL); 2053 SLAB_HWCACHE_ALIGN, NULL, NULL);
2285 if (!tcp_bucket_cachep) 2054 if (!tcp_hashinfo.bind_bucket_cachep)
2286 panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); 2055 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287 2056
2288 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289 sizeof(struct tcp_tw_bucket),
2290 0, SLAB_HWCACHE_ALIGN,
2291 NULL, NULL);
2292 if (!tcp_timewait_cachep)
2293 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295 /* Size and allocate the main established and bind bucket 2057 /* Size and allocate the main established and bind bucket
2296 * hash tables. 2058 * hash tables.
2297 * 2059 *
2298 * The methodology is similar to that of the buffer cache. 2060 * The methodology is similar to that of the buffer cache.
2299 */ 2061 */
2300 tcp_ehash = (struct tcp_ehash_bucket *) 2062 tcp_hashinfo.ehash =
2301 alloc_large_system_hash("TCP established", 2063 alloc_large_system_hash("TCP established",
2302 sizeof(struct tcp_ehash_bucket), 2064 sizeof(struct inet_ehash_bucket),
2303 thash_entries, 2065 thash_entries,
2304 (num_physpages >= 128 * 1024) ? 2066 (num_physpages >= 128 * 1024) ?
2305 (25 - PAGE_SHIFT) : 2067 (25 - PAGE_SHIFT) :
2306 (27 - PAGE_SHIFT), 2068 (27 - PAGE_SHIFT),
2307 HASH_HIGHMEM, 2069 HASH_HIGHMEM,
2308 &tcp_ehash_size, 2070 &tcp_hashinfo.ehash_size,
2309 NULL, 2071 NULL,
2310 0); 2072 0);
2311 tcp_ehash_size = (1 << tcp_ehash_size) >> 1; 2073 tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2312 for (i = 0; i < (tcp_ehash_size << 1); i++) { 2074 for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2313 rwlock_init(&tcp_ehash[i].lock); 2075 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2314 INIT_HLIST_HEAD(&tcp_ehash[i].chain); 2076 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2315 } 2077 }
2316 2078
2317 tcp_bhash = (struct tcp_bind_hashbucket *) 2079 tcp_hashinfo.bhash =
2318 alloc_large_system_hash("TCP bind", 2080 alloc_large_system_hash("TCP bind",
2319 sizeof(struct tcp_bind_hashbucket), 2081 sizeof(struct inet_bind_hashbucket),
2320 tcp_ehash_size, 2082 tcp_hashinfo.ehash_size,
2321 (num_physpages >= 128 * 1024) ? 2083 (num_physpages >= 128 * 1024) ?
2322 (25 - PAGE_SHIFT) : 2084 (25 - PAGE_SHIFT) :
2323 (27 - PAGE_SHIFT), 2085 (27 - PAGE_SHIFT),
2324 HASH_HIGHMEM, 2086 HASH_HIGHMEM,
2325 &tcp_bhash_size, 2087 &tcp_hashinfo.bhash_size,
2326 NULL, 2088 NULL,
2327 64 * 1024); 2089 64 * 1024);
2328 tcp_bhash_size = 1 << tcp_bhash_size; 2090 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2329 for (i = 0; i < tcp_bhash_size; i++) { 2091 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2330 spin_lock_init(&tcp_bhash[i].lock); 2092 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2331 INIT_HLIST_HEAD(&tcp_bhash[i].chain); 2093 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2332 } 2094 }
2333 2095
2334 /* Try to be a bit smarter and adjust defaults depending 2096 /* Try to be a bit smarter and adjust defaults depending
2335 * on available memory. 2097 * on available memory.
2336 */ 2098 */
2337 for (order = 0; ((1 << order) << PAGE_SHIFT) < 2099 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); 2100 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2339 order++) 2101 order++)
2340 ; 2102 ;
2341 if (order >= 4) { 2103 if (order >= 4) {
2342 sysctl_local_port_range[0] = 32768; 2104 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000; 2105 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000; 2106 tcp_death_row.sysctl_max_tw_buckets = 180000;
2345 sysctl_tcp_max_orphans = 4096 << (order - 4); 2107 sysctl_tcp_max_orphans = 4096 << (order - 4);
2346 sysctl_max_syn_backlog = 1024; 2108 sysctl_max_syn_backlog = 1024;
2347 } else if (order < 3) { 2109 } else if (order < 3) {
2348 sysctl_local_port_range[0] = 1024 * (3 - order); 2110 sysctl_local_port_range[0] = 1024 * (3 - order);
2349 sysctl_tcp_max_tw_buckets >>= (3 - order); 2111 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2350 sysctl_tcp_max_orphans >>= (3 - order); 2112 sysctl_tcp_max_orphans >>= (3 - order);
2351 sysctl_max_syn_backlog = 128; 2113 sysctl_max_syn_backlog = 128;
2352 } 2114 }
2353 tcp_port_rover = sysctl_local_port_range[0] - 1; 2115 tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2354 2116
2355 sysctl_tcp_mem[0] = 768 << order; 2117 sysctl_tcp_mem[0] = 768 << order;
2356 sysctl_tcp_mem[1] = 1024 << order; 2118 sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2127,12 @@ void __init tcp_init(void)
2365 2127
2366 printk(KERN_INFO "TCP: Hash tables configured " 2128 printk(KERN_INFO "TCP: Hash tables configured "
2367 "(established %d bind %d)\n", 2129 "(established %d bind %d)\n",
2368 tcp_ehash_size << 1, tcp_bhash_size); 2130 tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2369 2131
2370 tcp_register_congestion_control(&tcp_reno); 2132 tcp_register_congestion_control(&tcp_reno);
2371} 2133}
2372 2134
2373EXPORT_SYMBOL(tcp_accept);
2374EXPORT_SYMBOL(tcp_close); 2135EXPORT_SYMBOL(tcp_close);
2375EXPORT_SYMBOL(tcp_destroy_sock);
2376EXPORT_SYMBOL(tcp_disconnect); 2136EXPORT_SYMBOL(tcp_disconnect);
2377EXPORT_SYMBOL(tcp_getsockopt); 2137EXPORT_SYMBOL(tcp_getsockopt);
2378EXPORT_SYMBOL(tcp_ioctl); 2138EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2144,3 @@ EXPORT_SYMBOL(tcp_sendpage);
2384EXPORT_SYMBOL(tcp_setsockopt); 2144EXPORT_SYMBOL(tcp_setsockopt);
2385EXPORT_SYMBOL(tcp_shutdown); 2145EXPORT_SYMBOL(tcp_shutdown);
2386EXPORT_SYMBOL(tcp_statistics); 2146EXPORT_SYMBOL(tcp_statistics);
2387EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d6649..b940346de4e7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87} 87}
88 88
89static void bictcp_init(struct tcp_sock *tp) 89static void bictcp_init(struct sock *sk)
90{ 90{
91 bictcp_reset(tcp_ca(tp)); 91 bictcp_reset(inet_csk_ca(sk));
92 if (initial_ssthresh) 92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh; 93 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
94} 94}
95 95
96/* 96/*
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
156 156
157 157
158/* Detect low utilization in congestion avoidance */ 158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) 159static inline void bictcp_low_utilization(struct sock *sk, int flag)
160{ 160{
161 struct bictcp *ca = tcp_ca(tp); 161 const struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
162 u32 dist, delay; 163 u32 dist, delay;
163 164
164 /* No time stamp */ 165 /* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
208 209
209} 210}
210 211
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, 212static void bictcp_cong_avoid(struct sock *sk, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked) 213 u32 seq_rtt, u32 in_flight, int data_acked)
213{ 214{
214 struct bictcp *ca = tcp_ca(tp); 215 struct tcp_sock *tp = tcp_sk(sk);
216 struct bictcp *ca = inet_csk_ca(sk);
215 217
216 bictcp_low_utilization(tp, data_acked); 218 bictcp_low_utilization(sk, data_acked);
217 219
218 if (in_flight < tp->snd_cwnd) 220 if (in_flight < tp->snd_cwnd)
219 return; 221 return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
242 * behave like Reno until low_window is reached, 244 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly 245 * then increase congestion window slowly
244 */ 246 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) 247static u32 bictcp_recalc_ssthresh(struct sock *sk)
246{ 248{
247 struct bictcp *ca = tcp_ca(tp); 249 const struct tcp_sock *tp = tcp_sk(sk);
250 struct bictcp *ca = inet_csk_ca(sk);
248 251
249 ca->epoch_start = 0; /* end of epoch */ 252 ca->epoch_start = 0; /* end of epoch */
250 253
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 272 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270} 273}
271 274
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp) 275static u32 bictcp_undo_cwnd(struct sock *sk)
273{ 276{
274 struct bictcp *ca = tcp_ca(tp); 277 const struct tcp_sock *tp = tcp_sk(sk);
275 278 const struct bictcp *ca = inet_csk_ca(sk);
276 return max(tp->snd_cwnd, ca->last_max_cwnd); 279 return max(tp->snd_cwnd, ca->last_max_cwnd);
277} 280}
278 281
279static u32 bictcp_min_cwnd(struct tcp_sock *tp) 282static u32 bictcp_min_cwnd(struct sock *sk)
280{ 283{
284 const struct tcp_sock *tp = tcp_sk(sk);
281 return tp->snd_ssthresh; 285 return tp->snd_ssthresh;
282} 286}
283 287
284static void bictcp_state(struct tcp_sock *tp, u8 new_state) 288static void bictcp_state(struct sock *sk, u8 new_state)
285{ 289{
286 if (new_state == TCP_CA_Loss) 290 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp)); 291 bictcp_reset(inet_csk_ca(sk));
288} 292}
289 293
290/* Track delayed acknowledgement ratio using sliding window 294/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16 295 * ratio = (15*ratio + sample) / 16
292 */ 296 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt) 297static void bictcp_acked(struct sock *sk, u32 cnt)
294{ 298{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) { 299 const struct inet_connection_sock *icsk = inet_csk(sk);
296 struct bictcp *ca = tcp_ca(tp); 300
301 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
302 struct bictcp *ca = inet_csk_ca(sk);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 303 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt; 304 ca->delayed_ack += cnt;
299 } 305 }
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
314 320
315static int __init bictcp_register(void) 321static int __init bictcp_register(void)
316{ 322{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); 323 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp); 324 return tcp_register_congestion_control(&bictcp);
319} 325}
320 326
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a7785..bbf2d6624e89 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74 74
75/* Assign choice of congestion control. */ 75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp) 76void tcp_init_congestion_control(struct sock *sk)
77{ 77{
78 struct inet_connection_sock *icsk = inet_csk(sk);
78 struct tcp_congestion_ops *ca; 79 struct tcp_congestion_ops *ca;
79 80
80 if (tp->ca_ops != &tcp_init_congestion_ops) 81 if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
81 return; 82 return;
82 83
83 rcu_read_lock(); 84 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 85 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) { 86 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca; 87 icsk->icsk_ca_ops = ca;
87 break; 88 break;
88 } 89 }
89 90
90 } 91 }
91 rcu_read_unlock(); 92 rcu_read_unlock();
92 93
93 if (tp->ca_ops->init) 94 if (icsk->icsk_ca_ops->init)
94 tp->ca_ops->init(tp); 95 icsk->icsk_ca_ops->init(sk);
95} 96}
96 97
97/* Manage refcounts on socket close. */ 98/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp) 99void tcp_cleanup_congestion_control(struct sock *sk)
99{ 100{
100 if (tp->ca_ops->release) 101 struct inet_connection_sock *icsk = inet_csk(sk);
101 tp->ca_ops->release(tp); 102
102 module_put(tp->ca_ops->owner); 103 if (icsk->icsk_ca_ops->release)
104 icsk->icsk_ca_ops->release(sk);
105 module_put(icsk->icsk_ca_ops->owner);
103} 106}
104 107
105/* Used by sysctl to change default congestion control */ 108/* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
143} 146}
144 147
145/* Change congestion control for socket */ 148/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) 149int tcp_set_congestion_control(struct sock *sk, const char *name)
147{ 150{
151 struct inet_connection_sock *icsk = inet_csk(sk);
148 struct tcp_congestion_ops *ca; 152 struct tcp_congestion_ops *ca;
149 int err = 0; 153 int err = 0;
150 154
151 rcu_read_lock(); 155 rcu_read_lock();
152 ca = tcp_ca_find(name); 156 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops) 157 if (ca == icsk->icsk_ca_ops)
154 goto out; 158 goto out;
155 159
156 if (!ca) 160 if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
160 err = -EBUSY; 164 err = -EBUSY;
161 165
162 else { 166 else {
163 tcp_cleanup_congestion_control(tp); 167 tcp_cleanup_congestion_control(sk);
164 tp->ca_ops = ca; 168 icsk->icsk_ca_ops = ca;
165 if (tp->ca_ops->init) 169 if (icsk->icsk_ca_ops->init)
166 tp->ca_ops->init(tp); 170 icsk->icsk_ca_ops->init(sk);
167 } 171 }
168 out: 172 out:
169 rcu_read_unlock(); 173 rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
177/* This is Jacobson's slow start and congestion avoidance. 181/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328. 182 * SIGCOMM '88, p. 328.
179 */ 183 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, 184void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
181 int flag) 185 int flag)
182{ 186{
187 struct tcp_sock *tp = tcp_sk(sk);
188
183 if (in_flight < tp->snd_cwnd) 189 if (in_flight < tp->snd_cwnd)
184 return; 190 return;
185 191
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 208EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203 209
204/* Slow start threshold is half the congestion window (min 2) */ 210/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp) 211u32 tcp_reno_ssthresh(struct sock *sk)
206{ 212{
213 const struct tcp_sock *tp = tcp_sk(sk);
207 return max(tp->snd_cwnd >> 1U, 2U); 214 return max(tp->snd_cwnd >> 1U, 2U);
208} 215}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 216EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210 217
211/* Lower bound on congestion window. */ 218/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp) 219u32 tcp_reno_min_cwnd(struct sock *sk)
213{ 220{
221 const struct tcp_sock *tp = tcp_sk(sk);
214 return tp->snd_ssthresh/2; 222 return tp->snd_ssthresh/2;
215} 223}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); 224EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158f..c148c1081880 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * tcp_diag.c Module for monitoring TCP sockets. 2 * tcp_diag.c Module for monitoring TCP transport protocols sockets.
3 * 3 *
4 * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ 4 * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
5 * 5 *
@@ -12,779 +12,43 @@
12 */ 12 */
13 13
14#include <linux/config.h> 14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/fcntl.h>
18#include <linux/random.h>
19#include <linux/cache.h>
20#include <linux/init.h>
21#include <linux/time.h>
22
23#include <net/icmp.h>
24#include <net/tcp.h>
25#include <net/ipv6.h>
26#include <net/inet_common.h>
27
28#include <linux/inet.h>
29#include <linux/stddef.h>
30
31#include <linux/tcp_diag.h>
32 15
33struct tcpdiag_entry 16#include <linux/module.h>
34{ 17#include <linux/inet_diag.h>
35 u32 *saddr;
36 u32 *daddr;
37 u16 sport;
38 u16 dport;
39 u16 family;
40 u16 userlocks;
41};
42 18
43static struct sock *tcpnl; 19#include <linux/tcp.h>
44 20
45#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 21#include <net/tcp.h>
46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
47 22
48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 23static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
49 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 24 void *_info)
50{ 25{
51 struct inet_sock *inet = inet_sk(sk); 26 const struct tcp_sock *tp = tcp_sk(sk);
52 struct tcp_sock *tp = tcp_sk(sk); 27 struct tcp_info *info = _info;
53 struct tcpdiagmsg *r;
54 struct nlmsghdr *nlh;
55 struct tcp_info *info = NULL;
56 struct tcpdiag_meminfo *minfo = NULL;
57 unsigned char *b = skb->tail;
58
59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
60 nlh->nlmsg_flags = nlmsg_flags;
61 r = NLMSG_DATA(nlh);
62 if (sk->sk_state != TCP_TIME_WAIT) {
63 if (ext & (1<<(TCPDIAG_MEMINFO-1)))
64 minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
65 if (ext & (1<<(TCPDIAG_INFO-1)))
66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
67
68 if (ext & (1<<(TCPDIAG_CONG-1))) {
69 size_t len = strlen(tp->ca_ops->name);
70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
73 }
74 r->tcpdiag_family = sk->sk_family;
75 r->tcpdiag_state = sk->sk_state;
76 r->tcpdiag_timer = 0;
77 r->tcpdiag_retrans = 0;
78
79 r->id.tcpdiag_if = sk->sk_bound_dev_if;
80 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
81 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
82
83 if (r->tcpdiag_state == TCP_TIME_WAIT) {
84 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
85 long tmo = tw->tw_ttd - jiffies;
86 if (tmo < 0)
87 tmo = 0;
88
89 r->id.tcpdiag_sport = tw->tw_sport;
90 r->id.tcpdiag_dport = tw->tw_dport;
91 r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
92 r->id.tcpdiag_dst[0] = tw->tw_daddr;
93 r->tcpdiag_state = tw->tw_substate;
94 r->tcpdiag_timer = 3;
95 r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
96 r->tcpdiag_rqueue = 0;
97 r->tcpdiag_wqueue = 0;
98 r->tcpdiag_uid = 0;
99 r->tcpdiag_inode = 0;
100#ifdef CONFIG_IP_TCPDIAG_IPV6
101 if (r->tcpdiag_family == AF_INET6) {
102 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
103 &tw->tw_v6_rcv_saddr);
104 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
105 &tw->tw_v6_daddr);
106 }
107#endif
108 nlh->nlmsg_len = skb->tail - b;
109 return skb->len;
110 }
111
112 r->id.tcpdiag_sport = inet->sport;
113 r->id.tcpdiag_dport = inet->dport;
114 r->id.tcpdiag_src[0] = inet->rcv_saddr;
115 r->id.tcpdiag_dst[0] = inet->daddr;
116
117#ifdef CONFIG_IP_TCPDIAG_IPV6
118 if (r->tcpdiag_family == AF_INET6) {
119 struct ipv6_pinfo *np = inet6_sk(sk);
120
121 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
122 &np->rcv_saddr);
123 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
124 &np->daddr);
125 }
126#endif
127
128#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
129
130 if (tp->pending == TCP_TIME_RETRANS) {
131 r->tcpdiag_timer = 1;
132 r->tcpdiag_retrans = tp->retransmits;
133 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
134 } else if (tp->pending == TCP_TIME_PROBE0) {
135 r->tcpdiag_timer = 4;
136 r->tcpdiag_retrans = tp->probes_out;
137 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
138 } else if (timer_pending(&sk->sk_timer)) {
139 r->tcpdiag_timer = 2;
140 r->tcpdiag_retrans = tp->probes_out;
141 r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
142 } else {
143 r->tcpdiag_timer = 0;
144 r->tcpdiag_expires = 0;
145 }
146#undef EXPIRES_IN_MS
147 28
148 r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; 29 r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
149 r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; 30 r->idiag_wqueue = tp->write_seq - tp->snd_una;
150 r->tcpdiag_uid = sock_i_uid(sk); 31 if (info != NULL)
151 r->tcpdiag_inode = sock_i_ino(sk);
152
153 if (minfo) {
154 minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
155 minfo->tcpdiag_wmem = sk->sk_wmem_queued;
156 minfo->tcpdiag_fmem = sk->sk_forward_alloc;
157 minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
158 }
159
160 if (info)
161 tcp_get_info(sk, info); 32 tcp_get_info(sk, info);
162
163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
164 tp->ca_ops->get_info(tp, ext, skb);
165
166 nlh->nlmsg_len = skb->tail - b;
167 return skb->len;
168
169rtattr_failure:
170nlmsg_failure:
171 skb_trim(skb, b - skb->data);
172 return -1;
173}
174
175extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
176 int dif);
177#ifdef CONFIG_IP_TCPDIAG_IPV6
178extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
179 struct in6_addr *daddr, u16 dport,
180 int dif);
181#else
182static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
183 struct in6_addr *daddr, u16 dport,
184 int dif)
185{
186 return NULL;
187}
188#endif
189
190static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
191{
192 int err;
193 struct sock *sk;
194 struct tcpdiagreq *req = NLMSG_DATA(nlh);
195 struct sk_buff *rep;
196
197 if (req->tcpdiag_family == AF_INET) {
198 sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
199 req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
200 req->id.tcpdiag_if);
201 }
202#ifdef CONFIG_IP_TCPDIAG_IPV6
203 else if (req->tcpdiag_family == AF_INET6) {
204 sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
205 (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
206 req->id.tcpdiag_if);
207 }
208#endif
209 else {
210 return -EINVAL;
211 }
212
213 if (sk == NULL)
214 return -ENOENT;
215
216 err = -ESTALE;
217 if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
218 req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
219 ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
220 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
221 goto out;
222
223 err = -ENOMEM;
224 rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
225 sizeof(struct tcpdiag_meminfo)+
226 sizeof(struct tcp_info)+64), GFP_KERNEL);
227 if (!rep)
228 goto out;
229
230 if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
231 NETLINK_CB(in_skb).pid,
232 nlh->nlmsg_seq, 0) <= 0)
233 BUG();
234
235 err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
236 if (err > 0)
237 err = 0;
238
239out:
240 if (sk) {
241 if (sk->sk_state == TCP_TIME_WAIT)
242 tcp_tw_put((struct tcp_tw_bucket*)sk);
243 else
244 sock_put(sk);
245 }
246 return err;
247}
248
249static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
250{
251 int words = bits >> 5;
252
253 bits &= 0x1f;
254
255 if (words) {
256 if (memcmp(a1, a2, words << 2))
257 return 0;
258 }
259 if (bits) {
260 __u32 w1, w2;
261 __u32 mask;
262
263 w1 = a1[words];
264 w2 = a2[words];
265
266 mask = htonl((0xffffffff) << (32 - bits));
267
268 if ((w1 ^ w2) & mask)
269 return 0;
270 }
271
272 return 1;
273}
274
275
276static int tcpdiag_bc_run(const void *bc, int len,
277 const struct tcpdiag_entry *entry)
278{
279 while (len > 0) {
280 int yes = 1;
281 const struct tcpdiag_bc_op *op = bc;
282
283 switch (op->code) {
284 case TCPDIAG_BC_NOP:
285 break;
286 case TCPDIAG_BC_JMP:
287 yes = 0;
288 break;
289 case TCPDIAG_BC_S_GE:
290 yes = entry->sport >= op[1].no;
291 break;
292 case TCPDIAG_BC_S_LE:
293 yes = entry->dport <= op[1].no;
294 break;
295 case TCPDIAG_BC_D_GE:
296 yes = entry->dport >= op[1].no;
297 break;
298 case TCPDIAG_BC_D_LE:
299 yes = entry->dport <= op[1].no;
300 break;
301 case TCPDIAG_BC_AUTO:
302 yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
303 break;
304 case TCPDIAG_BC_S_COND:
305 case TCPDIAG_BC_D_COND:
306 {
307 struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
308 u32 *addr;
309
310 if (cond->port != -1 &&
311 cond->port != (op->code == TCPDIAG_BC_S_COND ?
312 entry->sport : entry->dport)) {
313 yes = 0;
314 break;
315 }
316
317 if (cond->prefix_len == 0)
318 break;
319
320 if (op->code == TCPDIAG_BC_S_COND)
321 addr = entry->saddr;
322 else
323 addr = entry->daddr;
324
325 if (bitstring_match(addr, cond->addr, cond->prefix_len))
326 break;
327 if (entry->family == AF_INET6 &&
328 cond->family == AF_INET) {
329 if (addr[0] == 0 && addr[1] == 0 &&
330 addr[2] == htonl(0xffff) &&
331 bitstring_match(addr+3, cond->addr, cond->prefix_len))
332 break;
333 }
334 yes = 0;
335 break;
336 }
337 }
338
339 if (yes) {
340 len -= op->yes;
341 bc += op->yes;
342 } else {
343 len -= op->no;
344 bc += op->no;
345 }
346 }
347 return (len == 0);
348}
349
350static int valid_cc(const void *bc, int len, int cc)
351{
352 while (len >= 0) {
353 const struct tcpdiag_bc_op *op = bc;
354
355 if (cc > len)
356 return 0;
357 if (cc == len)
358 return 1;
359 if (op->yes < 4)
360 return 0;
361 len -= op->yes;
362 bc += op->yes;
363 }
364 return 0;
365}
366
367static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
368{
369 const unsigned char *bc = bytecode;
370 int len = bytecode_len;
371
372 while (len > 0) {
373 struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
374
375//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
376 switch (op->code) {
377 case TCPDIAG_BC_AUTO:
378 case TCPDIAG_BC_S_COND:
379 case TCPDIAG_BC_D_COND:
380 case TCPDIAG_BC_S_GE:
381 case TCPDIAG_BC_S_LE:
382 case TCPDIAG_BC_D_GE:
383 case TCPDIAG_BC_D_LE:
384 if (op->yes < 4 || op->yes > len+4)
385 return -EINVAL;
386 case TCPDIAG_BC_JMP:
387 if (op->no < 4 || op->no > len+4)
388 return -EINVAL;
389 if (op->no < len &&
390 !valid_cc(bytecode, bytecode_len, len-op->no))
391 return -EINVAL;
392 break;
393 case TCPDIAG_BC_NOP:
394 if (op->yes < 4 || op->yes > len+4)
395 return -EINVAL;
396 break;
397 default:
398 return -EINVAL;
399 }
400 bc += op->yes;
401 len -= op->yes;
402 }
403 return len == 0 ? 0 : -EINVAL;
404}
405
406static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
407 struct netlink_callback *cb)
408{
409 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
410
411 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
412 struct tcpdiag_entry entry;
413 struct rtattr *bc = (struct rtattr *)(r + 1);
414 struct inet_sock *inet = inet_sk(sk);
415
416 entry.family = sk->sk_family;
417#ifdef CONFIG_IP_TCPDIAG_IPV6
418 if (entry.family == AF_INET6) {
419 struct ipv6_pinfo *np = inet6_sk(sk);
420
421 entry.saddr = np->rcv_saddr.s6_addr32;
422 entry.daddr = np->daddr.s6_addr32;
423 } else
424#endif
425 {
426 entry.saddr = &inet->rcv_saddr;
427 entry.daddr = &inet->daddr;
428 }
429 entry.sport = inet->num;
430 entry.dport = ntohs(inet->dport);
431 entry.userlocks = sk->sk_userlocks;
432
433 if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
434 return 0;
435 }
436
437 return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
438 cb->nlh->nlmsg_seq, NLM_F_MULTI);
439} 33}
440 34
441static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, 35static struct inet_diag_handler tcp_diag_handler = {
442 struct request_sock *req, 36 .idiag_hashinfo = &tcp_hashinfo,
443 u32 pid, u32 seq) 37 .idiag_get_info = tcp_diag_get_info,
444{ 38 .idiag_type = TCPDIAG_GETSOCK,
445 const struct inet_request_sock *ireq = inet_rsk(req); 39 .idiag_info_size = sizeof(struct tcp_info),
446 struct inet_sock *inet = inet_sk(sk); 40};
447 unsigned char *b = skb->tail;
448 struct tcpdiagmsg *r;
449 struct nlmsghdr *nlh;
450 long tmo;
451
452 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
453 nlh->nlmsg_flags = NLM_F_MULTI;
454 r = NLMSG_DATA(nlh);
455
456 r->tcpdiag_family = sk->sk_family;
457 r->tcpdiag_state = TCP_SYN_RECV;
458 r->tcpdiag_timer = 1;
459 r->tcpdiag_retrans = req->retrans;
460
461 r->id.tcpdiag_if = sk->sk_bound_dev_if;
462 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
463 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
464
465 tmo = req->expires - jiffies;
466 if (tmo < 0)
467 tmo = 0;
468
469 r->id.tcpdiag_sport = inet->sport;
470 r->id.tcpdiag_dport = ireq->rmt_port;
471 r->id.tcpdiag_src[0] = ireq->loc_addr;
472 r->id.tcpdiag_dst[0] = ireq->rmt_addr;
473 r->tcpdiag_expires = jiffies_to_msecs(tmo),
474 r->tcpdiag_rqueue = 0;
475 r->tcpdiag_wqueue = 0;
476 r->tcpdiag_uid = sock_i_uid(sk);
477 r->tcpdiag_inode = 0;
478#ifdef CONFIG_IP_TCPDIAG_IPV6
479 if (r->tcpdiag_family == AF_INET6) {
480 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
481 &tcp6_rsk(req)->loc_addr);
482 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
483 &tcp6_rsk(req)->rmt_addr);
484 }
485#endif
486 nlh->nlmsg_len = skb->tail - b;
487
488 return skb->len;
489
490nlmsg_failure:
491 skb_trim(skb, b - skb->data);
492 return -1;
493}
494
495static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
496 struct netlink_callback *cb)
497{
498 struct tcpdiag_entry entry;
499 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
500 struct tcp_sock *tp = tcp_sk(sk);
501 struct listen_sock *lopt;
502 struct rtattr *bc = NULL;
503 struct inet_sock *inet = inet_sk(sk);
504 int j, s_j;
505 int reqnum, s_reqnum;
506 int err = 0;
507
508 s_j = cb->args[3];
509 s_reqnum = cb->args[4];
510
511 if (s_j > 0)
512 s_j--;
513
514 entry.family = sk->sk_family;
515
516 read_lock_bh(&tp->accept_queue.syn_wait_lock);
517
518 lopt = tp->accept_queue.listen_opt;
519 if (!lopt || !lopt->qlen)
520 goto out;
521
522 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
523 bc = (struct rtattr *)(r + 1);
524 entry.sport = inet->num;
525 entry.userlocks = sk->sk_userlocks;
526 }
527
528 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
529 struct request_sock *req, *head = lopt->syn_table[j];
530
531 reqnum = 0;
532 for (req = head; req; reqnum++, req = req->dl_next) {
533 struct inet_request_sock *ireq = inet_rsk(req);
534
535 if (reqnum < s_reqnum)
536 continue;
537 if (r->id.tcpdiag_dport != ireq->rmt_port &&
538 r->id.tcpdiag_dport)
539 continue;
540
541 if (bc) {
542 entry.saddr =
543#ifdef CONFIG_IP_TCPDIAG_IPV6
544 (entry.family == AF_INET6) ?
545 tcp6_rsk(req)->loc_addr.s6_addr32 :
546#endif
547 &ireq->loc_addr;
548 entry.daddr =
549#ifdef CONFIG_IP_TCPDIAG_IPV6
550 (entry.family == AF_INET6) ?
551 tcp6_rsk(req)->rmt_addr.s6_addr32 :
552#endif
553 &ireq->rmt_addr;
554 entry.dport = ntohs(ireq->rmt_port);
555
556 if (!tcpdiag_bc_run(RTA_DATA(bc),
557 RTA_PAYLOAD(bc), &entry))
558 continue;
559 }
560
561 err = tcpdiag_fill_req(skb, sk, req,
562 NETLINK_CB(cb->skb).pid,
563 cb->nlh->nlmsg_seq);
564 if (err < 0) {
565 cb->args[3] = j + 1;
566 cb->args[4] = reqnum;
567 goto out;
568 }
569 }
570
571 s_reqnum = 0;
572 }
573
574out:
575 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
576
577 return err;
578}
579
580static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
581{
582 int i, num;
583 int s_i, s_num;
584 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
585
586 s_i = cb->args[1];
587 s_num = num = cb->args[2];
588
589 if (cb->args[0] == 0) {
590 if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
591 goto skip_listen_ht;
592 tcp_listen_lock();
593 for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
594 struct sock *sk;
595 struct hlist_node *node;
596
597 num = 0;
598 sk_for_each(sk, node, &tcp_listening_hash[i]) {
599 struct inet_sock *inet = inet_sk(sk);
600
601 if (num < s_num) {
602 num++;
603 continue;
604 }
605
606 if (r->id.tcpdiag_sport != inet->sport &&
607 r->id.tcpdiag_sport)
608 goto next_listen;
609
610 if (!(r->tcpdiag_states&TCPF_LISTEN) ||
611 r->id.tcpdiag_dport ||
612 cb->args[3] > 0)
613 goto syn_recv;
614
615 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
616 tcp_listen_unlock();
617 goto done;
618 }
619
620syn_recv:
621 if (!(r->tcpdiag_states&TCPF_SYN_RECV))
622 goto next_listen;
623
624 if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
625 tcp_listen_unlock();
626 goto done;
627 }
628
629next_listen:
630 cb->args[3] = 0;
631 cb->args[4] = 0;
632 ++num;
633 }
634
635 s_num = 0;
636 cb->args[3] = 0;
637 cb->args[4] = 0;
638 }
639 tcp_listen_unlock();
640skip_listen_ht:
641 cb->args[0] = 1;
642 s_i = num = s_num = 0;
643 }
644
645 if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
646 return skb->len;
647
648 for (i = s_i; i < tcp_ehash_size; i++) {
649 struct tcp_ehash_bucket *head = &tcp_ehash[i];
650 struct sock *sk;
651 struct hlist_node *node;
652
653 if (i > s_i)
654 s_num = 0;
655
656 read_lock_bh(&head->lock);
657
658 num = 0;
659 sk_for_each(sk, node, &head->chain) {
660 struct inet_sock *inet = inet_sk(sk);
661
662 if (num < s_num)
663 goto next_normal;
664 if (!(r->tcpdiag_states & (1 << sk->sk_state)))
665 goto next_normal;
666 if (r->id.tcpdiag_sport != inet->sport &&
667 r->id.tcpdiag_sport)
668 goto next_normal;
669 if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
670 goto next_normal;
671 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
672 read_unlock_bh(&head->lock);
673 goto done;
674 }
675next_normal:
676 ++num;
677 }
678
679 if (r->tcpdiag_states&TCPF_TIME_WAIT) {
680 sk_for_each(sk, node,
681 &tcp_ehash[i + tcp_ehash_size].chain) {
682 struct inet_sock *inet = inet_sk(sk);
683
684 if (num < s_num)
685 goto next_dying;
686 if (r->id.tcpdiag_sport != inet->sport &&
687 r->id.tcpdiag_sport)
688 goto next_dying;
689 if (r->id.tcpdiag_dport != inet->dport &&
690 r->id.tcpdiag_dport)
691 goto next_dying;
692 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
693 read_unlock_bh(&head->lock);
694 goto done;
695 }
696next_dying:
697 ++num;
698 }
699 }
700 read_unlock_bh(&head->lock);
701 }
702
703done:
704 cb->args[1] = i;
705 cb->args[2] = num;
706 return skb->len;
707}
708
709static int tcpdiag_dump_done(struct netlink_callback *cb)
710{
711 return 0;
712}
713
714
715static __inline__ int
716tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
717{
718 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
719 return 0;
720
721 if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
722 goto err_inval;
723
724 if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
725 goto err_inval;
726
727 if (nlh->nlmsg_flags&NLM_F_DUMP) {
728 if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
729 struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
730 if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
731 rta->rta_len < 8 ||
732 rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
733 goto err_inval;
734 if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
735 goto err_inval;
736 }
737 return netlink_dump_start(tcpnl, skb, nlh,
738 tcpdiag_dump,
739 tcpdiag_dump_done);
740 } else {
741 return tcpdiag_get_exact(skb, nlh);
742 }
743
744err_inval:
745 return -EINVAL;
746}
747
748
749static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
750{
751 int err;
752 struct nlmsghdr * nlh;
753
754 if (skb->len >= NLMSG_SPACE(0)) {
755 nlh = (struct nlmsghdr *)skb->data;
756 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
757 return;
758 err = tcpdiag_rcv_msg(skb, nlh);
759 if (err || nlh->nlmsg_flags & NLM_F_ACK)
760 netlink_ack(skb, nlh, err);
761 }
762}
763
764static void tcpdiag_rcv(struct sock *sk, int len)
765{
766 struct sk_buff *skb;
767 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
768
769 while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
770 tcpdiag_rcv_skb(skb);
771 kfree_skb(skb);
772 }
773}
774 41
775static int __init tcpdiag_init(void) 42static int __init tcp_diag_init(void)
776{ 43{
777 tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); 44 return inet_diag_register(&tcp_diag_handler);
778 if (tcpnl == NULL)
779 return -ENOMEM;
780 return 0;
781} 45}
782 46
783static void __exit tcpdiag_exit(void) 47static void __exit tcp_diag_exit(void)
784{ 48{
785 sock_release(tcpnl->sk_socket); 49 inet_diag_unregister(&tcp_diag_handler);
786} 50}
787 51
788module_init(tcpdiag_init); 52module_init(tcp_diag_init);
789module_exit(tcpdiag_exit); 53module_exit(tcp_diag_exit);
790MODULE_LICENSE("GPL"); 54MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136bf..6acc04bde080 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
98 u32 ai; 98 u32 ai;
99}; 99};
100 100
101static void hstcp_init(struct tcp_sock *tp) 101static void hstcp_init(struct sock *sk)
102{ 102{
103 struct hstcp *ca = tcp_ca(tp); 103 struct tcp_sock *tp = tcp_sk(sk);
104 struct hstcp *ca = inet_csk_ca(sk);
104 105
105 ca->ai = 0; 106 ca->ai = 0;
106 107
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 110 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110} 111}
111 112
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, 113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
113 u32 in_flight, int good) 114 u32 in_flight, int good)
114{ 115{
115 struct hstcp *ca = tcp_ca(tp); 116 struct tcp_sock *tp = tcp_sk(sk);
117 struct hstcp *ca = inet_csk_ca(sk);
116 118
117 if (in_flight < tp->snd_cwnd) 119 if (in_flight < tp->snd_cwnd)
118 return; 120 return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
143 } 145 }
144} 146}
145 147
146static u32 hstcp_ssthresh(struct tcp_sock *tp) 148static u32 hstcp_ssthresh(struct sock *sk)
147{ 149{
148 struct hstcp *ca = tcp_ca(tp); 150 const struct tcp_sock *tp = tcp_sk(sk);
151 const struct hstcp *ca = inet_csk_ca(sk);
149 152
150 /* Do multiplicative decrease */ 153 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); 154 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
164 167
165static int __init hstcp_register(void) 168static int __init hstcp_register(void)
166{ 169{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); 170 BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed); 171 return tcp_register_congestion_control(&tcp_highspeed);
169} 172}
170 173
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf9..e47b37984e95 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
55 ca->snd_cwnd_cnt2 = 0; 55 ca->snd_cwnd_cnt2 = 0;
56} 56}
57 57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp) 58static u32 htcp_cwnd_undo(struct sock *sk)
59{ 59{
60 struct htcp *ca = tcp_ca(tp); 60 const struct tcp_sock *tp = tcp_sk(sk);
61 struct htcp *ca = inet_csk_ca(sk);
61 ca->ccount = ca->undo_ccount; 62 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT; 63 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB; 64 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); 65 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65} 66}
66 67
67static inline void measure_rtt(struct tcp_sock *tp) 68static inline void measure_rtt(struct sock *sk)
68{ 69{
69 struct htcp *ca = tcp_ca(tp); 70 const struct inet_connection_sock *icsk = inet_csk(sk);
71 const struct tcp_sock *tp = tcp_sk(sk);
72 struct htcp *ca = inet_csk_ca(sk);
70 u32 srtt = tp->srtt>>3; 73 u32 srtt = tp->srtt>>3;
71 74
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */ 75 /* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
74 ca->minRTT = srtt; 77 ca->minRTT = srtt;
75 78
76 /* max RTT */ 79 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { 80 if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT) 81 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT; 82 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) 83 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
82 } 85 }
83} 86}
84 87
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) 88static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
86{ 89{
87 struct htcp *ca = tcp_ca(tp); 90 const struct inet_connection_sock *icsk = inet_csk(sk);
91 const struct tcp_sock *tp = tcp_sk(sk);
92 struct htcp *ca = inet_csk_ca(sk);
88 u32 now = tcp_time_stamp; 93 u32 now = tcp_time_stamp;
89 94
90 /* achieved throughput calculations */ 95 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { 96 if (icsk->icsk_ca_state != TCP_CA_Open &&
97 icsk->icsk_ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0; 98 ca->packetcount = 0;
93 ca->lasttime = now; 99 ca->lasttime = now;
94 return; 100 return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
173 * that point do we really have a real sense of maxRTT (the queues en route 179 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now). 180 * were getting just too full now).
175 */ 181 */
176static void htcp_param_update(struct tcp_sock *tp) 182static void htcp_param_update(struct sock *sk)
177{ 183{
178 struct htcp *ca = tcp_ca(tp); 184 struct htcp *ca = inet_csk_ca(sk);
179 u32 minRTT = ca->minRTT; 185 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT; 186 u32 maxRTT = ca->maxRTT;
181 187
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; 193 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188} 194}
189 195
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) 196static u32 htcp_recalc_ssthresh(struct sock *sk)
191{ 197{
192 struct htcp *ca = tcp_ca(tp); 198 const struct tcp_sock *tp = tcp_sk(sk);
193 htcp_param_update(tp); 199 const struct htcp *ca = inet_csk_ca(sk);
200 htcp_param_update(sk);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U); 201 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195} 202}
196 203
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 204static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked) 205 u32 in_flight, int data_acked)
199{ 206{
200 struct htcp *ca = tcp_ca(tp); 207 struct tcp_sock *tp = tcp_sk(sk);
208 struct htcp *ca = inet_csk_ca(sk);
201 209
202 if (in_flight < tp->snd_cwnd) 210 if (in_flight < tp->snd_cwnd)
203 return; 211 return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 215 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++; 216 tp->snd_cwnd++;
209 } else { 217 } else {
210 measure_rtt(tp); 218 measure_rtt(sk);
211 219
212 /* keep track of number of round-trip times since last backoff event */ 220 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { 221 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
229} 237}
230 238
231/* Lower bound on congestion window. */ 239/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp) 240static u32 htcp_min_cwnd(struct sock *sk)
233{ 241{
242 const struct tcp_sock *tp = tcp_sk(sk);
234 return tp->snd_ssthresh; 243 return tp->snd_ssthresh;
235} 244}
236 245
237 246
238static void htcp_init(struct tcp_sock *tp) 247static void htcp_init(struct sock *sk)
239{ 248{
240 struct htcp *ca = tcp_ca(tp); 249 struct htcp *ca = inet_csk_ca(sk);
241 250
242 memset(ca, 0, sizeof(struct htcp)); 251 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE; 252 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN; 253 ca->beta = BETA_MIN;
245} 254}
246 255
247static void htcp_state(struct tcp_sock *tp, u8 new_state) 256static void htcp_state(struct sock *sk, u8 new_state)
248{ 257{
249 switch (new_state) { 258 switch (new_state) {
250 case TCP_CA_CWR: 259 case TCP_CA_CWR:
251 case TCP_CA_Recovery: 260 case TCP_CA_Recovery:
252 case TCP_CA_Loss: 261 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp)); 262 htcp_reset(inet_csk_ca(sk));
254 break; 263 break;
255 } 264 }
256} 265}
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
269 278
270static int __init htcp_register(void) 279static int __init htcp_register(void)
271{ 280{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); 281 BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX); 282 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch) 283 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL; 284 htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c304..77add63623df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33 33
34 34
35/* This is called to refresh values for hybla parameters */ 35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp) 36static inline void hybla_recalc_param (struct sock *sk)
37{ 37{
38 struct hybla *ca = tcp_ca(tp); 38 struct hybla *ca = inet_csk_ca(sk);
39 39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); 40 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3; 41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; 42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7; 43 ca->rho2 = ca->rho2_7ls >>7;
44} 44}
45 45
46static void hybla_init(struct tcp_sock *tp) 46static void hybla_init(struct sock *sk)
47{ 47{
48 struct hybla *ca = tcp_ca(tp); 48 struct tcp_sock *tp = tcp_sk(sk);
49 struct hybla *ca = inet_csk_ca(sk);
49 50
50 ca->rho = 0; 51 ca->rho = 0;
51 ca->rho2 = 0; 52 ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
57 tp->snd_cwnd_clamp = 65535; 58 tp->snd_cwnd_clamp = 65535;
58 59
59 /* 1st Rho measurement based on initial srtt */ 60 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp); 61 hybla_recalc_param(sk);
61 62
62 /* set minimum rtt as this is the 1st ever seen */ 63 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt; 64 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho; 65 tp->snd_cwnd = ca->rho;
65} 66}
66 67
67static void hybla_state(struct tcp_sock *tp, u8 ca_state) 68static void hybla_state(struct sock *sk, u8 ca_state)
68{ 69{
69 struct hybla *ca = tcp_ca(tp); 70 struct hybla *ca = inet_csk_ca(sk);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open); 71 ca->hybla_en = (ca_state == TCP_CA_Open);
72} 72}
73 73
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
86 * o Give cwnd a new value based on the model proposed 86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1 87 * o remember increments <1
88 */ 88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 89static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
90 u32 in_flight, int flag) 90 u32 in_flight, int flag)
91{ 91{
92 struct hybla *ca = tcp_ca(tp); 92 struct tcp_sock *tp = tcp_sk(sk);
93 struct hybla *ca = inet_csk_ca(sk);
93 u32 increment, odd, rho_fractions; 94 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0; 95 int is_slowstart = 0;
95 96
96 /* Recalculate rho only if this srtt is the lowest */ 97 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){ 98 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp); 99 hybla_recalc_param(sk);
99 ca->minrtt = tp->srtt; 100 ca->minrtt = tp->srtt;
100 } 101 }
101 102
102 if (!ca->hybla_en) 103 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); 104 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
104 105
105 if (in_flight < tp->snd_cwnd) 106 if (in_flight < tp->snd_cwnd)
106 return; 107 return;
107 108
108 if (ca->rho == 0) 109 if (ca->rho == 0)
109 hybla_recalc_param(tp); 110 hybla_recalc_param(sk);
110 111
111 rho_fractions = ca->rho_3ls - (ca->rho << 3); 112 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112 113
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
170 171
171static int __init hybla_register(void) 172static int __init hybla_register(void)
172{ 173{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); 174 BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla); 175 return tcp_register_congestion_control(&tcp_hybla);
175} 176}
176 177
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1e..29222b964951 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
114/* Adapt the MSS value used to make delayed ack decision to the 114/* Adapt the MSS value used to make delayed ack decision to the
115 * real world. 115 * real world.
116 */ 116 */
117static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, 117static inline void tcp_measure_rcv_mss(struct sock *sk,
118 struct sk_buff *skb) 118 const struct sk_buff *skb)
119{ 119{
120 unsigned int len, lss; 120 struct inet_connection_sock *icsk = inet_csk(sk);
121 const unsigned int lss = icsk->icsk_ack.last_seg_size;
122 unsigned int len;
121 123
122 lss = tp->ack.last_seg_size; 124 icsk->icsk_ack.last_seg_size = 0;
123 tp->ack.last_seg_size = 0;
124 125
125 /* skb->len may jitter because of SACKs, even if peer 126 /* skb->len may jitter because of SACKs, even if peer
126 * sends good full-sized frames. 127 * sends good full-sized frames.
127 */ 128 */
128 len = skb->len; 129 len = skb->len;
129 if (len >= tp->ack.rcv_mss) { 130 if (len >= icsk->icsk_ack.rcv_mss) {
130 tp->ack.rcv_mss = len; 131 icsk->icsk_ack.rcv_mss = len;
131 } else { 132 } else {
132 /* Otherwise, we make more careful check taking into account, 133 /* Otherwise, we make more careful check taking into account,
133 * that SACKs block is variable. 134 * that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
147 * tcp header plus fixed timestamp option length. 148 * tcp header plus fixed timestamp option length.
148 * Resulting "len" is MSS free of SACK jitter. 149 * Resulting "len" is MSS free of SACK jitter.
149 */ 150 */
150 len -= tp->tcp_header_len; 151 len -= tcp_sk(sk)->tcp_header_len;
151 tp->ack.last_seg_size = len; 152 icsk->icsk_ack.last_seg_size = len;
152 if (len == lss) { 153 if (len == lss) {
153 tp->ack.rcv_mss = len; 154 icsk->icsk_ack.rcv_mss = len;
154 return; 155 return;
155 } 156 }
156 } 157 }
157 tp->ack.pending |= TCP_ACK_PUSHED; 158 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
158 } 159 }
159} 160}
160 161
161static void tcp_incr_quickack(struct tcp_sock *tp) 162static void tcp_incr_quickack(struct sock *sk)
162{ 163{
163 unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); 164 struct inet_connection_sock *icsk = inet_csk(sk);
165 unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
164 166
165 if (quickacks==0) 167 if (quickacks==0)
166 quickacks=2; 168 quickacks=2;
167 if (quickacks > tp->ack.quick) 169 if (quickacks > icsk->icsk_ack.quick)
168 tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 170 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
169} 171}
170 172
171void tcp_enter_quickack_mode(struct tcp_sock *tp) 173void tcp_enter_quickack_mode(struct sock *sk)
172{ 174{
173 tcp_incr_quickack(tp); 175 struct inet_connection_sock *icsk = inet_csk(sk);
174 tp->ack.pingpong = 0; 176 tcp_incr_quickack(sk);
175 tp->ack.ato = TCP_ATO_MIN; 177 icsk->icsk_ack.pingpong = 0;
178 icsk->icsk_ack.ato = TCP_ATO_MIN;
176} 179}
177 180
178/* Send ACKs quickly, if "quick" count is not exhausted 181/* Send ACKs quickly, if "quick" count is not exhausted
179 * and the session is not interactive. 182 * and the session is not interactive.
180 */ 183 */
181 184
182static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) 185static inline int tcp_in_quickack_mode(const struct sock *sk)
183{ 186{
184 return (tp->ack.quick && !tp->ack.pingpong); 187 const struct inet_connection_sock *icsk = inet_csk(sk);
188 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
185} 189}
186 190
187/* Buffer size and advertised window tuning. 191/* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
224 */ 228 */
225 229
226/* Slow part of check#2. */ 230/* Slow part of check#2. */
227static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 231static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
228 struct sk_buff *skb) 232 const struct sk_buff *skb)
229{ 233{
230 /* Optimize this! */ 234 /* Optimize this! */
231 int truesize = tcp_win_from_space(skb->truesize)/2; 235 int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
233 237
234 while (tp->rcv_ssthresh <= window) { 238 while (tp->rcv_ssthresh <= window) {
235 if (truesize <= skb->len) 239 if (truesize <= skb->len)
236 return 2*tp->ack.rcv_mss; 240 return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
237 241
238 truesize >>= 1; 242 truesize >>= 1;
239 window >>= 1; 243 window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
260 264
261 if (incr) { 265 if (incr) {
262 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); 266 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
263 tp->ack.quick |= 1; 267 inet_csk(sk)->icsk_ack.quick |= 1;
264 } 268 }
265 } 269 }
266} 270}
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
321/* 5. Recalculate window clamp after socket hit its memory bounds. */ 325/* 5. Recalculate window clamp after socket hit its memory bounds. */
322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 326static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
323{ 327{
328 struct inet_connection_sock *icsk = inet_csk(sk);
324 struct sk_buff *skb; 329 struct sk_buff *skb;
325 unsigned int app_win = tp->rcv_nxt - tp->copied_seq; 330 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
326 int ofo_win = 0; 331 int ofo_win = 0;
327 332
328 tp->ack.quick = 0; 333 icsk->icsk_ack.quick = 0;
329 334
330 skb_queue_walk(&tp->out_of_order_queue, skb) { 335 skb_queue_walk(&tp->out_of_order_queue, skb) {
331 ofo_win += skb->len; 336 ofo_win += skb->len;
@@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
346 app_win += ofo_win; 351 app_win += ofo_win;
347 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) 352 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
348 app_win >>= 1; 353 app_win >>= 1;
349 if (app_win > tp->ack.rcv_mss) 354 if (app_win > icsk->icsk_ack.rcv_mss)
350 app_win -= tp->ack.rcv_mss; 355 app_win -= icsk->icsk_ack.rcv_mss;
351 app_win = max(app_win, 2U*tp->advmss); 356 app_win = max(app_win, 2U*tp->advmss);
352 357
353 if (!ofo_win) 358 if (!ofo_win)
@@ -415,11 +420,12 @@ new_measure:
415 tp->rcv_rtt_est.time = tcp_time_stamp; 420 tp->rcv_rtt_est.time = tcp_time_stamp;
416} 421}
417 422
418static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) 423static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
419{ 424{
425 struct tcp_sock *tp = tcp_sk(sk);
420 if (tp->rx_opt.rcv_tsecr && 426 if (tp->rx_opt.rcv_tsecr &&
421 (TCP_SKB_CB(skb)->end_seq - 427 (TCP_SKB_CB(skb)->end_seq -
422 TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) 428 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
423 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); 429 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
424} 430}
425 431
@@ -492,41 +498,42 @@ new_measure:
492 */ 498 */
493static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 499static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
494{ 500{
501 struct inet_connection_sock *icsk = inet_csk(sk);
495 u32 now; 502 u32 now;
496 503
497 tcp_schedule_ack(tp); 504 inet_csk_schedule_ack(sk);
498 505
499 tcp_measure_rcv_mss(tp, skb); 506 tcp_measure_rcv_mss(sk, skb);
500 507
501 tcp_rcv_rtt_measure(tp); 508 tcp_rcv_rtt_measure(tp);
502 509
503 now = tcp_time_stamp; 510 now = tcp_time_stamp;
504 511
505 if (!tp->ack.ato) { 512 if (!icsk->icsk_ack.ato) {
506 /* The _first_ data packet received, initialize 513 /* The _first_ data packet received, initialize
507 * delayed ACK engine. 514 * delayed ACK engine.
508 */ 515 */
509 tcp_incr_quickack(tp); 516 tcp_incr_quickack(sk);
510 tp->ack.ato = TCP_ATO_MIN; 517 icsk->icsk_ack.ato = TCP_ATO_MIN;
511 } else { 518 } else {
512 int m = now - tp->ack.lrcvtime; 519 int m = now - icsk->icsk_ack.lrcvtime;
513 520
514 if (m <= TCP_ATO_MIN/2) { 521 if (m <= TCP_ATO_MIN/2) {
515 /* The fastest case is the first. */ 522 /* The fastest case is the first. */
516 tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; 523 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
517 } else if (m < tp->ack.ato) { 524 } else if (m < icsk->icsk_ack.ato) {
518 tp->ack.ato = (tp->ack.ato>>1) + m; 525 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
519 if (tp->ack.ato > tp->rto) 526 if (icsk->icsk_ack.ato > icsk->icsk_rto)
520 tp->ack.ato = tp->rto; 527 icsk->icsk_ack.ato = icsk->icsk_rto;
521 } else if (m > tp->rto) { 528 } else if (m > icsk->icsk_rto) {
522 /* Too long gap. Apparently sender falled to 529 /* Too long gap. Apparently sender falled to
523 * restart window, so that we send ACKs quickly. 530 * restart window, so that we send ACKs quickly.
524 */ 531 */
525 tcp_incr_quickack(tp); 532 tcp_incr_quickack(sk);
526 sk_stream_mem_reclaim(sk); 533 sk_stream_mem_reclaim(sk);
527 } 534 }
528 } 535 }
529 tp->ack.lrcvtime = now; 536 icsk->icsk_ack.lrcvtime = now;
530 537
531 TCP_ECN_check_ce(tp, skb); 538 TCP_ECN_check_ce(tp, skb);
532 539
@@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
543 * To save cycles in the RFC 1323 implementation it was better to break 550 * To save cycles in the RFC 1323 implementation it was better to break
544 * it up into three procedures. -- erics 551 * it up into three procedures. -- erics
545 */ 552 */
546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) 553static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
547{ 554{
555 struct tcp_sock *tp = tcp_sk(sk);
556 const struct inet_connection_sock *icsk = inet_csk(sk);
548 long m = mrtt; /* RTT */ 557 long m = mrtt; /* RTT */
549 558
550 /* The following amusing code comes from Jacobson's 559 /* The following amusing code comes from Jacobson's
@@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
604 tp->rtt_seq = tp->snd_nxt; 613 tp->rtt_seq = tp->snd_nxt;
605 } 614 }
606 615
607 if (tp->ca_ops->rtt_sample) 616 if (icsk->icsk_ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt); 617 icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
609} 618}
610 619
611/* Calculate rto without backoff. This is the second half of Van Jacobson's 620/* Calculate rto without backoff. This is the second half of Van Jacobson's
612 * routine referred to above. 621 * routine referred to above.
613 */ 622 */
614static inline void tcp_set_rto(struct tcp_sock *tp) 623static inline void tcp_set_rto(struct sock *sk)
615{ 624{
625 const struct tcp_sock *tp = tcp_sk(sk);
616 /* Old crap is replaced with new one. 8) 626 /* Old crap is replaced with new one. 8)
617 * 627 *
618 * More seriously: 628 * More seriously:
@@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
623 * is invisible. Actually, Linux-2.4 also generates erratic 633 * is invisible. Actually, Linux-2.4 also generates erratic
624 * ACKs in some curcumstances. 634 * ACKs in some curcumstances.
625 */ 635 */
626 tp->rto = (tp->srtt >> 3) + tp->rttvar; 636 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
627 637
628 /* 2. Fixups made earlier cannot be right. 638 /* 2. Fixups made earlier cannot be right.
629 * If we do not estimate RTO correctly without them, 639 * If we do not estimate RTO correctly without them,
@@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
635/* NOTE: clamping at TCP_RTO_MIN is not required, current algo 645/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
636 * guarantees that rto is higher. 646 * guarantees that rto is higher.
637 */ 647 */
638static inline void tcp_bound_rto(struct tcp_sock *tp) 648static inline void tcp_bound_rto(struct sock *sk)
639{ 649{
640 if (tp->rto > TCP_RTO_MAX) 650 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
641 tp->rto = TCP_RTO_MAX; 651 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
642} 652}
643 653
644/* Save metrics learned by this TCP session. 654/* Save metrics learned by this TCP session.
@@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
656 dst_confirm(dst); 666 dst_confirm(dst);
657 667
658 if (dst && (dst->flags&DST_HOST)) { 668 if (dst && (dst->flags&DST_HOST)) {
669 const struct inet_connection_sock *icsk = inet_csk(sk);
659 int m; 670 int m;
660 671
661 if (tp->backoff || !tp->srtt) { 672 if (icsk->icsk_backoff || !tp->srtt) {
662 /* This session failed to estimate rtt. Why? 673 /* This session failed to estimate rtt. Why?
663 * Probably, no packets returned in time. 674 * Probably, no packets returned in time.
664 * Reset our results. 675 * Reset our results.
@@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
707 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 718 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
708 dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; 719 dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
709 } else if (tp->snd_cwnd > tp->snd_ssthresh && 720 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
710 tp->ca_state == TCP_CA_Open) { 721 icsk->icsk_ca_state == TCP_CA_Open) {
711 /* Cong. avoidance phase, cwnd is reliable. */ 722 /* Cong. avoidance phase, cwnd is reliable. */
712 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 723 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
713 dst->metrics[RTAX_SSTHRESH-1] = 724 dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
801 tp->mdev = dst_metric(dst, RTAX_RTTVAR); 812 tp->mdev = dst_metric(dst, RTAX_RTTVAR);
802 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); 813 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
803 } 814 }
804 tcp_set_rto(tp); 815 tcp_set_rto(sk);
805 tcp_bound_rto(tp); 816 tcp_bound_rto(sk);
806 if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 817 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
807 goto reset; 818 goto reset;
808 tp->snd_cwnd = tcp_init_cwnd(tp, dst); 819 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
809 tp->snd_cwnd_stamp = tcp_time_stamp; 820 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +828,14 @@ reset:
817 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 828 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
818 tp->srtt = 0; 829 tp->srtt = 0;
819 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 830 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
820 tp->rto = TCP_TIMEOUT_INIT; 831 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
821 } 832 }
822} 833}
823 834
824static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) 835static void tcp_update_reordering(struct sock *sk, const int metric,
836 const int ts)
825{ 837{
838 struct tcp_sock *tp = tcp_sk(sk);
826 if (metric > tp->reordering) { 839 if (metric > tp->reordering) {
827 tp->reordering = min(TCP_MAX_REORDERING, metric); 840 tp->reordering = min(TCP_MAX_REORDERING, metric);
828 841
@@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
837 NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); 850 NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
838#if FASTRETRANS_DEBUG > 1 851#if FASTRETRANS_DEBUG > 1
839 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", 852 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
840 tp->rx_opt.sack_ok, tp->ca_state, 853 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
841 tp->reordering, 854 tp->reordering,
842 tp->fackets_out, 855 tp->fackets_out,
843 tp->sacked_out, 856 tp->sacked_out,
@@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
899static int 912static int
900tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) 913tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
901{ 914{
915 const struct inet_connection_sock *icsk = inet_csk(sk);
902 struct tcp_sock *tp = tcp_sk(sk); 916 struct tcp_sock *tp = tcp_sk(sk);
903 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; 917 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
904 struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); 918 struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -909,14 +923,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
909 int flag = 0; 923 int flag = 0;
910 int i; 924 int i;
911 925
912 /* So, SACKs for already sent large segments will be lost.
913 * Not good, but alternative is to resegment the queue. */
914 if (sk->sk_route_caps & NETIF_F_TSO) {
915 sk->sk_route_caps &= ~NETIF_F_TSO;
916 sock_set_flag(sk, SOCK_NO_LARGESEND);
917 tp->mss_cache = tp->mss_cache;
918 }
919
920 if (!tp->sacked_out) 926 if (!tp->sacked_out)
921 tp->fackets_out = 0; 927 tp->fackets_out = 0;
922 prior_fackets = tp->fackets_out; 928 prior_fackets = tp->fackets_out;
@@ -964,20 +970,40 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
964 flag |= FLAG_DATA_LOST; 970 flag |= FLAG_DATA_LOST;
965 971
966 sk_stream_for_retrans_queue(skb, sk) { 972 sk_stream_for_retrans_queue(skb, sk) {
967 u8 sacked = TCP_SKB_CB(skb)->sacked; 973 int in_sack, pcount;
968 int in_sack; 974 u8 sacked;
969 975
970 /* The retransmission queue is always in order, so 976 /* The retransmission queue is always in order, so
971 * we can short-circuit the walk early. 977 * we can short-circuit the walk early.
972 */ 978 */
973 if(!before(TCP_SKB_CB(skb)->seq, end_seq)) 979 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
974 break; 980 break;
975 981
976 fack_count += tcp_skb_pcount(skb); 982 pcount = tcp_skb_pcount(skb);
983
984 if (pcount > 1 &&
985 (after(start_seq, TCP_SKB_CB(skb)->seq) ||
986 before(end_seq, TCP_SKB_CB(skb)->end_seq))) {
987 unsigned int pkt_len;
988
989 if (after(start_seq, TCP_SKB_CB(skb)->seq))
990 pkt_len = (start_seq -
991 TCP_SKB_CB(skb)->seq);
992 else
993 pkt_len = (end_seq -
994 TCP_SKB_CB(skb)->seq);
995 if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
996 break;
997 pcount = tcp_skb_pcount(skb);
998 }
999
1000 fack_count += pcount;
977 1001
978 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1002 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
979 !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1003 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
980 1004
1005 sacked = TCP_SKB_CB(skb)->sacked;
1006
981 /* Account D-SACK for retransmitted packet. */ 1007 /* Account D-SACK for retransmitted packet. */
982 if ((dup_sack && in_sack) && 1008 if ((dup_sack && in_sack) &&
983 (sacked & TCPCB_RETRANS) && 1009 (sacked & TCPCB_RETRANS) &&
@@ -1064,7 +1090,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1064 * we have to account for reordering! Ugly, 1090 * we have to account for reordering! Ugly,
1065 * but should help. 1091 * but should help.
1066 */ 1092 */
1067 if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { 1093 if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
1068 struct sk_buff *skb; 1094 struct sk_buff *skb;
1069 1095
1070 sk_stream_for_retrans_queue(skb, sk) { 1096 sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1119,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1093 1119
1094 tp->left_out = tp->sacked_out + tp->lost_out; 1120 tp->left_out = tp->sacked_out + tp->lost_out;
1095 1121
1096 if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) 1122 if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
1097 tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); 1123 tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
1098 1124
1099#if FASTRETRANS_DEBUG > 0 1125#if FASTRETRANS_DEBUG > 0
1100 BUG_TRAP((int)tp->sacked_out >= 0); 1126 BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1137,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1111 */ 1137 */
1112void tcp_enter_frto(struct sock *sk) 1138void tcp_enter_frto(struct sock *sk)
1113{ 1139{
1140 const struct inet_connection_sock *icsk = inet_csk(sk);
1114 struct tcp_sock *tp = tcp_sk(sk); 1141 struct tcp_sock *tp = tcp_sk(sk);
1115 struct sk_buff *skb; 1142 struct sk_buff *skb;
1116 1143
1117 tp->frto_counter = 1; 1144 tp->frto_counter = 1;
1118 1145
1119 if (tp->ca_state <= TCP_CA_Disorder || 1146 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1120 tp->snd_una == tp->high_seq || 1147 tp->snd_una == tp->high_seq ||
1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1148 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1122 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1149 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1150 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1124 tcp_ca_event(tp, CA_EVENT_FRTO); 1151 tcp_ca_event(sk, CA_EVENT_FRTO);
1125 } 1152 }
1126 1153
1127 /* Have to clear retransmission markers here to keep the bookkeeping 1154 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1165,7 @@ void tcp_enter_frto(struct sock *sk)
1138 } 1165 }
1139 tcp_sync_left_out(tp); 1166 tcp_sync_left_out(tp);
1140 1167
1141 tcp_set_ca_state(tp, TCP_CA_Open); 1168 tcp_set_ca_state(sk, TCP_CA_Open);
1142 tp->frto_highmark = tp->snd_nxt; 1169 tp->frto_highmark = tp->snd_nxt;
1143} 1170}
1144 1171
@@ -1184,7 +1211,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
1184 1211
1185 tp->reordering = min_t(unsigned int, tp->reordering, 1212 tp->reordering = min_t(unsigned int, tp->reordering,
1186 sysctl_tcp_reordering); 1213 sysctl_tcp_reordering);
1187 tcp_set_ca_state(tp, TCP_CA_Loss); 1214 tcp_set_ca_state(sk, TCP_CA_Loss);
1188 tp->high_seq = tp->frto_highmark; 1215 tp->high_seq = tp->frto_highmark;
1189 TCP_ECN_queue_cwr(tp); 1216 TCP_ECN_queue_cwr(tp);
1190} 1217}
@@ -1208,16 +1235,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1208 */ 1235 */
1209void tcp_enter_loss(struct sock *sk, int how) 1236void tcp_enter_loss(struct sock *sk, int how)
1210{ 1237{
1238 const struct inet_connection_sock *icsk = inet_csk(sk);
1211 struct tcp_sock *tp = tcp_sk(sk); 1239 struct tcp_sock *tp = tcp_sk(sk);
1212 struct sk_buff *skb; 1240 struct sk_buff *skb;
1213 int cnt = 0; 1241 int cnt = 0;
1214 1242
1215 /* Reduce ssthresh if it has not yet been made inside this window. */ 1243 /* Reduce ssthresh if it has not yet been made inside this window. */
1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1244 if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1245 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1218 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1246 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1247 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1220 tcp_ca_event(tp, CA_EVENT_LOSS); 1248 tcp_ca_event(sk, CA_EVENT_LOSS);
1221 } 1249 }
1222 tp->snd_cwnd = 1; 1250 tp->snd_cwnd = 1;
1223 tp->snd_cwnd_cnt = 0; 1251 tp->snd_cwnd_cnt = 0;
@@ -1248,12 +1276,12 @@ void tcp_enter_loss(struct sock *sk, int how)
1248 1276
1249 tp->reordering = min_t(unsigned int, tp->reordering, 1277 tp->reordering = min_t(unsigned int, tp->reordering,
1250 sysctl_tcp_reordering); 1278 sysctl_tcp_reordering);
1251 tcp_set_ca_state(tp, TCP_CA_Loss); 1279 tcp_set_ca_state(sk, TCP_CA_Loss);
1252 tp->high_seq = tp->snd_nxt; 1280 tp->high_seq = tp->snd_nxt;
1253 TCP_ECN_queue_cwr(tp); 1281 TCP_ECN_queue_cwr(tp);
1254} 1282}
1255 1283
1256static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) 1284static int tcp_check_sack_reneging(struct sock *sk)
1257{ 1285{
1258 struct sk_buff *skb; 1286 struct sk_buff *skb;
1259 1287
@@ -1265,12 +1293,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
1265 */ 1293 */
1266 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && 1294 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
1267 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 1295 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1296 struct inet_connection_sock *icsk = inet_csk(sk);
1268 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); 1297 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
1269 1298
1270 tcp_enter_loss(sk, 1); 1299 tcp_enter_loss(sk, 1);
1271 tp->retransmits++; 1300 icsk->icsk_retransmits++;
1272 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); 1301 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
1273 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1302 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1303 icsk->icsk_rto, TCP_RTO_MAX);
1274 return 1; 1304 return 1;
1275 } 1305 }
1276 return 0; 1306 return 0;
@@ -1281,15 +1311,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
1281 return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; 1311 return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
1282} 1312}
1283 1313
1284static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) 1314static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
1285{ 1315{
1286 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); 1316 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
1287} 1317}
1288 1318
1289static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) 1319static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
1290{ 1320{
1291 return tp->packets_out && 1321 return tp->packets_out &&
1292 tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); 1322 tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
1293} 1323}
1294 1324
1295/* Linux NewReno/SACK/FACK/ECN state machine. 1325/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1453,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
1423 * in assumption of absent reordering, interpret this as reordering. 1453 * in assumption of absent reordering, interpret this as reordering.
1424 * The only another reason could be bug in receiver TCP. 1454 * The only another reason could be bug in receiver TCP.
1425 */ 1455 */
1426static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) 1456static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1427{ 1457{
1458 struct tcp_sock *tp = tcp_sk(sk);
1428 u32 holes; 1459 u32 holes;
1429 1460
1430 holes = max(tp->lost_out, 1U); 1461 holes = max(tp->lost_out, 1U);
@@ -1432,16 +1463,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
1432 1463
1433 if ((tp->sacked_out + holes) > tp->packets_out) { 1464 if ((tp->sacked_out + holes) > tp->packets_out) {
1434 tp->sacked_out = tp->packets_out - holes; 1465 tp->sacked_out = tp->packets_out - holes;
1435 tcp_update_reordering(tp, tp->packets_out+addend, 0); 1466 tcp_update_reordering(sk, tp->packets_out + addend, 0);
1436 } 1467 }
1437} 1468}
1438 1469
1439/* Emulate SACKs for SACKless connection: account for a new dupack. */ 1470/* Emulate SACKs for SACKless connection: account for a new dupack. */
1440 1471
1441static void tcp_add_reno_sack(struct tcp_sock *tp) 1472static void tcp_add_reno_sack(struct sock *sk)
1442{ 1473{
1474 struct tcp_sock *tp = tcp_sk(sk);
1443 tp->sacked_out++; 1475 tp->sacked_out++;
1444 tcp_check_reno_reordering(tp, 0); 1476 tcp_check_reno_reordering(sk, 0);
1445 tcp_sync_left_out(tp); 1477 tcp_sync_left_out(tp);
1446} 1478}
1447 1479
@@ -1456,7 +1488,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
1456 else 1488 else
1457 tp->sacked_out -= acked-1; 1489 tp->sacked_out -= acked-1;
1458 } 1490 }
1459 tcp_check_reno_reordering(tp, acked); 1491 tcp_check_reno_reordering(sk, acked);
1460 tcp_sync_left_out(tp); 1492 tcp_sync_left_out(tp);
1461} 1493}
1462 1494
@@ -1509,7 +1541,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1509 struct sk_buff *skb; 1541 struct sk_buff *skb;
1510 1542
1511 sk_stream_for_retrans_queue(skb, sk) { 1543 sk_stream_for_retrans_queue(skb, sk) {
1512 if (tcp_skb_timedout(tp, skb) && 1544 if (tcp_skb_timedout(sk, skb) &&
1513 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1545 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1514 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1546 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1515 tp->lost_out += tcp_skb_pcount(skb); 1547 tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1562,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1530} 1562}
1531 1563
1532/* Decrease cwnd each second ack. */ 1564/* Decrease cwnd each second ack. */
1533static void tcp_cwnd_down(struct tcp_sock *tp) 1565static void tcp_cwnd_down(struct sock *sk)
1534{ 1566{
1567 const struct inet_connection_sock *icsk = inet_csk(sk);
1568 struct tcp_sock *tp = tcp_sk(sk);
1535 int decr = tp->snd_cwnd_cnt + 1; 1569 int decr = tp->snd_cwnd_cnt + 1;
1536 1570
1537 tp->snd_cwnd_cnt = decr&1; 1571 tp->snd_cwnd_cnt = decr&1;
1538 decr >>= 1; 1572 decr >>= 1;
1539 1573
1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) 1574 if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
1541 tp->snd_cwnd -= decr; 1575 tp->snd_cwnd -= decr;
1542 1576
1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1577 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1605,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1571#define DBGUNDO(x...) do { } while (0) 1605#define DBGUNDO(x...) do { } while (0)
1572#endif 1606#endif
1573 1607
1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1608static void tcp_undo_cwr(struct sock *sk, const int undo)
1575{ 1609{
1610 struct tcp_sock *tp = tcp_sk(sk);
1611
1576 if (tp->prior_ssthresh) { 1612 if (tp->prior_ssthresh) {
1577 if (tp->ca_ops->undo_cwnd) 1613 const struct inet_connection_sock *icsk = inet_csk(sk);
1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); 1614
1615 if (icsk->icsk_ca_ops->undo_cwnd)
1616 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
1579 else 1617 else
1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1618 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1581 1619
@@ -1603,9 +1641,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1603 /* Happy end! We did not retransmit anything 1641 /* Happy end! We did not retransmit anything
1604 * or our original transmission succeeded. 1642 * or our original transmission succeeded.
1605 */ 1643 */
1606 DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); 1644 DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
1607 tcp_undo_cwr(tp, 1); 1645 tcp_undo_cwr(sk, 1);
1608 if (tp->ca_state == TCP_CA_Loss) 1646 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
1609 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); 1647 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1610 else 1648 else
1611 NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); 1649 NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1656,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1618 tcp_moderate_cwnd(tp); 1656 tcp_moderate_cwnd(tp);
1619 return 1; 1657 return 1;
1620 } 1658 }
1621 tcp_set_ca_state(tp, TCP_CA_Open); 1659 tcp_set_ca_state(sk, TCP_CA_Open);
1622 return 0; 1660 return 0;
1623} 1661}
1624 1662
@@ -1627,7 +1665,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
1627{ 1665{
1628 if (tp->undo_marker && !tp->undo_retrans) { 1666 if (tp->undo_marker && !tp->undo_retrans) {
1629 DBGUNDO(sk, tp, "D-SACK"); 1667 DBGUNDO(sk, tp, "D-SACK");
1630 tcp_undo_cwr(tp, 1); 1668 tcp_undo_cwr(sk, 1);
1631 tp->undo_marker = 0; 1669 tp->undo_marker = 0;
1632 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); 1670 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
1633 } 1671 }
@@ -1648,10 +1686,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
1648 if (tp->retrans_out == 0) 1686 if (tp->retrans_out == 0)
1649 tp->retrans_stamp = 0; 1687 tp->retrans_stamp = 0;
1650 1688
1651 tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); 1689 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
1652 1690
1653 DBGUNDO(sk, tp, "Hoe"); 1691 DBGUNDO(sk, tp, "Hoe");
1654 tcp_undo_cwr(tp, 0); 1692 tcp_undo_cwr(sk, 0);
1655 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); 1693 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
1656 1694
1657 /* So... Do not make Hoe's retransmit yet. 1695 /* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1712,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1674 DBGUNDO(sk, tp, "partial loss"); 1712 DBGUNDO(sk, tp, "partial loss");
1675 tp->lost_out = 0; 1713 tp->lost_out = 0;
1676 tp->left_out = tp->sacked_out; 1714 tp->left_out = tp->sacked_out;
1677 tcp_undo_cwr(tp, 1); 1715 tcp_undo_cwr(sk, 1);
1678 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); 1716 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1679 tp->retransmits = 0; 1717 inet_csk(sk)->icsk_retransmits = 0;
1680 tp->undo_marker = 0; 1718 tp->undo_marker = 0;
1681 if (!IsReno(tp)) 1719 if (!IsReno(tp))
1682 tcp_set_ca_state(tp, TCP_CA_Open); 1720 tcp_set_ca_state(sk, TCP_CA_Open);
1683 return 1; 1721 return 1;
1684 } 1722 }
1685 return 0; 1723 return 0;
1686} 1724}
1687 1725
1688static inline void tcp_complete_cwr(struct tcp_sock *tp) 1726static inline void tcp_complete_cwr(struct sock *sk)
1689{ 1727{
1728 struct tcp_sock *tp = tcp_sk(sk);
1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 1729 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1691 tp->snd_cwnd_stamp = tcp_time_stamp; 1730 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); 1731 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
1693} 1732}
1694 1733
1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1734static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1739,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
1700 tp->retrans_stamp = 0; 1739 tp->retrans_stamp = 0;
1701 1740
1702 if (flag&FLAG_ECE) 1741 if (flag&FLAG_ECE)
1703 tcp_enter_cwr(tp); 1742 tcp_enter_cwr(sk);
1704 1743
1705 if (tp->ca_state != TCP_CA_CWR) { 1744 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
1706 int state = TCP_CA_Open; 1745 int state = TCP_CA_Open;
1707 1746
1708 if (tp->left_out || tp->retrans_out || tp->undo_marker) 1747 if (tp->left_out || tp->retrans_out || tp->undo_marker)
1709 state = TCP_CA_Disorder; 1748 state = TCP_CA_Disorder;
1710 1749
1711 if (tp->ca_state != state) { 1750 if (inet_csk(sk)->icsk_ca_state != state) {
1712 tcp_set_ca_state(tp, state); 1751 tcp_set_ca_state(sk, state);
1713 tp->high_seq = tp->snd_nxt; 1752 tp->high_seq = tp->snd_nxt;
1714 } 1753 }
1715 tcp_moderate_cwnd(tp); 1754 tcp_moderate_cwnd(tp);
1716 } else { 1755 } else {
1717 tcp_cwnd_down(tp); 1756 tcp_cwnd_down(sk);
1718 } 1757 }
1719} 1758}
1720 1759
@@ -1733,6 +1772,7 @@ static void
1733tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, 1772tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1734 int prior_packets, int flag) 1773 int prior_packets, int flag)
1735{ 1774{
1775 struct inet_connection_sock *icsk = inet_csk(sk);
1736 struct tcp_sock *tp = tcp_sk(sk); 1776 struct tcp_sock *tp = tcp_sk(sk);
1737 int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); 1777 int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
1738 1778
@@ -1750,13 +1790,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1750 tp->prior_ssthresh = 0; 1790 tp->prior_ssthresh = 0;
1751 1791
1752 /* B. In all the states check for reneging SACKs. */ 1792 /* B. In all the states check for reneging SACKs. */
1753 if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) 1793 if (tp->sacked_out && tcp_check_sack_reneging(sk))
1754 return; 1794 return;
1755 1795
1756 /* C. Process data loss notification, provided it is valid. */ 1796 /* C. Process data loss notification, provided it is valid. */
1757 if ((flag&FLAG_DATA_LOST) && 1797 if ((flag&FLAG_DATA_LOST) &&
1758 before(tp->snd_una, tp->high_seq) && 1798 before(tp->snd_una, tp->high_seq) &&
1759 tp->ca_state != TCP_CA_Open && 1799 icsk->icsk_ca_state != TCP_CA_Open &&
1760 tp->fackets_out > tp->reordering) { 1800 tp->fackets_out > tp->reordering) {
1761 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); 1801 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
1762 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); 1802 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1807,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1767 1807
1768 /* E. Check state exit conditions. State can be terminated 1808 /* E. Check state exit conditions. State can be terminated
1769 * when high_seq is ACKed. */ 1809 * when high_seq is ACKed. */
1770 if (tp->ca_state == TCP_CA_Open) { 1810 if (icsk->icsk_ca_state == TCP_CA_Open) {
1771 if (!sysctl_tcp_frto) 1811 if (!sysctl_tcp_frto)
1772 BUG_TRAP(tp->retrans_out == 0); 1812 BUG_TRAP(tp->retrans_out == 0);
1773 tp->retrans_stamp = 0; 1813 tp->retrans_stamp = 0;
1774 } else if (!before(tp->snd_una, tp->high_seq)) { 1814 } else if (!before(tp->snd_una, tp->high_seq)) {
1775 switch (tp->ca_state) { 1815 switch (icsk->icsk_ca_state) {
1776 case TCP_CA_Loss: 1816 case TCP_CA_Loss:
1777 tp->retransmits = 0; 1817 icsk->icsk_retransmits = 0;
1778 if (tcp_try_undo_recovery(sk, tp)) 1818 if (tcp_try_undo_recovery(sk, tp))
1779 return; 1819 return;
1780 break; 1820 break;
@@ -1783,8 +1823,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1783 /* CWR is to be held something *above* high_seq 1823 /* CWR is to be held something *above* high_seq
1784 * is ACKed for CWR bit to reach receiver. */ 1824 * is ACKed for CWR bit to reach receiver. */
1785 if (tp->snd_una != tp->high_seq) { 1825 if (tp->snd_una != tp->high_seq) {
1786 tcp_complete_cwr(tp); 1826 tcp_complete_cwr(sk);
1787 tcp_set_ca_state(tp, TCP_CA_Open); 1827 tcp_set_ca_state(sk, TCP_CA_Open);
1788 } 1828 }
1789 break; 1829 break;
1790 1830
@@ -1795,7 +1835,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1795 * catching for all duplicate ACKs. */ 1835 * catching for all duplicate ACKs. */
1796 IsReno(tp) || tp->snd_una != tp->high_seq) { 1836 IsReno(tp) || tp->snd_una != tp->high_seq) {
1797 tp->undo_marker = 0; 1837 tp->undo_marker = 0;
1798 tcp_set_ca_state(tp, TCP_CA_Open); 1838 tcp_set_ca_state(sk, TCP_CA_Open);
1799 } 1839 }
1800 break; 1840 break;
1801 1841
@@ -1804,17 +1844,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1804 tcp_reset_reno_sack(tp); 1844 tcp_reset_reno_sack(tp);
1805 if (tcp_try_undo_recovery(sk, tp)) 1845 if (tcp_try_undo_recovery(sk, tp))
1806 return; 1846 return;
1807 tcp_complete_cwr(tp); 1847 tcp_complete_cwr(sk);
1808 break; 1848 break;
1809 } 1849 }
1810 } 1850 }
1811 1851
1812 /* F. Process state. */ 1852 /* F. Process state. */
1813 switch (tp->ca_state) { 1853 switch (icsk->icsk_ca_state) {
1814 case TCP_CA_Recovery: 1854 case TCP_CA_Recovery:
1815 if (prior_snd_una == tp->snd_una) { 1855 if (prior_snd_una == tp->snd_una) {
1816 if (IsReno(tp) && is_dupack) 1856 if (IsReno(tp) && is_dupack)
1817 tcp_add_reno_sack(tp); 1857 tcp_add_reno_sack(sk);
1818 } else { 1858 } else {
1819 int acked = prior_packets - tp->packets_out; 1859 int acked = prior_packets - tp->packets_out;
1820 if (IsReno(tp)) 1860 if (IsReno(tp))
@@ -1824,13 +1864,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1824 break; 1864 break;
1825 case TCP_CA_Loss: 1865 case TCP_CA_Loss:
1826 if (flag&FLAG_DATA_ACKED) 1866 if (flag&FLAG_DATA_ACKED)
1827 tp->retransmits = 0; 1867 icsk->icsk_retransmits = 0;
1828 if (!tcp_try_undo_loss(sk, tp)) { 1868 if (!tcp_try_undo_loss(sk, tp)) {
1829 tcp_moderate_cwnd(tp); 1869 tcp_moderate_cwnd(tp);
1830 tcp_xmit_retransmit_queue(sk); 1870 tcp_xmit_retransmit_queue(sk);
1831 return; 1871 return;
1832 } 1872 }
1833 if (tp->ca_state != TCP_CA_Open) 1873 if (icsk->icsk_ca_state != TCP_CA_Open)
1834 return; 1874 return;
1835 /* Loss is undone; fall through to processing in Open state. */ 1875 /* Loss is undone; fall through to processing in Open state. */
1836 default: 1876 default:
@@ -1838,10 +1878,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1838 if (tp->snd_una != prior_snd_una) 1878 if (tp->snd_una != prior_snd_una)
1839 tcp_reset_reno_sack(tp); 1879 tcp_reset_reno_sack(tp);
1840 if (is_dupack) 1880 if (is_dupack)
1841 tcp_add_reno_sack(tp); 1881 tcp_add_reno_sack(sk);
1842 } 1882 }
1843 1883
1844 if (tp->ca_state == TCP_CA_Disorder) 1884 if (icsk->icsk_ca_state == TCP_CA_Disorder)
1845 tcp_try_undo_dsack(sk, tp); 1885 tcp_try_undo_dsack(sk, tp);
1846 1886
1847 if (!tcp_time_to_recover(sk, tp)) { 1887 if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1901,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1861 tp->undo_marker = tp->snd_una; 1901 tp->undo_marker = tp->snd_una;
1862 tp->undo_retrans = tp->retrans_out; 1902 tp->undo_retrans = tp->retrans_out;
1863 1903
1864 if (tp->ca_state < TCP_CA_CWR) { 1904 if (icsk->icsk_ca_state < TCP_CA_CWR) {
1865 if (!(flag&FLAG_ECE)) 1905 if (!(flag&FLAG_ECE))
1866 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1906 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1907 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1868 TCP_ECN_queue_cwr(tp); 1908 TCP_ECN_queue_cwr(tp);
1869 } 1909 }
1870 1910
1871 tp->snd_cwnd_cnt = 0; 1911 tp->snd_cwnd_cnt = 0;
1872 tcp_set_ca_state(tp, TCP_CA_Recovery); 1912 tcp_set_ca_state(sk, TCP_CA_Recovery);
1873 } 1913 }
1874 1914
1875 if (is_dupack || tcp_head_timedout(sk, tp)) 1915 if (is_dupack || tcp_head_timedout(sk, tp))
1876 tcp_update_scoreboard(sk, tp); 1916 tcp_update_scoreboard(sk, tp);
1877 tcp_cwnd_down(tp); 1917 tcp_cwnd_down(sk);
1878 tcp_xmit_retransmit_queue(sk); 1918 tcp_xmit_retransmit_queue(sk);
1879} 1919}
1880 1920
1881/* Read draft-ietf-tcplw-high-performance before mucking 1921/* Read draft-ietf-tcplw-high-performance before mucking
1882 * with this code. (Superceeds RFC1323) 1922 * with this code. (Superceeds RFC1323)
1883 */ 1923 */
1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) 1924static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1885{ 1925{
1886 __u32 seq_rtt;
1887
1888 /* RTTM Rule: A TSecr value received in a segment is used to 1926 /* RTTM Rule: A TSecr value received in a segment is used to
1889 * update the averaged RTT measurement only if the segment 1927 * update the averaged RTT measurement only if the segment
1890 * acknowledges some new data, i.e., only if it advances the 1928 * acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1938,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1900 * answer arrives rto becomes 120 seconds! If at least one of segments 1938 * answer arrives rto becomes 120 seconds! If at least one of segments
1901 * in window is lost... Voila. --ANK (010210) 1939 * in window is lost... Voila. --ANK (010210)
1902 */ 1940 */
1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1941 struct tcp_sock *tp = tcp_sk(sk);
1904 tcp_rtt_estimator(tp, seq_rtt, usrtt); 1942 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1905 tcp_set_rto(tp); 1943 tcp_rtt_estimator(sk, seq_rtt, usrtt);
1906 tp->backoff = 0; 1944 tcp_set_rto(sk);
1907 tcp_bound_rto(tp); 1945 inet_csk(sk)->icsk_backoff = 0;
1946 tcp_bound_rto(sk);
1908} 1947}
1909 1948
1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) 1949static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
1911{ 1950{
1912 /* We don't have a timestamp. Can only use 1951 /* We don't have a timestamp. Can only use
1913 * packets that are not retransmitted to determine 1952 * packets that are not retransmitted to determine
@@ -1921,27 +1960,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
1921 if (flag & FLAG_RETRANS_DATA_ACKED) 1960 if (flag & FLAG_RETRANS_DATA_ACKED)
1922 return; 1961 return;
1923 1962
1924 tcp_rtt_estimator(tp, seq_rtt, usrtt); 1963 tcp_rtt_estimator(sk, seq_rtt, usrtt);
1925 tcp_set_rto(tp); 1964 tcp_set_rto(sk);
1926 tp->backoff = 0; 1965 inet_csk(sk)->icsk_backoff = 0;
1927 tcp_bound_rto(tp); 1966 tcp_bound_rto(sk);
1928} 1967}
1929 1968
1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1969static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
1931 int flag, s32 seq_rtt, u32 *usrtt) 1970 const s32 seq_rtt, u32 *usrtt)
1932{ 1971{
1972 const struct tcp_sock *tp = tcp_sk(sk);
1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1973 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1974 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
1935 tcp_ack_saw_tstamp(tp, usrtt, flag); 1975 tcp_ack_saw_tstamp(sk, usrtt, flag);
1936 else if (seq_rtt >= 0) 1976 else if (seq_rtt >= 0)
1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); 1977 tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
1938} 1978}
1939 1979
1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 1980static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
1941 u32 in_flight, int good) 1981 u32 in_flight, int good)
1942{ 1982{
1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); 1983 const struct inet_connection_sock *icsk = inet_csk(sk);
1944 tp->snd_cwnd_stamp = tcp_time_stamp; 1984 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
1985 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
1945} 1986}
1946 1987
1947/* Restart timer after forward progress on connection. 1988/* Restart timer after forward progress on connection.
@@ -1951,9 +1992,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
1951static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 1992static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1952{ 1993{
1953 if (!tp->packets_out) { 1994 if (!tp->packets_out) {
1954 tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); 1995 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
1955 } else { 1996 } else {
1956 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1997 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1957 } 1998 }
1958} 1999}
1959 2000
@@ -2068,9 +2109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2068 seq_rtt = -1; 2109 seq_rtt = -1;
2069 } else if (seq_rtt < 0) 2110 } else if (seq_rtt < 0)
2070 seq_rtt = now - scb->when; 2111 seq_rtt = now - scb->when;
2071 if (seq_usrtt) 2112 if (seq_usrtt) {
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 2113 struct timeval tv;
2073 + (usnow.tv_usec - skb->stamp.tv_usec); 2114
2115 skb_get_timestamp(skb, &tv);
2116 *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
2117 + (usnow.tv_usec - tv.tv_usec);
2118 }
2074 2119
2075 if (sacked & TCPCB_SACKED_ACKED) 2120 if (sacked & TCPCB_SACKED_ACKED)
2076 tp->sacked_out -= tcp_skb_pcount(skb); 2121 tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2130,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2085 seq_rtt = now - scb->when; 2130 seq_rtt = now - scb->when;
2086 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2131 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2087 tcp_packets_out_dec(tp, skb); 2132 tcp_packets_out_dec(tp, skb);
2088 __skb_unlink(skb, skb->list); 2133 __skb_unlink(skb, &sk->sk_write_queue);
2089 sk_stream_free_skb(sk, skb); 2134 sk_stream_free_skb(sk, skb);
2090 } 2135 }
2091 2136
2092 if (acked&FLAG_ACKED) { 2137 if (acked&FLAG_ACKED) {
2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); 2138 const struct inet_connection_sock *icsk = inet_csk(sk);
2139 tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
2094 tcp_ack_packets_out(sk, tp); 2140 tcp_ack_packets_out(sk, tp);
2095 2141
2096 if (tp->ca_ops->pkts_acked) 2142 if (icsk->icsk_ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked); 2143 icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
2098 } 2144 }
2099 2145
2100#if FASTRETRANS_DEBUG > 0 2146#if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2148,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2102 BUG_TRAP((int)tp->lost_out >= 0); 2148 BUG_TRAP((int)tp->lost_out >= 0);
2103 BUG_TRAP((int)tp->retrans_out >= 0); 2149 BUG_TRAP((int)tp->retrans_out >= 0);
2104 if (!tp->packets_out && tp->rx_opt.sack_ok) { 2150 if (!tp->packets_out && tp->rx_opt.sack_ok) {
2151 const struct inet_connection_sock *icsk = inet_csk(sk);
2105 if (tp->lost_out) { 2152 if (tp->lost_out) {
2106 printk(KERN_DEBUG "Leak l=%u %d\n", 2153 printk(KERN_DEBUG "Leak l=%u %d\n",
2107 tp->lost_out, tp->ca_state); 2154 tp->lost_out, icsk->icsk_ca_state);
2108 tp->lost_out = 0; 2155 tp->lost_out = 0;
2109 } 2156 }
2110 if (tp->sacked_out) { 2157 if (tp->sacked_out) {
2111 printk(KERN_DEBUG "Leak s=%u %d\n", 2158 printk(KERN_DEBUG "Leak s=%u %d\n",
2112 tp->sacked_out, tp->ca_state); 2159 tp->sacked_out, icsk->icsk_ca_state);
2113 tp->sacked_out = 0; 2160 tp->sacked_out = 0;
2114 } 2161 }
2115 if (tp->retrans_out) { 2162 if (tp->retrans_out) {
2116 printk(KERN_DEBUG "Leak r=%u %d\n", 2163 printk(KERN_DEBUG "Leak r=%u %d\n",
2117 tp->retrans_out, tp->ca_state); 2164 tp->retrans_out, icsk->icsk_ca_state);
2118 tp->retrans_out = 0; 2165 tp->retrans_out = 0;
2119 } 2166 }
2120 } 2167 }
@@ -2125,40 +2172,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2125 2172
2126static void tcp_ack_probe(struct sock *sk) 2173static void tcp_ack_probe(struct sock *sk)
2127{ 2174{
2128 struct tcp_sock *tp = tcp_sk(sk); 2175 const struct tcp_sock *tp = tcp_sk(sk);
2176 struct inet_connection_sock *icsk = inet_csk(sk);
2129 2177
2130 /* Was it a usable window open? */ 2178 /* Was it a usable window open? */
2131 2179
2132 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, 2180 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
2133 tp->snd_una + tp->snd_wnd)) { 2181 tp->snd_una + tp->snd_wnd)) {
2134 tp->backoff = 0; 2182 icsk->icsk_backoff = 0;
2135 tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); 2183 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
2136 /* Socket must be waked up by subsequent tcp_data_snd_check(). 2184 /* Socket must be waked up by subsequent tcp_data_snd_check().
2137 * This function is not for random using! 2185 * This function is not for random using!
2138 */ 2186 */
2139 } else { 2187 } else {
2140 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, 2188 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2141 min(tp->rto << tp->backoff, TCP_RTO_MAX)); 2189 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2190 TCP_RTO_MAX);
2142 } 2191 }
2143} 2192}
2144 2193
2145static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) 2194static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
2146{ 2195{
2147 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 2196 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
2148 tp->ca_state != TCP_CA_Open); 2197 inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
2149} 2198}
2150 2199
2151static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) 2200static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
2152{ 2201{
2202 const struct tcp_sock *tp = tcp_sk(sk);
2153 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 2203 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
2154 !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); 2204 !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
2155} 2205}
2156 2206
2157/* Check that window update is acceptable. 2207/* Check that window update is acceptable.
2158 * The function assumes that snd_una<=ack<=snd_next. 2208 * The function assumes that snd_una<=ack<=snd_next.
2159 */ 2209 */
2160static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, 2210static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
2161 u32 ack_seq, u32 nwin) 2211 const u32 ack_seq, const u32 nwin)
2162{ 2212{
2163 return (after(ack, tp->snd_una) || 2213 return (after(ack, tp->snd_una) ||
2164 after(ack_seq, tp->snd_wl1) || 2214 after(ack_seq, tp->snd_wl1) ||
@@ -2241,6 +2291,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2241/* This routine deals with incoming acks, but not outgoing ones. */ 2291/* This routine deals with incoming acks, but not outgoing ones. */
2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2292static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2243{ 2293{
2294 struct inet_connection_sock *icsk = inet_csk(sk);
2244 struct tcp_sock *tp = tcp_sk(sk); 2295 struct tcp_sock *tp = tcp_sk(sk);
2245 u32 prior_snd_una = tp->snd_una; 2296 u32 prior_snd_una = tp->snd_una;
2246 u32 ack_seq = TCP_SKB_CB(skb)->seq; 2297 u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2319,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2268 tp->snd_una = ack; 2319 tp->snd_una = ack;
2269 flag |= FLAG_WIN_UPDATE; 2320 flag |= FLAG_WIN_UPDATE;
2270 2321
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK); 2322 tcp_ca_event(sk, CA_EVENT_FAST_ACK);
2272 2323
2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2324 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2274 } else { 2325 } else {
@@ -2285,7 +2336,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2336 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2286 flag |= FLAG_ECE; 2337 flag |= FLAG_ECE;
2287 2338
2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK); 2339 tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
2289 } 2340 }
2290 2341
2291 /* We passed data and got it acked, remove any soft error 2342 /* We passed data and got it acked, remove any soft error
@@ -2301,19 +2352,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2301 2352
2302 /* See if we can take anything off of the retransmit queue. */ 2353 /* See if we can take anything off of the retransmit queue. */
2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt, 2354 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); 2355 icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
2305 2356
2306 if (tp->frto_counter) 2357 if (tp->frto_counter)
2307 tcp_process_frto(sk, prior_snd_una); 2358 tcp_process_frto(sk, prior_snd_una);
2308 2359
2309 if (tcp_ack_is_dubious(tp, flag)) { 2360 if (tcp_ack_is_dubious(sk, flag)) {
2310 /* Advanve CWND, if state allows this. */ 2361 /* Advanve CWND, if state allows this. */
2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) 2362 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); 2363 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2364 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2314 } else { 2365 } else {
2315 if ((flag & FLAG_DATA_ACKED)) 2366 if ((flag & FLAG_DATA_ACKED))
2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); 2367 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
2317 } 2368 }
2318 2369
2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2370 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2373,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2322 return 1; 2373 return 1;
2323 2374
2324no_queue: 2375no_queue:
2325 tp->probes_out = 0; 2376 icsk->icsk_probes_out = 0;
2326 2377
2327 /* If this ack opens up a zero window, clear backoff. It was 2378 /* If this ack opens up a zero window, clear backoff. It was
2328 * being used to time the probes, and is probably far higher than 2379 * being used to time the probes, and is probably far higher than
@@ -2500,8 +2551,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
2500 * up to bandwidth of 18Gigabit/sec. 8) ] 2551 * up to bandwidth of 18Gigabit/sec. 8) ]
2501 */ 2552 */
2502 2553
2503static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) 2554static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
2504{ 2555{
2556 struct tcp_sock *tp = tcp_sk(sk);
2505 struct tcphdr *th = skb->h.th; 2557 struct tcphdr *th = skb->h.th;
2506 u32 seq = TCP_SKB_CB(skb)->seq; 2558 u32 seq = TCP_SKB_CB(skb)->seq;
2507 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2559 u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2568,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
2516 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && 2568 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
2517 2569
2518 /* 4. ... and sits in replay window. */ 2570 /* 4. ... and sits in replay window. */
2519 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); 2571 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
2520} 2572}
2521 2573
2522static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) 2574static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
2523{ 2575{
2576 const struct tcp_sock *tp = tcp_sk(sk);
2524 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && 2577 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
2525 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && 2578 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
2526 !tcp_disordered_ack(tp, skb)); 2579 !tcp_disordered_ack(sk, skb));
2527} 2580}
2528 2581
2529/* Check segment sequence number for validity. 2582/* Check segment sequence number for validity.
@@ -2586,7 +2639,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2586{ 2639{
2587 struct tcp_sock *tp = tcp_sk(sk); 2640 struct tcp_sock *tp = tcp_sk(sk);
2588 2641
2589 tcp_schedule_ack(tp); 2642 inet_csk_schedule_ack(sk);
2590 2643
2591 sk->sk_shutdown |= RCV_SHUTDOWN; 2644 sk->sk_shutdown |= RCV_SHUTDOWN;
2592 sock_set_flag(sk, SOCK_DONE); 2645 sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2649,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2596 case TCP_ESTABLISHED: 2649 case TCP_ESTABLISHED:
2597 /* Move to CLOSE_WAIT */ 2650 /* Move to CLOSE_WAIT */
2598 tcp_set_state(sk, TCP_CLOSE_WAIT); 2651 tcp_set_state(sk, TCP_CLOSE_WAIT);
2599 tp->ack.pingpong = 1; 2652 inet_csk(sk)->icsk_ack.pingpong = 1;
2600 break; 2653 break;
2601 2654
2602 case TCP_CLOSE_WAIT: 2655 case TCP_CLOSE_WAIT:
@@ -2694,7 +2747,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
2694 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 2747 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
2695 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 2748 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2696 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); 2749 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
2697 tcp_enter_quickack_mode(tp); 2750 tcp_enter_quickack_mode(sk);
2698 2751
2699 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { 2752 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
2700 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 2753 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2906,7 @@ static void tcp_ofo_queue(struct sock *sk)
2853 2906
2854 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 2907 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
2855 SOCK_DEBUG(sk, "ofo packet was already received \n"); 2908 SOCK_DEBUG(sk, "ofo packet was already received \n");
2856 __skb_unlink(skb, skb->list); 2909 __skb_unlink(skb, &tp->out_of_order_queue);
2857 __kfree_skb(skb); 2910 __kfree_skb(skb);
2858 continue; 2911 continue;
2859 } 2912 }
@@ -2861,7 +2914,7 @@ static void tcp_ofo_queue(struct sock *sk)
2861 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 2914 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
2862 TCP_SKB_CB(skb)->end_seq); 2915 TCP_SKB_CB(skb)->end_seq);
2863 2916
2864 __skb_unlink(skb, skb->list); 2917 __skb_unlink(skb, &tp->out_of_order_queue);
2865 __skb_queue_tail(&sk->sk_receive_queue, skb); 2918 __skb_queue_tail(&sk->sk_receive_queue, skb);
2866 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 2919 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2867 if(skb->h.th->fin) 2920 if(skb->h.th->fin)
@@ -2942,7 +2995,7 @@ queue_and_out:
2942 * gap in queue is filled. 2995 * gap in queue is filled.
2943 */ 2996 */
2944 if (skb_queue_empty(&tp->out_of_order_queue)) 2997 if (skb_queue_empty(&tp->out_of_order_queue))
2945 tp->ack.pingpong = 0; 2998 inet_csk(sk)->icsk_ack.pingpong = 0;
2946 } 2999 }
2947 3000
2948 if (tp->rx_opt.num_sacks) 3001 if (tp->rx_opt.num_sacks)
@@ -2963,8 +3016,8 @@ queue_and_out:
2963 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 3016 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
2964 3017
2965out_of_window: 3018out_of_window:
2966 tcp_enter_quickack_mode(tp); 3019 tcp_enter_quickack_mode(sk);
2967 tcp_schedule_ack(tp); 3020 inet_csk_schedule_ack(sk);
2968drop: 3021drop:
2969 __kfree_skb(skb); 3022 __kfree_skb(skb);
2970 return; 3023 return;
@@ -2974,7 +3027,7 @@ drop:
2974 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) 3027 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
2975 goto out_of_window; 3028 goto out_of_window;
2976 3029
2977 tcp_enter_quickack_mode(tp); 3030 tcp_enter_quickack_mode(sk);
2978 3031
2979 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 3032 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2980 /* Partial packet, seq < rcv_next < end_seq */ 3033 /* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3056,7 @@ drop:
3003 3056
3004 /* Disable header prediction. */ 3057 /* Disable header prediction. */
3005 tp->pred_flags = 0; 3058 tp->pred_flags = 0;
3006 tcp_schedule_ack(tp); 3059 inet_csk_schedule_ack(sk);
3007 3060
3008 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 3061 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
3009 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 3062 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3080,7 @@ drop:
3027 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 3080 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3028 3081
3029 if (seq == TCP_SKB_CB(skb1)->end_seq) { 3082 if (seq == TCP_SKB_CB(skb1)->end_seq) {
3030 __skb_append(skb1, skb); 3083 __skb_append(skb1, skb, &tp->out_of_order_queue);
3031 3084
3032 if (!tp->rx_opt.num_sacks || 3085 if (!tp->rx_opt.num_sacks ||
3033 tp->selective_acks[0].end_seq != seq) 3086 tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3124,7 @@ drop:
3071 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); 3124 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
3072 break; 3125 break;
3073 } 3126 }
3074 __skb_unlink(skb1, skb1->list); 3127 __skb_unlink(skb1, &tp->out_of_order_queue);
3075 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); 3128 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
3076 __kfree_skb(skb1); 3129 __kfree_skb(skb1);
3077 } 3130 }
@@ -3088,8 +3141,9 @@ add_sack:
3088 * simplifies code) 3141 * simplifies code)
3089 */ 3142 */
3090static void 3143static void
3091tcp_collapse(struct sock *sk, struct sk_buff *head, 3144tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3092 struct sk_buff *tail, u32 start, u32 end) 3145 struct sk_buff *head, struct sk_buff *tail,
3146 u32 start, u32 end)
3093{ 3147{
3094 struct sk_buff *skb; 3148 struct sk_buff *skb;
3095 3149
@@ -3099,7 +3153,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3099 /* No new bits? It is possible on ofo queue. */ 3153 /* No new bits? It is possible on ofo queue. */
3100 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 3154 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3101 struct sk_buff *next = skb->next; 3155 struct sk_buff *next = skb->next;
3102 __skb_unlink(skb, skb->list); 3156 __skb_unlink(skb, list);
3103 __kfree_skb(skb); 3157 __kfree_skb(skb);
3104 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); 3158 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3105 skb = next; 3159 skb = next;
@@ -3145,7 +3199,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3145 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); 3199 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
3146 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 3200 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
3147 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 3201 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
3148 __skb_insert(nskb, skb->prev, skb, skb->list); 3202 __skb_insert(nskb, skb->prev, skb, list);
3149 sk_stream_set_owner_r(nskb, sk); 3203 sk_stream_set_owner_r(nskb, sk);
3150 3204
3151 /* Copy data, releasing collapsed skbs. */ 3205 /* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3218,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3164 } 3218 }
3165 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 3219 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3166 struct sk_buff *next = skb->next; 3220 struct sk_buff *next = skb->next;
3167 __skb_unlink(skb, skb->list); 3221 __skb_unlink(skb, list);
3168 __kfree_skb(skb); 3222 __kfree_skb(skb);
3169 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); 3223 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3170 skb = next; 3224 skb = next;
@@ -3200,7 +3254,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
3200 if (skb == (struct sk_buff *)&tp->out_of_order_queue || 3254 if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
3201 after(TCP_SKB_CB(skb)->seq, end) || 3255 after(TCP_SKB_CB(skb)->seq, end) ||
3202 before(TCP_SKB_CB(skb)->end_seq, start)) { 3256 before(TCP_SKB_CB(skb)->end_seq, start)) {
3203 tcp_collapse(sk, head, skb, start, end); 3257 tcp_collapse(sk, &tp->out_of_order_queue,
3258 head, skb, start, end);
3204 head = skb; 3259 head = skb;
3205 if (skb == (struct sk_buff *)&tp->out_of_order_queue) 3260 if (skb == (struct sk_buff *)&tp->out_of_order_queue)
3206 break; 3261 break;
@@ -3237,7 +3292,8 @@ static int tcp_prune_queue(struct sock *sk)
3237 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 3292 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
3238 3293
3239 tcp_collapse_ofo_queue(sk); 3294 tcp_collapse_ofo_queue(sk);
3240 tcp_collapse(sk, sk->sk_receive_queue.next, 3295 tcp_collapse(sk, &sk->sk_receive_queue,
3296 sk->sk_receive_queue.next,
3241 (struct sk_buff*)&sk->sk_receive_queue, 3297 (struct sk_buff*)&sk->sk_receive_queue,
3242 tp->copied_seq, tp->rcv_nxt); 3298 tp->copied_seq, tp->rcv_nxt);
3243 sk_stream_mem_reclaim(sk); 3299 sk_stream_mem_reclaim(sk);
@@ -3286,12 +3342,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
3286{ 3342{
3287 struct tcp_sock *tp = tcp_sk(sk); 3343 struct tcp_sock *tp = tcp_sk(sk);
3288 3344
3289 if (tp->ca_state == TCP_CA_Open && 3345 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
3290 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 3346 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
3291 /* Limited by application or receiver window. */ 3347 /* Limited by application or receiver window. */
3292 u32 win_used = max(tp->snd_cwnd_used, 2U); 3348 u32 win_used = max(tp->snd_cwnd_used, 2U);
3293 if (win_used < tp->snd_cwnd) { 3349 if (win_used < tp->snd_cwnd) {
3294 tp->snd_ssthresh = tcp_current_ssthresh(tp); 3350 tp->snd_ssthresh = tcp_current_ssthresh(sk);
3295 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; 3351 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
3296 } 3352 }
3297 tp->snd_cwnd_used = 0; 3353 tp->snd_cwnd_used = 0;
@@ -3370,13 +3426,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3370 struct tcp_sock *tp = tcp_sk(sk); 3426 struct tcp_sock *tp = tcp_sk(sk);
3371 3427
3372 /* More than one full frame received... */ 3428 /* More than one full frame received... */
3373 if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss 3429 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
3374 /* ... and right edge of window advances far enough. 3430 /* ... and right edge of window advances far enough.
3375 * (tcp_recvmsg() will send ACK otherwise). Or... 3431 * (tcp_recvmsg() will send ACK otherwise). Or...
3376 */ 3432 */
3377 && __tcp_select_window(sk) >= tp->rcv_wnd) || 3433 && __tcp_select_window(sk) >= tp->rcv_wnd) ||
3378 /* We ACK each frame or... */ 3434 /* We ACK each frame or... */
3379 tcp_in_quickack_mode(tp) || 3435 tcp_in_quickack_mode(sk) ||
3380 /* We have out of order data. */ 3436 /* We have out of order data. */
3381 (ofo_possible && 3437 (ofo_possible &&
3382 skb_peek(&tp->out_of_order_queue))) { 3438 skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3446,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3390 3446
3391static __inline__ void tcp_ack_snd_check(struct sock *sk) 3447static __inline__ void tcp_ack_snd_check(struct sock *sk)
3392{ 3448{
3393 struct tcp_sock *tp = tcp_sk(sk); 3449 if (!inet_csk_ack_scheduled(sk)) {
3394 if (!tcp_ack_scheduled(tp)) {
3395 /* We sent a data segment already. */ 3450 /* We sent a data segment already. */
3396 return; 3451 return;
3397 } 3452 }
@@ -3462,7 +3517,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
3462 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 3517 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
3463 tp->copied_seq++; 3518 tp->copied_seq++;
3464 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { 3519 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
3465 __skb_unlink(skb, skb->list); 3520 __skb_unlink(skb, &sk->sk_receive_queue);
3466 __kfree_skb(skb); 3521 __kfree_skb(skb);
3467 } 3522 }
3468 } 3523 }
@@ -3645,7 +3700,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3645 tp->rcv_nxt == tp->rcv_wup) 3700 tp->rcv_nxt == tp->rcv_wup)
3646 tcp_store_ts_recent(tp); 3701 tcp_store_ts_recent(tp);
3647 3702
3648 tcp_rcv_rtt_measure_ts(tp, skb); 3703 tcp_rcv_rtt_measure_ts(sk, skb);
3649 3704
3650 /* We know that such packets are checksummed 3705 /* We know that such packets are checksummed
3651 * on entry. 3706 * on entry.
@@ -3678,7 +3733,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3678 tp->rcv_nxt == tp->rcv_wup) 3733 tp->rcv_nxt == tp->rcv_wup)
3679 tcp_store_ts_recent(tp); 3734 tcp_store_ts_recent(tp);
3680 3735
3681 tcp_rcv_rtt_measure_ts(tp, skb); 3736 tcp_rcv_rtt_measure_ts(sk, skb);
3682 3737
3683 __skb_pull(skb, tcp_header_len); 3738 __skb_pull(skb, tcp_header_len);
3684 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3739 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3754,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3699 tp->rcv_nxt == tp->rcv_wup) 3754 tp->rcv_nxt == tp->rcv_wup)
3700 tcp_store_ts_recent(tp); 3755 tcp_store_ts_recent(tp);
3701 3756
3702 tcp_rcv_rtt_measure_ts(tp, skb); 3757 tcp_rcv_rtt_measure_ts(sk, skb);
3703 3758
3704 if ((int)skb->truesize > sk->sk_forward_alloc) 3759 if ((int)skb->truesize > sk->sk_forward_alloc)
3705 goto step5; 3760 goto step5;
@@ -3719,7 +3774,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3719 /* Well, only one small jumplet in fast path... */ 3774 /* Well, only one small jumplet in fast path... */
3720 tcp_ack(sk, skb, FLAG_DATA); 3775 tcp_ack(sk, skb, FLAG_DATA);
3721 tcp_data_snd_check(sk, tp); 3776 tcp_data_snd_check(sk, tp);
3722 if (!tcp_ack_scheduled(tp)) 3777 if (!inet_csk_ack_scheduled(sk))
3723 goto no_ack; 3778 goto no_ack;
3724 } 3779 }
3725 3780
@@ -3741,7 +3796,7 @@ slow_path:
3741 * RFC1323: H1. Apply PAWS check first. 3796 * RFC1323: H1. Apply PAWS check first.
3742 */ 3797 */
3743 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 3798 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
3744 tcp_paws_discard(tp, skb)) { 3799 tcp_paws_discard(sk, skb)) {
3745 if (!th->rst) { 3800 if (!th->rst) {
3746 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 3801 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
3747 tcp_send_dupack(sk, skb); 3802 tcp_send_dupack(sk, skb);
@@ -3788,7 +3843,7 @@ step5:
3788 if(th->ack) 3843 if(th->ack)
3789 tcp_ack(sk, skb, FLAG_SLOWPATH); 3844 tcp_ack(sk, skb, FLAG_SLOWPATH);
3790 3845
3791 tcp_rcv_rtt_measure_ts(tp, skb); 3846 tcp_rcv_rtt_measure_ts(sk, skb);
3792 3847
3793 /* Process urgent data. */ 3848 /* Process urgent data. */
3794 tcp_urg(sk, skb, th); 3849 tcp_urg(sk, skb, th);
@@ -3817,6 +3872,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3817 tcp_parse_options(skb, &tp->rx_opt, 0); 3872 tcp_parse_options(skb, &tp->rx_opt, 0);
3818 3873
3819 if (th->ack) { 3874 if (th->ack) {
3875 struct inet_connection_sock *icsk;
3820 /* rfc793: 3876 /* rfc793:
3821 * "If the state is SYN-SENT then 3877 * "If the state is SYN-SENT then
3822 * first check the ACK bit 3878 * first check the ACK bit
@@ -3920,7 +3976,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3920 3976
3921 tcp_init_metrics(sk); 3977 tcp_init_metrics(sk);
3922 3978
3923 tcp_init_congestion_control(tp); 3979 tcp_init_congestion_control(sk);
3924 3980
3925 /* Prevent spurious tcp_cwnd_restart() on first data 3981 /* Prevent spurious tcp_cwnd_restart() on first data
3926 * packet. 3982 * packet.
@@ -3930,7 +3986,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3930 tcp_init_buffer_space(sk); 3986 tcp_init_buffer_space(sk);
3931 3987
3932 if (sock_flag(sk, SOCK_KEEPOPEN)) 3988 if (sock_flag(sk, SOCK_KEEPOPEN))
3933 tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); 3989 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
3934 3990
3935 if (!tp->rx_opt.snd_wscale) 3991 if (!tp->rx_opt.snd_wscale)
3936 __tcp_fast_path_on(tp, tp->snd_wnd); 3992 __tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3998,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3942 sk_wake_async(sk, 0, POLL_OUT); 3998 sk_wake_async(sk, 0, POLL_OUT);
3943 } 3999 }
3944 4000
3945 if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { 4001 icsk = inet_csk(sk);
4002
4003 if (sk->sk_write_pending ||
4004 icsk->icsk_accept_queue.rskq_defer_accept ||
4005 icsk->icsk_ack.pingpong) {
3946 /* Save one ACK. Data will be ready after 4006 /* Save one ACK. Data will be ready after
3947 * several ticks, if write_pending is set. 4007 * several ticks, if write_pending is set.
3948 * 4008 *
@@ -3950,12 +4010,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3950 * look so _wonderfully_ clever, that I was not able 4010 * look so _wonderfully_ clever, that I was not able
3951 * to stand against the temptation 8) --ANK 4011 * to stand against the temptation 8) --ANK
3952 */ 4012 */
3953 tcp_schedule_ack(tp); 4013 inet_csk_schedule_ack(sk);
3954 tp->ack.lrcvtime = tcp_time_stamp; 4014 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
3955 tp->ack.ato = TCP_ATO_MIN; 4015 icsk->icsk_ack.ato = TCP_ATO_MIN;
3956 tcp_incr_quickack(tp); 4016 tcp_incr_quickack(sk);
3957 tcp_enter_quickack_mode(tp); 4017 tcp_enter_quickack_mode(sk);
3958 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); 4018 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
4019 TCP_DELACK_MAX, TCP_RTO_MAX);
3959 4020
3960discard: 4021discard:
3961 __kfree_skb(skb); 4022 __kfree_skb(skb);
@@ -4111,7 +4172,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4111 } 4172 }
4112 4173
4113 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 4174 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4114 tcp_paws_discard(tp, skb)) { 4175 tcp_paws_discard(sk, skb)) {
4115 if (!th->rst) { 4176 if (!th->rst) {
4116 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 4177 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
4117 tcp_send_dupack(sk, skb); 4178 tcp_send_dupack(sk, skb);
@@ -4180,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4180 */ 4241 */
4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4242 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4182 !tp->srtt) 4243 !tp->srtt)
4183 tcp_ack_saw_tstamp(tp, 0, 0); 4244 tcp_ack_saw_tstamp(sk, NULL, 0);
4184 4245
4185 if (tp->rx_opt.tstamp_ok) 4246 if (tp->rx_opt.tstamp_ok)
4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4247 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4253,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4192 4253
4193 tcp_init_metrics(sk); 4254 tcp_init_metrics(sk);
4194 4255
4195 tcp_init_congestion_control(tp); 4256 tcp_init_congestion_control(sk);
4196 4257
4197 /* Prevent spurious tcp_cwnd_restart() on 4258 /* Prevent spurious tcp_cwnd_restart() on
4198 * first data packet. 4259 * first data packet.
@@ -4227,9 +4288,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4227 return 1; 4288 return 1;
4228 } 4289 }
4229 4290
4230 tmo = tcp_fin_time(tp); 4291 tmo = tcp_fin_time(sk);
4231 if (tmo > TCP_TIMEWAIT_LEN) { 4292 if (tmo > TCP_TIMEWAIT_LEN) {
4232 tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 4293 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
4233 } else if (th->fin || sock_owned_by_user(sk)) { 4294 } else if (th->fin || sock_owned_by_user(sk)) {
4234 /* Bad case. We could lose such FIN otherwise. 4295 /* Bad case. We could lose such FIN otherwise.
4235 * It is not a big problem, but it looks confusing 4296 * It is not a big problem, but it looks confusing
@@ -4237,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4237 * if it spins in bh_lock_sock(), but it is really 4298 * if it spins in bh_lock_sock(), but it is really
4238 * marginal case. 4299 * marginal case.
4239 */ 4300 */
4240 tcp_reset_keepalive_timer(sk, tmo); 4301 inet_csk_reset_keepalive_timer(sk, tmo);
4241 } else { 4302 } else {
4242 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 4303 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
4243 goto discard; 4304 goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 62f62bb05c2a..13dfb391cdf1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
64#include <linux/times.h> 64#include <linux/times.h>
65 65
66#include <net/icmp.h> 66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
67#include <net/tcp.h> 68#include <net/tcp.h>
69#include <net/transp_v6.h>
68#include <net/ipv6.h> 70#include <net/ipv6.h>
69#include <net/inet_common.h> 71#include <net/inet_common.h>
70#include <net/xfrm.h> 72#include <net/xfrm.h>
@@ -75,7 +77,6 @@
75#include <linux/proc_fs.h> 77#include <linux/proc_fs.h>
76#include <linux/seq_file.h> 78#include <linux/seq_file.h>
77 79
78extern int sysctl_ip_dynaddr;
79int sysctl_tcp_tw_reuse; 80int sysctl_tcp_tw_reuse;
80int sysctl_tcp_low_latency; 81int sysctl_tcp_low_latency;
81 82
@@ -88,458 +89,29 @@ static struct socket *tcp_socket;
88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb); 90 struct sk_buff *skb);
90 91
91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0), 94 .lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait 95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), 96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED 97 .port_rover = 1024 - 1,
97}; 98};
98 99
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
109{
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
114}
115
116static __inline__ int tcp_sk_hashfn(struct sock *sk)
117{
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
123
124 return tcp_hashfn(laddr, lport, faddr, fport);
125}
126
127/* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
129 */
130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
132{
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
140 }
141 return tb;
142}
143
144/* Caller must hold hashbucket lock for this tb with local BH disabled */
145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146{
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
150 }
151}
152
153/* Caller must disable local BH processing. */
154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155{
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
159
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
165}
166
167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168{
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
172}
173
174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
176{
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
180}
181
182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183{
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
188
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
201 }
202 }
203 }
204 return node != NULL;
205}
206
207/* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
209 */
210static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 100static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211{ 101{
212 struct tcp_bind_hashbucket *head; 102 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
225 if (tcp_port_rover < low)
226 rover = low;
227 else
228 rover = tcp_port_rover;
229 do {
230 rover++;
231 if (rover > high)
232 rover = low;
233 head = &tcp_bhash[tcp_bhashfn(rover)];
234 spin_lock(&head->lock);
235 tb_for_each(tb, node, &head->chain)
236 if (tb->port == rover)
237 goto next;
238 break;
239 next:
240 spin_unlock(&head->lock);
241 } while (--remaining > 0);
242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock);
244
245 /* Exhausted local port range during search? */
246 ret = 1;
247 if (remaining <= 0)
248 goto fail;
249
250 /* OK, here is the one we will use. HEAD is
251 * non-NULL and we hold it's mutex.
252 */
253 snum = rover;
254 } else {
255 head = &tcp_bhash[tcp_bhashfn(snum)];
256 spin_lock(&head->lock);
257 tb_for_each(tb, node, &head->chain)
258 if (tb->port == snum)
259 goto tb_found;
260 }
261 tb = NULL;
262 goto tb_not_found;
263tb_found:
264 if (!hlist_empty(&tb->owners)) {
265 if (sk->sk_reuse > 1)
266 goto success;
267 if (tb->fastreuse > 0 &&
268 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
269 goto success;
270 } else {
271 ret = 1;
272 if (tcp_bind_conflict(sk, tb))
273 goto fail_unlock;
274 }
275 }
276tb_not_found:
277 ret = 1;
278 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
279 goto fail_unlock;
280 if (hlist_empty(&tb->owners)) {
281 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
282 tb->fastreuse = 1;
283 else
284 tb->fastreuse = 0;
285 } else if (tb->fastreuse &&
286 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
287 tb->fastreuse = 0;
288success:
289 if (!tcp_sk(sk)->bind_hash)
290 tcp_bind_hash(sk, tb, snum);
291 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
292 ret = 0;
293
294fail_unlock:
295 spin_unlock(&head->lock);
296fail:
297 local_bh_enable();
298 return ret;
299}
300
301/* Get rid of any references to a local port held by the
302 * given sock.
303 */
304static void __tcp_put_port(struct sock *sk)
305{
306 struct inet_sock *inet = inet_sk(sk);
307 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
308 struct tcp_bind_bucket *tb;
309
310 spin_lock(&head->lock);
311 tb = tcp_sk(sk)->bind_hash;
312 __sk_del_bind_node(sk);
313 tcp_sk(sk)->bind_hash = NULL;
314 inet->num = 0;
315 tcp_bucket_destroy(tb);
316 spin_unlock(&head->lock);
317}
318
319void tcp_put_port(struct sock *sk)
320{
321 local_bh_disable();
322 __tcp_put_port(sk);
323 local_bh_enable();
324}
325
326/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
327 * Look, when several writers sleep and reader wakes them up, all but one
328 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
329 * this, _but_ remember, it adds useless work on UP machines (wake up each
330 * exclusive lock release). It should be ifdefed really.
331 */
332
333void tcp_listen_wlock(void)
334{
335 write_lock(&tcp_lhash_lock);
336
337 if (atomic_read(&tcp_lhash_users)) {
338 DEFINE_WAIT(wait);
339
340 for (;;) {
341 prepare_to_wait_exclusive(&tcp_lhash_wait,
342 &wait, TASK_UNINTERRUPTIBLE);
343 if (!atomic_read(&tcp_lhash_users))
344 break;
345 write_unlock_bh(&tcp_lhash_lock);
346 schedule();
347 write_lock_bh(&tcp_lhash_lock);
348 }
349
350 finish_wait(&tcp_lhash_wait, &wait);
351 }
352}
353
354static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355{
356 struct hlist_head *list;
357 rwlock_t *lock;
358
359 BUG_TRAP(sk_unhashed(sk));
360 if (listen_possible && sk->sk_state == TCP_LISTEN) {
361 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362 lock = &tcp_lhash_lock;
363 tcp_listen_wlock();
364 } else {
365 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
366 lock = &tcp_ehash[sk->sk_hashent].lock;
367 write_lock(lock);
368 }
369 __sk_add_node(sk, list);
370 sock_prot_inc_use(sk->sk_prot);
371 write_unlock(lock);
372 if (listen_possible && sk->sk_state == TCP_LISTEN)
373 wake_up(&tcp_lhash_wait);
374} 103}
375 104
376static void tcp_v4_hash(struct sock *sk) 105static void tcp_v4_hash(struct sock *sk)
377{ 106{
378 if (sk->sk_state != TCP_CLOSE) { 107 inet_hash(&tcp_hashinfo, sk);
379 local_bh_disable();
380 __tcp_v4_hash(sk, 1);
381 local_bh_enable();
382 }
383} 108}
384 109
385void tcp_unhash(struct sock *sk) 110void tcp_unhash(struct sock *sk)
386{ 111{
387 rwlock_t *lock; 112 inet_unhash(&tcp_hashinfo, sk);
388
389 if (sk_unhashed(sk))
390 goto ende;
391
392 if (sk->sk_state == TCP_LISTEN) {
393 local_bh_disable();
394 tcp_listen_wlock();
395 lock = &tcp_lhash_lock;
396 } else {
397 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
398 lock = &head->lock;
399 write_lock_bh(&head->lock);
400 }
401
402 if (__sk_del_node_init(sk))
403 sock_prot_dec_use(sk->sk_prot);
404 write_unlock_bh(lock);
405
406 ende:
407 if (sk->sk_state == TCP_LISTEN)
408 wake_up(&tcp_lhash_wait);
409}
410
411/* Don't inline this cruft. Here are some nice properties to
412 * exploit here. The BSD API does not allow a listening TCP
413 * to specify the remote port nor the remote address for the
414 * connection. So always assume those are both wildcarded
415 * during the search since they can never be otherwise.
416 */
417static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
418 unsigned short hnum, int dif)
419{
420 struct sock *result = NULL, *sk;
421 struct hlist_node *node;
422 int score, hiscore;
423
424 hiscore=-1;
425 sk_for_each(sk, node, head) {
426 struct inet_sock *inet = inet_sk(sk);
427
428 if (inet->num == hnum && !ipv6_only_sock(sk)) {
429 __u32 rcv_saddr = inet->rcv_saddr;
430
431 score = (sk->sk_family == PF_INET ? 1 : 0);
432 if (rcv_saddr) {
433 if (rcv_saddr != daddr)
434 continue;
435 score+=2;
436 }
437 if (sk->sk_bound_dev_if) {
438 if (sk->sk_bound_dev_if != dif)
439 continue;
440 score+=2;
441 }
442 if (score == 5)
443 return sk;
444 if (score > hiscore) {
445 hiscore = score;
446 result = sk;
447 }
448 }
449 }
450 return result;
451}
452
453/* Optimize the common listener case. */
454static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
455 unsigned short hnum, int dif)
456{
457 struct sock *sk = NULL;
458 struct hlist_head *head;
459
460 read_lock(&tcp_lhash_lock);
461 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
462 if (!hlist_empty(head)) {
463 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
464
465 if (inet->num == hnum && !sk->sk_node.next &&
466 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
467 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
468 !sk->sk_bound_dev_if)
469 goto sherry_cache;
470 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
471 }
472 if (sk) {
473sherry_cache:
474 sock_hold(sk);
475 }
476 read_unlock(&tcp_lhash_lock);
477 return sk;
478}
479
480/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
481 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
482 *
483 * Local BH must be disabled here.
484 */
485
486static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
487 u32 daddr, u16 hnum,
488 int dif)
489{
490 struct tcp_ehash_bucket *head;
491 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
492 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
493 struct sock *sk;
494 struct hlist_node *node;
495 /* Optimize here for direct hit, only listening connections can
496 * have wildcards anyways.
497 */
498 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
499 head = &tcp_ehash[hash];
500 read_lock(&head->lock);
501 sk_for_each(sk, node, &head->chain) {
502 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
503 goto hit; /* You sunk my battleship! */
504 }
505
506 /* Must check for a TIME_WAIT'er before going to listener hash. */
507 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
508 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
509 goto hit;
510 }
511 sk = NULL;
512out:
513 read_unlock(&head->lock);
514 return sk;
515hit:
516 sock_hold(sk);
517 goto out;
518}
519
520static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
521 u32 daddr, u16 hnum, int dif)
522{
523 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
524 daddr, hnum, dif);
525
526 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
527}
528
529inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
530 u16 dport, int dif)
531{
532 struct sock *sk;
533
534 local_bh_disable();
535 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
536 local_bh_enable();
537
538 return sk;
539} 113}
540 114
541EXPORT_SYMBOL_GPL(tcp_v4_lookup);
542
543static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) 115static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
544{ 116{
545 return secure_tcp_sequence_number(skb->nh.iph->daddr, 117 return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -550,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550 122
551/* called with local bh disabled */ 123/* called with local bh disabled */
552static int __tcp_v4_check_established(struct sock *sk, __u16 lport, 124static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
553 struct tcp_tw_bucket **twp) 125 struct inet_timewait_sock **twp)
554{ 126{
555 struct inet_sock *inet = inet_sk(sk); 127 struct inet_sock *inet = inet_sk(sk);
556 u32 daddr = inet->rcv_saddr; 128 u32 daddr = inet->rcv_saddr;
557 u32 saddr = inet->daddr; 129 u32 saddr = inet->daddr;
558 int dif = sk->sk_bound_dev_if; 130 int dif = sk->sk_bound_dev_if;
559 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) 131 INET_ADDR_COOKIE(acookie, saddr, daddr)
560 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); 132 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
561 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); 133 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
562 struct tcp_ehash_bucket *head = &tcp_ehash[hash]; 134 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
563 struct sock *sk2; 135 struct sock *sk2;
564 struct hlist_node *node; 136 const struct hlist_node *node;
565 struct tcp_tw_bucket *tw; 137 struct inet_timewait_sock *tw;
566 138
567 write_lock(&head->lock); 139 write_lock(&head->lock);
568 140
569 /* Check TIME-WAIT sockets first. */ 141 /* Check TIME-WAIT sockets first. */
570 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { 142 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
571 tw = (struct tcp_tw_bucket *)sk2; 143 tw = inet_twsk(sk2);
572 144
573 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { 145 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
146 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
574 struct tcp_sock *tp = tcp_sk(sk); 147 struct tcp_sock *tp = tcp_sk(sk);
575 148
576 /* With PAWS, it is safe from the viewpoint 149 /* With PAWS, it is safe from the viewpoint
@@ -587,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
587 fall back to VJ's scheme and use initial 160 fall back to VJ's scheme and use initial
588 timestamp retrieved from peer table. 161 timestamp retrieved from peer table.
589 */ 162 */
590 if (tw->tw_ts_recent_stamp && 163 if (tcptw->tw_ts_recent_stamp &&
591 (!twp || (sysctl_tcp_tw_reuse && 164 (!twp || (sysctl_tcp_tw_reuse &&
592 xtime.tv_sec - 165 xtime.tv_sec -
593 tw->tw_ts_recent_stamp > 1))) { 166 tcptw->tw_ts_recent_stamp > 1))) {
594 if ((tp->write_seq = 167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
595 tw->tw_snd_nxt + 65535 + 2) == 0) 168 if (tp->write_seq == 0)
596 tp->write_seq = 1; 169 tp->write_seq = 1;
597 tp->rx_opt.ts_recent = tw->tw_ts_recent; 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
598 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
599 sock_hold(sk2); 172 sock_hold(sk2);
600 goto unique; 173 goto unique;
601 } else 174 } else
@@ -606,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
606 179
607 /* And established part... */ 180 /* And established part... */
608 sk_for_each(sk2, node, &head->chain) { 181 sk_for_each(sk2, node, &head->chain) {
609 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) 182 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
610 goto not_unique; 183 goto not_unique;
611 } 184 }
612 185
@@ -626,10 +199,10 @@ unique:
626 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
627 } else if (tw) { 200 } else if (tw) {
628 /* Silly. Should hash-dance instead... */ 201 /* Silly. Should hash-dance instead... */
629 tcp_tw_deschedule(tw); 202 inet_twsk_deschedule(tw, &tcp_death_row);
630 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
631 204
632 tcp_tw_put(tw); 205 inet_twsk_put(tw);
633 } 206 }
634 207
635 return 0; 208 return 0;
@@ -652,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
652 */ 225 */
653static inline int tcp_v4_hash_connect(struct sock *sk) 226static inline int tcp_v4_hash_connect(struct sock *sk)
654{ 227{
655 unsigned short snum = inet_sk(sk)->num; 228 const unsigned short snum = inet_sk(sk)->num;
656 struct tcp_bind_hashbucket *head; 229 struct inet_bind_hashbucket *head;
657 struct tcp_bind_bucket *tb; 230 struct inet_bind_bucket *tb;
658 int ret; 231 int ret;
659 232
660 if (!snum) { 233 if (!snum) {
@@ -666,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
666 static u32 hint; 239 static u32 hint;
667 u32 offset = hint + connect_port_offset(sk); 240 u32 offset = hint + connect_port_offset(sk);
668 struct hlist_node *node; 241 struct hlist_node *node;
669 struct tcp_tw_bucket *tw = NULL; 242 struct inet_timewait_sock *tw = NULL;
670 243
671 local_bh_disable(); 244 local_bh_disable();
672 for (i = 1; i <= range; i++) { 245 for (i = 1; i <= range; i++) {
673 port = low + (i + offset) % range; 246 port = low + (i + offset) % range;
674 head = &tcp_bhash[tcp_bhashfn(port)]; 247 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
675 spin_lock(&head->lock); 248 spin_lock(&head->lock);
676 249
677 /* Does not bother with rcv_saddr checks, 250 /* Does not bother with rcv_saddr checks,
678 * because the established check is already 251 * because the established check is already
679 * unique enough. 252 * unique enough.
680 */ 253 */
681 tb_for_each(tb, node, &head->chain) { 254 inet_bind_bucket_for_each(tb, node, &head->chain) {
682 if (tb->port == port) { 255 if (tb->port == port) {
683 BUG_TRAP(!hlist_empty(&tb->owners)); 256 BUG_TRAP(!hlist_empty(&tb->owners));
684 if (tb->fastreuse >= 0) 257 if (tb->fastreuse >= 0)
@@ -691,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
691 } 264 }
692 } 265 }
693 266
694 tb = tcp_bucket_create(head, port); 267 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
695 if (!tb) { 268 if (!tb) {
696 spin_unlock(&head->lock); 269 spin_unlock(&head->lock);
697 break; 270 break;
@@ -710,27 +283,27 @@ ok:
710 hint += i; 283 hint += i;
711 284
712 /* Head lock still held and bh's disabled */ 285 /* Head lock still held and bh's disabled */
713 tcp_bind_hash(sk, tb, port); 286 inet_bind_hash(sk, tb, port);
714 if (sk_unhashed(sk)) { 287 if (sk_unhashed(sk)) {
715 inet_sk(sk)->sport = htons(port); 288 inet_sk(sk)->sport = htons(port);
716 __tcp_v4_hash(sk, 0); 289 __inet_hash(&tcp_hashinfo, sk, 0);
717 } 290 }
718 spin_unlock(&head->lock); 291 spin_unlock(&head->lock);
719 292
720 if (tw) { 293 if (tw) {
721 tcp_tw_deschedule(tw); 294 inet_twsk_deschedule(tw, &tcp_death_row);;
722 tcp_tw_put(tw); 295 inet_twsk_put(tw);
723 } 296 }
724 297
725 ret = 0; 298 ret = 0;
726 goto out; 299 goto out;
727 } 300 }
728 301
729 head = &tcp_bhash[tcp_bhashfn(snum)]; 302 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
730 tb = tcp_sk(sk)->bind_hash; 303 tb = inet_csk(sk)->icsk_bind_hash;
731 spin_lock_bh(&head->lock); 304 spin_lock_bh(&head->lock);
732 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 305 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
733 __tcp_v4_hash(sk, 0); 306 __inet_hash(&tcp_hashinfo, sk, 0);
734 spin_unlock_bh(&head->lock); 307 spin_unlock_bh(&head->lock);
735 return 0; 308 return 0;
736 } else { 309 } else {
@@ -793,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
793 tp->write_seq = 0; 366 tp->write_seq = 0;
794 } 367 }
795 368
796 if (sysctl_tcp_tw_recycle && 369 if (tcp_death_row.sysctl_tw_recycle &&
797 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
798 struct inet_peer *peer = rt_get_peer(rt); 371 struct inet_peer *peer = rt_get_peer(rt);
799 372
@@ -832,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
832 goto failure; 405 goto failure;
833 406
834 /* OK, now commit destination to socket. */ 407 /* OK, now commit destination to socket. */
835 __sk_dst_set(sk, &rt->u.dst); 408 sk_setup_caps(sk, &rt->u.dst);
836 tcp_v4_setup_caps(sk, &rt->u.dst);
837 409
838 if (!tp->write_seq) 410 if (!tp->write_seq)
839 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 411 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -859,53 +431,6 @@ failure:
859 return err; 431 return err;
860} 432}
861 433
862static __inline__ int tcp_v4_iif(struct sk_buff *skb)
863{
864 return ((struct rtable *)skb->dst)->rt_iif;
865}
866
867static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
868{
869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
870}
871
872static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
873 struct request_sock ***prevp,
874 __u16 rport,
875 __u32 raddr, __u32 laddr)
876{
877 struct listen_sock *lopt = tp->accept_queue.listen_opt;
878 struct request_sock *req, **prev;
879
880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
881 (req = *prev) != NULL;
882 prev = &req->dl_next) {
883 const struct inet_request_sock *ireq = inet_rsk(req);
884
885 if (ireq->rmt_port == rport &&
886 ireq->rmt_addr == raddr &&
887 ireq->loc_addr == laddr &&
888 TCP_INET_FAMILY(req->rsk_ops->family)) {
889 BUG_TRAP(!req->sk);
890 *prevp = prev;
891 break;
892 }
893 }
894
895 return req;
896}
897
898static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
899{
900 struct tcp_sock *tp = tcp_sk(sk);
901 struct listen_sock *lopt = tp->accept_queue.listen_opt;
902 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
903
904 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
905 tcp_synq_added(sk);
906}
907
908
909/* 434/*
910 * This routine does path mtu discovery as defined in RFC1191. 435 * This routine does path mtu discovery as defined in RFC1191.
911 */ 436 */
@@ -988,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
988 return; 513 return;
989 } 514 }
990 515
991 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, 516 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
992 th->source, tcp_v4_iif(skb)); 517 th->source, inet_iif(skb));
993 if (!sk) { 518 if (!sk) {
994 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 519 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
995 return; 520 return;
996 } 521 }
997 if (sk->sk_state == TCP_TIME_WAIT) { 522 if (sk->sk_state == TCP_TIME_WAIT) {
998 tcp_tw_put((struct tcp_tw_bucket *)sk); 523 inet_twsk_put((struct inet_timewait_sock *)sk);
999 return; 524 return;
1000 } 525 }
1001 526
@@ -1049,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1049 if (sock_owned_by_user(sk)) 574 if (sock_owned_by_user(sk))
1050 goto out; 575 goto out;
1051 576
1052 req = tcp_v4_search_req(tp, &prev, th->dest, 577 req = inet_csk_search_req(sk, &prev, th->dest,
1053 iph->daddr, iph->saddr); 578 iph->daddr, iph->saddr);
1054 if (!req) 579 if (!req)
1055 goto out; 580 goto out;
1056 581
@@ -1070,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1070 * created socket, and POSIX does not want network 595 * created socket, and POSIX does not want network
1071 * errors returned from accept(). 596 * errors returned from accept().
1072 */ 597 */
1073 tcp_synq_drop(sk, req, prev); 598 inet_csk_reqsk_queue_drop(sk, req, prev);
1074 goto out; 599 goto out;
1075 600
1076 case TCP_SYN_SENT: 601 case TCP_SYN_SENT:
@@ -1240,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1240 765
1241static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 766static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1242{ 767{
1243 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; 768 struct inet_timewait_sock *tw = inet_twsk(sk);
769 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1244 770
1245 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, 771 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1246 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); 772 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
1247 773
1248 tcp_tw_put(tw); 774 inet_twsk_put(tw);
1249} 775}
1250 776
1251static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) 777static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1254,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1254 req->ts_recent); 780 req->ts_recent);
1255} 781}
1256 782
1257static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1258 struct request_sock *req)
1259{
1260 struct rtable *rt;
1261 const struct inet_request_sock *ireq = inet_rsk(req);
1262 struct ip_options *opt = inet_rsk(req)->opt;
1263 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1264 .nl_u = { .ip4_u =
1265 { .daddr = ((opt && opt->srr) ?
1266 opt->faddr :
1267 ireq->rmt_addr),
1268 .saddr = ireq->loc_addr,
1269 .tos = RT_CONN_FLAGS(sk) } },
1270 .proto = IPPROTO_TCP,
1271 .uli_u = { .ports =
1272 { .sport = inet_sk(sk)->sport,
1273 .dport = ireq->rmt_port } } };
1274
1275 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1276 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1277 return NULL;
1278 }
1279 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1280 ip_rt_put(rt);
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282 return NULL;
1283 }
1284 return &rt->u.dst;
1285}
1286
1287/* 783/*
1288 * Send a SYN-ACK after having received an ACK. 784 * Send a SYN-ACK after having received an ACK.
1289 * This still operates on a request_sock only, not on a big 785 * This still operates on a request_sock only, not on a big
@@ -1297,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1297 struct sk_buff * skb; 793 struct sk_buff * skb;
1298 794
1299 /* First, grab a route. */ 795 /* First, grab a route. */
1300 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) 796 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1301 goto out; 797 goto out;
1302 798
1303 skb = tcp_make_synack(sk, dst, req); 799 skb = tcp_make_synack(sk, dst, req);
@@ -1399,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1399 * limitations, they conserve resources and peer is 895 * limitations, they conserve resources and peer is
1400 * evidently real one. 896 * evidently real one.
1401 */ 897 */
1402 if (tcp_synq_is_full(sk) && !isn) { 898 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1403#ifdef CONFIG_SYN_COOKIES 899#ifdef CONFIG_SYN_COOKIES
1404 if (sysctl_tcp_syncookies) { 900 if (sysctl_tcp_syncookies) {
1405 want_cookie = 1; 901 want_cookie = 1;
@@ -1413,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1413 * clogging syn queue with openreqs with exponentially increasing 909 * clogging syn queue with openreqs with exponentially increasing
1414 * timeout. 910 * timeout.
1415 */ 911 */
1416 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 912 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1417 goto drop; 913 goto drop;
1418 914
1419 req = reqsk_alloc(&tcp_request_sock_ops); 915 req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1469,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469 * are made in the function processing timewait state. 965 * are made in the function processing timewait state.
1470 */ 966 */
1471 if (tmp_opt.saw_tstamp && 967 if (tmp_opt.saw_tstamp &&
1472 sysctl_tcp_tw_recycle && 968 tcp_death_row.sysctl_tw_recycle &&
1473 (dst = tcp_v4_route_req(sk, req)) != NULL && 969 (dst = inet_csk_route_req(sk, req)) != NULL &&
1474 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 970 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1475 peer->v4daddr == saddr) { 971 peer->v4daddr == saddr) {
1476 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && 972 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1483,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1483 } 979 }
1484 /* Kill the following clause, if you dislike this way. */ 980 /* Kill the following clause, if you dislike this way. */
1485 else if (!sysctl_tcp_syncookies && 981 else if (!sysctl_tcp_syncookies &&
1486 (sysctl_max_syn_backlog - tcp_synq_len(sk) < 982 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1487 (sysctl_max_syn_backlog >> 2)) && 983 (sysctl_max_syn_backlog >> 2)) &&
1488 (!peer || !peer->tcp_ts_stamp) && 984 (!peer || !peer->tcp_ts_stamp) &&
1489 (!dst || !dst_metric(dst, RTAX_RTT))) { 985 (!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1494,12 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 * to destinations, already remembered 990 * to destinations, already remembered
1495 * to the moment of synflood. 991 * to the moment of synflood.
1496 */ 992 */
1497 NETDEBUG(if (net_ratelimit()) \ 993 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1498 printk(KERN_DEBUG "TCP: drop open " 994 "request from %u.%u.%u.%u/%u\n",
1499 "request from %u.%u." 995 NIPQUAD(saddr),
1500 "%u.%u/%u\n", \ 996 ntohs(skb->h.th->source));
1501 NIPQUAD(saddr),
1502 ntohs(skb->h.th->source)));
1503 dst_release(dst); 997 dst_release(dst);
1504 goto drop_and_free; 998 goto drop_and_free;
1505 } 999 }
@@ -1514,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1514 if (want_cookie) { 1008 if (want_cookie) {
1515 reqsk_free(req); 1009 reqsk_free(req);
1516 } else { 1010 } else {
1517 tcp_v4_synq_add(sk, req); 1011 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1518 } 1012 }
1519 return 0; 1013 return 0;
1520 1014
@@ -1542,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1542 if (sk_acceptq_is_full(sk)) 1036 if (sk_acceptq_is_full(sk))
1543 goto exit_overflow; 1037 goto exit_overflow;
1544 1038
1545 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) 1039 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1546 goto exit; 1040 goto exit;
1547 1041
1548 newsk = tcp_create_openreq_child(sk, req, skb); 1042 newsk = tcp_create_openreq_child(sk, req, skb);
1549 if (!newsk) 1043 if (!newsk)
1550 goto exit; 1044 goto exit;
1551 1045
1552 newsk->sk_dst_cache = dst; 1046 sk_setup_caps(newsk, dst);
1553 tcp_v4_setup_caps(newsk, dst);
1554 1047
1555 newtp = tcp_sk(newsk); 1048 newtp = tcp_sk(newsk);
1556 newinet = inet_sk(newsk); 1049 newinet = inet_sk(newsk);
@@ -1560,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1560 newinet->saddr = ireq->loc_addr; 1053 newinet->saddr = ireq->loc_addr;
1561 newinet->opt = ireq->opt; 1054 newinet->opt = ireq->opt;
1562 ireq->opt = NULL; 1055 ireq->opt = NULL;
1563 newinet->mc_index = tcp_v4_iif(skb); 1056 newinet->mc_index = inet_iif(skb);
1564 newinet->mc_ttl = skb->nh.iph->ttl; 1057 newinet->mc_ttl = skb->nh.iph->ttl;
1565 newtp->ext_header_len = 0; 1058 newtp->ext_header_len = 0;
1566 if (newinet->opt) 1059 if (newinet->opt)
@@ -1571,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1571 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1064 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1572 tcp_initialize_rcv_mss(newsk); 1065 tcp_initialize_rcv_mss(newsk);
1573 1066
1574 __tcp_v4_hash(newsk, 0); 1067 __inet_hash(&tcp_hashinfo, newsk, 0);
1575 __tcp_inherit_port(sk, newsk); 1068 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1576 1069
1577 return newsk; 1070 return newsk;
1578 1071
@@ -1588,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1588{ 1081{
1589 struct tcphdr *th = skb->h.th; 1082 struct tcphdr *th = skb->h.th;
1590 struct iphdr *iph = skb->nh.iph; 1083 struct iphdr *iph = skb->nh.iph;
1591 struct tcp_sock *tp = tcp_sk(sk);
1592 struct sock *nsk; 1084 struct sock *nsk;
1593 struct request_sock **prev; 1085 struct request_sock **prev;
1594 /* Find possible connection requests. */ 1086 /* Find possible connection requests. */
1595 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, 1087 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1596 iph->saddr, iph->daddr); 1088 iph->saddr, iph->daddr);
1597 if (req) 1089 if (req)
1598 return tcp_check_req(sk, skb, req, prev); 1090 return tcp_check_req(sk, skb, req, prev);
1599 1091
1600 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, 1092 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1601 th->source, 1093 th->source, skb->nh.iph->daddr,
1602 skb->nh.iph->daddr, 1094 ntohs(th->dest), inet_iif(skb));
1603 ntohs(th->dest),
1604 tcp_v4_iif(skb));
1605 1095
1606 if (nsk) { 1096 if (nsk) {
1607 if (nsk->sk_state != TCP_TIME_WAIT) { 1097 if (nsk->sk_state != TCP_TIME_WAIT) {
1608 bh_lock_sock(nsk); 1098 bh_lock_sock(nsk);
1609 return nsk; 1099 return nsk;
1610 } 1100 }
1611 tcp_tw_put((struct tcp_tw_bucket *)nsk); 1101 inet_twsk_put((struct inet_timewait_sock *)nsk);
1612 return NULL; 1102 return NULL;
1613 } 1103 }
1614 1104
@@ -1627,8 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
1627 skb->nh.iph->daddr, skb->csum)) 1117 skb->nh.iph->daddr, skb->csum))
1628 return 0; 1118 return 0;
1629 1119
1630 NETDEBUG(if (net_ratelimit()) 1120 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1631 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1632 skb->ip_summed = CHECKSUM_NONE; 1121 skb->ip_summed = CHECKSUM_NONE;
1633 } 1122 }
1634 if (skb->len <= 76) { 1123 if (skb->len <= 76) {
@@ -1744,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
1744 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; 1233 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1745 TCP_SKB_CB(skb)->sacked = 0; 1234 TCP_SKB_CB(skb)->sacked = 0;
1746 1235
1747 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, 1236 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1748 skb->nh.iph->daddr, ntohs(th->dest), 1237 skb->nh.iph->daddr, ntohs(th->dest),
1749 tcp_v4_iif(skb)); 1238 inet_iif(skb));
1750 1239
1751 if (!sk) 1240 if (!sk)
1752 goto no_tcp_socket; 1241 goto no_tcp_socket;
@@ -1798,24 +1287,26 @@ discard_and_relse:
1798 1287
1799do_time_wait: 1288do_time_wait:
1800 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1289 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1801 tcp_tw_put((struct tcp_tw_bucket *) sk); 1290 inet_twsk_put((struct inet_timewait_sock *) sk);
1802 goto discard_it; 1291 goto discard_it;
1803 } 1292 }
1804 1293
1805 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1294 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1806 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1295 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1807 tcp_tw_put((struct tcp_tw_bucket *) sk); 1296 inet_twsk_put((struct inet_timewait_sock *) sk);
1808 goto discard_it; 1297 goto discard_it;
1809 } 1298 }
1810 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, 1299 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1811 skb, th, skb->len)) { 1300 skb, th)) {
1812 case TCP_TW_SYN: { 1301 case TCP_TW_SYN: {
1813 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, 1302 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1814 ntohs(th->dest), 1303 skb->nh.iph->daddr,
1815 tcp_v4_iif(skb)); 1304 ntohs(th->dest),
1305 inet_iif(skb));
1816 if (sk2) { 1306 if (sk2) {
1817 tcp_tw_deschedule((struct tcp_tw_bucket *)sk); 1307 inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1818 tcp_tw_put((struct tcp_tw_bucket *)sk); 1308 &tcp_death_row);
1309 inet_twsk_put((struct inet_timewait_sock *)sk);
1819 sk = sk2; 1310 sk = sk2;
1820 goto process; 1311 goto process;
1821 } 1312 }
@@ -1831,112 +1322,6 @@ do_time_wait:
1831 goto discard_it; 1322 goto discard_it;
1832} 1323}
1833 1324
1834/* With per-bucket locks this operation is not-atomic, so that
1835 * this version is not worse.
1836 */
1837static void __tcp_v4_rehash(struct sock *sk)
1838{
1839 sk->sk_prot->unhash(sk);
1840 sk->sk_prot->hash(sk);
1841}
1842
1843static int tcp_v4_reselect_saddr(struct sock *sk)
1844{
1845 struct inet_sock *inet = inet_sk(sk);
1846 int err;
1847 struct rtable *rt;
1848 __u32 old_saddr = inet->saddr;
1849 __u32 new_saddr;
1850 __u32 daddr = inet->daddr;
1851
1852 if (inet->opt && inet->opt->srr)
1853 daddr = inet->opt->faddr;
1854
1855 /* Query new route. */
1856 err = ip_route_connect(&rt, daddr, 0,
1857 RT_CONN_FLAGS(sk),
1858 sk->sk_bound_dev_if,
1859 IPPROTO_TCP,
1860 inet->sport, inet->dport, sk);
1861 if (err)
1862 return err;
1863
1864 __sk_dst_set(sk, &rt->u.dst);
1865 tcp_v4_setup_caps(sk, &rt->u.dst);
1866
1867 new_saddr = rt->rt_src;
1868
1869 if (new_saddr == old_saddr)
1870 return 0;
1871
1872 if (sysctl_ip_dynaddr > 1) {
1873 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1874 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1875 NIPQUAD(old_saddr),
1876 NIPQUAD(new_saddr));
1877 }
1878
1879 inet->saddr = new_saddr;
1880 inet->rcv_saddr = new_saddr;
1881
1882 /* XXX The only one ugly spot where we need to
1883 * XXX really change the sockets identity after
1884 * XXX it has entered the hashes. -DaveM
1885 *
1886 * Besides that, it does not check for connection
1887 * uniqueness. Wait for troubles.
1888 */
1889 __tcp_v4_rehash(sk);
1890 return 0;
1891}
1892
1893int tcp_v4_rebuild_header(struct sock *sk)
1894{
1895 struct inet_sock *inet = inet_sk(sk);
1896 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1897 u32 daddr;
1898 int err;
1899
1900 /* Route is OK, nothing to do. */
1901 if (rt)
1902 return 0;
1903
1904 /* Reroute. */
1905 daddr = inet->daddr;
1906 if (inet->opt && inet->opt->srr)
1907 daddr = inet->opt->faddr;
1908
1909 {
1910 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1911 .nl_u = { .ip4_u =
1912 { .daddr = daddr,
1913 .saddr = inet->saddr,
1914 .tos = RT_CONN_FLAGS(sk) } },
1915 .proto = IPPROTO_TCP,
1916 .uli_u = { .ports =
1917 { .sport = inet->sport,
1918 .dport = inet->dport } } };
1919
1920 err = ip_route_output_flow(&rt, &fl, sk, 0);
1921 }
1922 if (!err) {
1923 __sk_dst_set(sk, &rt->u.dst);
1924 tcp_v4_setup_caps(sk, &rt->u.dst);
1925 return 0;
1926 }
1927
1928 /* Routing failed... */
1929 sk->sk_route_caps = 0;
1930
1931 if (!sysctl_ip_dynaddr ||
1932 sk->sk_state != TCP_SYN_SENT ||
1933 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1934 (err = tcp_v4_reselect_saddr(sk)) != 0)
1935 sk->sk_err_soft = -err;
1936
1937 return err;
1938}
1939
1940static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) 1325static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1941{ 1326{
1942 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; 1327 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1985,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
1985 return 0; 1370 return 0;
1986} 1371}
1987 1372
1988int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) 1373int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1989{ 1374{
1990 struct inet_peer *peer = NULL; 1375 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1991
1992 peer = inet_getpeer(tw->tw_daddr, 1);
1993 1376
1994 if (peer) { 1377 if (peer) {
1995 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || 1378 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1379
1380 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1996 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 1381 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1997 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { 1382 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1998 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; 1383 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1999 peer->tcp_ts = tw->tw_ts_recent; 1384 peer->tcp_ts = tcptw->tw_ts_recent;
2000 } 1385 }
2001 inet_putpeer(peer); 1386 inet_putpeer(peer);
2002 return 1; 1387 return 1;
@@ -2008,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2008struct tcp_func ipv4_specific = { 1393struct tcp_func ipv4_specific = {
2009 .queue_xmit = ip_queue_xmit, 1394 .queue_xmit = ip_queue_xmit,
2010 .send_check = tcp_v4_send_check, 1395 .send_check = tcp_v4_send_check,
2011 .rebuild_header = tcp_v4_rebuild_header, 1396 .rebuild_header = inet_sk_rebuild_header,
2012 .conn_request = tcp_v4_conn_request, 1397 .conn_request = tcp_v4_conn_request,
2013 .syn_recv_sock = tcp_v4_syn_recv_sock, 1398 .syn_recv_sock = tcp_v4_syn_recv_sock,
2014 .remember_stamp = tcp_v4_remember_stamp, 1399 .remember_stamp = tcp_v4_remember_stamp,
@@ -2024,13 +1409,14 @@ struct tcp_func ipv4_specific = {
2024 */ 1409 */
2025static int tcp_v4_init_sock(struct sock *sk) 1410static int tcp_v4_init_sock(struct sock *sk)
2026{ 1411{
1412 struct inet_connection_sock *icsk = inet_csk(sk);
2027 struct tcp_sock *tp = tcp_sk(sk); 1413 struct tcp_sock *tp = tcp_sk(sk);
2028 1414
2029 skb_queue_head_init(&tp->out_of_order_queue); 1415 skb_queue_head_init(&tp->out_of_order_queue);
2030 tcp_init_xmit_timers(sk); 1416 tcp_init_xmit_timers(sk);
2031 tcp_prequeue_init(tp); 1417 tcp_prequeue_init(tp);
2032 1418
2033 tp->rto = TCP_TIMEOUT_INIT; 1419 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2034 tp->mdev = TCP_TIMEOUT_INIT; 1420 tp->mdev = TCP_TIMEOUT_INIT;
2035 1421
2036 /* So many TCP implementations out there (incorrectly) count the 1422 /* So many TCP implementations out there (incorrectly) count the
@@ -2048,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2048 tp->mss_cache = 536; 1434 tp->mss_cache = 536;
2049 1435
2050 tp->reordering = sysctl_tcp_reordering; 1436 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops; 1437 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
2052 1438
2053 sk->sk_state = TCP_CLOSE; 1439 sk->sk_state = TCP_CLOSE;
2054 1440
@@ -2071,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
2071 1457
2072 tcp_clear_xmit_timers(sk); 1458 tcp_clear_xmit_timers(sk);
2073 1459
2074 tcp_cleanup_congestion_control(tp); 1460 tcp_cleanup_congestion_control(sk);
2075 1461
2076 /* Cleanup up the write buffer. */ 1462 /* Cleanup up the write buffer. */
2077 sk_stream_writequeue_purge(sk); 1463 sk_stream_writequeue_purge(sk);
@@ -2083,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2083 __skb_queue_purge(&tp->ucopy.prequeue); 1469 __skb_queue_purge(&tp->ucopy.prequeue);
2084 1470
2085 /* Clean up a referenced TCP bind bucket. */ 1471 /* Clean up a referenced TCP bind bucket. */
2086 if (tp->bind_hash) 1472 if (inet_csk(sk)->icsk_bind_hash)
2087 tcp_put_port(sk); 1473 inet_put_port(&tcp_hashinfo, sk);
2088 1474
2089 /* 1475 /*
2090 * If sendmsg cached page exists, toss it. 1476 * If sendmsg cached page exists, toss it.
@@ -2104,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
2104#ifdef CONFIG_PROC_FS 1490#ifdef CONFIG_PROC_FS
2105/* Proc filesystem TCP sock list dumping. */ 1491/* Proc filesystem TCP sock list dumping. */
2106 1492
2107static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) 1493static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
2108{ 1494{
2109 return hlist_empty(head) ? NULL : 1495 return hlist_empty(head) ? NULL :
2110 list_entry(head->first, struct tcp_tw_bucket, tw_node); 1496 list_entry(head->first, struct inet_timewait_sock, tw_node);
2111} 1497}
2112 1498
2113static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) 1499static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2114{ 1500{
2115 return tw->tw_node.next ? 1501 return tw->tw_node.next ?
2116 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1502 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2118,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2118 1504
2119static void *listening_get_next(struct seq_file *seq, void *cur) 1505static void *listening_get_next(struct seq_file *seq, void *cur)
2120{ 1506{
2121 struct tcp_sock *tp; 1507 struct inet_connection_sock *icsk;
2122 struct hlist_node *node; 1508 struct hlist_node *node;
2123 struct sock *sk = cur; 1509 struct sock *sk = cur;
2124 struct tcp_iter_state* st = seq->private; 1510 struct tcp_iter_state* st = seq->private;
2125 1511
2126 if (!sk) { 1512 if (!sk) {
2127 st->bucket = 0; 1513 st->bucket = 0;
2128 sk = sk_head(&tcp_listening_hash[0]); 1514 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
2129 goto get_sk; 1515 goto get_sk;
2130 } 1516 }
2131 1517
@@ -2134,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2134 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1520 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2135 struct request_sock *req = cur; 1521 struct request_sock *req = cur;
2136 1522
2137 tp = tcp_sk(st->syn_wait_sk); 1523 icsk = inet_csk(st->syn_wait_sk);
2138 req = req->dl_next; 1524 req = req->dl_next;
2139 while (1) { 1525 while (1) {
2140 while (req) { 1526 while (req) {
@@ -2147,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2147 if (++st->sbucket >= TCP_SYNQ_HSIZE) 1533 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2148 break; 1534 break;
2149get_req: 1535get_req:
2150 req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; 1536 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2151 } 1537 }
2152 sk = sk_next(st->syn_wait_sk); 1538 sk = sk_next(st->syn_wait_sk);
2153 st->state = TCP_SEQ_STATE_LISTENING; 1539 st->state = TCP_SEQ_STATE_LISTENING;
2154 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1540 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2155 } else { 1541 } else {
2156 tp = tcp_sk(sk); 1542 icsk = inet_csk(sk);
2157 read_lock_bh(&tp->accept_queue.syn_wait_lock); 1543 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2158 if (reqsk_queue_len(&tp->accept_queue)) 1544 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2159 goto start_req; 1545 goto start_req;
2160 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1546 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2161 sk = sk_next(sk); 1547 sk = sk_next(sk);
2162 } 1548 }
2163get_sk: 1549get_sk:
@@ -2166,9 +1552,9 @@ get_sk:
2166 cur = sk; 1552 cur = sk;
2167 goto out; 1553 goto out;
2168 } 1554 }
2169 tp = tcp_sk(sk); 1555 icsk = inet_csk(sk);
2170 read_lock_bh(&tp->accept_queue.syn_wait_lock); 1556 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2171 if (reqsk_queue_len(&tp->accept_queue)) { 1557 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2172start_req: 1558start_req:
2173 st->uid = sock_i_uid(sk); 1559 st->uid = sock_i_uid(sk);
2174 st->syn_wait_sk = sk; 1560 st->syn_wait_sk = sk;
@@ -2176,10 +1562,10 @@ start_req:
2176 st->sbucket = 0; 1562 st->sbucket = 0;
2177 goto get_req; 1563 goto get_req;
2178 } 1564 }
2179 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1565 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2180 } 1566 }
2181 if (++st->bucket < TCP_LHTABLE_SIZE) { 1567 if (++st->bucket < INET_LHTABLE_SIZE) {
2182 sk = sk_head(&tcp_listening_hash[st->bucket]); 1568 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2183 goto get_sk; 1569 goto get_sk;
2184 } 1570 }
2185 cur = NULL; 1571 cur = NULL;
@@ -2203,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
2203 struct tcp_iter_state* st = seq->private; 1589 struct tcp_iter_state* st = seq->private;
2204 void *rc = NULL; 1590 void *rc = NULL;
2205 1591
2206 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { 1592 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2207 struct sock *sk; 1593 struct sock *sk;
2208 struct hlist_node *node; 1594 struct hlist_node *node;
2209 struct tcp_tw_bucket *tw; 1595 struct inet_timewait_sock *tw;
2210 1596
2211 /* We can reschedule _before_ having picked the target: */ 1597 /* We can reschedule _before_ having picked the target: */
2212 cond_resched_softirq(); 1598 cond_resched_softirq();
2213 1599
2214 read_lock(&tcp_ehash[st->bucket].lock); 1600 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2215 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { 1601 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2216 if (sk->sk_family != st->family) { 1602 if (sk->sk_family != st->family) {
2217 continue; 1603 continue;
2218 } 1604 }
@@ -2220,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
2220 goto out; 1606 goto out;
2221 } 1607 }
2222 st->state = TCP_SEQ_STATE_TIME_WAIT; 1608 st->state = TCP_SEQ_STATE_TIME_WAIT;
2223 tw_for_each(tw, node, 1609 inet_twsk_for_each(tw, node,
2224 &tcp_ehash[st->bucket + tcp_ehash_size].chain) { 1610 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2225 if (tw->tw_family != st->family) { 1611 if (tw->tw_family != st->family) {
2226 continue; 1612 continue;
2227 } 1613 }
2228 rc = tw; 1614 rc = tw;
2229 goto out; 1615 goto out;
2230 } 1616 }
2231 read_unlock(&tcp_ehash[st->bucket].lock); 1617 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2232 st->state = TCP_SEQ_STATE_ESTABLISHED; 1618 st->state = TCP_SEQ_STATE_ESTABLISHED;
2233 } 1619 }
2234out: 1620out:
@@ -2238,7 +1624,7 @@ out:
2238static void *established_get_next(struct seq_file *seq, void *cur) 1624static void *established_get_next(struct seq_file *seq, void *cur)
2239{ 1625{
2240 struct sock *sk = cur; 1626 struct sock *sk = cur;
2241 struct tcp_tw_bucket *tw; 1627 struct inet_timewait_sock *tw;
2242 struct hlist_node *node; 1628 struct hlist_node *node;
2243 struct tcp_iter_state* st = seq->private; 1629 struct tcp_iter_state* st = seq->private;
2244 1630
@@ -2255,15 +1641,15 @@ get_tw:
2255 cur = tw; 1641 cur = tw;
2256 goto out; 1642 goto out;
2257 } 1643 }
2258 read_unlock(&tcp_ehash[st->bucket].lock); 1644 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2259 st->state = TCP_SEQ_STATE_ESTABLISHED; 1645 st->state = TCP_SEQ_STATE_ESTABLISHED;
2260 1646
2261 /* We can reschedule between buckets: */ 1647 /* We can reschedule between buckets: */
2262 cond_resched_softirq(); 1648 cond_resched_softirq();
2263 1649
2264 if (++st->bucket < tcp_ehash_size) { 1650 if (++st->bucket < tcp_hashinfo.ehash_size) {
2265 read_lock(&tcp_ehash[st->bucket].lock); 1651 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2266 sk = sk_head(&tcp_ehash[st->bucket].chain); 1652 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2267 } else { 1653 } else {
2268 cur = NULL; 1654 cur = NULL;
2269 goto out; 1655 goto out;
@@ -2277,7 +1663,7 @@ get_tw:
2277 } 1663 }
2278 1664
2279 st->state = TCP_SEQ_STATE_TIME_WAIT; 1665 st->state = TCP_SEQ_STATE_TIME_WAIT;
2280 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); 1666 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2281 goto get_tw; 1667 goto get_tw;
2282found: 1668found:
2283 cur = sk; 1669 cur = sk;
@@ -2301,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2301 void *rc; 1687 void *rc;
2302 struct tcp_iter_state* st = seq->private; 1688 struct tcp_iter_state* st = seq->private;
2303 1689
2304 tcp_listen_lock(); 1690 inet_listen_lock(&tcp_hashinfo);
2305 st->state = TCP_SEQ_STATE_LISTENING; 1691 st->state = TCP_SEQ_STATE_LISTENING;
2306 rc = listening_get_idx(seq, &pos); 1692 rc = listening_get_idx(seq, &pos);
2307 1693
2308 if (!rc) { 1694 if (!rc) {
2309 tcp_listen_unlock(); 1695 inet_listen_unlock(&tcp_hashinfo);
2310 local_bh_disable(); 1696 local_bh_disable();
2311 st->state = TCP_SEQ_STATE_ESTABLISHED; 1697 st->state = TCP_SEQ_STATE_ESTABLISHED;
2312 rc = established_get_idx(seq, pos); 1698 rc = established_get_idx(seq, pos);
@@ -2339,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2339 case TCP_SEQ_STATE_LISTENING: 1725 case TCP_SEQ_STATE_LISTENING:
2340 rc = listening_get_next(seq, v); 1726 rc = listening_get_next(seq, v);
2341 if (!rc) { 1727 if (!rc) {
2342 tcp_listen_unlock(); 1728 inet_listen_unlock(&tcp_hashinfo);
2343 local_bh_disable(); 1729 local_bh_disable();
2344 st->state = TCP_SEQ_STATE_ESTABLISHED; 1730 st->state = TCP_SEQ_STATE_ESTABLISHED;
2345 rc = established_get_first(seq); 1731 rc = established_get_first(seq);
@@ -2362,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2362 switch (st->state) { 1748 switch (st->state) {
2363 case TCP_SEQ_STATE_OPENREQ: 1749 case TCP_SEQ_STATE_OPENREQ:
2364 if (v) { 1750 if (v) {
2365 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); 1751 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2366 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1752 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2367 } 1753 }
2368 case TCP_SEQ_STATE_LISTENING: 1754 case TCP_SEQ_STATE_LISTENING:
2369 if (v != SEQ_START_TOKEN) 1755 if (v != SEQ_START_TOKEN)
2370 tcp_listen_unlock(); 1756 inet_listen_unlock(&tcp_hashinfo);
2371 break; 1757 break;
2372 case TCP_SEQ_STATE_TIME_WAIT: 1758 case TCP_SEQ_STATE_TIME_WAIT:
2373 case TCP_SEQ_STATE_ESTABLISHED: 1759 case TCP_SEQ_STATE_ESTABLISHED:
2374 if (v) 1760 if (v)
2375 read_unlock(&tcp_ehash[st->bucket].lock); 1761 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2376 local_bh_enable(); 1762 local_bh_enable();
2377 break; 1763 break;
2378 } 1764 }
@@ -2469,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2469 int timer_active; 1855 int timer_active;
2470 unsigned long timer_expires; 1856 unsigned long timer_expires;
2471 struct tcp_sock *tp = tcp_sk(sp); 1857 struct tcp_sock *tp = tcp_sk(sp);
1858 const struct inet_connection_sock *icsk = inet_csk(sp);
2472 struct inet_sock *inet = inet_sk(sp); 1859 struct inet_sock *inet = inet_sk(sp);
2473 unsigned int dest = inet->daddr; 1860 unsigned int dest = inet->daddr;
2474 unsigned int src = inet->rcv_saddr; 1861 unsigned int src = inet->rcv_saddr;
2475 __u16 destp = ntohs(inet->dport); 1862 __u16 destp = ntohs(inet->dport);
2476 __u16 srcp = ntohs(inet->sport); 1863 __u16 srcp = ntohs(inet->sport);
2477 1864
2478 if (tp->pending == TCP_TIME_RETRANS) { 1865 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2479 timer_active = 1; 1866 timer_active = 1;
2480 timer_expires = tp->timeout; 1867 timer_expires = icsk->icsk_timeout;
2481 } else if (tp->pending == TCP_TIME_PROBE0) { 1868 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2482 timer_active = 4; 1869 timer_active = 4;
2483 timer_expires = tp->timeout; 1870 timer_expires = icsk->icsk_timeout;
2484 } else if (timer_pending(&sp->sk_timer)) { 1871 } else if (timer_pending(&sp->sk_timer)) {
2485 timer_active = 2; 1872 timer_active = 2;
2486 timer_expires = sp->sk_timer.expires; 1873 timer_expires = sp->sk_timer.expires;
@@ -2495,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2495 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, 1882 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2496 timer_active, 1883 timer_active,
2497 jiffies_to_clock_t(timer_expires - jiffies), 1884 jiffies_to_clock_t(timer_expires - jiffies),
2498 tp->retransmits, 1885 icsk->icsk_retransmits,
2499 sock_i_uid(sp), 1886 sock_i_uid(sp),
2500 tp->probes_out, 1887 icsk->icsk_probes_out,
2501 sock_i_ino(sp), 1888 sock_i_ino(sp),
2502 atomic_read(&sp->sk_refcnt), sp, 1889 atomic_read(&sp->sk_refcnt), sp,
2503 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, 1890 icsk->icsk_rto,
1891 icsk->icsk_ack.ato,
1892 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2504 tp->snd_cwnd, 1893 tp->snd_cwnd,
2505 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); 1894 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2506} 1895}
2507 1896
2508static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) 1897static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2509{ 1898{
2510 unsigned int dest, src; 1899 unsigned int dest, src;
2511 __u16 destp, srcp; 1900 __u16 destp, srcp;
@@ -2585,7 +1974,7 @@ struct proto tcp_prot = {
2585 .close = tcp_close, 1974 .close = tcp_close,
2586 .connect = tcp_v4_connect, 1975 .connect = tcp_v4_connect,
2587 .disconnect = tcp_disconnect, 1976 .disconnect = tcp_disconnect,
2588 .accept = tcp_accept, 1977 .accept = inet_csk_accept,
2589 .ioctl = tcp_ioctl, 1978 .ioctl = tcp_ioctl,
2590 .init = tcp_v4_init_sock, 1979 .init = tcp_v4_init_sock,
2591 .destroy = tcp_v4_destroy_sock, 1980 .destroy = tcp_v4_destroy_sock,
@@ -2600,6 +1989,7 @@ struct proto tcp_prot = {
2600 .get_port = tcp_v4_get_port, 1989 .get_port = tcp_v4_get_port,
2601 .enter_memory_pressure = tcp_enter_memory_pressure, 1990 .enter_memory_pressure = tcp_enter_memory_pressure,
2602 .sockets_allocated = &tcp_sockets_allocated, 1991 .sockets_allocated = &tcp_sockets_allocated,
1992 .orphan_count = &tcp_orphan_count,
2603 .memory_allocated = &tcp_memory_allocated, 1993 .memory_allocated = &tcp_memory_allocated,
2604 .memory_pressure = &tcp_memory_pressure, 1994 .memory_pressure = &tcp_memory_pressure,
2605 .sysctl_mem = sysctl_tcp_mem, 1995 .sysctl_mem = sysctl_tcp_mem,
@@ -2607,6 +1997,7 @@ struct proto tcp_prot = {
2607 .sysctl_rmem = sysctl_tcp_rmem, 1997 .sysctl_rmem = sysctl_tcp_rmem,
2608 .max_header = MAX_TCP_HEADER, 1998 .max_header = MAX_TCP_HEADER,
2609 .obj_size = sizeof(struct tcp_sock), 1999 .obj_size = sizeof(struct tcp_sock),
2000 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2610 .rsk_prot = &tcp_request_sock_ops, 2001 .rsk_prot = &tcp_request_sock_ops,
2611}; 2002};
2612 2003
@@ -2628,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
2628} 2019}
2629 2020
2630EXPORT_SYMBOL(ipv4_specific); 2021EXPORT_SYMBOL(ipv4_specific);
2631EXPORT_SYMBOL(tcp_bind_hash); 2022EXPORT_SYMBOL(inet_bind_bucket_create);
2632EXPORT_SYMBOL(tcp_bucket_create);
2633EXPORT_SYMBOL(tcp_hashinfo); 2023EXPORT_SYMBOL(tcp_hashinfo);
2634EXPORT_SYMBOL(tcp_inherit_port);
2635EXPORT_SYMBOL(tcp_listen_wlock);
2636EXPORT_SYMBOL(tcp_port_rover);
2637EXPORT_SYMBOL(tcp_prot); 2024EXPORT_SYMBOL(tcp_prot);
2638EXPORT_SYMBOL(tcp_put_port);
2639EXPORT_SYMBOL(tcp_unhash); 2025EXPORT_SYMBOL(tcp_unhash);
2640EXPORT_SYMBOL(tcp_v4_conn_request); 2026EXPORT_SYMBOL(tcp_v4_conn_request);
2641EXPORT_SYMBOL(tcp_v4_connect); 2027EXPORT_SYMBOL(tcp_v4_connect);
2642EXPORT_SYMBOL(tcp_v4_do_rcv); 2028EXPORT_SYMBOL(tcp_v4_do_rcv);
2643EXPORT_SYMBOL(tcp_v4_rebuild_header);
2644EXPORT_SYMBOL(tcp_v4_remember_stamp); 2029EXPORT_SYMBOL(tcp_v4_remember_stamp);
2645EXPORT_SYMBOL(tcp_v4_send_check); 2030EXPORT_SYMBOL(tcp_v4_send_check);
2646EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 2031EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b7..a88db28b0af7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
35#define SYNC_INIT 1 35#define SYNC_INIT 1
36#endif 36#endif
37 37
38int sysctl_tcp_tw_recycle;
39int sysctl_tcp_max_tw_buckets = NR_FILE*2;
40
41int sysctl_tcp_syncookies = SYNC_INIT; 38int sysctl_tcp_syncookies = SYNC_INIT;
42int sysctl_tcp_abort_on_overflow; 39int sysctl_tcp_abort_on_overflow;
43 40
44static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); 41struct inet_timewait_death_row tcp_death_row = {
42 .sysctl_max_tw_buckets = NR_FILE * 2,
43 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
44 .death_lock = SPIN_LOCK_UNLOCKED,
45 .hashinfo = &tcp_hashinfo,
46 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
47 (unsigned long)&tcp_death_row),
48 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
49 inet_twdr_twkill_work,
50 &tcp_death_row),
51/* Short-time timewait calendar */
52
53 .twcal_hand = -1,
54 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
55 (unsigned long)&tcp_death_row),
56};
57
58EXPORT_SYMBOL_GPL(tcp_death_row);
45 59
46static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 60static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
47{ 61{
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
52 return (seq == e_win && seq == end_seq); 66 return (seq == e_win && seq == end_seq);
53} 67}
54 68
55/* New-style handling of TIME_WAIT sockets. */
56
57int tcp_tw_count;
58
59
60/* Must be called with locally disabled BHs. */
61static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
62{
63 struct tcp_ehash_bucket *ehead;
64 struct tcp_bind_hashbucket *bhead;
65 struct tcp_bind_bucket *tb;
66
67 /* Unlink from established hashes. */
68 ehead = &tcp_ehash[tw->tw_hashent];
69 write_lock(&ehead->lock);
70 if (hlist_unhashed(&tw->tw_node)) {
71 write_unlock(&ehead->lock);
72 return;
73 }
74 __hlist_del(&tw->tw_node);
75 sk_node_init(&tw->tw_node);
76 write_unlock(&ehead->lock);
77
78 /* Disassociate with bind bucket. */
79 bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
80 spin_lock(&bhead->lock);
81 tb = tw->tw_tb;
82 __hlist_del(&tw->tw_bind_node);
83 tw->tw_tb = NULL;
84 tcp_bucket_destroy(tb);
85 spin_unlock(&bhead->lock);
86
87#ifdef INET_REFCNT_DEBUG
88 if (atomic_read(&tw->tw_refcnt) != 1) {
89 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
90 atomic_read(&tw->tw_refcnt));
91 }
92#endif
93 tcp_tw_put(tw);
94}
95
96/* 69/*
97 * * Main purpose of TIME-WAIT state is to close connection gracefully, 70 * * Main purpose of TIME-WAIT state is to close connection gracefully,
98 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 71 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
122 * to avoid misread sequence numbers, states etc. --ANK 95 * to avoid misread sequence numbers, states etc. --ANK
123 */ 96 */
124enum tcp_tw_status 97enum tcp_tw_status
125tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, 98tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
126 struct tcphdr *th, unsigned len) 99 const struct tcphdr *th)
127{ 100{
101 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
128 struct tcp_options_received tmp_opt; 102 struct tcp_options_received tmp_opt;
129 int paws_reject = 0; 103 int paws_reject = 0;
130 104
131 tmp_opt.saw_tstamp = 0; 105 tmp_opt.saw_tstamp = 0;
132 if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { 106 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
133 tcp_parse_options(skb, &tmp_opt, 0); 107 tcp_parse_options(skb, &tmp_opt, 0);
134 108
135 if (tmp_opt.saw_tstamp) { 109 if (tmp_opt.saw_tstamp) {
136 tmp_opt.ts_recent = tw->tw_ts_recent; 110 tmp_opt.ts_recent = tcptw->tw_ts_recent;
137 tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; 111 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 112 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
139 } 113 }
140 } 114 }
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
145 /* Out of window, send ACK */ 119 /* Out of window, send ACK */
146 if (paws_reject || 120 if (paws_reject ||
147 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 121 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
148 tw->tw_rcv_nxt, 122 tcptw->tw_rcv_nxt,
149 tw->tw_rcv_nxt + tw->tw_rcv_wnd)) 123 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
150 return TCP_TW_ACK; 124 return TCP_TW_ACK;
151 125
152 if (th->rst) 126 if (th->rst)
153 goto kill; 127 goto kill;
154 128
155 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) 129 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
156 goto kill_with_rst; 130 goto kill_with_rst;
157 131
158 /* Dup ACK? */ 132 /* Dup ACK? */
159 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || 133 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
160 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 134 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
161 tcp_tw_put(tw); 135 inet_twsk_put(tw);
162 return TCP_TW_SUCCESS; 136 return TCP_TW_SUCCESS;
163 } 137 }
164 138
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
166 * reset. 140 * reset.
167 */ 141 */
168 if (!th->fin || 142 if (!th->fin ||
169 TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { 143 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
170kill_with_rst: 144kill_with_rst:
171 tcp_tw_deschedule(tw); 145 inet_twsk_deschedule(tw, &tcp_death_row);
172 tcp_tw_put(tw); 146 inet_twsk_put(tw);
173 return TCP_TW_RST; 147 return TCP_TW_RST;
174 } 148 }
175 149
176 /* FIN arrived, enter true time-wait state. */ 150 /* FIN arrived, enter true time-wait state. */
177 tw->tw_substate = TCP_TIME_WAIT; 151 tw->tw_substate = TCP_TIME_WAIT;
178 tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 152 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
179 if (tmp_opt.saw_tstamp) { 153 if (tmp_opt.saw_tstamp) {
180 tw->tw_ts_recent_stamp = xtime.tv_sec; 154 tcptw->tw_ts_recent_stamp = xtime.tv_sec;
181 tw->tw_ts_recent = tmp_opt.rcv_tsval; 155 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
182 } 156 }
183 157
184 /* I am shamed, but failed to make it more elegant. 158 /* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
187 * do not undertsnad recycling in any case, it not 161 * do not undertsnad recycling in any case, it not
188 * a big problem in practice. --ANK */ 162 * a big problem in practice. --ANK */
189 if (tw->tw_family == AF_INET && 163 if (tw->tw_family == AF_INET &&
190 sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && 164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
191 tcp_v4_tw_remember_stamp(tw)) 165 tcp_v4_tw_remember_stamp(tw))
192 tcp_tw_schedule(tw, tw->tw_timeout); 166 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
167 TCP_TIMEWAIT_LEN);
193 else 168 else
194 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 169 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
170 TCP_TIMEWAIT_LEN);
195 return TCP_TW_ACK; 171 return TCP_TW_ACK;
196 } 172 }
197 173
@@ -213,7 +189,7 @@ kill_with_rst:
213 */ 189 */
214 190
215 if (!paws_reject && 191 if (!paws_reject &&
216 (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && 192 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
217 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 193 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
218 /* In window segment, it may be only reset or bare ack. */ 194 /* In window segment, it may be only reset or bare ack. */
219 195
@@ -224,19 +200,20 @@ kill_with_rst:
224 */ 200 */
225 if (sysctl_tcp_rfc1337 == 0) { 201 if (sysctl_tcp_rfc1337 == 0) {
226kill: 202kill:
227 tcp_tw_deschedule(tw); 203 inet_twsk_deschedule(tw, &tcp_death_row);
228 tcp_tw_put(tw); 204 inet_twsk_put(tw);
229 return TCP_TW_SUCCESS; 205 return TCP_TW_SUCCESS;
230 } 206 }
231 } 207 }
232 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 208 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
209 TCP_TIMEWAIT_LEN);
233 210
234 if (tmp_opt.saw_tstamp) { 211 if (tmp_opt.saw_tstamp) {
235 tw->tw_ts_recent = tmp_opt.rcv_tsval; 212 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
236 tw->tw_ts_recent_stamp = xtime.tv_sec; 213 tcptw->tw_ts_recent_stamp = xtime.tv_sec;
237 } 214 }
238 215
239 tcp_tw_put(tw); 216 inet_twsk_put(tw);
240 return TCP_TW_SUCCESS; 217 return TCP_TW_SUCCESS;
241 } 218 }
242 219
@@ -258,9 +235,10 @@ kill:
258 */ 235 */
259 236
260 if (th->syn && !th->rst && !th->ack && !paws_reject && 237 if (th->syn && !th->rst && !th->ack && !paws_reject &&
261 (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || 238 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
262 (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 239 (tmp_opt.saw_tstamp &&
263 u32 isn = tw->tw_snd_nxt + 65535 + 2; 240 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
241 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
264 if (isn == 0) 242 if (isn == 0)
265 isn++; 243 isn++;
266 TCP_SKB_CB(skb)->when = isn; 244 TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
278 * Do not reschedule in the last case. 256 * Do not reschedule in the last case.
279 */ 257 */
280 if (paws_reject || th->ack) 258 if (paws_reject || th->ack)
281 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 259 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
260 TCP_TIMEWAIT_LEN);
282 261
283 /* Send ACK. Note, we do not put the bucket, 262 /* Send ACK. Note, we do not put the bucket,
284 * it will be released by caller. 263 * it will be released by caller.
285 */ 264 */
286 return TCP_TW_ACK; 265 return TCP_TW_ACK;
287 } 266 }
288 tcp_tw_put(tw); 267 inet_twsk_put(tw);
289 return TCP_TW_SUCCESS; 268 return TCP_TW_SUCCESS;
290} 269}
291 270
292/* Enter the time wait state. This is called with locally disabled BH.
293 * Essentially we whip up a timewait bucket, copy the
294 * relevant info into it from the SK, and mess with hash chains
295 * and list linkage.
296 */
297static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
298{
299 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
300 struct tcp_bind_hashbucket *bhead;
301
302 /* Step 1: Put TW into bind hash. Original socket stays there too.
303 Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
304 binding cache, even if it is closed.
305 */
306 bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
307 spin_lock(&bhead->lock);
308 tw->tw_tb = tcp_sk(sk)->bind_hash;
309 BUG_TRAP(tcp_sk(sk)->bind_hash);
310 tw_add_bind_node(tw, &tw->tw_tb->owners);
311 spin_unlock(&bhead->lock);
312
313 write_lock(&ehead->lock);
314
315 /* Step 2: Remove SK from established hash. */
316 if (__sk_del_node_init(sk))
317 sock_prot_dec_use(sk->sk_prot);
318
319 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
320 tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
321 atomic_inc(&tw->tw_refcnt);
322
323 write_unlock(&ehead->lock);
324}
325
326/* 271/*
327 * Move a socket to time-wait or dead fin-wait-2 state. 272 * Move a socket to time-wait or dead fin-wait-2 state.
328 */ 273 */
329void tcp_time_wait(struct sock *sk, int state, int timeo) 274void tcp_time_wait(struct sock *sk, int state, int timeo)
330{ 275{
331 struct tcp_tw_bucket *tw = NULL; 276 struct inet_timewait_sock *tw = NULL;
332 struct tcp_sock *tp = tcp_sk(sk); 277 const struct tcp_sock *tp = tcp_sk(sk);
333 int recycle_ok = 0; 278 int recycle_ok = 0;
334 279
335 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) 280 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
336 recycle_ok = tp->af_specific->remember_stamp(sk); 281 recycle_ok = tp->af_specific->remember_stamp(sk);
337 282
338 if (tcp_tw_count < sysctl_tcp_max_tw_buckets) 283 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
339 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); 284 tw = inet_twsk_alloc(sk, state);
340
341 if(tw != NULL) {
342 struct inet_sock *inet = inet_sk(sk);
343 int rto = (tp->rto<<2) - (tp->rto>>1);
344
345 /* Give us an identity. */
346 tw->tw_daddr = inet->daddr;
347 tw->tw_rcv_saddr = inet->rcv_saddr;
348 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
349 tw->tw_num = inet->num;
350 tw->tw_state = TCP_TIME_WAIT;
351 tw->tw_substate = state;
352 tw->tw_sport = inet->sport;
353 tw->tw_dport = inet->dport;
354 tw->tw_family = sk->sk_family;
355 tw->tw_reuse = sk->sk_reuse;
356 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
357 atomic_set(&tw->tw_refcnt, 1);
358 285
359 tw->tw_hashent = sk->sk_hashent; 286 if (tw != NULL) {
360 tw->tw_rcv_nxt = tp->rcv_nxt; 287 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
361 tw->tw_snd_nxt = tp->snd_nxt; 288 const struct inet_connection_sock *icsk = inet_csk(sk);
362 tw->tw_rcv_wnd = tcp_receive_window(tp); 289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
363 tw->tw_ts_recent = tp->rx_opt.ts_recent; 290
364 tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
365 tw_dead_node_init(tw); 292 tcptw->tw_rcv_nxt = tp->rcv_nxt;
293 tcptw->tw_snd_nxt = tp->snd_nxt;
294 tcptw->tw_rcv_wnd = tcp_receive_window(tp);
295 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
296 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
366 297
367#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
368 if (tw->tw_family == PF_INET6) { 299 if (tw->tw_family == PF_INET6) {
369 struct ipv6_pinfo *np = inet6_sk(sk); 300 struct ipv6_pinfo *np = inet6_sk(sk);
301 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
370 302
371 ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); 303 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
372 ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); 304 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
373 tw->tw_v6_ipv6only = np->ipv6only; 305 tw->tw_ipv6only = np->ipv6only;
374 } else {
375 memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
376 memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
377 tw->tw_v6_ipv6only = 0;
378 } 306 }
379#endif 307#endif
380 /* Linkage updates. */ 308 /* Linkage updates. */
381 __tcp_tw_hashdance(sk, tw); 309 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
382 310
383 /* Get the TIME_WAIT timeout firing. */ 311 /* Get the TIME_WAIT timeout firing. */
384 if (timeo < rto) 312 if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
392 timeo = TCP_TIMEWAIT_LEN; 320 timeo = TCP_TIMEWAIT_LEN;
393 } 321 }
394 322
395 tcp_tw_schedule(tw, timeo); 323 inet_twsk_schedule(tw, &tcp_death_row, timeo,
396 tcp_tw_put(tw); 324 TCP_TIMEWAIT_LEN);
325 inet_twsk_put(tw);
397 } else { 326 } else {
398 /* Sorry, if we're out of memory, just CLOSE this 327 /* Sorry, if we're out of memory, just CLOSE this
399 * socket up. We've got bigger problems than 328 * socket up. We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
407 tcp_done(sk); 336 tcp_done(sk);
408} 337}
409 338
410/* Kill off TIME_WAIT sockets once their lifetime has expired. */
411static int tcp_tw_death_row_slot;
412
413static void tcp_twkill(unsigned long);
414
415/* TIME_WAIT reaping mechanism. */
416#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
417#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
418
419#define TCP_TWKILL_QUOTA 100
420
421static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
422static DEFINE_SPINLOCK(tw_death_lock);
423static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
424static void twkill_work(void *);
425static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
426static u32 twkill_thread_slots;
427
428/* Returns non-zero if quota exceeded. */
429static int tcp_do_twkill_work(int slot, unsigned int quota)
430{
431 struct tcp_tw_bucket *tw;
432 struct hlist_node *node;
433 unsigned int killed;
434 int ret;
435
436 /* NOTE: compare this to previous version where lock
437 * was released after detaching chain. It was racy,
438 * because tw buckets are scheduled in not serialized context
439 * in 2.3 (with netfilter), and with softnet it is common, because
440 * soft irqs are not sequenced.
441 */
442 killed = 0;
443 ret = 0;
444rescan:
445 tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
446 __tw_del_dead_node(tw);
447 spin_unlock(&tw_death_lock);
448 tcp_timewait_kill(tw);
449 tcp_tw_put(tw);
450 killed++;
451 spin_lock(&tw_death_lock);
452 if (killed > quota) {
453 ret = 1;
454 break;
455 }
456
457 /* While we dropped tw_death_lock, another cpu may have
458 * killed off the next TW bucket in the list, therefore
459 * do a fresh re-read of the hlist head node with the
460 * lock reacquired. We still use the hlist traversal
461 * macro in order to get the prefetches.
462 */
463 goto rescan;
464 }
465
466 tcp_tw_count -= killed;
467 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
468
469 return ret;
470}
471
472static void tcp_twkill(unsigned long dummy)
473{
474 int need_timer, ret;
475
476 spin_lock(&tw_death_lock);
477
478 if (tcp_tw_count == 0)
479 goto out;
480
481 need_timer = 0;
482 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
483 if (ret) {
484 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
485 mb();
486 schedule_work(&tcp_twkill_work);
487 need_timer = 1;
488 } else {
489 /* We purged the entire slot, anything left? */
490 if (tcp_tw_count)
491 need_timer = 1;
492 }
493 tcp_tw_death_row_slot =
494 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
495 if (need_timer)
496 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
497out:
498 spin_unlock(&tw_death_lock);
499}
500
501extern void twkill_slots_invalid(void);
502
503static void twkill_work(void *dummy)
504{
505 int i;
506
507 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
508 twkill_slots_invalid();
509
510 while (twkill_thread_slots) {
511 spin_lock_bh(&tw_death_lock);
512 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
513 if (!(twkill_thread_slots & (1 << i)))
514 continue;
515
516 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
517 if (need_resched()) {
518 spin_unlock_bh(&tw_death_lock);
519 schedule();
520 spin_lock_bh(&tw_death_lock);
521 }
522 }
523
524 twkill_thread_slots &= ~(1 << i);
525 }
526 spin_unlock_bh(&tw_death_lock);
527 }
528}
529
530/* These are always called from BH context. See callers in
531 * tcp_input.c to verify this.
532 */
533
534/* This is for handling early-kills of TIME_WAIT sockets. */
535void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
536{
537 spin_lock(&tw_death_lock);
538 if (tw_del_dead_node(tw)) {
539 tcp_tw_put(tw);
540 if (--tcp_tw_count == 0)
541 del_timer(&tcp_tw_timer);
542 }
543 spin_unlock(&tw_death_lock);
544 tcp_timewait_kill(tw);
545}
546
547/* Short-time timewait calendar */
548
549static int tcp_twcal_hand = -1;
550static int tcp_twcal_jiffie;
551static void tcp_twcal_tick(unsigned long);
552static struct timer_list tcp_twcal_timer =
553 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
554static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
555
556static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
557{
558 struct hlist_head *list;
559 int slot;
560
561 /* timeout := RTO * 3.5
562 *
563 * 3.5 = 1+2+0.5 to wait for two retransmits.
564 *
565 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
566 * our ACK acking that FIN can be lost. If N subsequent retransmitted
567 * FINs (or previous seqments) are lost (probability of such event
568 * is p^(N+1), where p is probability to lose single packet and
569 * time to detect the loss is about RTO*(2^N - 1) with exponential
570 * backoff). Normal timewait length is calculated so, that we
571 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
572 * [ BTW Linux. following BSD, violates this requirement waiting
573 * only for 60sec, we should wait at least for 240 secs.
574 * Well, 240 consumes too much of resources 8)
575 * ]
576 * This interval is not reduced to catch old duplicate and
577 * responces to our wandering segments living for two MSLs.
578 * However, if we use PAWS to detect
579 * old duplicates, we can reduce the interval to bounds required
580 * by RTO, rather than MSL. So, if peer understands PAWS, we
581 * kill tw bucket after 3.5*RTO (it is important that this number
582 * is greater than TS tick!) and detect old duplicates with help
583 * of PAWS.
584 */
585 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
586
587 spin_lock(&tw_death_lock);
588
589 /* Unlink it, if it was scheduled */
590 if (tw_del_dead_node(tw))
591 tcp_tw_count--;
592 else
593 atomic_inc(&tw->tw_refcnt);
594
595 if (slot >= TCP_TW_RECYCLE_SLOTS) {
596 /* Schedule to slow timer */
597 if (timeo >= TCP_TIMEWAIT_LEN) {
598 slot = TCP_TWKILL_SLOTS-1;
599 } else {
600 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
601 if (slot >= TCP_TWKILL_SLOTS)
602 slot = TCP_TWKILL_SLOTS-1;
603 }
604 tw->tw_ttd = jiffies + timeo;
605 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
606 list = &tcp_tw_death_row[slot];
607 } else {
608 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
609
610 if (tcp_twcal_hand < 0) {
611 tcp_twcal_hand = 0;
612 tcp_twcal_jiffie = jiffies;
613 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
614 add_timer(&tcp_twcal_timer);
615 } else {
616 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
617 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
618 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
619 }
620 list = &tcp_twcal_row[slot];
621 }
622
623 hlist_add_head(&tw->tw_death_node, list);
624
625 if (tcp_tw_count++ == 0)
626 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
627 spin_unlock(&tw_death_lock);
628}
629
630void tcp_twcal_tick(unsigned long dummy)
631{
632 int n, slot;
633 unsigned long j;
634 unsigned long now = jiffies;
635 int killed = 0;
636 int adv = 0;
637
638 spin_lock(&tw_death_lock);
639 if (tcp_twcal_hand < 0)
640 goto out;
641
642 slot = tcp_twcal_hand;
643 j = tcp_twcal_jiffie;
644
645 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
646 if (time_before_eq(j, now)) {
647 struct hlist_node *node, *safe;
648 struct tcp_tw_bucket *tw;
649
650 tw_for_each_inmate_safe(tw, node, safe,
651 &tcp_twcal_row[slot]) {
652 __tw_del_dead_node(tw);
653 tcp_timewait_kill(tw);
654 tcp_tw_put(tw);
655 killed++;
656 }
657 } else {
658 if (!adv) {
659 adv = 1;
660 tcp_twcal_jiffie = j;
661 tcp_twcal_hand = slot;
662 }
663
664 if (!hlist_empty(&tcp_twcal_row[slot])) {
665 mod_timer(&tcp_twcal_timer, j);
666 goto out;
667 }
668 }
669 j += (1<<TCP_TW_RECYCLE_TICK);
670 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
671 }
672 tcp_twcal_hand = -1;
673
674out:
675 if ((tcp_tw_count -= killed) == 0)
676 del_timer(&tcp_tw_timer);
677 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
678 spin_unlock(&tw_death_lock);
679}
680
681/* This is not only more efficient than what we used to do, it eliminates 339/* This is not only more efficient than what we used to do, it eliminates
682 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 340 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
683 * 341 *
@@ -686,75 +344,27 @@ out:
686 */ 344 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 345struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
688{ 346{
689 /* allocate the newsk from the same slab of the master sock, 347 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
690 * if not, at sk_free time we'll try to free it from the wrong
691 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693 348
694 if(newsk != NULL) { 349 if (newsk != NULL) {
695 struct inet_request_sock *ireq = inet_rsk(req); 350 const struct inet_request_sock *ireq = inet_rsk(req);
696 struct tcp_request_sock *treq = tcp_rsk(req); 351 struct tcp_request_sock *treq = tcp_rsk(req);
352 struct inet_connection_sock *newicsk = inet_csk(sk);
697 struct tcp_sock *newtp; 353 struct tcp_sock *newtp;
698 struct sk_filter *filter;
699
700 memcpy(newsk, sk, sizeof(struct tcp_sock));
701 newsk->sk_state = TCP_SYN_RECV;
702
703 /* SANITY */
704 sk_node_init(&newsk->sk_node);
705 tcp_sk(newsk)->bind_hash = NULL;
706
707 /* Clone the TCP header template */
708 inet_sk(newsk)->dport = ireq->rmt_port;
709
710 sock_lock_init(newsk);
711 bh_lock_sock(newsk);
712
713 rwlock_init(&newsk->sk_dst_lock);
714 atomic_set(&newsk->sk_rmem_alloc, 0);
715 skb_queue_head_init(&newsk->sk_receive_queue);
716 atomic_set(&newsk->sk_wmem_alloc, 0);
717 skb_queue_head_init(&newsk->sk_write_queue);
718 atomic_set(&newsk->sk_omem_alloc, 0);
719 newsk->sk_wmem_queued = 0;
720 newsk->sk_forward_alloc = 0;
721
722 sock_reset_flag(newsk, SOCK_DONE);
723 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
724 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
725 newsk->sk_send_head = NULL;
726 rwlock_init(&newsk->sk_callback_lock);
727 skb_queue_head_init(&newsk->sk_error_queue);
728 newsk->sk_write_space = sk_stream_write_space;
729
730 if ((filter = newsk->sk_filter) != NULL)
731 sk_filter_charge(newsk, filter);
732
733 if (unlikely(xfrm_sk_clone_policy(newsk))) {
734 /* It is still raw copy of parent, so invalidate
735 * destructor and make plain sk_free() */
736 newsk->sk_destruct = NULL;
737 sk_free(newsk);
738 return NULL;
739 }
740 354
741 /* Now setup tcp_sock */ 355 /* Now setup tcp_sock */
742 newtp = tcp_sk(newsk); 356 newtp = tcp_sk(newsk);
743 newtp->pred_flags = 0; 357 newtp->pred_flags = 0;
744 newtp->rcv_nxt = treq->rcv_isn + 1; 358 newtp->rcv_nxt = treq->rcv_isn + 1;
745 newtp->snd_nxt = treq->snt_isn + 1; 359 newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
746 newtp->snd_una = treq->snt_isn + 1;
747 newtp->snd_sml = treq->snt_isn + 1;
748 360
749 tcp_prequeue_init(newtp); 361 tcp_prequeue_init(newtp);
750 362
751 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 363 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
752 364
753 newtp->retransmits = 0;
754 newtp->backoff = 0;
755 newtp->srtt = 0; 365 newtp->srtt = 0;
756 newtp->mdev = TCP_TIMEOUT_INIT; 366 newtp->mdev = TCP_TIMEOUT_INIT;
757 newtp->rto = TCP_TIMEOUT_INIT; 367 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
758 368
759 newtp->packets_out = 0; 369 newtp->packets_out = 0;
760 newtp->left_out = 0; 370 newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 384 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 385 newtp->frto_highmark = 0;
776 386
777 newtp->ca_ops = &tcp_reno; 387 newicsk->icsk_ca_ops = &tcp_reno;
778 388
779 tcp_set_ca_state(newtp, TCP_CA_Open); 389 tcp_set_ca_state(newsk, TCP_CA_Open);
780 tcp_init_xmit_timers(newsk); 390 tcp_init_xmit_timers(newsk);
781 skb_queue_head_init(&newtp->out_of_order_queue); 391 skb_queue_head_init(&newtp->out_of_order_queue);
782 newtp->rcv_wup = treq->rcv_isn + 1; 392 newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
789 newtp->rx_opt.dsack = 0; 399 newtp->rx_opt.dsack = 0;
790 newtp->rx_opt.eff_sacks = 0; 400 newtp->rx_opt.eff_sacks = 0;
791 401
792 newtp->probes_out = 0;
793 newtp->rx_opt.num_sacks = 0; 402 newtp->rx_opt.num_sacks = 0;
794 newtp->urg_data = 0; 403 newtp->urg_data = 0;
795 /* Deinitialize accept_queue to trap illegal accesses. */
796 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
797
798 /* Back to base struct sock members. */
799 newsk->sk_err = 0;
800 newsk->sk_priority = 0;
801 atomic_set(&newsk->sk_refcnt, 2);
802#ifdef INET_REFCNT_DEBUG
803 atomic_inc(&inet_sock_nr);
804#endif
805 atomic_inc(&tcp_sockets_allocated);
806 404
807 if (sock_flag(newsk, SOCK_KEEPOPEN)) 405 if (sock_flag(newsk, SOCK_KEEPOPEN))
808 tcp_reset_keepalive_timer(newsk, 406 inet_csk_reset_keepalive_timer(newsk,
809 keepalive_time_when(newtp)); 407 keepalive_time_when(newtp));
810 newsk->sk_socket = NULL;
811 newsk->sk_sleep = NULL;
812 408
813 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 409 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
814 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 410 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
838 newtp->tcp_header_len = sizeof(struct tcphdr); 434 newtp->tcp_header_len = sizeof(struct tcphdr);
839 } 435 }
840 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 436 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
841 newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; 437 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
842 newtp->rx_opt.mss_clamp = req->mss; 438 newtp->rx_opt.mss_clamp = req->mss;
843 TCP_ECN_openreq_child(newtp, req); 439 TCP_ECN_openreq_child(newtp, req);
844 if (newtp->ecn_flags&TCP_ECN_OK) 440 if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
934 does sequence test, SYN is truncated, and thus we consider 530 does sequence test, SYN is truncated, and thus we consider
935 it a bare ACK. 531 it a bare ACK.
936 532
937 If tp->defer_accept, we silently drop this bare ACK. Otherwise, 533 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
938 we create an established connection. Both ends (listening sockets) 534 bare ACK. Otherwise, we create an established connection. Both
939 accept the new incoming connection and try to talk to each other. 8-) 535 ends (listening sockets) accept the new incoming connection and try
536 to talk to each other. 8-)
940 537
941 Note: This case is both harmless, and rare. Possibility is about the 538 Note: This case is both harmless, and rare. Possibility is about the
942 same as us discovering intelligent life on another plant tomorrow. 539 same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1003 return NULL; 600 return NULL;
1004 601
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 602 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 603 if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
604 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
1007 inet_rsk(req)->acked = 1; 605 inet_rsk(req)->acked = 1;
1008 return NULL; 606 return NULL;
1009 } 607 }
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1018 if (child == NULL) 616 if (child == NULL)
1019 goto listen_overflow; 617 goto listen_overflow;
1020 618
1021 tcp_synq_unlink(tp, req, prev); 619 inet_csk_reqsk_queue_unlink(sk, req, prev);
1022 tcp_synq_removed(sk, req); 620 inet_csk_reqsk_queue_removed(sk, req);
1023 621
1024 tcp_acceptq_queue(sk, req, child); 622 inet_csk_reqsk_queue_add(sk, req, child);
1025 return child; 623 return child;
1026 624
1027 listen_overflow: 625 listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1035 if (!(flg & TCP_FLAG_RST)) 633 if (!(flg & TCP_FLAG_RST))
1036 req->rsk_ops->send_reset(skb); 634 req->rsk_ops->send_reset(skb);
1037 635
1038 tcp_synq_drop(sk, req, prev); 636 inet_csk_reqsk_queue_drop(sk, req, prev);
1039 return NULL; 637 return NULL;
1040} 638}
1041 639
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
1074EXPORT_SYMBOL(tcp_child_process); 672EXPORT_SYMBOL(tcp_child_process);
1075EXPORT_SYMBOL(tcp_create_openreq_child); 673EXPORT_SYMBOL(tcp_create_openreq_child);
1076EXPORT_SYMBOL(tcp_timewait_state_process); 674EXPORT_SYMBOL(tcp_timewait_state_process);
1077EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3f8ea1bfa9c..6094db5e11be 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
105 105
106/* RFC2861. Reset CWND after idle period longer RTO to "restart window". 106/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */ 107 * This is the first part of cwnd validation mechanism. */
108static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) 108static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
109{ 109{
110 struct tcp_sock *tp = tcp_sk(sk);
110 s32 delta = tcp_time_stamp - tp->lsndtime; 111 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 112 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 113 u32 cwnd = tp->snd_cwnd;
113 114
114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART); 115 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
115 116
116 tp->snd_ssthresh = tcp_current_ssthresh(tp); 117 tp->snd_ssthresh = tcp_current_ssthresh(sk);
117 restart_cwnd = min(restart_cwnd, cwnd); 118 restart_cwnd = min(restart_cwnd, cwnd);
118 119
119 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) 120 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
120 cwnd >>= 1; 121 cwnd >>= 1;
121 tp->snd_cwnd = max(cwnd, restart_cwnd); 122 tp->snd_cwnd = max(cwnd, restart_cwnd);
122 tp->snd_cwnd_stamp = tcp_time_stamp; 123 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
126static inline void tcp_event_data_sent(struct tcp_sock *tp, 127static inline void tcp_event_data_sent(struct tcp_sock *tp,
127 struct sk_buff *skb, struct sock *sk) 128 struct sk_buff *skb, struct sock *sk)
128{ 129{
129 u32 now = tcp_time_stamp; 130 struct inet_connection_sock *icsk = inet_csk(sk);
131 const u32 now = tcp_time_stamp;
130 132
131 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) 133 if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
132 tcp_cwnd_restart(tp, __sk_dst_get(sk)); 134 tcp_cwnd_restart(sk, __sk_dst_get(sk));
133 135
134 tp->lsndtime = now; 136 tp->lsndtime = now;
135 137
136 /* If it is a reply for ato after last received 138 /* If it is a reply for ato after last received
137 * packet, enter pingpong mode. 139 * packet, enter pingpong mode.
138 */ 140 */
139 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) 141 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
140 tp->ack.pingpong = 1; 142 icsk->icsk_ack.pingpong = 1;
141} 143}
142 144
143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 145static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
144{ 146{
145 struct tcp_sock *tp = tcp_sk(sk); 147 tcp_dec_quickack_mode(sk, pkts);
146 148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
147 tcp_dec_quickack_mode(tp, pkts);
148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
149} 149}
150 150
151/* Determine a window scaling and initial window to offer. 151/* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
265static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) 265static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
266{ 266{
267 if (skb != NULL) { 267 if (skb != NULL) {
268 const struct inet_connection_sock *icsk = inet_csk(sk);
268 struct inet_sock *inet = inet_sk(sk); 269 struct inet_sock *inet = inet_sk(sk);
269 struct tcp_sock *tp = tcp_sk(sk); 270 struct tcp_sock *tp = tcp_sk(sk);
270 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_SACK 0x4 281#define SYSCTL_FLAG_SACK 0x4
281 282
282 /* If congestion control is doing timestamping */ 283 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample) 284 if (icsk->icsk_ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp); 285 __net_timestamp(skb);
285 286
286 sysctl_flags = 0; 287 sysctl_flags = 0;
287 if (tcb->flags & TCPCB_FLAG_SYN) { 288 if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
308 } 309 }
309 310
310 if (tcp_packets_in_flight(tp) == 0) 311 if (tcp_packets_in_flight(tp) == 0)
311 tcp_ca_event(tp, CA_EVENT_TX_START); 312 tcp_ca_event(sk, CA_EVENT_TX_START);
312 313
313 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 314 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
314 skb->h.th = th; 315 skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
366 if (err <= 0) 367 if (err <= 0)
367 return err; 368 return err;
368 369
369 tcp_enter_cwr(tp); 370 tcp_enter_cwr(sk);
370 371
371 /* NET_XMIT_CN is special. It does not guarantee, 372 /* NET_XMIT_CN is special. It does not guarantee,
372 * that this packet is lost. It tells that device 373 * that this packet is lost. It tells that device
@@ -403,11 +404,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
403 sk->sk_send_head = skb; 404 sk->sk_send_head = skb;
404} 405}
405 406
406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) 407static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
407{ 408{
408 struct tcp_sock *tp = tcp_sk(sk); 409 if (skb->len <= mss_now ||
409
410 if (skb->len <= tp->mss_cache ||
411 !(sk->sk_route_caps & NETIF_F_TSO)) { 410 !(sk->sk_route_caps & NETIF_F_TSO)) {
412 /* Avoid the costly divide in the normal 411 /* Avoid the costly divide in the normal
413 * non-TSO case. 412 * non-TSO case.
@@ -417,10 +416,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
417 } else { 416 } else {
418 unsigned int factor; 417 unsigned int factor;
419 418
420 factor = skb->len + (tp->mss_cache - 1); 419 factor = skb->len + (mss_now - 1);
421 factor /= tp->mss_cache; 420 factor /= mss_now;
422 skb_shinfo(skb)->tso_segs = factor; 421 skb_shinfo(skb)->tso_segs = factor;
423 skb_shinfo(skb)->tso_size = tp->mss_cache; 422 skb_shinfo(skb)->tso_size = mss_now;
424 } 423 }
425} 424}
426 425
@@ -429,11 +428,11 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
429 * packet to the list. This won't be called frequently, I hope. 428 * packet to the list. This won't be called frequently, I hope.
430 * Remember, these are still headerless SKBs at this point. 429 * Remember, these are still headerless SKBs at this point.
431 */ 430 */
432static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 431int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
433{ 432{
434 struct tcp_sock *tp = tcp_sk(sk); 433 struct tcp_sock *tp = tcp_sk(sk);
435 struct sk_buff *buff; 434 struct sk_buff *buff;
436 int nsize; 435 int nsize, old_factor;
437 u16 flags; 436 u16 flags;
438 437
439 nsize = skb_headlen(skb) - len; 438 nsize = skb_headlen(skb) - len;
@@ -484,30 +483,41 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
484 * skbs, which it never sent before. --ANK 483 * skbs, which it never sent before. --ANK
485 */ 484 */
486 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 485 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
487 buff->stamp = skb->stamp; 486 buff->tstamp = skb->tstamp;
488 487
489 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 488 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
490 tp->lost_out -= tcp_skb_pcount(skb); 489 tp->lost_out -= tcp_skb_pcount(skb);
491 tp->left_out -= tcp_skb_pcount(skb); 490 tp->left_out -= tcp_skb_pcount(skb);
492 } 491 }
493 492
494 /* Fix up tso_factor for both original and new SKB. */ 493 old_factor = tcp_skb_pcount(skb);
495 tcp_set_skb_tso_segs(sk, skb);
496 tcp_set_skb_tso_segs(sk, buff);
497 494
498 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 495 /* Fix up tso_factor for both original and new SKB. */
499 tp->lost_out += tcp_skb_pcount(skb); 496 tcp_set_skb_tso_segs(sk, skb, mss_now);
500 tp->left_out += tcp_skb_pcount(skb); 497 tcp_set_skb_tso_segs(sk, buff, mss_now);
501 }
502 498
503 if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { 499 /* If this packet has been sent out already, we must
504 tp->lost_out += tcp_skb_pcount(buff); 500 * adjust the various packet counters.
505 tp->left_out += tcp_skb_pcount(buff); 501 */
502 if (after(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
503 int diff = old_factor - tcp_skb_pcount(skb) -
504 tcp_skb_pcount(buff);
505
506 tp->packets_out -= diff;
507 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
508 tp->lost_out -= diff;
509 tp->left_out -= diff;
510 }
511 if (diff > 0) {
512 tp->fackets_out -= diff;
513 if ((int)tp->fackets_out < 0)
514 tp->fackets_out = 0;
515 }
506 } 516 }
507 517
508 /* Link BUFF into the send queue. */ 518 /* Link BUFF into the send queue. */
509 skb_header_release(buff); 519 skb_header_release(buff);
510 __skb_append(skb, buff); 520 __skb_append(skb, buff, &sk->sk_write_queue);
511 521
512 return 0; 522 return 0;
513} 523}
@@ -569,7 +579,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
569 * factor and mss. 579 * factor and mss.
570 */ 580 */
571 if (tcp_skb_pcount(skb) > 1) 581 if (tcp_skb_pcount(skb) > 1)
572 tcp_set_skb_tso_segs(sk, skb); 582 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
573 583
574 return 0; 584 return 0;
575} 585}
@@ -698,7 +708,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
698 if (tp->packets_out > tp->snd_cwnd_used) 708 if (tp->packets_out > tp->snd_cwnd_used)
699 tp->snd_cwnd_used = tp->packets_out; 709 tp->snd_cwnd_used = tp->packets_out;
700 710
701 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) 711 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
702 tcp_cwnd_application_limited(sk); 712 tcp_cwnd_application_limited(sk);
703 } 713 }
704} 714}
@@ -734,12 +744,14 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
734/* This must be invoked the first time we consider transmitting 744/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire. 745 * SKB onto the wire.
736 */ 746 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) 747static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
738{ 748{
739 int tso_segs = tcp_skb_pcount(skb); 749 int tso_segs = tcp_skb_pcount(skb);
740 750
741 if (!tso_segs) { 751 if (!tso_segs ||
742 tcp_set_skb_tso_segs(sk, skb); 752 (tso_segs > 1 &&
753 skb_shinfo(skb)->tso_size != mss_now)) {
754 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb); 755 tso_segs = tcp_skb_pcount(skb);
744 } 756 }
745 return tso_segs; 757 return tso_segs;
@@ -817,7 +829,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
817 struct tcp_sock *tp = tcp_sk(sk); 829 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota; 830 unsigned int cwnd_quota;
819 831
820 tcp_init_tso_segs(sk, skb); 832 tcp_init_tso_segs(sk, skb, cur_mss);
821 833
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) 834 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0; 835 return 0;
@@ -854,14 +866,15 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
854 * know that all the data is in scatter-gather pages, and that the 866 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned). 867 * packet has never been sent out before (and thus is not cloned).
856 */ 868 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) 869static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{ 870{
859 struct sk_buff *buff; 871 struct sk_buff *buff;
860 int nlen = skb->len - len; 872 int nlen = skb->len - len;
861 u16 flags; 873 u16 flags;
862 874
863 /* All of a TSO frame must be composed of paged data. */ 875 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len); 876 if (skb->len != skb->data_len)
877 return tcp_fragment(sk, skb, len, mss_now);
865 878
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); 879 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL)) 880 if (unlikely(buff == NULL))
@@ -887,12 +900,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
887 skb_split(skb, buff, len); 900 skb_split(skb, buff, len);
888 901
889 /* Fix up tso_factor for both original and new SKB. */ 902 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb); 903 tcp_set_skb_tso_segs(sk, skb, mss_now);
891 tcp_set_skb_tso_segs(sk, buff); 904 tcp_set_skb_tso_segs(sk, buff, mss_now);
892 905
893 /* Link BUFF into the send queue. */ 906 /* Link BUFF into the send queue. */
894 skb_header_release(buff); 907 skb_header_release(buff);
895 __skb_append(skb, buff); 908 __skb_append(skb, buff, &sk->sk_write_queue);
896 909
897 return 0; 910 return 0;
898} 911}
@@ -904,12 +917,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
904 */ 917 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 918static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{ 919{
920 const struct inet_connection_sock *icsk = inet_csk(sk);
907 u32 send_win, cong_win, limit, in_flight; 921 u32 send_win, cong_win, limit, in_flight;
908 922
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 923 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0; 924 return 0;
911 925
912 if (tp->ca_state != TCP_CA_Open) 926 if (icsk->icsk_ca_state != TCP_CA_Open)
913 return 0; 927 return 0;
914 928
915 in_flight = tcp_packets_in_flight(tp); 929 in_flight = tcp_packets_in_flight(tp);
@@ -924,10 +938,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
924 938
925 limit = min(send_win, cong_win); 939 limit = min(send_win, cong_win);
926 940
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) { 941 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 942 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933 943
@@ -972,19 +982,20 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
972 if (unlikely(sk->sk_state == TCP_CLOSE)) 982 if (unlikely(sk->sk_state == TCP_CLOSE))
973 return 0; 983 return 0;
974 984
975 skb = sk->sk_send_head;
976 if (unlikely(!skb))
977 return 0;
978
979 tso_segs = tcp_init_tso_segs(sk, skb);
980 cwnd_quota = tcp_cwnd_test(tp, skb);
981 if (unlikely(!cwnd_quota))
982 goto out;
983
984 sent_pkts = 0; 985 sent_pkts = 0;
985 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { 986 while ((skb = sk->sk_send_head)) {
987 unsigned int limit;
988
989 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
986 BUG_ON(!tso_segs); 990 BUG_ON(!tso_segs);
987 991
992 cwnd_quota = tcp_cwnd_test(tp, skb);
993 if (!cwnd_quota)
994 break;
995
996 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
997 break;
998
988 if (tso_segs == 1) { 999 if (tso_segs == 1) {
989 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 1000 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
990 (tcp_skb_is_last(sk, skb) ? 1001 (tcp_skb_is_last(sk, skb) ?
@@ -995,9 +1006,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
995 break; 1006 break;
996 } 1007 }
997 1008
1009 limit = mss_now;
998 if (tso_segs > 1) { 1010 if (tso_segs > 1) {
999 u32 limit = tcp_window_allows(tp, skb, 1011 limit = tcp_window_allows(tp, skb,
1000 mss_now, cwnd_quota); 1012 mss_now, cwnd_quota);
1001 1013
1002 if (skb->len < limit) { 1014 if (skb->len < limit) {
1003 unsigned int trim = skb->len % mss_now; 1015 unsigned int trim = skb->len % mss_now;
@@ -1005,15 +1017,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1005 if (trim) 1017 if (trim)
1006 limit = skb->len - trim; 1018 limit = skb->len - trim;
1007 } 1019 }
1008 if (skb->len > limit) {
1009 if (tso_fragment(sk, skb, limit))
1010 break;
1011 }
1012 } else if (unlikely(skb->len > mss_now)) {
1013 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1014 break;
1015 } 1020 }
1016 1021
1022 if (skb->len > limit &&
1023 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1024 break;
1025
1017 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1026 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1018 1027
1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) 1028 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
@@ -1026,27 +1035,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
1026 1035
1027 tcp_minshall_update(tp, mss_now, skb); 1036 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++; 1037 sent_pkts++;
1029
1030 /* Do not optimize this to use tso_segs. If we chopped up
1031 * the packet above, tso_segs will no longer be valid.
1032 */
1033 cwnd_quota -= tcp_skb_pcount(skb);
1034
1035 BUG_ON(cwnd_quota < 0);
1036 if (!cwnd_quota)
1037 break;
1038
1039 skb = sk->sk_send_head;
1040 if (!skb)
1041 break;
1042 tso_segs = tcp_init_tso_segs(sk, skb);
1043 } 1038 }
1044 1039
1045 if (likely(sent_pkts)) { 1040 if (likely(sent_pkts)) {
1046 tcp_cwnd_validate(sk, tp); 1041 tcp_cwnd_validate(sk, tp);
1047 return 0; 1042 return 0;
1048 } 1043 }
1049out:
1050 return !tp->packets_out && sk->sk_send_head; 1044 return !tp->packets_out && sk->sk_send_head;
1051} 1045}
1052 1046
@@ -1076,15 +1070,18 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
1076 1070
1077 BUG_ON(!skb || skb->len < mss_now); 1071 BUG_ON(!skb || skb->len < mss_now);
1078 1072
1079 tso_segs = tcp_init_tso_segs(sk, skb); 1073 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); 1074 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1081 1075
1082 if (likely(cwnd_quota)) { 1076 if (likely(cwnd_quota)) {
1077 unsigned int limit;
1078
1083 BUG_ON(!tso_segs); 1079 BUG_ON(!tso_segs);
1084 1080
1081 limit = mss_now;
1085 if (tso_segs > 1) { 1082 if (tso_segs > 1) {
1086 u32 limit = tcp_window_allows(tp, skb, 1083 limit = tcp_window_allows(tp, skb,
1087 mss_now, cwnd_quota); 1084 mss_now, cwnd_quota);
1088 1085
1089 if (skb->len < limit) { 1086 if (skb->len < limit) {
1090 unsigned int trim = skb->len % mss_now; 1087 unsigned int trim = skb->len % mss_now;
@@ -1092,15 +1089,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
1092 if (trim) 1089 if (trim)
1093 limit = skb->len - trim; 1090 limit = skb->len - trim;
1094 } 1091 }
1095 if (skb->len > limit) {
1096 if (unlikely(tso_fragment(sk, skb, limit)))
1097 return;
1098 }
1099 } else if (unlikely(skb->len > mss_now)) {
1100 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1101 return;
1102 } 1092 }
1103 1093
1094 if (skb->len > limit &&
1095 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1096 return;
1097
1104 /* Send it out now. */ 1098 /* Send it out now. */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1099 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106 1100
@@ -1166,6 +1160,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
1166 */ 1160 */
1167u32 __tcp_select_window(struct sock *sk) 1161u32 __tcp_select_window(struct sock *sk)
1168{ 1162{
1163 struct inet_connection_sock *icsk = inet_csk(sk);
1169 struct tcp_sock *tp = tcp_sk(sk); 1164 struct tcp_sock *tp = tcp_sk(sk);
1170 /* MSS for the peer's data. Previous verions used mss_clamp 1165 /* MSS for the peer's data. Previous verions used mss_clamp
1171 * here. I don't know if the value based on our guesses 1166 * here. I don't know if the value based on our guesses
@@ -1173,7 +1168,7 @@ u32 __tcp_select_window(struct sock *sk)
1173 * but may be worse for the performance because of rcv_mss 1168 * but may be worse for the performance because of rcv_mss
1174 * fluctuations. --SAW 1998/11/1 1169 * fluctuations. --SAW 1998/11/1
1175 */ 1170 */
1176 int mss = tp->ack.rcv_mss; 1171 int mss = icsk->icsk_ack.rcv_mss;
1177 int free_space = tcp_space(sk); 1172 int free_space = tcp_space(sk);
1178 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 1173 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
1179 int window; 1174 int window;
@@ -1182,7 +1177,7 @@ u32 __tcp_select_window(struct sock *sk)
1182 mss = full_space; 1177 mss = full_space;
1183 1178
1184 if (free_space < full_space/2) { 1179 if (free_space < full_space/2) {
1185 tp->ack.quick = 0; 1180 icsk->icsk_ack.quick = 0;
1186 1181
1187 if (tcp_memory_pressure) 1182 if (tcp_memory_pressure)
1188 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); 1183 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1257,7 +1252,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1257 tcp_skb_pcount(next_skb) != 1); 1252 tcp_skb_pcount(next_skb) != 1);
1258 1253
1259 /* Ok. We will be able to collapse the packet. */ 1254 /* Ok. We will be able to collapse the packet. */
1260 __skb_unlink(next_skb, next_skb->list); 1255 __skb_unlink(next_skb, &sk->sk_write_queue);
1261 1256
1262 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); 1257 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
1263 1258
@@ -1305,6 +1300,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1305 */ 1300 */
1306void tcp_simple_retransmit(struct sock *sk) 1301void tcp_simple_retransmit(struct sock *sk)
1307{ 1302{
1303 const struct inet_connection_sock *icsk = inet_csk(sk);
1308 struct tcp_sock *tp = tcp_sk(sk); 1304 struct tcp_sock *tp = tcp_sk(sk);
1309 struct sk_buff *skb; 1305 struct sk_buff *skb;
1310 unsigned int mss = tcp_current_mss(sk, 0); 1306 unsigned int mss = tcp_current_mss(sk, 0);
@@ -1335,12 +1331,12 @@ void tcp_simple_retransmit(struct sock *sk)
1335 * in network, but units changed and effective 1331 * in network, but units changed and effective
1336 * cwnd/ssthresh really reduced now. 1332 * cwnd/ssthresh really reduced now.
1337 */ 1333 */
1338 if (tp->ca_state != TCP_CA_Loss) { 1334 if (icsk->icsk_ca_state != TCP_CA_Loss) {
1339 tp->high_seq = tp->snd_nxt; 1335 tp->high_seq = tp->snd_nxt;
1340 tp->snd_ssthresh = tcp_current_ssthresh(tp); 1336 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1341 tp->prior_ssthresh = 0; 1337 tp->prior_ssthresh = 0;
1342 tp->undo_marker = 0; 1338 tp->undo_marker = 0;
1343 tcp_set_ca_state(tp, TCP_CA_Loss); 1339 tcp_set_ca_state(sk, TCP_CA_Loss);
1344 } 1340 }
1345 tcp_xmit_retransmit_queue(sk); 1341 tcp_xmit_retransmit_queue(sk);
1346} 1342}
@@ -1365,12 +1361,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1365 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 1361 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
1366 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1362 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1367 BUG(); 1363 BUG();
1368
1369 if (sk->sk_route_caps & NETIF_F_TSO) {
1370 sk->sk_route_caps &= ~NETIF_F_TSO;
1371 sock_set_flag(sk, SOCK_NO_LARGESEND);
1372 }
1373
1374 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1364 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1375 return -ENOMEM; 1365 return -ENOMEM;
1376 } 1366 }
@@ -1385,16 +1375,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1385 return -EAGAIN; 1375 return -EAGAIN;
1386 1376
1387 if (skb->len > cur_mss) { 1377 if (skb->len > cur_mss) {
1388 int old_factor = tcp_skb_pcount(skb); 1378 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1389 int new_factor;
1390
1391 if (tcp_fragment(sk, skb, cur_mss))
1392 return -ENOMEM; /* We'll try again later. */ 1379 return -ENOMEM; /* We'll try again later. */
1393
1394 /* New SKB created, account for it. */
1395 new_factor = tcp_skb_pcount(skb);
1396 tp->packets_out -= old_factor - new_factor;
1397 tp->packets_out += tcp_skb_pcount(skb->next);
1398 } 1380 }
1399 1381
1400 /* Collapse two adjacent packets if worthwhile and we can. */ 1382 /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1474,6 +1456,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1474 */ 1456 */
1475void tcp_xmit_retransmit_queue(struct sock *sk) 1457void tcp_xmit_retransmit_queue(struct sock *sk)
1476{ 1458{
1459 const struct inet_connection_sock *icsk = inet_csk(sk);
1477 struct tcp_sock *tp = tcp_sk(sk); 1460 struct tcp_sock *tp = tcp_sk(sk);
1478 struct sk_buff *skb; 1461 struct sk_buff *skb;
1479 int packet_cnt = tp->lost_out; 1462 int packet_cnt = tp->lost_out;
@@ -1497,14 +1480,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1497 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { 1480 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1498 if (tcp_retransmit_skb(sk, skb)) 1481 if (tcp_retransmit_skb(sk, skb))
1499 return; 1482 return;
1500 if (tp->ca_state != TCP_CA_Loss) 1483 if (icsk->icsk_ca_state != TCP_CA_Loss)
1501 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); 1484 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1502 else 1485 else
1503 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); 1486 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1504 1487
1505 if (skb == 1488 if (skb ==
1506 skb_peek(&sk->sk_write_queue)) 1489 skb_peek(&sk->sk_write_queue))
1507 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1490 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1491 inet_csk(sk)->icsk_rto,
1492 TCP_RTO_MAX);
1508 } 1493 }
1509 1494
1510 packet_cnt -= tcp_skb_pcount(skb); 1495 packet_cnt -= tcp_skb_pcount(skb);
@@ -1517,7 +1502,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1517 /* OK, demanded retransmission is finished. */ 1502 /* OK, demanded retransmission is finished. */
1518 1503
1519 /* Forward retransmissions are possible only during Recovery. */ 1504 /* Forward retransmissions are possible only during Recovery. */
1520 if (tp->ca_state != TCP_CA_Recovery) 1505 if (icsk->icsk_ca_state != TCP_CA_Recovery)
1521 return; 1506 return;
1522 1507
1523 /* No forward retransmissions in Reno are possible. */ 1508 /* No forward retransmissions in Reno are possible. */
@@ -1557,7 +1542,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1557 break; 1542 break;
1558 1543
1559 if (skb == skb_peek(&sk->sk_write_queue)) 1544 if (skb == skb_peek(&sk->sk_write_queue))
1560 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1545 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1546 inet_csk(sk)->icsk_rto,
1547 TCP_RTO_MAX);
1561 1548
1562 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); 1549 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1563 } 1550 }
@@ -1586,7 +1573,7 @@ void tcp_send_fin(struct sock *sk)
1586 } else { 1573 } else {
1587 /* Socket is locked, keep trying until memory is available. */ 1574 /* Socket is locked, keep trying until memory is available. */
1588 for (;;) { 1575 for (;;) {
1589 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); 1576 skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
1590 if (skb) 1577 if (skb)
1591 break; 1578 break;
1592 yield(); 1579 yield();
@@ -1793,8 +1780,8 @@ static inline void tcp_connect_init(struct sock *sk)
1793 tp->rcv_wup = 0; 1780 tp->rcv_wup = 0;
1794 tp->copied_seq = 0; 1781 tp->copied_seq = 0;
1795 1782
1796 tp->rto = TCP_TIMEOUT_INIT; 1783 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1797 tp->retransmits = 0; 1784 inet_csk(sk)->icsk_retransmits = 0;
1798 tcp_clear_retrans(tp); 1785 tcp_clear_retrans(tp);
1799} 1786}
1800 1787
@@ -1808,7 +1795,7 @@ int tcp_connect(struct sock *sk)
1808 1795
1809 tcp_connect_init(sk); 1796 tcp_connect_init(sk);
1810 1797
1811 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); 1798 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
1812 if (unlikely(buff == NULL)) 1799 if (unlikely(buff == NULL))
1813 return -ENOBUFS; 1800 return -ENOBUFS;
1814 1801
@@ -1837,7 +1824,8 @@ int tcp_connect(struct sock *sk)
1837 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); 1824 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1838 1825
1839 /* Timer for repeating the SYN until an answer. */ 1826 /* Timer for repeating the SYN until an answer. */
1840 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1827 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1828 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1841 return 0; 1829 return 0;
1842} 1830}
1843 1831
@@ -1847,20 +1835,21 @@ int tcp_connect(struct sock *sk)
1847 */ 1835 */
1848void tcp_send_delayed_ack(struct sock *sk) 1836void tcp_send_delayed_ack(struct sock *sk)
1849{ 1837{
1850 struct tcp_sock *tp = tcp_sk(sk); 1838 struct inet_connection_sock *icsk = inet_csk(sk);
1851 int ato = tp->ack.ato; 1839 int ato = icsk->icsk_ack.ato;
1852 unsigned long timeout; 1840 unsigned long timeout;
1853 1841
1854 if (ato > TCP_DELACK_MIN) { 1842 if (ato > TCP_DELACK_MIN) {
1843 const struct tcp_sock *tp = tcp_sk(sk);
1855 int max_ato = HZ/2; 1844 int max_ato = HZ/2;
1856 1845
1857 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) 1846 if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
1858 max_ato = TCP_DELACK_MAX; 1847 max_ato = TCP_DELACK_MAX;
1859 1848
1860 /* Slow path, intersegment interval is "high". */ 1849 /* Slow path, intersegment interval is "high". */
1861 1850
1862 /* If some rtt estimate is known, use it to bound delayed ack. 1851 /* If some rtt estimate is known, use it to bound delayed ack.
1863 * Do not use tp->rto here, use results of rtt measurements 1852 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
1864 * directly. 1853 * directly.
1865 */ 1854 */
1866 if (tp->srtt) { 1855 if (tp->srtt) {
@@ -1877,21 +1866,22 @@ void tcp_send_delayed_ack(struct sock *sk)
1877 timeout = jiffies + ato; 1866 timeout = jiffies + ato;
1878 1867
1879 /* Use new timeout only if there wasn't a older one earlier. */ 1868 /* Use new timeout only if there wasn't a older one earlier. */
1880 if (tp->ack.pending&TCP_ACK_TIMER) { 1869 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
1881 /* If delack timer was blocked or is about to expire, 1870 /* If delack timer was blocked or is about to expire,
1882 * send ACK now. 1871 * send ACK now.
1883 */ 1872 */
1884 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { 1873 if (icsk->icsk_ack.blocked ||
1874 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
1885 tcp_send_ack(sk); 1875 tcp_send_ack(sk);
1886 return; 1876 return;
1887 } 1877 }
1888 1878
1889 if (!time_before(timeout, tp->ack.timeout)) 1879 if (!time_before(timeout, icsk->icsk_ack.timeout))
1890 timeout = tp->ack.timeout; 1880 timeout = icsk->icsk_ack.timeout;
1891 } 1881 }
1892 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; 1882 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
1893 tp->ack.timeout = timeout; 1883 icsk->icsk_ack.timeout = timeout;
1894 sk_reset_timer(sk, &tp->delack_timer, timeout); 1884 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
1895} 1885}
1896 1886
1897/* This routine sends an ack and also updates the window. */ 1887/* This routine sends an ack and also updates the window. */
@@ -1908,9 +1898,10 @@ void tcp_send_ack(struct sock *sk)
1908 */ 1898 */
1909 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 1899 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1910 if (buff == NULL) { 1900 if (buff == NULL) {
1911 tcp_schedule_ack(tp); 1901 inet_csk_schedule_ack(sk);
1912 tp->ack.ato = TCP_ATO_MIN; 1902 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
1913 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); 1903 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1904 TCP_DELACK_MAX, TCP_RTO_MAX);
1914 return; 1905 return;
1915 } 1906 }
1916 1907
@@ -1991,16 +1982,10 @@ int tcp_write_wakeup(struct sock *sk)
1991 skb->len > mss) { 1982 skb->len > mss) {
1992 seg_size = min(seg_size, mss); 1983 seg_size = min(seg_size, mss);
1993 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1984 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1994 if (tcp_fragment(sk, skb, seg_size)) 1985 if (tcp_fragment(sk, skb, seg_size, mss))
1995 return -1; 1986 return -1;
1996 /* SWS override triggered forced fragmentation.
1997 * Disable TSO, the connection is too sick. */
1998 if (sk->sk_route_caps & NETIF_F_TSO) {
1999 sock_set_flag(sk, SOCK_NO_LARGESEND);
2000 sk->sk_route_caps &= ~NETIF_F_TSO;
2001 }
2002 } else if (!tcp_skb_pcount(skb)) 1987 } else if (!tcp_skb_pcount(skb))
2003 tcp_set_skb_tso_segs(sk, skb); 1988 tcp_set_skb_tso_segs(sk, skb, mss);
2004 1989
2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1990 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
2006 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1991 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2024,6 +2009,7 @@ int tcp_write_wakeup(struct sock *sk)
2024 */ 2009 */
2025void tcp_send_probe0(struct sock *sk) 2010void tcp_send_probe0(struct sock *sk)
2026{ 2011{
2012 struct inet_connection_sock *icsk = inet_csk(sk);
2027 struct tcp_sock *tp = tcp_sk(sk); 2013 struct tcp_sock *tp = tcp_sk(sk);
2028 int err; 2014 int err;
2029 2015
@@ -2031,28 +2017,31 @@ void tcp_send_probe0(struct sock *sk)
2031 2017
2032 if (tp->packets_out || !sk->sk_send_head) { 2018 if (tp->packets_out || !sk->sk_send_head) {
2033 /* Cancel probe timer, if it is not required. */ 2019 /* Cancel probe timer, if it is not required. */
2034 tp->probes_out = 0; 2020 icsk->icsk_probes_out = 0;
2035 tp->backoff = 0; 2021 icsk->icsk_backoff = 0;
2036 return; 2022 return;
2037 } 2023 }
2038 2024
2039 if (err <= 0) { 2025 if (err <= 0) {
2040 if (tp->backoff < sysctl_tcp_retries2) 2026 if (icsk->icsk_backoff < sysctl_tcp_retries2)
2041 tp->backoff++; 2027 icsk->icsk_backoff++;
2042 tp->probes_out++; 2028 icsk->icsk_probes_out++;
2043 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 2029 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2044 min(tp->rto << tp->backoff, TCP_RTO_MAX)); 2030 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2031 TCP_RTO_MAX);
2045 } else { 2032 } else {
2046 /* If packet was not sent due to local congestion, 2033 /* If packet was not sent due to local congestion,
2047 * do not backoff and do not remember probes_out. 2034 * do not backoff and do not remember icsk_probes_out.
2048 * Let local senders to fight for local resources. 2035 * Let local senders to fight for local resources.
2049 * 2036 *
2050 * Use accumulated backoff yet. 2037 * Use accumulated backoff yet.
2051 */ 2038 */
2052 if (!tp->probes_out) 2039 if (!icsk->icsk_probes_out)
2053 tp->probes_out=1; 2040 icsk->icsk_probes_out = 1;
2054 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 2041 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2055 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); 2042 min(icsk->icsk_rto << icsk->icsk_backoff,
2043 TCP_RESOURCE_PROBE_INTERVAL),
2044 TCP_RTO_MAX);
2056 } 2045 }
2057} 2046}
2058 2047
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c71..327770bf5522 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
16#define TCP_SCALABLE_AI_CNT 50U 16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3 17#define TCP_SCALABLE_MD_SCALE 3
18 18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 19static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
20 u32 in_flight, int flag) 20 u32 in_flight, int flag)
21{ 21{
22 struct tcp_sock *tp = tcp_sk(sk);
22 if (in_flight < tp->snd_cwnd) 23 if (in_flight < tp->snd_cwnd)
23 return; 24 return;
24 25
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
35 tp->snd_cwnd_stamp = tcp_time_stamp; 36 tp->snd_cwnd_stamp = tcp_time_stamp;
36} 37}
37 38
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) 39static u32 tcp_scalable_ssthresh(struct sock *sk)
39{ 40{
41 const struct tcp_sock *tp = tcp_sk(sk);
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); 42 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41} 43}
42 44
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c2..415ee47ac1c5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data); 37static void tcp_keepalive_timer (unsigned long data);
38 38
39#ifdef TCP_DEBUG
40const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41EXPORT_SYMBOL(tcp_timer_bug_msg);
42#endif
43
44/*
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
47 * to optimize.
48 */
49
50void tcp_init_xmit_timers(struct sock *sk) 39void tcp_init_xmit_timers(struct sock *sk)
51{ 40{
52 struct tcp_sock *tp = tcp_sk(sk); 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
53 42 &tcp_keepalive_timer);
54 init_timer(&tp->retransmit_timer);
55 tp->retransmit_timer.function=&tcp_write_timer;
56 tp->retransmit_timer.data = (unsigned long) sk;
57 tp->pending = 0;
58
59 init_timer(&tp->delack_timer);
60 tp->delack_timer.function=&tcp_delack_timer;
61 tp->delack_timer.data = (unsigned long) sk;
62 tp->ack.pending = 0;
63
64 init_timer(&sk->sk_timer);
65 sk->sk_timer.function = &tcp_keepalive_timer;
66 sk->sk_timer.data = (unsigned long)sk;
67} 43}
68 44
69void tcp_clear_xmit_timers(struct sock *sk) 45EXPORT_SYMBOL(tcp_init_xmit_timers);
70{
71 struct tcp_sock *tp = tcp_sk(sk);
72
73 tp->pending = 0;
74 sk_stop_timer(sk, &tp->retransmit_timer);
75
76 tp->ack.pending = 0;
77 tp->ack.blocked = 0;
78 sk_stop_timer(sk, &tp->delack_timer);
79
80 sk_stop_timer(sk, &sk->sk_timer);
81}
82 46
83static void tcp_write_err(struct sock *sk) 47static void tcp_write_err(struct sock *sk)
84{ 48{
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
155/* A write timeout has occurred. Process the after effects. */ 119/* A write timeout has occurred. Process the after effects. */
156static int tcp_write_timeout(struct sock *sk) 120static int tcp_write_timeout(struct sock *sk)
157{ 121{
158 struct tcp_sock *tp = tcp_sk(sk); 122 const struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until; 123 int retry_until;
160 124
161 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 125 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
162 if (tp->retransmits) 126 if (icsk->icsk_retransmits)
163 dst_negative_advice(&sk->sk_dst_cache); 127 dst_negative_advice(&sk->sk_dst_cache);
164 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; 128 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
165 } else { 129 } else {
166 if (tp->retransmits >= sysctl_tcp_retries1) { 130 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
167 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black 131 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
168 hole detection. :-( 132 hole detection. :-(
169 133
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
189 153
190 retry_until = sysctl_tcp_retries2; 154 retry_until = sysctl_tcp_retries2;
191 if (sock_flag(sk, SOCK_DEAD)) { 155 if (sock_flag(sk, SOCK_DEAD)) {
192 int alive = (tp->rto < TCP_RTO_MAX); 156 const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
193 157
194 retry_until = tcp_orphan_retries(sk, alive); 158 retry_until = tcp_orphan_retries(sk, alive);
195 159
196 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) 160 if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
197 return 1; 161 return 1;
198 } 162 }
199 } 163 }
200 164
201 if (tp->retransmits >= retry_until) { 165 if (icsk->icsk_retransmits >= retry_until) {
202 /* Has it gone just too far? */ 166 /* Has it gone just too far? */
203 tcp_write_err(sk); 167 tcp_write_err(sk);
204 return 1; 168 return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
210{ 174{
211 struct sock *sk = (struct sock*)data; 175 struct sock *sk = (struct sock*)data;
212 struct tcp_sock *tp = tcp_sk(sk); 176 struct tcp_sock *tp = tcp_sk(sk);
177 struct inet_connection_sock *icsk = inet_csk(sk);
213 178
214 bh_lock_sock(sk); 179 bh_lock_sock(sk);
215 if (sock_owned_by_user(sk)) { 180 if (sock_owned_by_user(sk)) {
216 /* Try again later. */ 181 /* Try again later. */
217 tp->ack.blocked = 1; 182 icsk->icsk_ack.blocked = 1;
218 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); 183 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
219 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); 184 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
220 goto out_unlock; 185 goto out_unlock;
221 } 186 }
222 187
223 sk_stream_mem_reclaim(sk); 188 sk_stream_mem_reclaim(sk);
224 189
225 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) 190 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
226 goto out; 191 goto out;
227 192
228 if (time_after(tp->ack.timeout, jiffies)) { 193 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
229 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); 194 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
230 goto out; 195 goto out;
231 } 196 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 197 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
233 198
234 if (!skb_queue_empty(&tp->ucopy.prequeue)) { 199 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 200 struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
242 tp->ucopy.memory = 0; 207 tp->ucopy.memory = 0;
243 } 208 }
244 209
245 if (tcp_ack_scheduled(tp)) { 210 if (inet_csk_ack_scheduled(sk)) {
246 if (!tp->ack.pingpong) { 211 if (!icsk->icsk_ack.pingpong) {
247 /* Delayed ACK missed: inflate ATO. */ 212 /* Delayed ACK missed: inflate ATO. */
248 tp->ack.ato = min(tp->ack.ato << 1, tp->rto); 213 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
249 } else { 214 } else {
250 /* Delayed ACK missed: leave pingpong mode and 215 /* Delayed ACK missed: leave pingpong mode and
251 * deflate ATO. 216 * deflate ATO.
252 */ 217 */
253 tp->ack.pingpong = 0; 218 icsk->icsk_ack.pingpong = 0;
254 tp->ack.ato = TCP_ATO_MIN; 219 icsk->icsk_ack.ato = TCP_ATO_MIN;
255 } 220 }
256 tcp_send_ack(sk); 221 tcp_send_ack(sk);
257 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); 222 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
268 233
269static void tcp_probe_timer(struct sock *sk) 234static void tcp_probe_timer(struct sock *sk)
270{ 235{
236 struct inet_connection_sock *icsk = inet_csk(sk);
271 struct tcp_sock *tp = tcp_sk(sk); 237 struct tcp_sock *tp = tcp_sk(sk);
272 int max_probes; 238 int max_probes;
273 239
274 if (tp->packets_out || !sk->sk_send_head) { 240 if (tp->packets_out || !sk->sk_send_head) {
275 tp->probes_out = 0; 241 icsk->icsk_probes_out = 0;
276 return; 242 return;
277 } 243 }
278 244
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
283 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing 249 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
284 * this behaviour in Solaris down as a bug fix. [AC] 250 * this behaviour in Solaris down as a bug fix. [AC]
285 * 251 *
286 * Let me to explain. probes_out is zeroed by incoming ACKs 252 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
287 * even if they advertise zero window. Hence, connection is killed only 253 * even if they advertise zero window. Hence, connection is killed only
288 * if we received no ACKs for normal connection timeout. It is not killed 254 * if we received no ACKs for normal connection timeout. It is not killed
289 * only because window stays zero for some time, window may be zero 255 * only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
294 max_probes = sysctl_tcp_retries2; 260 max_probes = sysctl_tcp_retries2;
295 261
296 if (sock_flag(sk, SOCK_DEAD)) { 262 if (sock_flag(sk, SOCK_DEAD)) {
297 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); 263 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
298 264
299 max_probes = tcp_orphan_retries(sk, alive); 265 max_probes = tcp_orphan_retries(sk, alive);
300 266
301 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) 267 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
302 return; 268 return;
303 } 269 }
304 270
305 if (tp->probes_out > max_probes) { 271 if (icsk->icsk_probes_out > max_probes) {
306 tcp_write_err(sk); 272 tcp_write_err(sk);
307 } else { 273 } else {
308 /* Only send another probe if we didn't close things up. */ 274 /* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
317static void tcp_retransmit_timer(struct sock *sk) 283static void tcp_retransmit_timer(struct sock *sk)
318{ 284{
319 struct tcp_sock *tp = tcp_sk(sk); 285 struct tcp_sock *tp = tcp_sk(sk);
286 struct inet_connection_sock *icsk = inet_csk(sk);
320 287
321 if (!tp->packets_out) 288 if (!tp->packets_out)
322 goto out; 289 goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
351 if (tcp_write_timeout(sk)) 318 if (tcp_write_timeout(sk))
352 goto out; 319 goto out;
353 320
354 if (tp->retransmits == 0) { 321 if (icsk->icsk_retransmits == 0) {
355 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { 322 if (icsk->icsk_ca_state == TCP_CA_Disorder ||
323 icsk->icsk_ca_state == TCP_CA_Recovery) {
356 if (tp->rx_opt.sack_ok) { 324 if (tp->rx_opt.sack_ok) {
357 if (tp->ca_state == TCP_CA_Recovery) 325 if (icsk->icsk_ca_state == TCP_CA_Recovery)
358 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); 326 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
359 else 327 else
360 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); 328 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
361 } else { 329 } else {
362 if (tp->ca_state == TCP_CA_Recovery) 330 if (icsk->icsk_ca_state == TCP_CA_Recovery)
363 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); 331 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
364 else 332 else
365 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); 333 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
366 } 334 }
367 } else if (tp->ca_state == TCP_CA_Loss) { 335 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
368 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); 336 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
369 } else { 337 } else {
370 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); 338 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
381 /* Retransmission failed because of local congestion, 349 /* Retransmission failed because of local congestion,
382 * do not backoff. 350 * do not backoff.
383 */ 351 */
384 if (!tp->retransmits) 352 if (!icsk->icsk_retransmits)
385 tp->retransmits=1; 353 icsk->icsk_retransmits = 1;
386 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, 354 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
387 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); 355 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
356 TCP_RTO_MAX);
388 goto out; 357 goto out;
389 } 358 }
390 359
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
403 * implemented ftp to mars will work nicely. We will have to fix 372 * implemented ftp to mars will work nicely. We will have to fix
404 * the 120 second clamps though! 373 * the 120 second clamps though!
405 */ 374 */
406 tp->backoff++; 375 icsk->icsk_backoff++;
407 tp->retransmits++; 376 icsk->icsk_retransmits++;
408 377
409out_reset_timer: 378out_reset_timer:
410 tp->rto = min(tp->rto << 1, TCP_RTO_MAX); 379 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
411 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 380 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
412 if (tp->retransmits > sysctl_tcp_retries1) 381 if (icsk->icsk_retransmits > sysctl_tcp_retries1)
413 __sk_dst_reset(sk); 382 __sk_dst_reset(sk);
414 383
415out:; 384out:;
@@ -418,32 +387,32 @@ out:;
418static void tcp_write_timer(unsigned long data) 387static void tcp_write_timer(unsigned long data)
419{ 388{
420 struct sock *sk = (struct sock*)data; 389 struct sock *sk = (struct sock*)data;
421 struct tcp_sock *tp = tcp_sk(sk); 390 struct inet_connection_sock *icsk = inet_csk(sk);
422 int event; 391 int event;
423 392
424 bh_lock_sock(sk); 393 bh_lock_sock(sk);
425 if (sock_owned_by_user(sk)) { 394 if (sock_owned_by_user(sk)) {
426 /* Try again later */ 395 /* Try again later */
427 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); 396 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
428 goto out_unlock; 397 goto out_unlock;
429 } 398 }
430 399
431 if (sk->sk_state == TCP_CLOSE || !tp->pending) 400 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
432 goto out; 401 goto out;
433 402
434 if (time_after(tp->timeout, jiffies)) { 403 if (time_after(icsk->icsk_timeout, jiffies)) {
435 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); 404 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
436 goto out; 405 goto out;
437 } 406 }
438 407
439 event = tp->pending; 408 event = icsk->icsk_pending;
440 tp->pending = 0; 409 icsk->icsk_pending = 0;
441 410
442 switch (event) { 411 switch (event) {
443 case TCP_TIME_RETRANS: 412 case ICSK_TIME_RETRANS:
444 tcp_retransmit_timer(sk); 413 tcp_retransmit_timer(sk);
445 break; 414 break;
446 case TCP_TIME_PROBE0: 415 case ICSK_TIME_PROBE0:
447 tcp_probe_timer(sk); 416 tcp_probe_timer(sk);
448 break; 417 break;
449 } 418 }
@@ -462,96 +431,8 @@ out_unlock:
462 431
463static void tcp_synack_timer(struct sock *sk) 432static void tcp_synack_timer(struct sock *sk)
464{ 433{
465 struct tcp_sock *tp = tcp_sk(sk); 434 inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
466 struct listen_sock *lopt = tp->accept_queue.listen_opt; 435 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
467 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
468 int thresh = max_retries;
469 unsigned long now = jiffies;
470 struct request_sock **reqp, *req;
471 int i, budget;
472
473 if (lopt == NULL || lopt->qlen == 0)
474 return;
475
476 /* Normally all the openreqs are young and become mature
477 * (i.e. converted to established socket) for first timeout.
478 * If synack was not acknowledged for 3 seconds, it means
479 * one of the following things: synack was lost, ack was lost,
480 * rtt is high or nobody planned to ack (i.e. synflood).
481 * When server is a bit loaded, queue is populated with old
482 * open requests, reducing effective size of queue.
483 * When server is well loaded, queue size reduces to zero
484 * after several minutes of work. It is not synflood,
485 * it is normal operation. The solution is pruning
486 * too old entries overriding normal timeout, when
487 * situation becomes dangerous.
488 *
489 * Essentially, we reserve half of room for young
490 * embrions; and abort old ones without pity, if old
491 * ones are about to clog our table.
492 */
493 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
494 int young = (lopt->qlen_young<<1);
495
496 while (thresh > 2) {
497 if (lopt->qlen < young)
498 break;
499 thresh--;
500 young <<= 1;
501 }
502 }
503
504 if (tp->defer_accept)
505 max_retries = tp->defer_accept;
506
507 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
508 i = lopt->clock_hand;
509
510 do {
511 reqp=&lopt->syn_table[i];
512 while ((req = *reqp) != NULL) {
513 if (time_after_eq(now, req->expires)) {
514 if ((req->retrans < thresh ||
515 (inet_rsk(req)->acked && req->retrans < max_retries))
516 && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
517 unsigned long timeo;
518
519 if (req->retrans++ == 0)
520 lopt->qlen_young--;
521 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
522 TCP_RTO_MAX);
523 req->expires = now + timeo;
524 reqp = &req->dl_next;
525 continue;
526 }
527
528 /* Drop this request */
529 tcp_synq_unlink(tp, req, reqp);
530 reqsk_queue_removed(&tp->accept_queue, req);
531 reqsk_free(req);
532 continue;
533 }
534 reqp = &req->dl_next;
535 }
536
537 i = (i+1)&(TCP_SYNQ_HSIZE-1);
538
539 } while (--budget > 0);
540
541 lopt->clock_hand = i;
542
543 if (lopt->qlen)
544 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
545}
546
547void tcp_delete_keepalive_timer (struct sock *sk)
548{
549 sk_stop_timer(sk, &sk->sk_timer);
550}
551
552void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
553{
554 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
555} 436}
556 437
557void tcp_set_keepalive(struct sock *sk, int val) 438void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
560 return; 441 return;
561 442
562 if (val && !sock_flag(sk, SOCK_KEEPOPEN)) 443 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
563 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); 444 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
564 else if (!val) 445 else if (!val)
565 tcp_delete_keepalive_timer(sk); 446 inet_csk_delete_keepalive_timer(sk);
566} 447}
567 448
568 449
569static void tcp_keepalive_timer (unsigned long data) 450static void tcp_keepalive_timer (unsigned long data)
570{ 451{
571 struct sock *sk = (struct sock *) data; 452 struct sock *sk = (struct sock *) data;
453 struct inet_connection_sock *icsk = inet_csk(sk);
572 struct tcp_sock *tp = tcp_sk(sk); 454 struct tcp_sock *tp = tcp_sk(sk);
573 __u32 elapsed; 455 __u32 elapsed;
574 456
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
576 bh_lock_sock(sk); 458 bh_lock_sock(sk);
577 if (sock_owned_by_user(sk)) { 459 if (sock_owned_by_user(sk)) {
578 /* Try again later. */ 460 /* Try again later. */
579 tcp_reset_keepalive_timer (sk, HZ/20); 461 inet_csk_reset_keepalive_timer (sk, HZ/20);
580 goto out; 462 goto out;
581 } 463 }
582 464
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
587 469
588 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { 470 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
589 if (tp->linger2 >= 0) { 471 if (tp->linger2 >= 0) {
590 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; 472 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
591 473
592 if (tmo > 0) { 474 if (tmo > 0) {
593 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 475 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
610 elapsed = tcp_time_stamp - tp->rcv_tstamp; 492 elapsed = tcp_time_stamp - tp->rcv_tstamp;
611 493
612 if (elapsed >= keepalive_time_when(tp)) { 494 if (elapsed >= keepalive_time_when(tp)) {
613 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || 495 if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
614 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { 496 (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
615 tcp_send_active_reset(sk, GFP_ATOMIC); 497 tcp_send_active_reset(sk, GFP_ATOMIC);
616 tcp_write_err(sk); 498 tcp_write_err(sk);
617 goto out; 499 goto out;
618 } 500 }
619 if (tcp_write_wakeup(sk) <= 0) { 501 if (tcp_write_wakeup(sk) <= 0) {
620 tp->probes_out++; 502 icsk->icsk_probes_out++;
621 elapsed = keepalive_intvl_when(tp); 503 elapsed = keepalive_intvl_when(tp);
622 } else { 504 } else {
623 /* If keepalive was lost due to local congestion, 505 /* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
634 sk_stream_mem_reclaim(sk); 516 sk_stream_mem_reclaim(sk);
635 517
636resched: 518resched:
637 tcp_reset_keepalive_timer (sk, elapsed); 519 inet_csk_reset_keepalive_timer (sk, elapsed);
638 goto out; 520 goto out;
639 521
640death: 522death:
@@ -644,8 +526,3 @@ out:
644 bh_unlock_sock(sk); 526 bh_unlock_sock(sk);
645 sock_put(sk); 527 sock_put(sk);
646} 528}
647
648EXPORT_SYMBOL(tcp_clear_xmit_timers);
649EXPORT_SYMBOL(tcp_delete_keepalive_timer);
650EXPORT_SYMBOL(tcp_init_xmit_timers);
651EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db5193..93c5f92070f9 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/skbuff.h> 37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h> 38#include <linux/inet_diag.h>
39 39
40#include <net/tcp.h> 40#include <net/tcp.h>
41 41
@@ -82,9 +82,10 @@ struct vegas {
82 * Instead we must wait until the completion of an RTT during 82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs. 83 * which we actually receive ACKs.
84 */ 84 */
85static inline void vegas_enable(struct tcp_sock *tp) 85static inline void vegas_enable(struct sock *sk)
86{ 86{
87 struct vegas *vegas = tcp_ca(tp); 87 const struct tcp_sock *tp = tcp_sk(sk);
88 struct vegas *vegas = inet_csk_ca(sk);
88 89
89 /* Begin taking Vegas samples next time we send something. */ 90 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1; 91 vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
97} 98}
98 99
99/* Stop taking Vegas samples for now. */ 100/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp) 101static inline void vegas_disable(struct sock *sk)
101{ 102{
102 struct vegas *vegas = tcp_ca(tp); 103 struct vegas *vegas = inet_csk_ca(sk);
103 104
104 vegas->doing_vegas_now = 0; 105 vegas->doing_vegas_now = 0;
105} 106}
106 107
107static void tcp_vegas_init(struct tcp_sock *tp) 108static void tcp_vegas_init(struct sock *sk)
108{ 109{
109 struct vegas *vegas = tcp_ca(tp); 110 struct vegas *vegas = inet_csk_ca(sk);
110 111
111 vegas->baseRTT = 0x7fffffff; 112 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp); 113 vegas_enable(sk);
113} 114}
114 115
115/* Do RTT sampling needed for Vegas. 116/* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
120 * o min-filter RTT samples from a much longer window (forever for now) 121 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT) 122 * to find the propagation delay (baseRTT)
122 */ 123 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) 124static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
124{ 125{
125 struct vegas *vegas = tcp_ca(tp); 126 struct vegas *vegas = inet_csk_ca(sk);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ 127 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127 128
128 /* Filter to find propagation delay: */ 129 /* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
136 vegas->cntRTT++; 137 vegas->cntRTT++;
137} 138}
138 139
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) 140static void tcp_vegas_state(struct sock *sk, u8 ca_state)
140{ 141{
141 142
142 if (ca_state == TCP_CA_Open) 143 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp); 144 vegas_enable(sk);
144 else 145 else
145 vegas_disable(tp); 146 vegas_disable(sk);
146} 147}
147 148
148/* 149/*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
154 * packets, _then_ we can make Vegas calculations 155 * packets, _then_ we can make Vegas calculations
155 * again. 156 * again.
156 */ 157 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) 158static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
158{ 159{
159 if (event == CA_EVENT_CWND_RESTART || 160 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START) 161 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp); 162 tcp_vegas_init(sk);
162} 163}
163 164
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, 165static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag) 166 u32 seq_rtt, u32 in_flight, int flag)
166{ 167{
167 struct vegas *vegas = tcp_ca(tp); 168 struct tcp_sock *tp = tcp_sk(sk);
169 struct vegas *vegas = inet_csk_ca(sk);
168 170
169 if (!vegas->doing_vegas_now) 171 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); 172 return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
171 173
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt. 174 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 * 175 *
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
219 * but that's not too awful, since we're taking the min, 221 * but that's not too awful, since we're taking the min,
220 * rather than averaging. 222 * rather than averaging.
221 */ 223 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000); 224 tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
223 225
224 /* We do the Vegas calculations only if we got enough RTT 226 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got 227 * samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
359} 361}
360 362
361/* Extract info for Tcp socket info provided via netlink. */ 363/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, 364static void tcp_vegas_get_info(struct sock *sk, u32 ext,
363 struct sk_buff *skb) 365 struct sk_buff *skb)
364{ 366{
365 const struct vegas *ca = tcp_ca(tp); 367 const struct vegas *ca = inet_csk_ca(sk);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { 368 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
367 struct tcpvegas_info *info; 369 struct tcpvegas_info *info;
368 370
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, 371 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
370 sizeof(*info))); 372 sizeof(*info)));
371 373
372 info->tcpv_enabled = ca->doing_vegas_now; 374 info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
393 395
394static int __init tcp_vegas_register(void) 396static int __init tcp_vegas_register(void)
395{ 397{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); 398 BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas); 399 tcp_register_congestion_control(&tcp_vegas);
398 return 0; 400 return 0;
399} 401}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c940..0c340c3756c2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/skbuff.h> 10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h> 11#include <linux/inet_diag.h>
12#include <net/tcp.h> 12#include <net/tcp.h>
13 13
14/* TCP Westwood structure */ 14/* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
40 * way as soon as possible. It will reasonably happen within the first 40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime. 41 * RTT period of the connection lifetime.
42 */ 42 */
43static void tcp_westwood_init(struct tcp_sock *tp) 43static void tcp_westwood_init(struct sock *sk)
44{ 44{
45 struct westwood *w = tcp_ca(tp); 45 struct westwood *w = inet_csk_ca(sk);
46 46
47 w->bk = 0; 47 w->bk = 0;
48 w->bw_ns_est = 0; 48 w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
51 w->cumul_ack = 0; 51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; 52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp; 53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una; 54 w->snd_una = tcp_sk(sk)->snd_una;
55} 55}
56 56
57/* 57/*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
74 * Called after processing group of packets. 74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt. 75 * but all westwood needs is the last sample of srtt.
76 */ 76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) 77static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
78{ 78{
79 struct westwood *w = tcp_ca(tp); 79 struct westwood *w = inet_csk_ca(sk);
80 if (cnt > 0) 80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3; 81 w->rtt = tcp_sk(sk)->srtt >> 3;
82} 82}
83 83
84/* 84/*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
86 * It updates RTT evaluation window if it is the right moment to do 86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth. 87 * it. If so it calls filter for evaluating bandwidth.
88 */ 88 */
89static void westwood_update_window(struct tcp_sock *tp) 89static void westwood_update_window(struct sock *sk)
90{ 90{
91 struct westwood *w = tcp_ca(tp); 91 struct westwood *w = inet_csk_ca(sk);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx; 92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93 93
94 /* 94 /*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
114 * header prediction is successful. In such case in fact update is 114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care. 115 * straight forward and doesn't need any particular care.
116 */ 116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp) 117static inline void westwood_fast_bw(struct sock *sk)
118{ 118{
119 struct westwood *w = tcp_ca(tp); 119 const struct tcp_sock *tp = tcp_sk(sk);
120 struct westwood *w = inet_csk_ca(sk);
120 121
121 westwood_update_window(tp); 122 westwood_update_window(sk);
122 123
123 w->bk += tp->snd_una - w->snd_una; 124 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una; 125 w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
130 * This function evaluates cumul_ack for evaluating bk in case of 131 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks. 132 * delayed or partial acks.
132 */ 133 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp) 134static inline u32 westwood_acked_count(struct sock *sk)
134{ 135{
135 struct westwood *w = tcp_ca(tp); 136 const struct tcp_sock *tp = tcp_sk(sk);
137 struct westwood *w = inet_csk_ca(sk);
136 138
137 w->cumul_ack = tp->snd_una - w->snd_una; 139 w->cumul_ack = tp->snd_una - w->snd_una;
138 140
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
160 return w->cumul_ack; 162 return w->cumul_ack;
161} 163}
162 164
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) 165static inline u32 westwood_bw_rttmin(const struct sock *sk)
164{ 166{
165 struct westwood *w = tcp_ca(tp); 167 const struct tcp_sock *tp = tcp_sk(sk);
168 const struct westwood *w = inet_csk_ca(sk);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); 169 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167} 170}
168 171
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 175 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0. 176 * so avoids ever returning 0.
174 */ 177 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) 178static u32 tcp_westwood_cwnd_min(struct sock *sk)
176{ 179{
177 return westwood_bw_rttmin(tp); 180 return westwood_bw_rttmin(sk);
178} 181}
179 182
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) 183static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
181{ 184{
182 struct westwood *w = tcp_ca(tp); 185 struct tcp_sock *tp = tcp_sk(sk);
186 struct westwood *w = inet_csk_ca(sk);
183 187
184 switch(event) { 188 switch(event) {
185 case CA_EVENT_FAST_ACK: 189 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp); 190 westwood_fast_bw(sk);
187 break; 191 break;
188 192
189 case CA_EVENT_COMPLETE_CWR: 193 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); 194 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
191 break; 195 break;
192 196
193 case CA_EVENT_FRTO: 197 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp); 198 tp->snd_ssthresh = westwood_bw_rttmin(sk);
195 break; 199 break;
196 200
197 case CA_EVENT_SLOW_ACK: 201 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp); 202 westwood_update_window(sk);
199 w->bk += westwood_acked_count(tp); 203 w->bk += westwood_acked_count(sk);
200 w->rtt_min = min(w->rtt, w->rtt_min); 204 w->rtt_min = min(w->rtt, w->rtt_min);
201 break; 205 break;
202 206
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
208 212
209 213
210/* Extract info for Tcp socket info provided via netlink. */ 214/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, 215static void tcp_westwood_info(struct sock *sk, u32 ext,
212 struct sk_buff *skb) 216 struct sk_buff *skb)
213{ 217{
214 const struct westwood *ca = tcp_ca(tp); 218 const struct westwood *ca = inet_csk_ca(sk);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { 219 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
216 struct rtattr *rta; 220 struct rtattr *rta;
217 struct tcpvegas_info *info; 221 struct tcpvegas_info *info;
218 222
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); 223 rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta); 224 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1; 225 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0; 226 info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
242 246
243static int __init tcp_westwood_register(void) 247static int __init tcp_westwood_register(void)
244{ 248{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); 249 BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood); 250 return tcp_register_congestion_control(&tcp_westwood);
247} 251}
248 252
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b443f..e5beca7de86c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
95#include <linux/ipv6.h> 95#include <linux/ipv6.h>
96#include <linux/netdevice.h> 96#include <linux/netdevice.h>
97#include <net/snmp.h> 97#include <net/snmp.h>
98#include <net/tcp.h> 98#include <net/ip.h>
99#include <net/tcp_states.h>
99#include <net/protocol.h> 100#include <net/protocol.h>
100#include <linux/skbuff.h> 101#include <linux/skbuff.h>
101#include <linux/proc_fs.h> 102#include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
112 * Snmp MIB for the UDP layer 113 * Snmp MIB for the UDP layer
113 */ 114 */
114 115
115DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); 116DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
116 117
117struct hlist_head udp_hash[UDP_HTABLE_SIZE]; 118struct hlist_head udp_hash[UDP_HTABLE_SIZE];
118DEFINE_RWLOCK(udp_hash_lock); 119DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
628 /* ... which is an evident application bug. --ANK */ 629 /* ... which is an evident application bug. --ANK */
629 release_sock(sk); 630 release_sock(sk);
630 631
631 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); 632 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
632 err = -EINVAL; 633 err = -EINVAL;
633 goto out; 634 goto out;
634 } 635 }
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
693 if (unlikely(!up->pending)) { 694 if (unlikely(!up->pending)) {
694 release_sock(sk); 695 release_sock(sk);
695 696
696 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); 697 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
697 return -EINVAL; 698 return -EINVAL;
698 } 699 }
699 700
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1102 skb->ip_summed = CHECKSUM_UNNECESSARY; 1103 skb->ip_summed = CHECKSUM_UNNECESSARY;
1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1104 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1104 return 0; 1105 return 0;
1105 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); 1106 LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
1106 skb->ip_summed = CHECKSUM_NONE; 1107 skb->ip_summed = CHECKSUM_NONE;
1107 } 1108 }
1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1109 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
1181 return(0); 1182 return(0);
1182 1183
1183short_packet: 1184short_packet:
1184 NETDEBUG(if (net_ratelimit()) 1185 LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1185 printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1186 NIPQUAD(saddr),
1186 NIPQUAD(saddr), 1187 ntohs(uh->source),
1187 ntohs(uh->source), 1188 ulen,
1188 ulen, 1189 len,
1189 len, 1190 NIPQUAD(daddr),
1190 NIPQUAD(daddr), 1191 ntohs(uh->dest));
1191 ntohs(uh->dest)));
1192no_header: 1192no_header:
1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1194 kfree_skb(skb); 1194 kfree_skb(skb);
@@ -1199,13 +1199,12 @@ csum_error:
1199 * RFC1122: OK. Discards the bad packet silently (as far as 1199 * RFC1122: OK. Discards the bad packet silently (as far as
1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1201 */ 1201 */
1202 NETDEBUG(if (net_ratelimit()) 1202 LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1203 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1203 NIPQUAD(saddr),
1204 NIPQUAD(saddr), 1204 ntohs(uh->source),
1205 ntohs(uh->source), 1205 NIPQUAD(daddr),
1206 NIPQUAD(daddr), 1206 ntohs(uh->dest),
1207 ntohs(uh->dest), 1207 ulen);
1208 ulen));
1209drop: 1208drop:
1210 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1209 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1211 kfree_skb(skb); 1210 kfree_skb(skb);
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a43534..000000000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Various kernel-resident INET utility functions; mainly
7 * for format conversion and debugging output.
8 *
9 * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
10 *
11 * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area check.
15 * Alan Cox : removed old debugging.
16 * Andi Kleen : add net_ratelimit()
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
26#include <asm/byteorder.h>
27
28/*
29 * Convert an ASCII string to binary IP.
30 */
31
32__u32 in_aton(const char *str)
33{
34 unsigned long l;
35 unsigned int val;
36 int i;
37
38 l = 0;
39 for (i = 0; i < 4; i++)
40 {
41 l <<= 8;
42 if (*str != '\0')
43 {
44 val = 0;
45 while (*str != '\0' && *str != '.')
46 {
47 val *= 10;
48 val += *str - '0';
49 str++;
50 }
51 l |= val;
52 if (*str != '\0')
53 str++;
54 }
55 }
56 return(htonl(l));
57}
58
59EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a967..d23e07fc81fa 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
128 xfrm_state_register_afinfo(&xfrm4_state_afinfo); 128 xfrm_state_register_afinfo(&xfrm4_state_afinfo);
129} 129}
130 130
131#if 0
131void __exit xfrm4_state_fini(void) 132void __exit xfrm4_state_fini(void)
132{ 133{
133 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); 134 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
134} 135}
136#endif /* 0 */
135 137
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index e1fe360ed27a..afbb0d4cc305 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,10 +78,9 @@ static int ipip_rcv(struct sk_buff *skb)
78static void ipip_err(struct sk_buff *skb, u32 info) 78static void ipip_err(struct sk_buff *skb, u32 info)
79{ 79{
80 struct xfrm_tunnel *handler = ipip_handler; 80 struct xfrm_tunnel *handler = ipip_handler;
81 u32 arg = info;
82 81
83 if (handler) 82 if (handler)
84 handler->err_handler(skb, &arg); 83 handler->err_handler(skb, info);
85} 84}
86 85
87static int ipip_init_state(struct xfrm_state *x) 86static int ipip_init_state(struct xfrm_state *x)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381cfd..ab7a9124f985 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
1# 1#
2# IPv6 configuration 2# IPv6 configuration
3# 3#
4
5# IPv6 as module will cause a CRASH if you try to unload it
6config IPV6
7 tristate "The IPv6 protocol"
8 default m
9 select CRYPTO if IPV6_PRIVACY
10 select CRYPTO_MD5 if IPV6_PRIVACY
11 ---help---
12 This is complemental support for the IP version 6.
13 You will still be able to do traditional IPv4 networking as well.
14
15 For general information about IPv6, see
16 <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
17 For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
18 For specific information about IPv6 under Linux, read the HOWTO at
19 <http://www.bieringer.de/linux/IPv6/>.
20
21 To compile this protocol support as a module, choose M here: the
22 module will be called ipv6.
23
4config IPV6_PRIVACY 24config IPV6_PRIVACY
5 bool "IPv6: Privacy Extensions (RFC 3041) support" 25 bool "IPv6: Privacy Extensions (RFC 3041) support"
6 depends on IPV6 26 depends on IPV6
@@ -71,7 +91,6 @@ config INET6_TUNNEL
71config IPV6_TUNNEL 91config IPV6_TUNNEL
72 tristate "IPv6: IPv6-in-IPv6 tunnel" 92 tristate "IPv6: IPv6-in-IPv6 tunnel"
73 depends on IPV6 93 depends on IPV6
74 select INET6_TUNNEL
75 ---help--- 94 ---help---
76 Support for IPv6-in-IPv6 tunnels described in RFC 2473. 95 Support for IPv6-in-IPv6 tunnels described in RFC 2473.
77 96
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b39e04940590..6460eec834b7 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ 8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ 9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ 10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
11 ip6_flowlabel.o ipv6_syms.o 11 ip6_flowlabel.o ipv6_syms.o netfilter.o
12 12
13ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ 13ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
14 xfrm6_output.o 14 xfrm6_output.o
@@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/
23obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o 23obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
24 24
25obj-y += exthdrs_core.o 25obj-y += exthdrs_core.o
26
27obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 77004b9456c0..6d6fb74f3b52 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1041,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
1041 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; 1041 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
1042 const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); 1042 const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
1043 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; 1043 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
1044 u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); 1044 u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
1045 int sk_ipv6only = ipv6_only_sock(sk); 1045 int sk_ipv6only = ipv6_only_sock(sk);
1046 int sk2_ipv6only = tcp_v6_ipv6only(sk2); 1046 int sk2_ipv6only = inet_v6_ipv6only(sk2);
1047 int addr_type = ipv6_addr_type(sk_rcv_saddr6); 1047 int addr_type = ipv6_addr_type(sk_rcv_saddr6);
1048 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; 1048 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
1049 1049
@@ -1126,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
1126 __ipv6_dev_mc_dec(idev, &maddr); 1126 __ipv6_dev_mc_dec(idev, &maddr);
1127} 1127}
1128 1128
1129void addrconf_join_anycast(struct inet6_ifaddr *ifp) 1129static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
1130{ 1130{
1131 struct in6_addr addr; 1131 struct in6_addr addr;
1132 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); 1132 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1135,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp)
1135 ipv6_dev_ac_inc(ifp->idev->dev, &addr); 1135 ipv6_dev_ac_inc(ifp->idev->dev, &addr);
1136} 1136}
1137 1137
1138void addrconf_leave_anycast(struct inet6_ifaddr *ifp) 1138static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
1139{ 1139{
1140 struct in6_addr addr; 1140 struct in6_addr addr;
1141 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); 1141 ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -2858,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
2858 2858
2859 skb = alloc_skb(size, GFP_ATOMIC); 2859 skb = alloc_skb(size, GFP_ATOMIC);
2860 if (!skb) { 2860 if (!skb) {
2861 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); 2861 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS);
2862 return; 2862 return;
2863 } 2863 }
2864 if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { 2864 if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
2865 kfree_skb(skb); 2865 kfree_skb(skb);
2866 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); 2866 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL);
2867 return; 2867 return;
2868 } 2868 }
2869 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; 2869 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR;
2870 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); 2870 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC);
2871} 2871}
2872 2872
2873static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, 2873static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
@@ -2994,16 +2994,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
2994 2994
2995 skb = alloc_skb(size, GFP_ATOMIC); 2995 skb = alloc_skb(size, GFP_ATOMIC);
2996 if (!skb) { 2996 if (!skb) {
2997 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); 2997 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS);
2998 return; 2998 return;
2999 } 2999 }
3000 if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) { 3000 if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
3001 kfree_skb(skb); 3001 kfree_skb(skb);
3002 netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); 3002 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL);
3003 return; 3003 return;
3004 } 3004 }
3005 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO; 3005 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO;
3006 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC); 3006 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
3007} 3007}
3008 3008
3009static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, 3009static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
@@ -3054,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
3054 3054
3055 skb = alloc_skb(size, GFP_ATOMIC); 3055 skb = alloc_skb(size, GFP_ATOMIC);
3056 if (!skb) { 3056 if (!skb) {
3057 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); 3057 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS);
3058 return; 3058 return;
3059 } 3059 }
3060 if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) { 3060 if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
3061 kfree_skb(skb); 3061 kfree_skb(skb);
3062 netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); 3062 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL);
3063 return; 3063 return;
3064 } 3064 }
3065 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX; 3065 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX;
3066 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); 3066 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC);
3067} 3067}
3068 3068
3069static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { 3069static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
@@ -3593,10 +3593,8 @@ void __exit addrconf_cleanup(void)
3593 rtnl_unlock(); 3593 rtnl_unlock();
3594 3594
3595#ifdef CONFIG_IPV6_PRIVACY 3595#ifdef CONFIG_IPV6_PRIVACY
3596 if (likely(md5_tfm != NULL)) { 3596 crypto_free_tfm(md5_tfm);
3597 crypto_free_tfm(md5_tfm); 3597 md5_tfm = NULL;
3598 md5_tfm = NULL;
3599 }
3600#endif 3598#endif
3601 3599
3602#ifdef CONFIG_PROC_FS 3600#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 28d9bcab0970..4f8795af2edb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
44#include <linux/netdevice.h> 44#include <linux/netdevice.h>
45#include <linux/icmpv6.h> 45#include <linux/icmpv6.h>
46#include <linux/smp_lock.h> 46#include <linux/smp_lock.h>
47#include <linux/netfilter_ipv6.h>
47 48
48#include <net/ip.h> 49#include <net/ip.h>
49#include <net/ipv6.h> 50#include <net/ipv6.h>
@@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens");
66MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); 67MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
67MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
68 69
69/* IPv6 procfs goodies... */
70
71#ifdef CONFIG_PROC_FS
72extern int raw6_proc_init(void);
73extern void raw6_proc_exit(void);
74extern int tcp6_proc_init(void);
75extern void tcp6_proc_exit(void);
76extern int udp6_proc_init(void);
77extern void udp6_proc_exit(void);
78extern int ipv6_misc_proc_init(void);
79extern void ipv6_misc_proc_exit(void);
80extern int ac6_proc_init(void);
81extern void ac6_proc_exit(void);
82extern int if6_proc_init(void);
83extern void if6_proc_exit(void);
84#endif
85
86int sysctl_ipv6_bindv6only; 70int sysctl_ipv6_bindv6only;
87 71
88#ifdef INET_REFCNT_DEBUG
89atomic_t inet6_sock_nr;
90EXPORT_SYMBOL(inet6_sock_nr);
91#endif
92
93/* The inetsw table contains everything that inet_create needs to 72/* The inetsw table contains everything that inet_create needs to
94 * build a new socket. 73 * build a new socket.
95 */ 74 */
96static struct list_head inetsw6[SOCK_MAX]; 75static struct list_head inetsw6[SOCK_MAX];
97static DEFINE_SPINLOCK(inetsw6_lock); 76static DEFINE_SPINLOCK(inetsw6_lock);
98 77
99static void inet6_sock_destruct(struct sock *sk)
100{
101 inet_sock_destruct(sk);
102
103#ifdef INET_REFCNT_DEBUG
104 atomic_dec(&inet6_sock_nr);
105#endif
106}
107
108static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) 78static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
109{ 79{
110 const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); 80 const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol)
185 inet->hdrincl = 1; 155 inet->hdrincl = 1;
186 } 156 }
187 157
188 sk->sk_destruct = inet6_sock_destruct; 158 sk->sk_destruct = inet_sock_destruct;
189 sk->sk_family = PF_INET6; 159 sk->sk_family = PF_INET6;
190 sk->sk_protocol = protocol; 160 sk->sk_protocol = protocol;
191 161
@@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol)
212 inet->pmtudisc = IP_PMTUDISC_DONT; 182 inet->pmtudisc = IP_PMTUDISC_DONT;
213 else 183 else
214 inet->pmtudisc = IP_PMTUDISC_WANT; 184 inet->pmtudisc = IP_PMTUDISC_WANT;
185 /*
186 * Increment only the relevant sk_prot->socks debug field, this changes
187 * the previous behaviour of incrementing both the equivalent to
188 * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
189 *
190 * This allows better debug granularity as we'll know exactly how many
191 * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
192 * transport protocol socks. -acme
193 */
194 sk_refcnt_debug_inc(sk);
215 195
216
217#ifdef INET_REFCNT_DEBUG
218 atomic_inc(&inet6_sock_nr);
219 atomic_inc(&inet_sock_nr);
220#endif
221 if (inet->num) { 196 if (inet->num) {
222 /* It assumes that any protocol which allows 197 /* It assumes that any protocol which allows
223 * the user to assign a number at socket 198 * the user to assign a number at socket
@@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = {
513 .owner = THIS_MODULE, 488 .owner = THIS_MODULE,
514}; 489};
515 490
516#ifdef CONFIG_SYSCTL
517extern void ipv6_sysctl_register(void);
518extern void ipv6_sysctl_unregister(void);
519#endif
520
521/* Same as inet6_dgram_ops, sans udp_poll. */ 491/* Same as inet6_dgram_ops, sans udp_poll. */
522static struct proto_ops inet6_sockraw_ops = { 492static struct proto_ops inet6_sockraw_ops = {
523 .family = PF_INET6, 493 .family = PF_INET6,
@@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void)
684 snmp6_mib_free((void **)udp_stats_in6); 654 snmp6_mib_free((void **)udp_stats_in6);
685} 655}
686 656
687extern int ipv6_misc_proc_init(void);
688
689static int __init inet6_init(void) 657static int __init inet6_init(void)
690{ 658{
691 struct sk_buff *dummy_skb; 659 struct sk_buff *dummy_skb;
@@ -757,6 +725,9 @@ static int __init inet6_init(void)
757 err = igmp6_init(&inet6_family_ops); 725 err = igmp6_init(&inet6_family_ops);
758 if (err) 726 if (err)
759 goto igmp_fail; 727 goto igmp_fail;
728 err = ipv6_netfilter_init();
729 if (err)
730 goto netfilter_fail;
760 /* Create /proc/foo6 entries. */ 731 /* Create /proc/foo6 entries. */
761#ifdef CONFIG_PROC_FS 732#ifdef CONFIG_PROC_FS
762 err = -ENOMEM; 733 err = -ENOMEM;
@@ -813,6 +784,8 @@ proc_tcp6_fail:
813 raw6_proc_exit(); 784 raw6_proc_exit();
814proc_raw6_fail: 785proc_raw6_fail:
815#endif 786#endif
787 ipv6_netfilter_fini();
788netfilter_fail:
816 igmp6_cleanup(); 789 igmp6_cleanup();
817igmp_fail: 790igmp_fail:
818 ndisc_cleanup(); 791 ndisc_cleanup();
@@ -852,6 +825,7 @@ static void __exit inet6_exit(void)
852 ip6_route_cleanup(); 825 ip6_route_cleanup();
853 ipv6_packet_cleanup(); 826 ipv6_packet_cleanup();
854 igmp6_cleanup(); 827 igmp6_cleanup();
828 ipv6_netfilter_fini();
855 ndisc_cleanup(); 829 ndisc_cleanup();
856 icmpv6_cleanup(); 830 icmpv6_cleanup();
857#ifdef CONFIG_SYSCTL 831#ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 986fdfdccbcd..f3629730eb15 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
131 case NEXTHDR_HOP: 131 case NEXTHDR_HOP:
132 case NEXTHDR_DEST: 132 case NEXTHDR_DEST:
133 if (!zero_out_mutable_opts(exthdr.opth)) { 133 if (!zero_out_mutable_opts(exthdr.opth)) {
134 LIMIT_NETDEBUG(printk( 134 LIMIT_NETDEBUG(
135 KERN_WARNING "overrun %sopts\n", 135 KERN_WARNING "overrun %sopts\n",
136 nexthdr == NEXTHDR_HOP ? 136 nexthdr == NEXTHDR_HOP ?
137 "hop" : "dest")); 137 "hop" : "dest");
138 return -EINVAL; 138 return -EINVAL;
139 } 139 }
140 break; 140 break;
@@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
293 skb_push(skb, skb->data - skb->nh.raw); 293 skb_push(skb, skb->data - skb->nh.raw);
294 ahp->icv(ahp, skb, ah->auth_data); 294 ahp->icv(ahp, skb, ah->auth_data);
295 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { 295 if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
296 LIMIT_NETDEBUG( 296 LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
297 printk(KERN_WARNING "ipsec ah authentication error\n"));
298 x->stats.integrity_failed++; 297 x->stats.integrity_failed++;
299 goto free_out; 298 goto free_out;
300 } 299 }
@@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
332 if (!x) 331 if (!x)
333 return; 332 return;
334 333
335 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" 334 NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
336 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", 335 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
337 ntohl(ah->spi), NIP6(iph->daddr))); 336 ntohl(ah->spi), NIP6(iph->daddr));
338 337
339 xfrm_state_put(x); 338 xfrm_state_put(x);
340} 339}
@@ -402,10 +401,8 @@ static int ah6_init_state(struct xfrm_state *x)
402 401
403error: 402error:
404 if (ahp) { 403 if (ahp) {
405 if (ahp->work_icv) 404 kfree(ahp->work_icv);
406 kfree(ahp->work_icv); 405 crypto_free_tfm(ahp->tfm);
407 if (ahp->tfm)
408 crypto_free_tfm(ahp->tfm);
409 kfree(ahp); 406 kfree(ahp);
410 } 407 }
411 return -EINVAL; 408 return -EINVAL;
@@ -418,14 +415,10 @@ static void ah6_destroy(struct xfrm_state *x)
418 if (!ahp) 415 if (!ahp)
419 return; 416 return;
420 417
421 if (ahp->work_icv) { 418 kfree(ahp->work_icv);
422 kfree(ahp->work_icv); 419 ahp->work_icv = NULL;
423 ahp->work_icv = NULL; 420 crypto_free_tfm(ahp->tfm);
424 } 421 ahp->tfm = NULL;
425 if (ahp->tfm) {
426 crypto_free_tfm(ahp->tfm);
427 ahp->tfm = NULL;
428 }
429 kfree(ahp); 422 kfree(ahp);
430} 423}
431 424
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5229365cd8b4..01468fab3d3d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
29#include <net/addrconf.h> 29#include <net/addrconf.h>
30#include <net/transp_v6.h> 30#include <net/transp_v6.h>
31#include <net/ip6_route.h> 31#include <net/ip6_route.h>
32#include <net/tcp_states.h>
32 33
33#include <linux/errqueue.h> 34#include <linux/errqueue.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -588,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
588 break; 589 break;
589 590
590 default: 591 default:
591 LIMIT_NETDEBUG( 592 LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
592 printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type)); 593 cmsg->cmsg_type);
593 err = -EINVAL; 594 err = -EINVAL;
594 break; 595 break;
595 }; 596 };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 324db62515a2..9b27460f0cc7 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
212 212
213 padlen = nexthdr[0]; 213 padlen = nexthdr[0];
214 if (padlen+2 >= elen) { 214 if (padlen+2 >= elen) {
215 LIMIT_NETDEBUG( 215 LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
216 printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen));
217 ret = -EINVAL; 216 ret = -EINVAL;
218 goto out; 217 goto out;
219 } 218 }
@@ -277,22 +276,14 @@ static void esp6_destroy(struct xfrm_state *x)
277 if (!esp) 276 if (!esp)
278 return; 277 return;
279 278
280 if (esp->conf.tfm) { 279 crypto_free_tfm(esp->conf.tfm);
281 crypto_free_tfm(esp->conf.tfm); 280 esp->conf.tfm = NULL;
282 esp->conf.tfm = NULL; 281 kfree(esp->conf.ivec);
283 } 282 esp->conf.ivec = NULL;
284 if (esp->conf.ivec) { 283 crypto_free_tfm(esp->auth.tfm);
285 kfree(esp->conf.ivec); 284 esp->auth.tfm = NULL;
286 esp->conf.ivec = NULL; 285 kfree(esp->auth.work_icv);
287 } 286 esp->auth.work_icv = NULL;
288 if (esp->auth.tfm) {
289 crypto_free_tfm(esp->auth.tfm);
290 esp->auth.tfm = NULL;
291 }
292 if (esp->auth.work_icv) {
293 kfree(esp->auth.work_icv);
294 esp->auth.work_icv = NULL;
295 }
296 kfree(esp); 287 kfree(esp);
297} 288}
298 289
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e0839eafc3a9..5be6da2584ee 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
424 IP6CB(skb)->ra = optoff; 424 IP6CB(skb)->ra = optoff;
425 return 1; 425 return 1;
426 } 426 }
427 LIMIT_NETDEBUG( 427 LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
428 printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1])); 428 skb->nh.raw[optoff+1]);
429 kfree_skb(skb); 429 kfree_skb(skb);
430 return 0; 430 return 0;
431} 431}
@@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
437 u32 pkt_len; 437 u32 pkt_len;
438 438
439 if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { 439 if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
440 LIMIT_NETDEBUG( 440 LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
441 printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1])); 441 skb->nh.raw[optoff+1]);
442 IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); 442 IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
443 goto drop; 443 goto drop;
444 } 444 }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ff3ec9822e36..fa8f1bb0aa52 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -67,7 +67,7 @@
67#include <asm/uaccess.h> 67#include <asm/uaccess.h>
68#include <asm/system.h> 68#include <asm/system.h>
69 69
70DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); 70DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
71 71
72/* 72/*
73 * The ICMP socket(s). This is the most convenient way to flow control 73 * The ICMP socket(s). This is the most convenient way to flow control
@@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
332 * for now we don't know that. 332 * for now we don't know that.
333 */ 333 */
334 if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { 334 if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
335 LIMIT_NETDEBUG( 335 LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
336 printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"));
337 return; 336 return;
338 } 337 }
339 338
@@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
341 * Never answer to a ICMP packet. 340 * Never answer to a ICMP packet.
342 */ 341 */
343 if (is_ineligible(skb)) { 342 if (is_ineligible(skb)) {
344 LIMIT_NETDEBUG( 343 LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
345 printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"));
346 return; 344 return;
347 } 345 }
348 346
@@ -393,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
393 len = skb->len - msg.offset; 391 len = skb->len - msg.offset;
394 len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); 392 len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
395 if (len < 0) { 393 if (len < 0) {
396 LIMIT_NETDEBUG( 394 LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
397 printk(KERN_DEBUG "icmp: len problem\n"));
398 goto out_dst_release; 395 goto out_dst_release;
399 } 396 }
400 397
@@ -551,7 +548,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
551 548
552 read_lock(&raw_v6_lock); 549 read_lock(&raw_v6_lock);
553 if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { 550 if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
554 while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { 551 while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
552 IP6CB(skb)->iif))) {
555 rawv6_err(sk, skb, NULL, type, code, inner_offset, info); 553 rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
556 sk = sk_next(sk); 554 sk = sk_next(sk);
557 } 555 }
@@ -583,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
583 skb->ip_summed = CHECKSUM_UNNECESSARY; 581 skb->ip_summed = CHECKSUM_UNNECESSARY;
584 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 582 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
585 skb->csum)) { 583 skb->csum)) {
586 LIMIT_NETDEBUG( 584 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
587 printk(KERN_DEBUG "ICMPv6 hw checksum failed\n"));
588 skb->ip_summed = CHECKSUM_NONE; 585 skb->ip_summed = CHECKSUM_NONE;
589 } 586 }
590 } 587 }
591 if (skb->ip_summed == CHECKSUM_NONE) { 588 if (skb->ip_summed == CHECKSUM_NONE) {
592 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 589 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
593 skb_checksum(skb, 0, skb->len, 0))) { 590 skb_checksum(skb, 0, skb->len, 0))) {
594 LIMIT_NETDEBUG( 591 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
595 printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", 592 NIP6(*saddr), NIP6(*daddr));
596 NIP6(*saddr), NIP6(*daddr)));
597 goto discard_it; 593 goto discard_it;
598 } 594 }
599 } 595 }
@@ -669,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
669 break; 665 break;
670 666
671 default: 667 default:
672 LIMIT_NETDEBUG( 668 LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
673 printk(KERN_DEBUG "icmpv6: msg of unknown type\n"));
674 669
675 /* informational */ 670 /* informational */
676 if (type & ICMPV6_INFOMSG_MASK) 671 if (type & ICMPV6_INFOMSG_MASK)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 000000000000..01d5f46d4e40
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,81 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic INET6 transport hashtables
7 *
8 * Authors: Lotsa people, from code originally in tcp
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/config.h>
17
18#include <linux/module.h>
19
20#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h>
22#include <net/inet6_hashtables.h>
23
24struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
25 const struct in6_addr *daddr,
26 const unsigned short hnum, const int dif)
27{
28 struct sock *sk;
29 const struct hlist_node *node;
30 struct sock *result = NULL;
31 int score, hiscore = 0;
32
33 read_lock(&hashinfo->lhash_lock);
34 sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
35 if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
36 const struct ipv6_pinfo *np = inet6_sk(sk);
37
38 score = 1;
39 if (!ipv6_addr_any(&np->rcv_saddr)) {
40 if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
41 continue;
42 score++;
43 }
44 if (sk->sk_bound_dev_if) {
45 if (sk->sk_bound_dev_if != dif)
46 continue;
47 score++;
48 }
49 if (score == 3) {
50 result = sk;
51 break;
52 }
53 if (score > hiscore) {
54 hiscore = score;
55 result = sk;
56 }
57 }
58 }
59 if (result)
60 sock_hold(result);
61 read_unlock(&hashinfo->lhash_lock);
62 return result;
63}
64
65EXPORT_SYMBOL_GPL(inet6_lookup_listener);
66
67struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
68 const struct in6_addr *saddr, const u16 sport,
69 const struct in6_addr *daddr, const u16 dport,
70 const int dif)
71{
72 struct sock *sk;
73
74 local_bh_disable();
75 sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
76 local_bh_enable();
77
78 return sk;
79}
80
81EXPORT_SYMBOL_GPL(inet6_lookup);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1b354aa97934..16af874c9e8f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -49,7 +49,7 @@
49 49
50struct rt6_statistics rt6_stats; 50struct rt6_statistics rt6_stats;
51 51
52static kmem_cache_t * fib6_node_kmem; 52static kmem_cache_t * fib6_node_kmem __read_mostly;
53 53
54enum fib_walk_state_t 54enum fib_walk_state_t
55{ 55{
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f10726c58..6e3480426939 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb)
56 return dst_input(skb); 56 return dst_input(skb);
57} 57}
58 58
59int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 59int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
60{ 60{
61 struct ipv6hdr *hdr; 61 struct ipv6hdr *hdr;
62 u32 pkt_len; 62 u32 pkt_len;
@@ -166,8 +166,8 @@ resubmit:
166 nexthdr = skb->nh.raw[nhoff]; 166 nexthdr = skb->nh.raw[nhoff];
167 167
168 raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); 168 raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
169 if (raw_sk) 169 if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
170 ipv6_raw_deliver(skb, nexthdr); 170 raw_sk = NULL;
171 171
172 hash = nexthdr & (MAX_INET_PROTOS - 1); 172 hash = nexthdr & (MAX_INET_PROTOS - 1);
173 if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { 173 if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
@@ -198,12 +198,13 @@ resubmit:
198 if (!raw_sk) { 198 if (!raw_sk) {
199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { 199 if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); 200 IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
201 icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); 201 icmpv6_send(skb, ICMPV6_PARAMPROB,
202 ICMPV6_UNK_NEXTHDR, nhoff,
203 skb->dev);
202 } 204 }
203 } else { 205 } else
204 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); 206 IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
205 kfree_skb(skb); 207 kfree_skb(skb);
206 }
207 } 208 }
208 rcu_read_unlock(); 209 rcu_read_unlock();
209 return 0; 210 return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1f2c2f9e353f..01ef94f7c7f1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb)
153 return ip6_output2(skb); 153 return ip6_output2(skb);
154} 154}
155 155
156#ifdef CONFIG_NETFILTER
157int ip6_route_me_harder(struct sk_buff *skb)
158{
159 struct ipv6hdr *iph = skb->nh.ipv6h;
160 struct dst_entry *dst;
161 struct flowi fl = {
162 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 .nl_u =
164 { .ip6_u =
165 { .daddr = iph->daddr,
166 .saddr = iph->saddr, } },
167 .proto = iph->nexthdr,
168 };
169
170 dst = ip6_route_output(skb->sk, &fl);
171
172 if (dst->error) {
173 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 LIMIT_NETDEBUG(
175 printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 dst_release(dst);
177 return -EINVAL;
178 }
179
180 /* Drop old route. */
181 dst_release(skb->dst);
182
183 skb->dst = dst;
184 return 0;
185}
186#endif
187
188static inline int ip6_maybe_reroute(struct sk_buff *skb)
189{
190#ifdef CONFIG_NETFILTER
191 if (skb->nfcache & NFC_ALTERED){
192 if (ip6_route_me_harder(skb) != 0){
193 kfree_skb(skb);
194 return -EINVAL;
195 }
196 }
197#endif /* CONFIG_NETFILTER */
198 return dst_output(skb);
199}
200
201/* 156/*
202 * xmit an sk_buff (used by TCP) 157 * xmit an sk_buff (used by TCP)
203 */ 158 */
@@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
266 mtu = dst_mtu(dst); 221 mtu = dst_mtu(dst);
267 if ((skb->len <= mtu) || ipfragok) { 222 if ((skb->len <= mtu) || ipfragok) {
268 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 223 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); 224 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
225 dst_output);
270 } 226 }
271 227
272 if (net_ratelimit()) 228 if (net_ratelimit())
@@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
321 read_lock(&ip6_ra_lock); 277 read_lock(&ip6_ra_lock);
322 for (ra = ip6_ra_chain; ra; ra = ra->next) { 278 for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 struct sock *sk = ra->sk; 279 struct sock *sk = ra->sk;
324 if (sk && ra->sel == sel) { 280 if (sk && ra->sel == sel &&
281 (!sk->sk_bound_dev_if ||
282 sk->sk_bound_dev_if == skb->dev->ifindex)) {
325 if (last) { 283 if (last) {
326 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 284 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 if (skb2) 285 if (skb2)
@@ -667,7 +625,7 @@ slow_path:
667 */ 625 */
668 626
669 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { 627 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
670 NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); 628 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
671 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); 629 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
672 err = -ENOMEM; 630 err = -ENOMEM;
673 goto fail; 631 goto fail;
@@ -792,13 +750,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
792 if (ipv6_addr_any(&fl->fl6_src)) { 750 if (ipv6_addr_any(&fl->fl6_src)) {
793 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); 751 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
794 752
795 if (err) { 753 if (err)
796#if IP6_DEBUG >= 2
797 printk(KERN_DEBUG "ip6_dst_lookup: "
798 "no available source address\n");
799#endif
800 goto out_err_release; 754 goto out_err_release;
801 }
802 } 755 }
803 756
804 return 0; 757 return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ba3b0c267f75..09613729404c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1110,11 +1110,39 @@ ip6ip6_fb_tnl_dev_init(struct net_device *dev)
1110 return 0; 1110 return 0;
1111} 1111}
1112 1112
1113#ifdef CONFIG_INET6_TUNNEL
1113static struct xfrm6_tunnel ip6ip6_handler = { 1114static struct xfrm6_tunnel ip6ip6_handler = {
1114 .handler = ip6ip6_rcv, 1115 .handler = ip6ip6_rcv,
1115 .err_handler = ip6ip6_err, 1116 .err_handler = ip6ip6_err,
1116}; 1117};
1117 1118
1119static inline int ip6ip6_register(void)
1120{
1121 return xfrm6_tunnel_register(&ip6ip6_handler);
1122}
1123
1124static inline int ip6ip6_unregister(void)
1125{
1126 return xfrm6_tunnel_deregister(&ip6ip6_handler);
1127}
1128#else
1129static struct inet6_protocol xfrm6_tunnel_protocol = {
1130 .handler = ip6ip6_rcv,
1131 .err_handler = ip6ip6_err,
1132 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
1133};
1134
1135static inline int ip6ip6_register(void)
1136{
1137 return inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1138}
1139
1140static inline int ip6ip6_unregister(void)
1141{
1142 return inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
1143}
1144#endif
1145
1118/** 1146/**
1119 * ip6_tunnel_init - register protocol and reserve needed resources 1147 * ip6_tunnel_init - register protocol and reserve needed resources
1120 * 1148 *
@@ -1125,7 +1153,7 @@ static int __init ip6_tunnel_init(void)
1125{ 1153{
1126 int err; 1154 int err;
1127 1155
1128 if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) { 1156 if (ip6ip6_register() < 0) {
1129 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); 1157 printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
1130 return -EAGAIN; 1158 return -EAGAIN;
1131 } 1159 }
@@ -1144,7 +1172,7 @@ static int __init ip6_tunnel_init(void)
1144 } 1172 }
1145 return 0; 1173 return 0;
1146fail: 1174fail:
1147 xfrm6_tunnel_deregister(&ip6ip6_handler); 1175 ip6ip6_unregister();
1148 return err; 1176 return err;
1149} 1177}
1150 1178
@@ -1154,7 +1182,7 @@ fail:
1154 1182
1155static void __exit ip6_tunnel_cleanup(void) 1183static void __exit ip6_tunnel_cleanup(void)
1156{ 1184{
1157 if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0) 1185 if (ip6ip6_unregister() < 0)
1158 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); 1186 printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
1159 1187
1160 unregister_netdev(ip6ip6_fb_tnl_dev); 1188 unregister_netdev(ip6ip6_fb_tnl_dev);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 423feb46ccc0..85bfbc69b2c3 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -341,8 +341,7 @@ static void ipcomp6_free_tfms(struct crypto_tfm **tfms)
341 341
342 for_each_cpu(cpu) { 342 for_each_cpu(cpu) {
343 struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); 343 struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
344 if (tfm) 344 crypto_free_tfm(tfm);
345 crypto_free_tfm(tfm);
346 } 345 }
347 free_percpu(tfms); 346 free_percpu(tfms);
348} 347}
@@ -354,7 +353,7 @@ static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name)
354 int cpu; 353 int cpu;
355 354
356 /* This can be any valid CPU ID so we don't need locking. */ 355 /* This can be any valid CPU ID so we don't need locking. */
357 cpu = smp_processor_id(); 356 cpu = raw_smp_processor_id();
358 357
359 list_for_each_entry(pos, &ipcomp6_tfms_list, list) { 358 list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
360 struct crypto_tfm *tfm; 359 struct crypto_tfm *tfm;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f3ef4c38d315..76466af8331e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,7 +55,7 @@
55 55
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57 57
58DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); 58DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
59 59
60static struct packet_type ipv6_packet_type = { 60static struct packet_type ipv6_packet_type = {
61 .type = __constant_htons(ETH_P_IPV6), 61 .type = __constant_htons(ETH_P_IPV6),
@@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
109 return 0; 109 return 0;
110} 110}
111 111
112extern int ip6_mc_source(int add, int omode, struct sock *sk,
113 struct group_source_req *pgsr);
114extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
115extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
116 struct group_filter __user *optval, int __user *optlen);
117
118
119int ipv6_setsockopt(struct sock *sk, int level, int optname, 112int ipv6_setsockopt(struct sock *sk, int level, int optname,
120 char __user *optval, int optlen) 113 char __user *optval, int optlen)
121{ 114{
@@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
163 fl6_free_socklist(sk); 156 fl6_free_socklist(sk);
164 ipv6_sock_mc_close(sk); 157 ipv6_sock_mc_close(sk);
165 158
159 /*
160 * Sock is moving from IPv6 to IPv4 (sk_prot), so
161 * remove it from the refcnt debug socks count in the
162 * original family...
163 */
164 sk_refcnt_debug_dec(sk);
165
166 if (sk->sk_protocol == IPPROTO_TCP) { 166 if (sk->sk_protocol == IPPROTO_TCP) {
167 struct tcp_sock *tp = tcp_sk(sk); 167 struct tcp_sock *tp = tcp_sk(sk);
168 168
@@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
192 kfree_skb(pktopt); 192 kfree_skb(pktopt);
193 193
194 sk->sk_destruct = inet_sock_destruct; 194 sk->sk_destruct = inet_sock_destruct;
195#ifdef INET_REFCNT_DEBUG 195 /*
196 atomic_dec(&inet6_sock_nr); 196 * ... and add it to the refcnt debug socks count
197#endif 197 * in the new family. -acme
198 */
199 sk_refcnt_debug_inc(sk);
198 module_put(THIS_MODULE); 200 module_put(THIS_MODULE);
199 retv = 0; 201 retv = 0;
200 break; 202 break;
@@ -437,7 +439,6 @@ done:
437 } 439 }
438 case MCAST_MSFILTER: 440 case MCAST_MSFILTER:
439 { 441 {
440 extern int sysctl_optmem_max;
441 extern int sysctl_mld_max_msf; 442 extern int sysctl_mld_max_msf;
442 struct group_filter *gsf; 443 struct group_filter *gsf;
443 444
@@ -504,6 +505,9 @@ done:
504 break; 505 break;
505 case IPV6_IPSEC_POLICY: 506 case IPV6_IPSEC_POLICY:
506 case IPV6_XFRM_POLICY: 507 case IPV6_XFRM_POLICY:
508 retv = -EPERM;
509 if (!capable(CAP_NET_ADMIN))
510 break;
507 retv = xfrm_user_policy(sk, optname, optval, optlen); 511 retv = xfrm_user_policy(sk, optname, optval, optlen);
508 break; 512 break;
509 513
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 5ade5a5d1990..37a4a99c9fe9 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
15EXPORT_SYMBOL(register_inet6addr_notifier); 15EXPORT_SYMBOL(register_inet6addr_notifier);
16EXPORT_SYMBOL(unregister_inet6addr_notifier); 16EXPORT_SYMBOL(unregister_inet6addr_notifier);
17EXPORT_SYMBOL(ip6_route_output); 17EXPORT_SYMBOL(ip6_route_output);
18#ifdef CONFIG_NETFILTER
19EXPORT_SYMBOL(ip6_route_me_harder);
20#endif
21EXPORT_SYMBOL(addrconf_lock); 18EXPORT_SYMBOL(addrconf_lock);
22EXPORT_SYMBOL(ipv6_setsockopt); 19EXPORT_SYMBOL(ipv6_setsockopt);
23EXPORT_SYMBOL(ipv6_getsockopt); 20EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7ae72d4c9bd2..a7eae30f4554 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
812 if (ipv6_chk_acast_addr(dev, &msg->target) || 812 if (ipv6_chk_acast_addr(dev, &msg->target) ||
813 (idev->cnf.forwarding && 813 (idev->cnf.forwarding &&
814 pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) { 814 pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
815 if (skb->stamp.tv_sec != LOCALLY_ENQUEUED && 815 if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
816 skb->pkt_type != PACKET_HOST && 816 skb->pkt_type != PACKET_HOST &&
817 inc != 0 && 817 inc != 0 &&
818 idev->nd_parms->proxy_delay != 0) { 818 idev->nd_parms->proxy_delay != 0) {
@@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb)
1487 return 0; 1487 return 0;
1488 } 1488 }
1489 1489
1490 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
1491
1490 switch (msg->icmph.icmp6_type) { 1492 switch (msg->icmph.icmp6_type) {
1491 case NDISC_NEIGHBOUR_SOLICITATION: 1493 case NDISC_NEIGHBOUR_SOLICITATION:
1492 ndisc_recv_ns(skb); 1494 ndisc_recv_ns(skb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 000000000000..f8626ebf90fd
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,104 @@
1#include <linux/config.h>
2#include <linux/init.h>
3
4#ifdef CONFIG_NETFILTER
5
6#include <linux/kernel.h>
7#include <linux/ipv6.h>
8#include <linux/netfilter.h>
9#include <linux/netfilter_ipv6.h>
10#include <net/dst.h>
11#include <net/ipv6.h>
12#include <net/ip6_route.h>
13
14int ip6_route_me_harder(struct sk_buff *skb)
15{
16 struct ipv6hdr *iph = skb->nh.ipv6h;
17 struct dst_entry *dst;
18 struct flowi fl = {
19 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
20 .nl_u =
21 { .ip6_u =
22 { .daddr = iph->daddr,
23 .saddr = iph->saddr, } },
24 .proto = iph->nexthdr,
25 };
26
27 dst = ip6_route_output(skb->sk, &fl);
28
29 if (dst->error) {
30 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
31 LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
32 dst_release(dst);
33 return -EINVAL;
34 }
35
36 /* Drop old route. */
37 dst_release(skb->dst);
38
39 skb->dst = dst;
40 return 0;
41}
42EXPORT_SYMBOL(ip6_route_me_harder);
43
44/*
45 * Extra routing may needed on local out, as the QUEUE target never
46 * returns control to the table.
47 */
48
49struct ip6_rt_info {
50 struct in6_addr daddr;
51 struct in6_addr saddr;
52};
53
54static void save(const struct sk_buff *skb, struct nf_info *info)
55{
56 struct ip6_rt_info *rt_info = nf_info_reroute(info);
57
58 if (info->hook == NF_IP6_LOCAL_OUT) {
59 struct ipv6hdr *iph = skb->nh.ipv6h;
60
61 rt_info->daddr = iph->daddr;
62 rt_info->saddr = iph->saddr;
63 }
64}
65
66static int reroute(struct sk_buff **pskb, const struct nf_info *info)
67{
68 struct ip6_rt_info *rt_info = nf_info_reroute(info);
69
70 if (info->hook == NF_IP6_LOCAL_OUT) {
71 struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
72 if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
73 !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
74 return ip6_route_me_harder(*pskb);
75 }
76 return 0;
77}
78
79static struct nf_queue_rerouter ip6_reroute = {
80 .rer_size = sizeof(struct ip6_rt_info),
81 .save = &save,
82 .reroute = &reroute,
83};
84
85int __init ipv6_netfilter_init(void)
86{
87 return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
88}
89
90void ipv6_netfilter_fini(void)
91{
92 nf_unregister_queue_rerouter(PF_INET6);
93}
94
95#else /* CONFIG_NETFILTER */
96int __init ipv6_netfilter_init(void)
97{
98 return 0;
99}
100
101void ipv6_netfilter_fini(void)
102{
103}
104#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 77ec704c9ee3..216fbe1ac65c 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
10# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK 10# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
11#fi 11#fi
12config IP6_NF_QUEUE 12config IP6_NF_QUEUE
13 tristate "Userspace queueing via NETLINK" 13 tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
14 ---help--- 14 ---help---
15 15
16 This option adds a queue handler to the kernel for IPv6 16 This option adds a queue handler to the kernel for IPv6
17 packets which lets us to receive the filtered packets 17 packets which enables users to receive the filtered packets
18 with QUEUE target using libiptc as we can do with 18 with QUEUE target using libipq.
19 the IPv4 now. 19
20 THis option enables the old IPv6-only "ip6_queue" implementation
21 which has been obsoleted by the new "nfnetlink_queue" code (see
22 CONFIG_NETFILTER_NETLINK_QUEUE).
20 23
21 (C) Fernando Anton 2001 24 (C) Fernando Anton 2001
22 IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. 25 IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
@@ -196,6 +199,16 @@ config IP6_NF_TARGET_LOG
196 199
197 To compile it as a module, choose M here. If unsure, say N. 200 To compile it as a module, choose M here. If unsure, say N.
198 201
202config IP6_NF_TARGET_REJECT
203 tristate "REJECT target support"
204 depends on IP6_NF_FILTER
205 help
206 The REJECT target allows a filtering rule to specify that an ICMPv6
207 error should be issued in response to an incoming packet, rather
208 than silently being dropped.
209
210 To compile it as a module, choose M here. If unsure, say N.
211
199# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then 212# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
200# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER 213# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
201# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then 214# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
@@ -226,6 +239,22 @@ config IP6_NF_TARGET_MARK
226 239
227 To compile it as a module, choose M here. If unsure, say N. 240 To compile it as a module, choose M here. If unsure, say N.
228 241
242config IP6_NF_TARGET_HL
243 tristate 'HL (hoplimit) target support'
244 depends on IP6_NF_MANGLE
245 help
246 This option adds a `HL' target, which enables the user to decrement
247 the hoplimit value of the IPv6 header or set it to a given (lower)
248 value.
249
250 While it is safe to decrement the hoplimit value, this option also
251 enables functionality to increment and set the hoplimit value of the
252 IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since
253 you can easily create immortal packets that loop forever on the
254 network.
255
256 To compile it as a module, choose M here. If unsure, say N.
257
229#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES 258#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
230config IP6_NF_RAW 259config IP6_NF_RAW
231 tristate 'raw table support (required for TRACE)' 260 tristate 'raw table support (required for TRACE)'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2e51714953b6..bd9a16a5cbba 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
20obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o 20obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
21obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o 21obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
22obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o 22obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
23obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
23obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o 24obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
24obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o 25obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
25obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o 26obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
26obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o 27obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
28obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
29obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 750943e2d34e..aa11cf366efa 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -47,16 +47,10 @@
47#define NET_IPQ_QMAX 2088 47#define NET_IPQ_QMAX 2088
48#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" 48#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
49 49
50struct ipq_rt_info {
51 struct in6_addr daddr;
52 struct in6_addr saddr;
53};
54
55struct ipq_queue_entry { 50struct ipq_queue_entry {
56 struct list_head list; 51 struct list_head list;
57 struct nf_info *info; 52 struct nf_info *info;
58 struct sk_buff *skb; 53 struct sk_buff *skb;
59 struct ipq_rt_info rt_info;
60}; 54};
61 55
62typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); 56typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -76,7 +70,9 @@ static DECLARE_MUTEX(ipqnl_sem);
76static void 70static void
77ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) 71ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
78{ 72{
73 local_bh_disable();
79 nf_reinject(entry->skb, entry->info, verdict); 74 nf_reinject(entry->skb, entry->info, verdict);
75 local_bh_enable();
80 kfree(entry); 76 kfree(entry);
81} 77}
82 78
@@ -209,6 +205,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
209 break; 205 break;
210 206
211 case IPQ_COPY_PACKET: 207 case IPQ_COPY_PACKET:
208 if (entry->skb->ip_summed == CHECKSUM_HW &&
209 (*errp = skb_checksum_help(entry->skb,
210 entry->info->outdev == NULL))) {
211 read_unlock_bh(&queue_lock);
212 return NULL;
213 }
212 if (copy_range == 0 || copy_range > entry->skb->len) 214 if (copy_range == 0 || copy_range > entry->skb->len)
213 data_len = entry->skb->len; 215 data_len = entry->skb->len;
214 else 216 else
@@ -236,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
236 238
237 pmsg->packet_id = (unsigned long )entry; 239 pmsg->packet_id = (unsigned long )entry;
238 pmsg->data_len = data_len; 240 pmsg->data_len = data_len;
239 pmsg->timestamp_sec = entry->skb->stamp.tv_sec; 241 pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
240 pmsg->timestamp_usec = entry->skb->stamp.tv_usec; 242 pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
241 pmsg->mark = entry->skb->nfmark; 243 pmsg->mark = entry->skb->nfmark;
242 pmsg->hook = entry->info->hook; 244 pmsg->hook = entry->info->hook;
243 pmsg->hw_protocol = entry->skb->protocol; 245 pmsg->hw_protocol = entry->skb->protocol;
@@ -276,7 +278,8 @@ nlmsg_failure:
276} 278}
277 279
278static int 280static int
279ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) 281ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
282 unsigned int queuenum, void *data)
280{ 283{
281 int status = -EINVAL; 284 int status = -EINVAL;
282 struct sk_buff *nskb; 285 struct sk_buff *nskb;
@@ -294,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
294 entry->info = info; 297 entry->info = info;
295 entry->skb = skb; 298 entry->skb = skb;
296 299
297 if (entry->info->hook == NF_IP_LOCAL_OUT) {
298 struct ipv6hdr *iph = skb->nh.ipv6h;
299
300 entry->rt_info.daddr = iph->daddr;
301 entry->rt_info.saddr = iph->saddr;
302 }
303
304 nskb = ipq_build_packet_message(entry, &status); 300 nskb = ipq_build_packet_message(entry, &status);
305 if (nskb == NULL) 301 if (nskb == NULL)
306 goto err_out_free; 302 goto err_out_free;
@@ -376,22 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
376 } 372 }
377 skb_put(e->skb, diff); 373 skb_put(e->skb, diff);
378 } 374 }
379 if (!skb_ip_make_writable(&e->skb, v->data_len)) 375 if (!skb_make_writable(&e->skb, v->data_len))
380 return -ENOMEM; 376 return -ENOMEM;
381 memcpy(e->skb->data, v->payload, v->data_len); 377 memcpy(e->skb->data, v->payload, v->data_len);
382 e->skb->nfcache |= NFC_ALTERED; 378 e->skb->ip_summed = CHECKSUM_NONE;
383 379
384 /*
385 * Extra routing may needed on local out, as the QUEUE target never
386 * returns control to the table.
387 * Not a nice way to cmp, but works
388 */
389 if (e->info->hook == NF_IP_LOCAL_OUT) {
390 struct ipv6hdr *iph = e->skb->nh.ipv6h;
391 if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
392 !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
393 return ip6_route_me_harder(e->skb);
394 }
395 return 0; 380 return 0;
396} 381}
397 382
@@ -667,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
667 return len; 652 return len;
668} 653}
669 654
655static struct nf_queue_handler nfqh = {
656 .name = "ip6_queue",
657 .outfn = &ipq_enqueue_packet,
658};
659
670static int 660static int
671init_or_cleanup(int init) 661init_or_cleanup(int init)
672{ 662{
@@ -677,7 +667,8 @@ init_or_cleanup(int init)
677 goto cleanup; 667 goto cleanup;
678 668
679 netlink_register_notifier(&ipq_nl_notifier); 669 netlink_register_notifier(&ipq_nl_notifier);
680 ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); 670 ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
671 THIS_MODULE);
681 if (ipqnl == NULL) { 672 if (ipqnl == NULL) {
682 printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); 673 printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
683 goto cleanup_netlink_notifier; 674 goto cleanup_netlink_notifier;
@@ -694,7 +685,7 @@ init_or_cleanup(int init)
694 register_netdevice_notifier(&ipq_dev_notifier); 685 register_netdevice_notifier(&ipq_dev_notifier);
695 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); 686 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
696 687
697 status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL); 688 status = nf_register_queue_handler(PF_INET6, &nfqh);
698 if (status < 0) { 689 if (status < 0) {
699 printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); 690 printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
700 goto cleanup_sysctl; 691 goto cleanup_sysctl;
@@ -702,7 +693,7 @@ init_or_cleanup(int init)
702 return status; 693 return status;
703 694
704cleanup: 695cleanup:
705 nf_unregister_queue_handler(PF_INET6); 696 nf_unregister_queue_handlers(&nfqh);
706 synchronize_net(); 697 synchronize_net();
707 ipq_flush(NF_DROP); 698 ipq_flush(NF_DROP);
708 699
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73034511c8db..1cb8adb2787f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb,
401 do { 401 do {
402 IP_NF_ASSERT(e); 402 IP_NF_ASSERT(e);
403 IP_NF_ASSERT(back); 403 IP_NF_ASSERT(back);
404 (*pskb)->nfcache |= e->nfcache;
405 if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, 404 if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
406 &protoff, &offset)) { 405 &protoff, &offset)) {
407 struct ip6t_entry_target *t; 406 struct ip6t_entry_target *t;
@@ -434,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb,
434 back->comefrom); 433 back->comefrom);
435 continue; 434 continue;
436 } 435 }
437 if (table_base + v 436 if (table_base + v != (void *)e + e->next_offset
438 != (void *)e + e->next_offset) { 437 && !(e->ipv6.flags & IP6T_F_GOTO)) {
439 /* Save old back ptr in next entry */ 438 /* Save old back ptr in next entry */
440 struct ip6t_entry *next 439 struct ip6t_entry *next
441 = (void *)e + e->next_offset; 440 = (void *)e + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
new file mode 100644
index 000000000000..8f5549b72720
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -0,0 +1,118 @@
1/*
2 * Hop Limit modification target for ip6tables
3 * Maciej Soltysiak <solt@dns.toxicfilms.tv>
4 * Based on HW's TTL module
5 *
6 * This software is distributed under the terms of GNU GPL
7 */
8
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/ip.h>
12
13#include <linux/netfilter_ipv6/ip6_tables.h>
14#include <linux/netfilter_ipv6/ip6t_HL.h>
15
16MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
17MODULE_DESCRIPTION("IP tables Hop Limit modification module");
18MODULE_LICENSE("GPL");
19
20static unsigned int ip6t_hl_target(struct sk_buff **pskb,
21 const struct net_device *in,
22 const struct net_device *out,
23 unsigned int hooknum,
24 const void *targinfo, void *userinfo)
25{
26 struct ipv6hdr *ip6h;
27 const struct ip6t_HL_info *info = targinfo;
28 u_int16_t diffs[2];
29 int new_hl;
30
31 if (!skb_make_writable(pskb, (*pskb)->len))
32 return NF_DROP;
33
34 ip6h = (*pskb)->nh.ipv6h;
35
36 switch (info->mode) {
37 case IP6T_HL_SET:
38 new_hl = info->hop_limit;
39 break;
40 case IP6T_HL_INC:
41 new_hl = ip6h->hop_limit + info->hop_limit;
42 if (new_hl > 255)
43 new_hl = 255;
44 break;
45 case IP6T_HL_DEC:
46 new_hl = ip6h->hop_limit - info->hop_limit;
47 if (new_hl < 0)
48 new_hl = 0;
49 break;
50 default:
51 new_hl = ip6h->hop_limit;
52 break;
53 }
54
55 if (new_hl != ip6h->hop_limit) {
56 diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF;
57 ip6h->hop_limit = new_hl;
58 diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8);
59 }
60
61 return IP6T_CONTINUE;
62}
63
64static int ip6t_hl_checkentry(const char *tablename,
65 const struct ip6t_entry *e,
66 void *targinfo,
67 unsigned int targinfosize,
68 unsigned int hook_mask)
69{
70 struct ip6t_HL_info *info = targinfo;
71
72 if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) {
73 printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n",
74 targinfosize,
75 IP6T_ALIGN(sizeof(struct ip6t_HL_info)));
76 return 0;
77 }
78
79 if (strcmp(tablename, "mangle")) {
80 printk(KERN_WARNING "ip6t_HL: can only be called from "
81 "\"mangle\" table, not \"%s\"\n", tablename);
82 return 0;
83 }
84
85 if (info->mode > IP6T_HL_MAXMODE) {
86 printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
87 info->mode);
88 return 0;
89 }
90
91 if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
92 printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
93 "make sense with value 0\n");
94 return 0;
95 }
96
97 return 1;
98}
99
100static struct ip6t_target ip6t_HL = {
101 .name = "HL",
102 .target = ip6t_hl_target,
103 .checkentry = ip6t_hl_checkentry,
104 .me = THIS_MODULE
105};
106
107static int __init init(void)
108{
109 return ip6t_register_target(&ip6t_HL);
110}
111
112static void __exit fini(void)
113{
114 ip6t_unregister_target(&ip6t_HL);
115}
116
117module_init(init);
118module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index c44685e391b7..0cd1d1bd9033 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
26MODULE_DESCRIPTION("IP6 tables LOG target module"); 26MODULE_DESCRIPTION("IP6 tables LOG target module");
27MODULE_LICENSE("GPL"); 27MODULE_LICENSE("GPL");
28 28
29static unsigned int nflog = 1;
30module_param(nflog, int, 0400);
31MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
32
33struct in_device; 29struct in_device;
34#include <net/route.h> 30#include <net/route.h>
35#include <linux/netfilter_ipv6/ip6t_LOG.h> 31#include <linux/netfilter_ipv6/ip6t_LOG.h>
@@ -44,7 +40,7 @@ struct in_device;
44static DEFINE_SPINLOCK(log_lock); 40static DEFINE_SPINLOCK(log_lock);
45 41
46/* One level of recursion won't kill us */ 42/* One level of recursion won't kill us */
47static void dump_packet(const struct ip6t_log_info *info, 43static void dump_packet(const struct nf_loginfo *info,
48 const struct sk_buff *skb, unsigned int ip6hoff, 44 const struct sk_buff *skb, unsigned int ip6hoff,
49 int recurse) 45 int recurse)
50{ 46{
@@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info,
53 struct ipv6hdr _ip6h, *ih; 49 struct ipv6hdr _ip6h, *ih;
54 unsigned int ptr; 50 unsigned int ptr;
55 unsigned int hdrlen = 0; 51 unsigned int hdrlen = 0;
52 unsigned int logflags;
53
54 if (info->type == NF_LOG_TYPE_LOG)
55 logflags = info->u.log.logflags;
56 else
57 logflags = NF_LOG_MASK;
56 58
57 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); 59 ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
58 if (ih == NULL) { 60 if (ih == NULL) {
@@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info,
84 } 86 }
85 87
86 /* Max length: 48 "OPT (...) " */ 88 /* Max length: 48 "OPT (...) " */
87 if (info->logflags & IP6T_LOG_IPOPT) 89 if (logflags & IP6T_LOG_IPOPT)
88 printk("OPT ( "); 90 printk("OPT ( ");
89 91
90 switch (currenthdr) { 92 switch (currenthdr) {
@@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info,
119 case IPPROTO_ROUTING: 121 case IPPROTO_ROUTING:
120 case IPPROTO_HOPOPTS: 122 case IPPROTO_HOPOPTS:
121 if (fragment) { 123 if (fragment) {
122 if (info->logflags & IP6T_LOG_IPOPT) 124 if (logflags & IP6T_LOG_IPOPT)
123 printk(")"); 125 printk(")");
124 return; 126 return;
125 } 127 }
@@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info,
127 break; 129 break;
128 /* Max Length */ 130 /* Max Length */
129 case IPPROTO_AH: 131 case IPPROTO_AH:
130 if (info->logflags & IP6T_LOG_IPOPT) { 132 if (logflags & IP6T_LOG_IPOPT) {
131 struct ip_auth_hdr _ahdr, *ah; 133 struct ip_auth_hdr _ahdr, *ah;
132 134
133 /* Max length: 3 "AH " */ 135 /* Max length: 3 "AH " */
@@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info,
158 hdrlen = (hp->hdrlen+2)<<2; 160 hdrlen = (hp->hdrlen+2)<<2;
159 break; 161 break;
160 case IPPROTO_ESP: 162 case IPPROTO_ESP:
161 if (info->logflags & IP6T_LOG_IPOPT) { 163 if (logflags & IP6T_LOG_IPOPT) {
162 struct ip_esp_hdr _esph, *eh; 164 struct ip_esp_hdr _esph, *eh;
163 165
164 /* Max length: 4 "ESP " */ 166 /* Max length: 4 "ESP " */
@@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info,
190 printk("Unknown Ext Hdr %u", currenthdr); 192 printk("Unknown Ext Hdr %u", currenthdr);
191 return; 193 return;
192 } 194 }
193 if (info->logflags & IP6T_LOG_IPOPT) 195 if (logflags & IP6T_LOG_IPOPT)
194 printk(") "); 196 printk(") ");
195 197
196 currenthdr = hp->nexthdr; 198 currenthdr = hp->nexthdr;
@@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info,
218 printk("SPT=%u DPT=%u ", 220 printk("SPT=%u DPT=%u ",
219 ntohs(th->source), ntohs(th->dest)); 221 ntohs(th->source), ntohs(th->dest));
220 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 222 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
221 if (info->logflags & IP6T_LOG_TCPSEQ) 223 if (logflags & IP6T_LOG_TCPSEQ)
222 printk("SEQ=%u ACK=%u ", 224 printk("SEQ=%u ACK=%u ",
223 ntohl(th->seq), ntohl(th->ack_seq)); 225 ntohl(th->seq), ntohl(th->ack_seq));
224 /* Max length: 13 "WINDOW=65535 " */ 226 /* Max length: 13 "WINDOW=65535 " */
@@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info,
245 /* Max length: 11 "URGP=65535 " */ 247 /* Max length: 11 "URGP=65535 " */
246 printk("URGP=%u ", ntohs(th->urg_ptr)); 248 printk("URGP=%u ", ntohs(th->urg_ptr));
247 249
248 if ((info->logflags & IP6T_LOG_TCPOPT) 250 if ((logflags & IP6T_LOG_TCPOPT)
249 && th->doff * 4 > sizeof(struct tcphdr)) { 251 && th->doff * 4 > sizeof(struct tcphdr)) {
250 u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; 252 u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
251 unsigned int i; 253 unsigned int i;
@@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info,
349 } 351 }
350 352
351 /* Max length: 15 "UID=4294967295 " */ 353 /* Max length: 15 "UID=4294967295 " */
352 if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { 354 if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
353 read_lock_bh(&skb->sk->sk_callback_lock); 355 read_lock_bh(&skb->sk->sk_callback_lock);
354 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 356 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
355 printk("UID=%u ", skb->sk->sk_socket->file->f_uid); 357 printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -357,25 +359,38 @@ static void dump_packet(const struct ip6t_log_info *info,
357 } 359 }
358} 360}
359 361
362static struct nf_loginfo default_loginfo = {
363 .type = NF_LOG_TYPE_LOG,
364 .u = {
365 .log = {
366 .level = 0,
367 .logflags = NF_LOG_MASK,
368 },
369 },
370};
371
360static void 372static void
361ip6t_log_packet(unsigned int hooknum, 373ip6t_log_packet(unsigned int pf,
374 unsigned int hooknum,
362 const struct sk_buff *skb, 375 const struct sk_buff *skb,
363 const struct net_device *in, 376 const struct net_device *in,
364 const struct net_device *out, 377 const struct net_device *out,
365 const struct ip6t_log_info *loginfo, 378 const struct nf_loginfo *loginfo,
366 const char *level_string,
367 const char *prefix) 379 const char *prefix)
368{ 380{
381 if (!loginfo)
382 loginfo = &default_loginfo;
383
369 spin_lock_bh(&log_lock); 384 spin_lock_bh(&log_lock);
370 printk(level_string); 385 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
371 printk("%sIN=%s OUT=%s ", 386 prefix,
372 prefix == NULL ? loginfo->prefix : prefix,
373 in ? in->name : "", 387 in ? in->name : "",
374 out ? out->name : ""); 388 out ? out->name : "");
375 if (in && !out) { 389 if (in && !out) {
390 unsigned int len;
376 /* MAC logging for input chain only. */ 391 /* MAC logging for input chain only. */
377 printk("MAC="); 392 printk("MAC=");
378 if (skb->dev && skb->dev->hard_header_len && 393 if (skb->dev && (len = skb->dev->hard_header_len) &&
379 skb->mac.raw != skb->nh.raw) { 394 skb->mac.raw != skb->nh.raw) {
380 unsigned char *p = skb->mac.raw; 395 unsigned char *p = skb->mac.raw;
381 int i; 396 int i;
@@ -384,9 +399,11 @@ ip6t_log_packet(unsigned int hooknum,
384 (p -= ETH_HLEN) < skb->head) 399 (p -= ETH_HLEN) < skb->head)
385 p = NULL; 400 p = NULL;
386 401
387 if (p != NULL) 402 if (p != NULL) {
388 for (i = 0; i < skb->dev->hard_header_len; i++) 403 for (i = 0; i < len; i++)
389 printk("%02x", p[i]); 404 printk("%02x%s", p[i],
405 i == len - 1 ? "" : ":");
406 }
390 printk(" "); 407 printk(" ");
391 408
392 if (skb->dev->type == ARPHRD_SIT) { 409 if (skb->dev->type == ARPHRD_SIT) {
@@ -413,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb,
413 void *userinfo) 430 void *userinfo)
414{ 431{
415 const struct ip6t_log_info *loginfo = targinfo; 432 const struct ip6t_log_info *loginfo = targinfo;
416 char level_string[4] = "< >"; 433 struct nf_loginfo li;
434
435 li.type = NF_LOG_TYPE_LOG;
436 li.u.log.level = loginfo->level;
437 li.u.log.logflags = loginfo->logflags;
417 438
418 level_string[1] = '0' + (loginfo->level % 8); 439 nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix);
419 ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
420 440
421 return IP6T_CONTINUE; 441 return IP6T_CONTINUE;
422} 442}
423 443
424static void
425ip6t_logfn(unsigned int hooknum,
426 const struct sk_buff *skb,
427 const struct net_device *in,
428 const struct net_device *out,
429 const char *prefix)
430{
431 struct ip6t_log_info loginfo = {
432 .level = 0,
433 .logflags = IP6T_LOG_MASK,
434 .prefix = ""
435 };
436
437 ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
438}
439 444
440static int ip6t_log_checkentry(const char *tablename, 445static int ip6t_log_checkentry(const char *tablename,
441 const struct ip6t_entry *e, 446 const struct ip6t_entry *e,
@@ -472,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = {
472 .me = THIS_MODULE, 477 .me = THIS_MODULE,
473}; 478};
474 479
480static struct nf_logger ip6t_logger = {
481 .name = "ip6t_LOG",
482 .logfn = &ip6t_log_packet,
483 .me = THIS_MODULE,
484};
485
475static int __init init(void) 486static int __init init(void)
476{ 487{
477 if (ip6t_register_target(&ip6t_log_reg)) 488 if (ip6t_register_target(&ip6t_log_reg))
478 return -EINVAL; 489 return -EINVAL;
479 if (nflog) 490 if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
480 nf_log_register(PF_INET6, &ip6t_logfn); 491 printk(KERN_WARNING "ip6t_LOG: not logging via system console "
492 "since somebody else already registered for PF_INET6\n");
493 /* we cannot make module load fail here, since otherwise
494 * ip6tables userspace would abort */
495 }
481 496
482 return 0; 497 return 0;
483} 498}
484 499
485static void __exit fini(void) 500static void __exit fini(void)
486{ 501{
487 if (nflog) 502 nf_log_unregister_logger(&ip6t_logger);
488 nf_log_unregister(PF_INET6, &ip6t_logfn);
489 ip6t_unregister_target(&ip6t_log_reg); 503 ip6t_unregister_target(&ip6t_log_reg);
490} 504}
491 505
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index d09ceb05013a..81924fcc5857 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -28,10 +28,9 @@ target(struct sk_buff **pskb,
28{ 28{
29 const struct ip6t_mark_target_info *markinfo = targinfo; 29 const struct ip6t_mark_target_info *markinfo = targinfo;
30 30
31 if((*pskb)->nfmark != markinfo->mark) { 31 if((*pskb)->nfmark != markinfo->mark)
32 (*pskb)->nfmark = markinfo->mark; 32 (*pskb)->nfmark = markinfo->mark;
33 (*pskb)->nfcache |= NFC_ALTERED; 33
34 }
35 return IP6T_CONTINUE; 34 return IP6T_CONTINUE;
36} 35}
37 36
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
new file mode 100644
index 000000000000..c6e3730e7409
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c
@@ -0,0 +1,70 @@
1/* ip6tables module for using new netfilter netlink queue
2 *
3 * (C) 2005 by Harald Welte <laforge@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv6/ip6_tables.h>
16#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("ip6tables NFQUEUE target");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23target(struct sk_buff **pskb,
24 const struct net_device *in,
25 const struct net_device *out,
26 unsigned int hooknum,
27 const void *targinfo,
28 void *userinfo)
29{
30 const struct ipt_NFQ_info *tinfo = targinfo;
31
32 return NF_QUEUE_NR(tinfo->queuenum);
33}
34
35static int
36checkentry(const char *tablename,
37 const struct ip6t_entry *e,
38 void *targinfo,
39 unsigned int targinfosize,
40 unsigned int hook_mask)
41{
42 if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
43 printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
44 targinfosize,
45 IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
46 return 0;
47 }
48
49 return 1;
50}
51
52static struct ip6t_target ipt_NFQ_reg = {
53 .name = "NFQUEUE",
54 .target = target,
55 .checkentry = checkentry,
56 .me = THIS_MODULE,
57};
58
59static int __init init(void)
60{
61 return ip6t_register_target(&ipt_NFQ_reg);
62}
63
64static void __exit fini(void)
65{
66 ip6t_unregister_target(&ipt_NFQ_reg);
67}
68
69module_init(init);
70module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 000000000000..14316c3ebde4
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,284 @@
1/*
2 * IP6 tables REJECT target module
3 * Linux INET6 implementation
4 *
5 * Copyright (C)2003 USAGI/WIDE Project
6 *
7 * Authors:
8 * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
9 *
10 * Based on net/ipv4/netfilter/ipt_REJECT.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/skbuff.h>
21#include <linux/icmpv6.h>
22#include <linux/netdevice.h>
23#include <net/ipv6.h>
24#include <net/tcp.h>
25#include <net/icmp.h>
26#include <net/ip6_checksum.h>
27#include <net/ip6_fib.h>
28#include <net/ip6_route.h>
29#include <net/flow.h>
30#include <linux/netfilter_ipv6/ip6_tables.h>
31#include <linux/netfilter_ipv6/ip6t_REJECT.h>
32
33MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
34MODULE_DESCRIPTION("IP6 tables REJECT target module");
35MODULE_LICENSE("GPL");
36
37#if 0
38#define DEBUGP printk
39#else
40#define DEBUGP(format, args...)
41#endif
42
43/* Send RST reply */
44static void send_reset(struct sk_buff *oldskb)
45{
46 struct sk_buff *nskb;
47 struct tcphdr otcph, *tcph;
48 unsigned int otcplen, hh_len;
49 int tcphoff, needs_ack;
50 struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
51 struct dst_entry *dst = NULL;
52 u8 proto;
53 struct flowi fl;
54
55 if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
56 (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
57 DEBUGP("ip6t_REJECT: addr is not unicast.\n");
58 return;
59 }
60
61 proto = oip6h->nexthdr;
62 tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
63
64 if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
65 DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
66 return;
67 }
68
69 otcplen = oldskb->len - tcphoff;
70
71 /* IP header checks: fragment, too short. */
72 if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
73 DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
74 proto, otcplen);
75 return;
76 }
77
78 if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
79 BUG();
80
81 /* No RST for RST. */
82 if (otcph.rst) {
83 DEBUGP("ip6t_REJECT: RST is set\n");
84 return;
85 }
86
87 /* Check checksum. */
88 if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
89 skb_checksum(oldskb, tcphoff, otcplen, 0))) {
90 DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
91 return;
92 }
93
94 memset(&fl, 0, sizeof(fl));
95 fl.proto = IPPROTO_TCP;
96 ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
97 ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
98 fl.fl_ip_sport = otcph.dest;
99 fl.fl_ip_dport = otcph.source;
100 dst = ip6_route_output(NULL, &fl);
101 if (dst == NULL)
102 return;
103 if (dst->error ||
104 xfrm_lookup(&dst, &fl, NULL, 0)) {
105 dst_release(dst);
106 return;
107 }
108
109 hh_len = (dst->dev->hard_header_len + 15)&~15;
110 nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
111 + sizeof(struct tcphdr) + dst->trailer_len,
112 GFP_ATOMIC);
113
114 if (!nskb) {
115 if (net_ratelimit())
116 printk("ip6t_REJECT: Can't alloc skb\n");
117 dst_release(dst);
118 return;
119 }
120
121 nskb->dst = dst;
122
123 skb_reserve(nskb, hh_len + dst->header_len);
124
125 ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
126 skb_put(nskb, sizeof(struct ipv6hdr));
127 ip6h->version = 6;
128 ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
129 ip6h->nexthdr = IPPROTO_TCP;
130 ip6h->payload_len = htons(sizeof(struct tcphdr));
131 ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
132 ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
133
134 tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
135 /* Truncate to length (no data) */
136 tcph->doff = sizeof(struct tcphdr)/4;
137 tcph->source = otcph.dest;
138 tcph->dest = otcph.source;
139
140 if (otcph.ack) {
141 needs_ack = 0;
142 tcph->seq = otcph.ack_seq;
143 tcph->ack_seq = 0;
144 } else {
145 needs_ack = 1;
146 tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
147 + otcplen - (otcph.doff<<2));
148 tcph->seq = 0;
149 }
150
151 /* Reset flags */
152 ((u_int8_t *)tcph)[13] = 0;
153 tcph->rst = 1;
154 tcph->ack = needs_ack;
155 tcph->window = 0;
156 tcph->urg_ptr = 0;
157 tcph->check = 0;
158
159 /* Adjust TCP checksum */
160 tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
161 &nskb->nh.ipv6h->daddr,
162 sizeof(struct tcphdr), IPPROTO_TCP,
163 csum_partial((char *)tcph,
164 sizeof(struct tcphdr), 0));
165
166 NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
167 dst_output);
168}
169
170static inline void
171send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
172{
173 if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
174 skb_in->dev = &loopback_dev;
175
176 icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
177}
178
179static unsigned int reject6_target(struct sk_buff **pskb,
180 const struct net_device *in,
181 const struct net_device *out,
182 unsigned int hooknum,
183 const void *targinfo,
184 void *userinfo)
185{
186 const struct ip6t_reject_info *reject = targinfo;
187
188 DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
189 /* WARNING: This code causes reentry within ip6tables.
190 This means that the ip6tables jump stack is now crap. We
191 must return an absolute verdict. --RR */
192 switch (reject->with) {
193 case IP6T_ICMP6_NO_ROUTE:
194 send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
195 break;
196 case IP6T_ICMP6_ADM_PROHIBITED:
197 send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
198 break;
199 case IP6T_ICMP6_NOT_NEIGHBOUR:
200 send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
201 break;
202 case IP6T_ICMP6_ADDR_UNREACH:
203 send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
204 break;
205 case IP6T_ICMP6_PORT_UNREACH:
206 send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
207 break;
208 case IP6T_ICMP6_ECHOREPLY:
209 /* Do nothing */
210 break;
211 case IP6T_TCP_RESET:
212 send_reset(*pskb);
213 break;
214 default:
215 if (net_ratelimit())
216 printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
217 break;
218 }
219
220 return NF_DROP;
221}
222
223static int check(const char *tablename,
224 const struct ip6t_entry *e,
225 void *targinfo,
226 unsigned int targinfosize,
227 unsigned int hook_mask)
228{
229 const struct ip6t_reject_info *rejinfo = targinfo;
230
231 if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
232 DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
233 return 0;
234 }
235
236 /* Only allow these for packet filtering. */
237 if (strcmp(tablename, "filter") != 0) {
238 DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename);
239 return 0;
240 }
241
242 if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN)
243 | (1 << NF_IP6_FORWARD)
244 | (1 << NF_IP6_LOCAL_OUT))) != 0) {
245 DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask);
246 return 0;
247 }
248
249 if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
250 printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
251 return 0;
252 } else if (rejinfo->with == IP6T_TCP_RESET) {
253 /* Must specify that it's a TCP packet */
254 if (e->ipv6.proto != IPPROTO_TCP
255 || (e->ipv6.invflags & IP6T_INV_PROTO)) {
256 DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
257 return 0;
258 }
259 }
260
261 return 1;
262}
263
264static struct ip6t_target ip6t_reject_reg = {
265 .name = "REJECT",
266 .target = reject6_target,
267 .checkentry = check,
268 .me = THIS_MODULE
269};
270
271static int __init init(void)
272{
273 if (ip6t_register_target(&ip6t_reject_reg))
274 return -EINVAL;
275 return 0;
276}
277
278static void __exit fini(void)
279{
280 ip6t_unregister_target(&ip6t_reject_reg);
281}
282
283module_init(init);
284module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index ab0e32d3de46..9b91decbfddb 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
20MODULE_DESCRIPTION("IP6 tables owner matching module"); 20MODULE_DESCRIPTION("IP6 tables owner matching module");
21MODULE_LICENSE("GPL"); 21MODULE_LICENSE("GPL");
22 22
23static int
24match_pid(const struct sk_buff *skb, pid_t pid)
25{
26 struct task_struct *p;
27 struct files_struct *files;
28 int i;
29
30 read_lock(&tasklist_lock);
31 p = find_task_by_pid(pid);
32 if (!p)
33 goto out;
34 task_lock(p);
35 files = p->files;
36 if(files) {
37 spin_lock(&files->file_lock);
38 for (i=0; i < files->max_fds; i++) {
39 if (fcheck_files(files, i) == skb->sk->sk_socket->file) {
40 spin_unlock(&files->file_lock);
41 task_unlock(p);
42 read_unlock(&tasklist_lock);
43 return 1;
44 }
45 }
46 spin_unlock(&files->file_lock);
47 }
48 task_unlock(p);
49out:
50 read_unlock(&tasklist_lock);
51 return 0;
52}
53
54static int
55match_sid(const struct sk_buff *skb, pid_t sid)
56{
57 struct task_struct *g, *p;
58 struct file *file = skb->sk->sk_socket->file;
59 int i, found=0;
60
61 read_lock(&tasklist_lock);
62 do_each_thread(g, p) {
63 struct files_struct *files;
64 if (p->signal->session != sid)
65 continue;
66
67 task_lock(p);
68 files = p->files;
69 if (files) {
70 spin_lock(&files->file_lock);
71 for (i=0; i < files->max_fds; i++) {
72 if (fcheck_files(files, i) == file) {
73 found = 1;
74 break;
75 }
76 }
77 spin_unlock(&files->file_lock);
78 }
79 task_unlock(p);
80 if (found)
81 goto out;
82 } while_each_thread(g, p);
83out:
84 read_unlock(&tasklist_lock);
85
86 return found;
87}
88 23
89static int 24static int
90match(const struct sk_buff *skb, 25match(const struct sk_buff *skb,
@@ -112,18 +47,6 @@ match(const struct sk_buff *skb,
112 return 0; 47 return 0;
113 } 48 }
114 49
115 if(info->match & IP6T_OWNER_PID) {
116 if (!match_pid(skb, info->pid) ^
117 !!(info->invert & IP6T_OWNER_PID))
118 return 0;
119 }
120
121 if(info->match & IP6T_OWNER_SID) {
122 if (!match_sid(skb, info->sid) ^
123 !!(info->invert & IP6T_OWNER_SID))
124 return 0;
125 }
126
127 return 1; 50 return 1;
128} 51}
129 52
@@ -134,6 +57,8 @@ checkentry(const char *tablename,
134 unsigned int matchsize, 57 unsigned int matchsize,
135 unsigned int hook_mask) 58 unsigned int hook_mask)
136{ 59{
60 const struct ip6t_owner_info *info = matchinfo;
61
137 if (hook_mask 62 if (hook_mask
138 & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) { 63 & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) {
139 printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); 64 printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -142,14 +67,13 @@ checkentry(const char *tablename,
142 67
143 if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info))) 68 if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info)))
144 return 0; 69 return 0;
145#ifdef CONFIG_SMP 70
146 /* files->file_lock can not be used in a BH */ 71 if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
147 if (((struct ip6t_owner_info *)matchinfo)->match 72 printk("ipt_owner: pid and sid matching "
148 & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { 73 "not supported anymore\n");
149 printk("ip6t_owner: pid and sid matching is broken on SMP.\n");
150 return 0; 74 return 0;
151 } 75 }
152#endif 76
153 return 1; 77 return 1;
154} 78}
155 79
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e2b848ec9851..ed3a76b30fd9 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
49#include <net/transp_v6.h> 49#include <net/transp_v6.h>
50#include <net/udp.h> 50#include <net/udp.h>
51#include <net/inet_common.h> 51#include <net/inet_common.h>
52#include <net/tcp_states.h>
52 53
53#include <net/rawv6.h> 54#include <net/rawv6.h>
54#include <net/xfrm.h> 55#include <net/xfrm.h>
@@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk)
81 82
82/* Grumble... icmp and ip_input want to get at this... */ 83/* Grumble... icmp and ip_input want to get at this... */
83struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, 84struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
84 struct in6_addr *loc_addr, struct in6_addr *rmt_addr) 85 struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
86 int dif)
85{ 87{
86 struct hlist_node *node; 88 struct hlist_node *node;
87 int is_multicast = ipv6_addr_is_multicast(loc_addr); 89 int is_multicast = ipv6_addr_is_multicast(loc_addr);
@@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
94 !ipv6_addr_equal(&np->daddr, rmt_addr)) 96 !ipv6_addr_equal(&np->daddr, rmt_addr))
95 continue; 97 continue;
96 98
99 if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
100 continue;
101
97 if (!ipv6_addr_any(&np->rcv_saddr)) { 102 if (!ipv6_addr_any(&np->rcv_saddr)) {
98 if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) 103 if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
99 goto found; 104 goto found;
@@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
137 * 142 *
138 * Caller owns SKB so we must make clones. 143 * Caller owns SKB so we must make clones.
139 */ 144 */
140void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) 145int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
141{ 146{
142 struct in6_addr *saddr; 147 struct in6_addr *saddr;
143 struct in6_addr *daddr; 148 struct in6_addr *daddr;
144 struct sock *sk; 149 struct sock *sk;
150 int delivered = 0;
145 __u8 hash; 151 __u8 hash;
146 152
147 saddr = &skb->nh.ipv6h->saddr; 153 saddr = &skb->nh.ipv6h->saddr;
@@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
160 if (sk == NULL) 166 if (sk == NULL)
161 goto out; 167 goto out;
162 168
163 sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); 169 sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
164 170
165 while (sk) { 171 while (sk) {
172 delivered = 1;
166 if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { 173 if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
167 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 174 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
168 175
@@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
170 if (clone) 177 if (clone)
171 rawv6_rcv(sk, clone); 178 rawv6_rcv(sk, clone);
172 } 179 }
173 sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); 180 sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
181 IP6CB(skb)->iif);
174 } 182 }
175out: 183out:
176 read_unlock(&raw_v6_lock); 184 read_unlock(&raw_v6_lock);
185 return delivered;
177} 186}
178 187
179/* This cleans up af_inet6 a bit. -DaveM */ 188/* This cleans up af_inet6 a bit. -DaveM */
@@ -328,12 +337,13 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
328 337
329 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 338 if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
330 if (skb->ip_summed == CHECKSUM_HW) { 339 if (skb->ip_summed == CHECKSUM_HW) {
340 skb_postpull_rcsum(skb, skb->nh.raw,
341 skb->h.raw - skb->nh.raw);
331 skb->ip_summed = CHECKSUM_UNNECESSARY; 342 skb->ip_summed = CHECKSUM_UNNECESSARY;
332 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, 343 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
333 &skb->nh.ipv6h->daddr, 344 &skb->nh.ipv6h->daddr,
334 skb->len, inet->num, skb->csum)) { 345 skb->len, inet->num, skb->csum)) {
335 LIMIT_NETDEBUG( 346 LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
336 printk(KERN_DEBUG "raw v6 hw csum failure.\n"));
337 skb->ip_summed = CHECKSUM_NONE; 347 skb->ip_summed = CHECKSUM_NONE;
338 } 348 }
339 } 349 }
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 59e7c6317872..9d9e04344c77 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
562 if (skb->dev) 562 if (skb->dev)
563 fq->iif = skb->dev->ifindex; 563 fq->iif = skb->dev->ifindex;
564 skb->dev = NULL; 564 skb->dev = NULL;
565 fq->stamp = skb->stamp; 565 skb_get_timestamp(skb, &fq->stamp);
566 fq->meat += skb->len; 566 fq->meat += skb->len;
567 atomic_add(skb->truesize, &ip6_frag_mem); 567 atomic_add(skb->truesize, &ip6_frag_mem);
568 568
@@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
664 664
665 head->next = NULL; 665 head->next = NULL;
666 head->dev = dev; 666 head->dev = dev;
667 head->stamp = fq->stamp; 667 skb_set_timestamp(head, &fq->stamp);
668 head->nh.ipv6h->payload_len = htons(payload_len); 668 head->nh.ipv6h->payload_len = htons(payload_len);
669 669
670 *skb_in = head; 670 *skb_in = head;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 878789b3122d..5d5bbb49ec78 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1372 * Drop the packet on the floor 1372 * Drop the packet on the floor
1373 */ 1373 */
1374 1374
1375int ip6_pkt_discard(struct sk_buff *skb) 1375static int ip6_pkt_discard(struct sk_buff *skb)
1376{ 1376{
1377 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); 1377 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1378 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev); 1378 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
@@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
1380 return 0; 1380 return 0;
1381} 1381}
1382 1382
1383int ip6_pkt_discard_out(struct sk_buff *skb) 1383static int ip6_pkt_discard_out(struct sk_buff *skb)
1384{ 1384{
1385 skb->dev = skb->dst->dev; 1385 skb->dev = skb->dst->dev;
1386 return ip6_pkt_discard(skb); 1386 return ip6_pkt_discard(skb);
@@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1850 1850
1851 skb = alloc_skb(size, gfp_any()); 1851 skb = alloc_skb(size, gfp_any());
1852 if (!skb) { 1852 if (!skb) {
1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); 1853 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1854 return; 1854 return;
1855 } 1855 }
1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) { 1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1857 kfree_skb(skb); 1857 kfree_skb(skb);
1858 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); 1858 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
1859 return; 1859 return;
1860 } 1860 }
1861 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; 1861 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1862 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any()); 1862 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
1863} 1863}
1864 1864
1865/* 1865/*
@@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1960 return arg.len; 1960 return arg.len;
1961} 1961}
1962 1962
1963extern struct rt6_statistics rt6_stats;
1964
1965static int rt6_stats_seq_show(struct seq_file *seq, void *v) 1963static int rt6_stats_seq_show(struct seq_file *seq, void *v)
1966{ 1964{
1967 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 1965 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b788f55e139b..c3123c9e1a8e 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,7 +195,6 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int
195 dev_hold(dev); 195 dev_hold(dev);
196 196
197 ipip6_tunnel_link(nt); 197 ipip6_tunnel_link(nt);
198 /* Do not decrement MOD_USE_COUNT here. */
199 return nt; 198 return nt;
200 199
201failed: 200failed:
@@ -771,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
771 return 0; 770 return 0;
772} 771}
773 772
774int __init ipip6_fb_tunnel_init(struct net_device *dev) 773static int __init ipip6_fb_tunnel_init(struct net_device *dev)
775{ 774{
776 struct ip_tunnel *tunnel = dev->priv; 775 struct ip_tunnel *tunnel = dev->priv;
777 struct iphdr *iph = &tunnel->parms.iph; 776 struct iphdr *iph = &tunnel->parms.iph;
@@ -794,10 +793,28 @@ static struct net_protocol sit_protocol = {
794 .err_handler = ipip6_err, 793 .err_handler = ipip6_err,
795}; 794};
796 795
796static void __exit sit_destroy_tunnels(void)
797{
798 int prio;
799
800 for (prio = 1; prio < 4; prio++) {
801 int h;
802 for (h = 0; h < HASH_SIZE; h++) {
803 struct ip_tunnel *t;
804 while ((t = tunnels[prio][h]) != NULL)
805 unregister_netdevice(t->dev);
806 }
807 }
808}
809
797void __exit sit_cleanup(void) 810void __exit sit_cleanup(void)
798{ 811{
799 inet_del_protocol(&sit_protocol, IPPROTO_IPV6); 812 inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
800 unregister_netdev(ipip6_fb_tunnel_dev); 813
814 rtnl_lock();
815 sit_destroy_tunnels();
816 unregister_netdevice(ipip6_fb_tunnel_dev);
817 rtnl_unlock();
801} 818}
802 819
803int __init sit_init(void) 820int __init sit_init(void)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 3a18e0e6ffed..8eff9fa1e983 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -14,9 +14,6 @@
14#include <net/ipv6.h> 14#include <net/ipv6.h>
15#include <net/addrconf.h> 15#include <net/addrconf.h>
16 16
17extern ctl_table ipv6_route_table[];
18extern ctl_table ipv6_icmp_table[];
19
20#ifdef CONFIG_SYSCTL 17#ifdef CONFIG_SYSCTL
21 18
22static ctl_table ipv6_table[] = { 19static ctl_table ipv6_table[] = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f6e288dc116e..794734f1d230 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -47,6 +47,7 @@
47 47
48#include <net/tcp.h> 48#include <net/tcp.h>
49#include <net/ndisc.h> 49#include <net/ndisc.h>
50#include <net/inet6_hashtables.h>
50#include <net/ipv6.h> 51#include <net/ipv6.h>
51#include <net/transp_v6.h> 52#include <net/transp_v6.h>
52#include <net/addrconf.h> 53#include <net/addrconf.h>
@@ -75,34 +76,11 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
75static struct tcp_func ipv6_mapped; 76static struct tcp_func ipv6_mapped;
76static struct tcp_func ipv6_specific; 77static struct tcp_func ipv6_specific;
77 78
78/* I have no idea if this is a good hash for v6 or not. -DaveM */ 79static inline int tcp_v6_bind_conflict(const struct sock *sk,
79static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, 80 const struct inet_bind_bucket *tb)
80 struct in6_addr *faddr, u16 fport)
81{ 81{
82 int hashent = (lport ^ fport); 82 const struct sock *sk2;
83 83 const struct hlist_node *node;
84 hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
85 hashent ^= hashent>>16;
86 hashent ^= hashent>>8;
87 return (hashent & (tcp_ehash_size - 1));
88}
89
90static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
91{
92 struct inet_sock *inet = inet_sk(sk);
93 struct ipv6_pinfo *np = inet6_sk(sk);
94 struct in6_addr *laddr = &np->rcv_saddr;
95 struct in6_addr *faddr = &np->daddr;
96 __u16 lport = inet->num;
97 __u16 fport = inet->dport;
98 return tcp_v6_hashfn(laddr, lport, faddr, fport);
99}
100
101static inline int tcp_v6_bind_conflict(struct sock *sk,
102 struct tcp_bind_bucket *tb)
103{
104 struct sock *sk2;
105 struct hlist_node *node;
106 84
107 /* We must walk the whole port owner list in this case. -DaveM */ 85 /* We must walk the whole port owner list in this case. -DaveM */
108 sk_for_each_bound(sk2, node, &tb->owners) { 86 sk_for_each_bound(sk2, node, &tb->owners) {
@@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk,
126 */ 104 */
127static int tcp_v6_get_port(struct sock *sk, unsigned short snum) 105static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
128{ 106{
129 struct tcp_bind_hashbucket *head; 107 struct inet_bind_hashbucket *head;
130 struct tcp_bind_bucket *tb; 108 struct inet_bind_bucket *tb;
131 struct hlist_node *node; 109 struct hlist_node *node;
132 int ret; 110 int ret;
133 111
@@ -138,37 +116,42 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
138 int remaining = (high - low) + 1; 116 int remaining = (high - low) + 1;
139 int rover; 117 int rover;
140 118
141 spin_lock(&tcp_portalloc_lock); 119 spin_lock(&tcp_hashinfo.portalloc_lock);
142 if (tcp_port_rover < low) 120 if (tcp_hashinfo.port_rover < low)
143 rover = low; 121 rover = low;
144 else 122 else
145 rover = tcp_port_rover; 123 rover = tcp_hashinfo.port_rover;
146 do { rover++; 124 do { rover++;
147 if (rover > high) 125 if (rover > high)
148 rover = low; 126 rover = low;
149 head = &tcp_bhash[tcp_bhashfn(rover)]; 127 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
150 spin_lock(&head->lock); 128 spin_lock(&head->lock);
151 tb_for_each(tb, node, &head->chain) 129 inet_bind_bucket_for_each(tb, node, &head->chain)
152 if (tb->port == rover) 130 if (tb->port == rover)
153 goto next; 131 goto next;
154 break; 132 break;
155 next: 133 next:
156 spin_unlock(&head->lock); 134 spin_unlock(&head->lock);
157 } while (--remaining > 0); 135 } while (--remaining > 0);
158 tcp_port_rover = rover; 136 tcp_hashinfo.port_rover = rover;
159 spin_unlock(&tcp_portalloc_lock); 137 spin_unlock(&tcp_hashinfo.portalloc_lock);
160 138
161 /* Exhausted local port range during search? */ 139 /* Exhausted local port range during search? It is not
140 * possible for us to be holding one of the bind hash
141 * locks if this test triggers, because if 'remaining'
142 * drops to zero, we broke out of the do/while loop at
143 * the top level, not from the 'break;' statement.
144 */
162 ret = 1; 145 ret = 1;
163 if (remaining <= 0) 146 if (unlikely(remaining <= 0))
164 goto fail; 147 goto fail;
165 148
166 /* OK, here is the one we will use. */ 149 /* OK, here is the one we will use. */
167 snum = rover; 150 snum = rover;
168 } else { 151 } else {
169 head = &tcp_bhash[tcp_bhashfn(snum)]; 152 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
170 spin_lock(&head->lock); 153 spin_lock(&head->lock);
171 tb_for_each(tb, node, &head->chain) 154 inet_bind_bucket_for_each(tb, node, &head->chain)
172 if (tb->port == snum) 155 if (tb->port == snum)
173 goto tb_found; 156 goto tb_found;
174 } 157 }
@@ -187,8 +170,11 @@ tb_found:
187 } 170 }
188tb_not_found: 171tb_not_found:
189 ret = 1; 172 ret = 1;
190 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) 173 if (tb == NULL) {
191 goto fail_unlock; 174 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
175 if (tb == NULL)
176 goto fail_unlock;
177 }
192 if (hlist_empty(&tb->owners)) { 178 if (hlist_empty(&tb->owners)) {
193 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 179 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
194 tb->fastreuse = 1; 180 tb->fastreuse = 1;
@@ -199,9 +185,9 @@ tb_not_found:
199 tb->fastreuse = 0; 185 tb->fastreuse = 0;
200 186
201success: 187success:
202 if (!tcp_sk(sk)->bind_hash) 188 if (!inet_csk(sk)->icsk_bind_hash)
203 tcp_bind_hash(sk, tb, snum); 189 inet_bind_hash(sk, tb, snum);
204 BUG_TRAP(tcp_sk(sk)->bind_hash == tb); 190 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
205 ret = 0; 191 ret = 0;
206 192
207fail_unlock: 193fail_unlock:
@@ -219,13 +205,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk)
219 BUG_TRAP(sk_unhashed(sk)); 205 BUG_TRAP(sk_unhashed(sk));
220 206
221 if (sk->sk_state == TCP_LISTEN) { 207 if (sk->sk_state == TCP_LISTEN) {
222 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; 208 list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
223 lock = &tcp_lhash_lock; 209 lock = &tcp_hashinfo.lhash_lock;
224 tcp_listen_wlock(); 210 inet_listen_wlock(&tcp_hashinfo);
225 } else { 211 } else {
226 sk->sk_hashent = tcp_v6_sk_hashfn(sk); 212 sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
227 list = &tcp_ehash[sk->sk_hashent].chain; 213 list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
228 lock = &tcp_ehash[sk->sk_hashent].lock; 214 lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
229 write_lock(lock); 215 write_lock(lock);
230 } 216 }
231 217
@@ -250,131 +236,11 @@ static void tcp_v6_hash(struct sock *sk)
250 } 236 }
251} 237}
252 238
253static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif)
254{
255 struct sock *sk;
256 struct hlist_node *node;
257 struct sock *result = NULL;
258 int score, hiscore;
259
260 hiscore=0;
261 read_lock(&tcp_lhash_lock);
262 sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) {
263 if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
264 struct ipv6_pinfo *np = inet6_sk(sk);
265
266 score = 1;
267 if (!ipv6_addr_any(&np->rcv_saddr)) {
268 if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
269 continue;
270 score++;
271 }
272 if (sk->sk_bound_dev_if) {
273 if (sk->sk_bound_dev_if != dif)
274 continue;
275 score++;
276 }
277 if (score == 3) {
278 result = sk;
279 break;
280 }
281 if (score > hiscore) {
282 hiscore = score;
283 result = sk;
284 }
285 }
286 }
287 if (result)
288 sock_hold(result);
289 read_unlock(&tcp_lhash_lock);
290 return result;
291}
292
293/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
294 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
295 *
296 * The sockhash lock must be held as a reader here.
297 */
298
299static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport,
300 struct in6_addr *daddr, u16 hnum,
301 int dif)
302{
303 struct tcp_ehash_bucket *head;
304 struct sock *sk;
305 struct hlist_node *node;
306 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
307 int hash;
308
309 /* Optimize here for direct hit, only listening connections can
310 * have wildcards anyways.
311 */
312 hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
313 head = &tcp_ehash[hash];
314 read_lock(&head->lock);
315 sk_for_each(sk, node, &head->chain) {
316 /* For IPV6 do the cheaper port and family tests first. */
317 if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
318 goto hit; /* You sunk my battleship! */
319 }
320 /* Must check for a TIME_WAIT'er before going to listener hash. */
321 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
322 /* FIXME: acme: check this... */
323 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
324
325 if(*((__u32 *)&(tw->tw_dport)) == ports &&
326 sk->sk_family == PF_INET6) {
327 if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) &&
328 ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
329 (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
330 goto hit;
331 }
332 }
333 read_unlock(&head->lock);
334 return NULL;
335
336hit:
337 sock_hold(sk);
338 read_unlock(&head->lock);
339 return sk;
340}
341
342
343static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
344 struct in6_addr *daddr, u16 hnum,
345 int dif)
346{
347 struct sock *sk;
348
349 sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif);
350
351 if (sk)
352 return sk;
353
354 return tcp_v6_lookup_listener(daddr, hnum, dif);
355}
356
357inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
358 struct in6_addr *daddr, u16 dport,
359 int dif)
360{
361 struct sock *sk;
362
363 local_bh_disable();
364 sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
365 local_bh_enable();
366
367 return sk;
368}
369
370EXPORT_SYMBOL_GPL(tcp_v6_lookup);
371
372
373/* 239/*
374 * Open request hash tables. 240 * Open request hash tables.
375 */ 241 */
376 242
377static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) 243static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
378{ 244{
379 u32 a, b, c; 245 u32 a, b, c;
380 246
@@ -394,14 +260,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
394 return c & (TCP_SYNQ_HSIZE - 1); 260 return c & (TCP_SYNQ_HSIZE - 1);
395} 261}
396 262
397static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp, 263static struct request_sock *tcp_v6_search_req(const struct sock *sk,
398 struct request_sock ***prevp, 264 struct request_sock ***prevp,
399 __u16 rport, 265 __u16 rport,
400 struct in6_addr *raddr, 266 struct in6_addr *raddr,
401 struct in6_addr *laddr, 267 struct in6_addr *laddr,
402 int iif) 268 int iif)
403{ 269{
404 struct listen_sock *lopt = tp->accept_queue.listen_opt; 270 const struct inet_connection_sock *icsk = inet_csk(sk);
271 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
405 struct request_sock *req, **prev; 272 struct request_sock *req, **prev;
406 273
407 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; 274 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -446,44 +313,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
446 } 313 }
447} 314}
448 315
449static int __tcp_v6_check_established(struct sock *sk, __u16 lport, 316static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
450 struct tcp_tw_bucket **twp) 317 struct inet_timewait_sock **twp)
451{ 318{
452 struct inet_sock *inet = inet_sk(sk); 319 struct inet_sock *inet = inet_sk(sk);
453 struct ipv6_pinfo *np = inet6_sk(sk); 320 const struct ipv6_pinfo *np = inet6_sk(sk);
454 struct in6_addr *daddr = &np->rcv_saddr; 321 const struct in6_addr *daddr = &np->rcv_saddr;
455 struct in6_addr *saddr = &np->daddr; 322 const struct in6_addr *saddr = &np->daddr;
456 int dif = sk->sk_bound_dev_if; 323 const int dif = sk->sk_bound_dev_if;
457 u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); 324 const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
458 int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); 325 const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport,
459 struct tcp_ehash_bucket *head = &tcp_ehash[hash]; 326 tcp_hashinfo.ehash_size);
327 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
460 struct sock *sk2; 328 struct sock *sk2;
461 struct hlist_node *node; 329 const struct hlist_node *node;
462 struct tcp_tw_bucket *tw; 330 struct inet_timewait_sock *tw;
463 331
464 write_lock(&head->lock); 332 write_lock(&head->lock);
465 333
466 /* Check TIME-WAIT sockets first. */ 334 /* Check TIME-WAIT sockets first. */
467 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { 335 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
468 tw = (struct tcp_tw_bucket*)sk2; 336 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
337
338 tw = inet_twsk(sk2);
469 339
470 if(*((__u32 *)&(tw->tw_dport)) == ports && 340 if(*((__u32 *)&(tw->tw_dport)) == ports &&
471 sk2->sk_family == PF_INET6 && 341 sk2->sk_family == PF_INET6 &&
472 ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && 342 ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
473 ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && 343 ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
474 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { 344 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
345 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
475 struct tcp_sock *tp = tcp_sk(sk); 346 struct tcp_sock *tp = tcp_sk(sk);
476 347
477 if (tw->tw_ts_recent_stamp && 348 if (tcptw->tw_ts_recent_stamp &&
478 (!twp || (sysctl_tcp_tw_reuse && 349 (!twp ||
479 xtime.tv_sec - 350 (sysctl_tcp_tw_reuse &&
480 tw->tw_ts_recent_stamp > 1))) { 351 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
481 /* See comment in tcp_ipv4.c */ 352 /* See comment in tcp_ipv4.c */
482 tp->write_seq = tw->tw_snd_nxt + 65535 + 2; 353 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
483 if (!tp->write_seq) 354 if (!tp->write_seq)
484 tp->write_seq = 1; 355 tp->write_seq = 1;
485 tp->rx_opt.ts_recent = tw->tw_ts_recent; 356 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
486 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; 357 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
487 sock_hold(sk2); 358 sock_hold(sk2);
488 goto unique; 359 goto unique;
489 } else 360 } else
@@ -494,7 +365,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
494 365
495 /* And established part... */ 366 /* And established part... */
496 sk_for_each(sk2, node, &head->chain) { 367 sk_for_each(sk2, node, &head->chain) {
497 if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) 368 if (INET6_MATCH(sk2, saddr, daddr, ports, dif))
498 goto not_unique; 369 goto not_unique;
499 } 370 }
500 371
@@ -510,10 +381,10 @@ unique:
510 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 381 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
511 } else if (tw) { 382 } else if (tw) {
512 /* Silly. Should hash-dance instead... */ 383 /* Silly. Should hash-dance instead... */
513 tcp_tw_deschedule(tw); 384 inet_twsk_deschedule(tw, &tcp_death_row);
514 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 385 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
515 386
516 tcp_tw_put(tw); 387 inet_twsk_put(tw);
517 } 388 }
518 return 0; 389 return 0;
519 390
@@ -535,8 +406,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk)
535static int tcp_v6_hash_connect(struct sock *sk) 406static int tcp_v6_hash_connect(struct sock *sk)
536{ 407{
537 unsigned short snum = inet_sk(sk)->num; 408 unsigned short snum = inet_sk(sk)->num;
538 struct tcp_bind_hashbucket *head; 409 struct inet_bind_hashbucket *head;
539 struct tcp_bind_bucket *tb; 410 struct inet_bind_bucket *tb;
540 int ret; 411 int ret;
541 412
542 if (!snum) { 413 if (!snum) {
@@ -548,19 +419,19 @@ static int tcp_v6_hash_connect(struct sock *sk)
548 static u32 hint; 419 static u32 hint;
549 u32 offset = hint + tcpv6_port_offset(sk); 420 u32 offset = hint + tcpv6_port_offset(sk);
550 struct hlist_node *node; 421 struct hlist_node *node;
551 struct tcp_tw_bucket *tw = NULL; 422 struct inet_timewait_sock *tw = NULL;
552 423
553 local_bh_disable(); 424 local_bh_disable();
554 for (i = 1; i <= range; i++) { 425 for (i = 1; i <= range; i++) {
555 port = low + (i + offset) % range; 426 port = low + (i + offset) % range;
556 head = &tcp_bhash[tcp_bhashfn(port)]; 427 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
557 spin_lock(&head->lock); 428 spin_lock(&head->lock);
558 429
559 /* Does not bother with rcv_saddr checks, 430 /* Does not bother with rcv_saddr checks,
560 * because the established check is already 431 * because the established check is already
561 * unique enough. 432 * unique enough.
562 */ 433 */
563 tb_for_each(tb, node, &head->chain) { 434 inet_bind_bucket_for_each(tb, node, &head->chain) {
564 if (tb->port == port) { 435 if (tb->port == port) {
565 BUG_TRAP(!hlist_empty(&tb->owners)); 436 BUG_TRAP(!hlist_empty(&tb->owners));
566 if (tb->fastreuse >= 0) 437 if (tb->fastreuse >= 0)
@@ -573,7 +444,7 @@ static int tcp_v6_hash_connect(struct sock *sk)
573 } 444 }
574 } 445 }
575 446
576 tb = tcp_bucket_create(head, port); 447 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
577 if (!tb) { 448 if (!tb) {
578 spin_unlock(&head->lock); 449 spin_unlock(&head->lock);
579 break; 450 break;
@@ -592,7 +463,7 @@ ok:
592 hint += i; 463 hint += i;
593 464
594 /* Head lock still held and bh's disabled */ 465 /* Head lock still held and bh's disabled */
595 tcp_bind_hash(sk, tb, port); 466 inet_bind_hash(sk, tb, port);
596 if (sk_unhashed(sk)) { 467 if (sk_unhashed(sk)) {
597 inet_sk(sk)->sport = htons(port); 468 inet_sk(sk)->sport = htons(port);
598 __tcp_v6_hash(sk); 469 __tcp_v6_hash(sk);
@@ -600,16 +471,16 @@ ok:
600 spin_unlock(&head->lock); 471 spin_unlock(&head->lock);
601 472
602 if (tw) { 473 if (tw) {
603 tcp_tw_deschedule(tw); 474 inet_twsk_deschedule(tw, &tcp_death_row);
604 tcp_tw_put(tw); 475 inet_twsk_put(tw);
605 } 476 }
606 477
607 ret = 0; 478 ret = 0;
608 goto out; 479 goto out;
609 } 480 }
610 481
611 head = &tcp_bhash[tcp_bhashfn(snum)]; 482 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
612 tb = tcp_sk(sk)->bind_hash; 483 tb = inet_csk(sk)->icsk_bind_hash;
613 spin_lock_bh(&head->lock); 484 spin_lock_bh(&head->lock);
614 485
615 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 486 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
@@ -626,11 +497,6 @@ out:
626 } 497 }
627} 498}
628 499
629static __inline__ int tcp_v6_iif(struct sk_buff *skb)
630{
631 return IP6CB(skb)->iif;
632}
633
634static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 500static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
635 int addr_len) 501 int addr_len)
636{ 502{
@@ -822,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
822 int type, int code, int offset, __u32 info) 688 int type, int code, int offset, __u32 info)
823{ 689{
824 struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; 690 struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
825 struct tcphdr *th = (struct tcphdr *)(skb->data+offset); 691 const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
826 struct ipv6_pinfo *np; 692 struct ipv6_pinfo *np;
827 struct sock *sk; 693 struct sock *sk;
828 int err; 694 int err;
829 struct tcp_sock *tp; 695 struct tcp_sock *tp;
830 __u32 seq; 696 __u32 seq;
831 697
832 sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); 698 sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
699 th->source, skb->dev->ifindex);
833 700
834 if (sk == NULL) { 701 if (sk == NULL) {
835 ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); 702 ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
@@ -837,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
837 } 704 }
838 705
839 if (sk->sk_state == TCP_TIME_WAIT) { 706 if (sk->sk_state == TCP_TIME_WAIT) {
840 tcp_tw_put((struct tcp_tw_bucket*)sk); 707 inet_twsk_put((struct inet_timewait_sock *)sk);
841 return; 708 return;
842 } 709 }
843 710
@@ -915,8 +782,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
915 if (sock_owned_by_user(sk)) 782 if (sock_owned_by_user(sk))
916 goto out; 783 goto out;
917 784
918 req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, 785 req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
919 &hdr->saddr, tcp_v6_iif(skb)); 786 &hdr->saddr, inet6_iif(skb));
920 if (!req) 787 if (!req)
921 goto out; 788 goto out;
922 789
@@ -930,7 +797,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
930 goto out; 797 goto out;
931 } 798 }
932 799
933 tcp_synq_drop(sk, req, prev); 800 inet_csk_reqsk_queue_drop(sk, req, prev);
934 goto out; 801 goto out;
935 802
936 case TCP_SYN_SENT: 803 case TCP_SYN_SENT:
@@ -1127,7 +994,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
1127 buff->csum); 994 buff->csum);
1128 995
1129 fl.proto = IPPROTO_TCP; 996 fl.proto = IPPROTO_TCP;
1130 fl.oif = tcp_v6_iif(skb); 997 fl.oif = inet6_iif(skb);
1131 fl.fl_ip_dport = t1->dest; 998 fl.fl_ip_dport = t1->dest;
1132 fl.fl_ip_sport = t1->source; 999 fl.fl_ip_sport = t1->source;
1133 1000
@@ -1196,7 +1063,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
1196 buff->csum); 1063 buff->csum);
1197 1064
1198 fl.proto = IPPROTO_TCP; 1065 fl.proto = IPPROTO_TCP;
1199 fl.oif = tcp_v6_iif(skb); 1066 fl.oif = inet6_iif(skb);
1200 fl.fl_ip_dport = t1->dest; 1067 fl.fl_ip_dport = t1->dest;
1201 fl.fl_ip_sport = t1->source; 1068 fl.fl_ip_sport = t1->source;
1202 1069
@@ -1215,12 +1082,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
1215 1082
1216static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) 1083static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
1217{ 1084{
1218 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; 1085 struct inet_timewait_sock *tw = inet_twsk(sk);
1086 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1219 1087
1220 tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, 1088 tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1221 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); 1089 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1090 tcptw->tw_ts_recent);
1222 1091
1223 tcp_tw_put(tw); 1092 inet_twsk_put(tw);
1224} 1093}
1225 1094
1226static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) 1095static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1232,28 +1101,25 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1232static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) 1101static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1233{ 1102{
1234 struct request_sock *req, **prev; 1103 struct request_sock *req, **prev;
1235 struct tcphdr *th = skb->h.th; 1104 const struct tcphdr *th = skb->h.th;
1236 struct tcp_sock *tp = tcp_sk(sk);
1237 struct sock *nsk; 1105 struct sock *nsk;
1238 1106
1239 /* Find possible connection requests. */ 1107 /* Find possible connection requests. */
1240 req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, 1108 req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
1241 &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); 1109 &skb->nh.ipv6h->daddr, inet6_iif(skb));
1242 if (req) 1110 if (req)
1243 return tcp_check_req(sk, skb, req, prev); 1111 return tcp_check_req(sk, skb, req, prev);
1244 1112
1245 nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, 1113 nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
1246 th->source, 1114 th->source, &skb->nh.ipv6h->daddr,
1247 &skb->nh.ipv6h->daddr, 1115 ntohs(th->dest), inet6_iif(skb));
1248 ntohs(th->dest),
1249 tcp_v6_iif(skb));
1250 1116
1251 if (nsk) { 1117 if (nsk) {
1252 if (nsk->sk_state != TCP_TIME_WAIT) { 1118 if (nsk->sk_state != TCP_TIME_WAIT) {
1253 bh_lock_sock(nsk); 1119 bh_lock_sock(nsk);
1254 return nsk; 1120 return nsk;
1255 } 1121 }
1256 tcp_tw_put((struct tcp_tw_bucket*)nsk); 1122 inet_twsk_put((struct inet_timewait_sock *)nsk);
1257 return NULL; 1123 return NULL;
1258 } 1124 }
1259 1125
@@ -1266,12 +1132,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1266 1132
1267static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) 1133static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
1268{ 1134{
1269 struct tcp_sock *tp = tcp_sk(sk); 1135 struct inet_connection_sock *icsk = inet_csk(sk);
1270 struct listen_sock *lopt = tp->accept_queue.listen_opt; 1136 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
1271 u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); 1137 const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1272 1138
1273 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); 1139 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
1274 tcp_synq_added(sk); 1140 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
1275} 1141}
1276 1142
1277 1143
@@ -1296,13 +1162,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1296 /* 1162 /*
1297 * There are no SYN attacks on IPv6, yet... 1163 * There are no SYN attacks on IPv6, yet...
1298 */ 1164 */
1299 if (tcp_synq_is_full(sk) && !isn) { 1165 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1300 if (net_ratelimit()) 1166 if (net_ratelimit())
1301 printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); 1167 printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
1302 goto drop; 1168 goto drop;
1303 } 1169 }
1304 1170
1305 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1171 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1306 goto drop; 1172 goto drop;
1307 1173
1308 req = reqsk_alloc(&tcp6_request_sock_ops); 1174 req = reqsk_alloc(&tcp6_request_sock_ops);
@@ -1334,7 +1200,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1334 /* So that link locals have meaning */ 1200 /* So that link locals have meaning */
1335 if (!sk->sk_bound_dev_if && 1201 if (!sk->sk_bound_dev_if &&
1336 ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) 1202 ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
1337 treq->iif = tcp_v6_iif(skb); 1203 treq->iif = inet6_iif(skb);
1338 1204
1339 if (isn == 0) 1205 if (isn == 0)
1340 isn = tcp_v6_init_sequence(sk,skb); 1206 isn = tcp_v6_init_sequence(sk,skb);
@@ -1399,15 +1265,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1399 newsk->sk_backlog_rcv = tcp_v4_do_rcv; 1265 newsk->sk_backlog_rcv = tcp_v4_do_rcv;
1400 newnp->pktoptions = NULL; 1266 newnp->pktoptions = NULL;
1401 newnp->opt = NULL; 1267 newnp->opt = NULL;
1402 newnp->mcast_oif = tcp_v6_iif(skb); 1268 newnp->mcast_oif = inet6_iif(skb);
1403 newnp->mcast_hops = skb->nh.ipv6h->hop_limit; 1269 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
1404 1270
1405 /* Charge newly allocated IPv6 socket. Though it is mapped, 1271 /*
1406 * it is IPv6 yet. 1272 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
1273 * here, tcp_create_openreq_child now does this for us, see the comment in
1274 * that function for the gory details. -acme
1407 */ 1275 */
1408#ifdef INET_REFCNT_DEBUG
1409 atomic_inc(&inet6_sock_nr);
1410#endif
1411 1276
1412 /* It is tricky place. Until this moment IPv4 tcp 1277 /* It is tricky place. Until this moment IPv4 tcp
1413 worked with IPv6 af_tcp.af_specific. 1278 worked with IPv6 af_tcp.af_specific.
@@ -1462,10 +1327,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1462 if (newsk == NULL) 1327 if (newsk == NULL)
1463 goto out; 1328 goto out;
1464 1329
1465 /* Charge newly allocated IPv6 socket */ 1330 /*
1466#ifdef INET_REFCNT_DEBUG 1331 * No need to charge this sock to the relevant IPv6 refcnt debug socks
1467 atomic_inc(&inet6_sock_nr); 1332 * count here, tcp_create_openreq_child now does this for us, see the
1468#endif 1333 * comment in that function for the gory details. -acme
1334 */
1469 1335
1470 ip6_dst_store(newsk, dst, NULL); 1336 ip6_dst_store(newsk, dst, NULL);
1471 newsk->sk_route_caps = dst->dev->features & 1337 newsk->sk_route_caps = dst->dev->features &
@@ -1504,7 +1370,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1504 skb_set_owner_r(newnp->pktoptions, newsk); 1370 skb_set_owner_r(newnp->pktoptions, newsk);
1505 } 1371 }
1506 newnp->opt = NULL; 1372 newnp->opt = NULL;
1507 newnp->mcast_oif = tcp_v6_iif(skb); 1373 newnp->mcast_oif = inet6_iif(skb);
1508 newnp->mcast_hops = skb->nh.ipv6h->hop_limit; 1374 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
1509 1375
1510 /* Clone native IPv6 options from listening socket (if any) 1376 /* Clone native IPv6 options from listening socket (if any)
@@ -1531,7 +1397,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1531 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; 1397 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
1532 1398
1533 __tcp_v6_hash(newsk); 1399 __tcp_v6_hash(newsk);
1534 tcp_inherit_port(sk, newsk); 1400 inet_inherit_port(&tcp_hashinfo, sk, newsk);
1535 1401
1536 return newsk; 1402 return newsk;
1537 1403
@@ -1552,7 +1418,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb)
1552 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1418 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1553 &skb->nh.ipv6h->daddr,skb->csum)) 1419 &skb->nh.ipv6h->daddr,skb->csum))
1554 return 0; 1420 return 0;
1555 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); 1421 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
1556 } 1422 }
1557 if (skb->len <= 76) { 1423 if (skb->len <= 76) {
1558 if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1424 if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
@@ -1679,7 +1545,7 @@ ipv6_pktoptions:
1679 if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && 1545 if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
1680 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { 1546 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
1681 if (np->rxopt.bits.rxinfo) 1547 if (np->rxopt.bits.rxinfo)
1682 np->mcast_oif = tcp_v6_iif(opt_skb); 1548 np->mcast_oif = inet6_iif(opt_skb);
1683 if (np->rxopt.bits.rxhlim) 1549 if (np->rxopt.bits.rxhlim)
1684 np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; 1550 np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
1685 if (ipv6_opt_accepted(sk, opt_skb)) { 1551 if (ipv6_opt_accepted(sk, opt_skb)) {
@@ -1734,8 +1600,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
1734 TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); 1600 TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
1735 TCP_SKB_CB(skb)->sacked = 0; 1601 TCP_SKB_CB(skb)->sacked = 0;
1736 1602
1737 sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source, 1603 sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
1738 &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); 1604 &skb->nh.ipv6h->daddr, ntohs(th->dest),
1605 inet6_iif(skb));
1739 1606
1740 if (!sk) 1607 if (!sk)
1741 goto no_tcp_socket; 1608 goto no_tcp_socket;
@@ -1790,26 +1657,29 @@ discard_and_relse:
1790 1657
1791do_time_wait: 1658do_time_wait:
1792 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1659 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1793 tcp_tw_put((struct tcp_tw_bucket *) sk); 1660 inet_twsk_put((struct inet_timewait_sock *)sk);
1794 goto discard_it; 1661 goto discard_it;
1795 } 1662 }
1796 1663
1797 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { 1664 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1798 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1665 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1799 tcp_tw_put((struct tcp_tw_bucket *) sk); 1666 inet_twsk_put((struct inet_timewait_sock *)sk);
1800 goto discard_it; 1667 goto discard_it;
1801 } 1668 }
1802 1669
1803 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, 1670 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1804 skb, th, skb->len)) { 1671 skb, th)) {
1805 case TCP_TW_SYN: 1672 case TCP_TW_SYN:
1806 { 1673 {
1807 struct sock *sk2; 1674 struct sock *sk2;
1808 1675
1809 sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); 1676 sk2 = inet6_lookup_listener(&tcp_hashinfo,
1677 &skb->nh.ipv6h->daddr,
1678 ntohs(th->dest), inet6_iif(skb));
1810 if (sk2 != NULL) { 1679 if (sk2 != NULL) {
1811 tcp_tw_deschedule((struct tcp_tw_bucket *)sk); 1680 struct inet_timewait_sock *tw = inet_twsk(sk);
1812 tcp_tw_put((struct tcp_tw_bucket *)sk); 1681 inet_twsk_deschedule(tw, &tcp_death_row);
1682 inet_twsk_put(tw);
1813 sk = sk2; 1683 sk = sk2;
1814 goto process; 1684 goto process;
1815 } 1685 }
@@ -1978,7 +1848,7 @@ static struct tcp_func ipv6_specific = {
1978static struct tcp_func ipv6_mapped = { 1848static struct tcp_func ipv6_mapped = {
1979 .queue_xmit = ip_queue_xmit, 1849 .queue_xmit = ip_queue_xmit,
1980 .send_check = tcp_v4_send_check, 1850 .send_check = tcp_v4_send_check,
1981 .rebuild_header = tcp_v4_rebuild_header, 1851 .rebuild_header = inet_sk_rebuild_header,
1982 .conn_request = tcp_v6_conn_request, 1852 .conn_request = tcp_v6_conn_request,
1983 .syn_recv_sock = tcp_v6_syn_recv_sock, 1853 .syn_recv_sock = tcp_v6_syn_recv_sock,
1984 .remember_stamp = tcp_v4_remember_stamp, 1854 .remember_stamp = tcp_v4_remember_stamp,
@@ -1997,13 +1867,14 @@ static struct tcp_func ipv6_mapped = {
1997 */ 1867 */
1998static int tcp_v6_init_sock(struct sock *sk) 1868static int tcp_v6_init_sock(struct sock *sk)
1999{ 1869{
1870 struct inet_connection_sock *icsk = inet_csk(sk);
2000 struct tcp_sock *tp = tcp_sk(sk); 1871 struct tcp_sock *tp = tcp_sk(sk);
2001 1872
2002 skb_queue_head_init(&tp->out_of_order_queue); 1873 skb_queue_head_init(&tp->out_of_order_queue);
2003 tcp_init_xmit_timers(sk); 1874 tcp_init_xmit_timers(sk);
2004 tcp_prequeue_init(tp); 1875 tcp_prequeue_init(tp);
2005 1876
2006 tp->rto = TCP_TIMEOUT_INIT; 1877 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2007 tp->mdev = TCP_TIMEOUT_INIT; 1878 tp->mdev = TCP_TIMEOUT_INIT;
2008 1879
2009 /* So many TCP implementations out there (incorrectly) count the 1880 /* So many TCP implementations out there (incorrectly) count the
@@ -2025,7 +1896,7 @@ static int tcp_v6_init_sock(struct sock *sk)
2025 sk->sk_state = TCP_CLOSE; 1896 sk->sk_state = TCP_CLOSE;
2026 1897
2027 tp->af_specific = &ipv6_specific; 1898 tp->af_specific = &ipv6_specific;
2028 tp->ca_ops = &tcp_init_congestion_ops; 1899 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
2029 sk->sk_write_space = sk_stream_write_space; 1900 sk->sk_write_space = sk_stream_write_space;
2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1901 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2031 1902
@@ -2039,8 +1910,6 @@ static int tcp_v6_init_sock(struct sock *sk)
2039 1910
2040static int tcp_v6_destroy_sock(struct sock *sk) 1911static int tcp_v6_destroy_sock(struct sock *sk)
2041{ 1912{
2042 extern int tcp_v4_destroy_sock(struct sock *sk);
2043
2044 tcp_v4_destroy_sock(sk); 1913 tcp_v4_destroy_sock(sk);
2045 return inet6_destroy_sock(sk); 1914 return inet6_destroy_sock(sk);
2046} 1915}
@@ -2086,18 +1955,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
2086 unsigned long timer_expires; 1955 unsigned long timer_expires;
2087 struct inet_sock *inet = inet_sk(sp); 1956 struct inet_sock *inet = inet_sk(sp);
2088 struct tcp_sock *tp = tcp_sk(sp); 1957 struct tcp_sock *tp = tcp_sk(sp);
1958 const struct inet_connection_sock *icsk = inet_csk(sp);
2089 struct ipv6_pinfo *np = inet6_sk(sp); 1959 struct ipv6_pinfo *np = inet6_sk(sp);
2090 1960
2091 dest = &np->daddr; 1961 dest = &np->daddr;
2092 src = &np->rcv_saddr; 1962 src = &np->rcv_saddr;
2093 destp = ntohs(inet->dport); 1963 destp = ntohs(inet->dport);
2094 srcp = ntohs(inet->sport); 1964 srcp = ntohs(inet->sport);
2095 if (tp->pending == TCP_TIME_RETRANS) { 1965
1966 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2096 timer_active = 1; 1967 timer_active = 1;
2097 timer_expires = tp->timeout; 1968 timer_expires = icsk->icsk_timeout;
2098 } else if (tp->pending == TCP_TIME_PROBE0) { 1969 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2099 timer_active = 4; 1970 timer_active = 4;
2100 timer_expires = tp->timeout; 1971 timer_expires = icsk->icsk_timeout;
2101 } else if (timer_pending(&sp->sk_timer)) { 1972 } else if (timer_pending(&sp->sk_timer)) {
2102 timer_active = 2; 1973 timer_active = 2;
2103 timer_expires = sp->sk_timer.expires; 1974 timer_expires = sp->sk_timer.expires;
@@ -2118,28 +1989,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
2118 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, 1989 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2119 timer_active, 1990 timer_active,
2120 jiffies_to_clock_t(timer_expires - jiffies), 1991 jiffies_to_clock_t(timer_expires - jiffies),
2121 tp->retransmits, 1992 icsk->icsk_retransmits,
2122 sock_i_uid(sp), 1993 sock_i_uid(sp),
2123 tp->probes_out, 1994 icsk->icsk_probes_out,
2124 sock_i_ino(sp), 1995 sock_i_ino(sp),
2125 atomic_read(&sp->sk_refcnt), sp, 1996 atomic_read(&sp->sk_refcnt), sp,
2126 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, 1997 icsk->icsk_rto,
1998 icsk->icsk_ack.ato,
1999 (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
2127 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh 2000 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2128 ); 2001 );
2129} 2002}
2130 2003
2131static void get_timewait6_sock(struct seq_file *seq, 2004static void get_timewait6_sock(struct seq_file *seq,
2132 struct tcp_tw_bucket *tw, int i) 2005 struct inet_timewait_sock *tw, int i)
2133{ 2006{
2134 struct in6_addr *dest, *src; 2007 struct in6_addr *dest, *src;
2135 __u16 destp, srcp; 2008 __u16 destp, srcp;
2009 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
2136 int ttd = tw->tw_ttd - jiffies; 2010 int ttd = tw->tw_ttd - jiffies;
2137 2011
2138 if (ttd < 0) 2012 if (ttd < 0)
2139 ttd = 0; 2013 ttd = 0;
2140 2014
2141 dest = &tw->tw_v6_daddr; 2015 dest = &tcp6tw->tw_v6_daddr;
2142 src = &tw->tw_v6_rcv_saddr; 2016 src = &tcp6tw->tw_v6_rcv_saddr;
2143 destp = ntohs(tw->tw_dport); 2017 destp = ntohs(tw->tw_dport);
2144 srcp = ntohs(tw->tw_sport); 2018 srcp = ntohs(tw->tw_sport);
2145 2019
@@ -2214,7 +2088,7 @@ struct proto tcpv6_prot = {
2214 .close = tcp_close, 2088 .close = tcp_close,
2215 .connect = tcp_v6_connect, 2089 .connect = tcp_v6_connect,
2216 .disconnect = tcp_disconnect, 2090 .disconnect = tcp_disconnect,
2217 .accept = tcp_accept, 2091 .accept = inet_csk_accept,
2218 .ioctl = tcp_ioctl, 2092 .ioctl = tcp_ioctl,
2219 .init = tcp_v6_init_sock, 2093 .init = tcp_v6_init_sock,
2220 .destroy = tcp_v6_destroy_sock, 2094 .destroy = tcp_v6_destroy_sock,
@@ -2231,11 +2105,13 @@ struct proto tcpv6_prot = {
2231 .sockets_allocated = &tcp_sockets_allocated, 2105 .sockets_allocated = &tcp_sockets_allocated,
2232 .memory_allocated = &tcp_memory_allocated, 2106 .memory_allocated = &tcp_memory_allocated,
2233 .memory_pressure = &tcp_memory_pressure, 2107 .memory_pressure = &tcp_memory_pressure,
2108 .orphan_count = &tcp_orphan_count,
2234 .sysctl_mem = sysctl_tcp_mem, 2109 .sysctl_mem = sysctl_tcp_mem,
2235 .sysctl_wmem = sysctl_tcp_wmem, 2110 .sysctl_wmem = sysctl_tcp_wmem,
2236 .sysctl_rmem = sysctl_tcp_rmem, 2111 .sysctl_rmem = sysctl_tcp_rmem,
2237 .max_header = MAX_TCP_HEADER, 2112 .max_header = MAX_TCP_HEADER,
2238 .obj_size = sizeof(struct tcp6_sock), 2113 .obj_size = sizeof(struct tcp6_sock),
2114 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
2239 .rsk_prot = &tcp6_request_sock_ops, 2115 .rsk_prot = &tcp6_request_sock_ops,
2240}; 2116};
2241 2117
@@ -2245,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = {
2245 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, 2121 .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
2246}; 2122};
2247 2123
2248extern struct proto_ops inet6_stream_ops;
2249
2250static struct inet_protosw tcpv6_protosw = { 2124static struct inet_protosw tcpv6_protosw = {
2251 .type = SOCK_STREAM, 2125 .type = SOCK_STREAM,
2252 .protocol = IPPROTO_TCP, 2126 .protocol = IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eff050ac7049..390d750449ce 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -51,6 +51,7 @@
51#include <net/udp.h> 51#include <net/udp.h>
52#include <net/raw.h> 52#include <net/raw.h>
53#include <net/inet_common.h> 53#include <net/inet_common.h>
54#include <net/tcp_states.h>
54 55
55#include <net/ip6_checksum.h> 56#include <net/ip6_checksum.h>
56#include <net/xfrm.h> 57#include <net/xfrm.h>
@@ -58,7 +59,7 @@
58#include <linux/proc_fs.h> 59#include <linux/proc_fs.h>
59#include <linux/seq_file.h> 60#include <linux/seq_file.h>
60 61
61DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); 62DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
62 63
63/* Grrr, addr_type already calculated by caller, but I don't want 64/* Grrr, addr_type already calculated by caller, but I don't want
64 * to add some silly "cookie" argument to this method just for that. 65 * to add some silly "cookie" argument to this method just for that.
@@ -477,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
477 /* RFC 2460 section 8.1 says that we SHOULD log 478 /* RFC 2460 section 8.1 says that we SHOULD log
478 this error. Well, it is reasonable. 479 this error. Well, it is reasonable.
479 */ 480 */
480 LIMIT_NETDEBUG( 481 LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
481 printk(KERN_INFO "IPv6: udp checksum is 0\n"));
482 goto discard; 482 goto discard;
483 } 483 }
484 484
@@ -493,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
493 if (skb->ip_summed==CHECKSUM_HW) { 493 if (skb->ip_summed==CHECKSUM_HW) {
494 skb->ip_summed = CHECKSUM_UNNECESSARY; 494 skb->ip_summed = CHECKSUM_UNNECESSARY;
495 if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { 495 if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
496 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n")); 496 LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
497 skb->ip_summed = CHECKSUM_NONE; 497 skb->ip_summed = CHECKSUM_NONE;
498 } 498 }
499 } 499 }
@@ -825,7 +825,7 @@ back_from_confirm:
825 /* ... which is an evident application bug. --ANK */ 825 /* ... which is an evident application bug. --ANK */
826 release_sock(sk); 826 release_sock(sk);
827 827
828 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); 828 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
829 err = -EINVAL; 829 err = -EINVAL;
830 goto out; 830 goto out;
831 } 831 }
@@ -1054,8 +1054,6 @@ struct proto udpv6_prot = {
1054 .obj_size = sizeof(struct udp6_sock), 1054 .obj_size = sizeof(struct udp6_sock),
1055}; 1055};
1056 1056
1057extern struct proto_ops inet6_dgram_ops;
1058
1059static struct inet_protosw udpv6_protosw = { 1057static struct inet_protosw udpv6_protosw = {
1060 .type = SOCK_DGRAM, 1058 .type = SOCK_DGRAM,
1061 .protocol = IPPROTO_UDP, 1059 .protocol = IPPROTO_UDP,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 60c26c87277e..fbef7826a74f 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi;
79#define XFRM6_TUNNEL_SPI_MIN 1 79#define XFRM6_TUNNEL_SPI_MIN 1
80#define XFRM6_TUNNEL_SPI_MAX 0xffffffff 80#define XFRM6_TUNNEL_SPI_MAX 0xffffffff
81 81
82static kmem_cache_t *xfrm6_tunnel_spi_kmem; 82static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
83 83
84#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 84#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
85#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 85#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e783..980a826f5d02 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
1# 1#
2# IPX configuration 2# IPX configuration
3# 3#
4config IPX
5 tristate "The IPX protocol"
6 select LLC
7 ---help---
8 This is support for the Novell networking protocol, IPX, commonly
9 used for local networks of Windows machines. You need it if you
10 want to access Novell NetWare file or print servers using the Linux
11 Novell client ncpfs (available from
12 <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
13 within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>). In order
15 to do the former, you'll also have to say Y to "NCP file system
16 support", below.
17
18 IPX is similar in scope to IP, while SPX, which runs on top of IPX,
19 is similar to TCP. There is also experimental support for SPX in
20 Linux (see "SPX networking", below).
21
22 To turn your Linux box into a fully featured NetWare file server and
23 IPX router, say Y here and fetch either lwared from
24 <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
25 mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
26 information, read the IPX-HOWTO available from
27 <http://www.tldp.org/docs.html#howto>.
28
29 General information about how to connect Linux, Windows machines and
30 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
31
32 The IPX driver would enlarge your kernel by about 16 KB. To compile
33 this driver as a module, choose M here: the module will be called ipx.
34 Unless you want to integrate your Linux box with a local Novell
35 network, say N.
36
4config IPX_INTERN 37config IPX_INTERN
5 bool "IPX: Full internal IPX network" 38 bool "IPX: Full internal IPX network"
6 depends on IPX 39 depends on IPX
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 5a27e5df5886..34b3bb868409 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -44,7 +44,6 @@
44#include <linux/socket.h> 44#include <linux/socket.h>
45#include <linux/sockios.h> 45#include <linux/sockios.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/tcp.h>
48#include <linux/types.h> 47#include <linux/types.h>
49#include <linux/termios.h> 48#include <linux/termios.h>
50 49
@@ -52,6 +51,7 @@
52#include <net/p8022.h> 51#include <net/p8022.h>
53#include <net/psnap.h> 52#include <net/psnap.h>
54#include <net/sock.h> 53#include <net/sock.h>
54#include <net/tcp_states.h>
55 55
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57 57
@@ -1627,7 +1627,7 @@ out:
1627 return rc; 1627 return rc;
1628} 1628}
1629 1629
1630static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 1630static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
1631{ 1631{
1632 /* NULL here for pt means the packet was looped back */ 1632 /* NULL here for pt means the packet was looped back */
1633 struct ipx_interface *intrfc; 1633 struct ipx_interface *intrfc;
@@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
1796 copied); 1796 copied);
1797 if (rc) 1797 if (rc)
1798 goto out_free; 1798 goto out_free;
1799 if (skb->stamp.tv_sec) 1799 if (skb->tstamp.off_sec)
1800 sk->sk_stamp = skb->stamp; 1800 skb_get_timestamp(skb, &sk->sk_stamp);
1801 1801
1802 msg->msg_namelen = sizeof(*sipx); 1802 msg->msg_namelen = sizeof(*sipx);
1803 1803
@@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = {
1940}; 1940};
1941 1941
1942extern struct datalink_proto *make_EII_client(void); 1942extern struct datalink_proto *make_EII_client(void);
1943extern struct datalink_proto *make_8023_client(void);
1944extern void destroy_EII_client(struct datalink_proto *); 1943extern void destroy_EII_client(struct datalink_proto *);
1945extern void destroy_8023_client(struct datalink_proto *);
1946 1944
1947static unsigned char ipx_8022_type = 0xE0; 1945static unsigned char ipx_8022_type = 0xE0;
1948static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; 1946static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index b6761913445a..1f73d9ea434d 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -10,7 +10,7 @@
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/tcp.h> 13#include <net/tcp_states.h>
14#include <net/ipx.h> 14#include <net/ipx.h>
15 15
16static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos) 16static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92c6e8d4e731..6f92f9c62990 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -56,7 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57 57
58#include <net/sock.h> 58#include <net/sock.h>
59#include <net/tcp.h> 59#include <net/tcp_states.h>
60 60
61#include <net/irda/af_irda.h> 61#include <net/irda/af_irda.h>
62 62
diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c
index 343c5d4a1a1d..ca7d358dab52 100644
--- a/net/irda/irlan/irlan_filter.c
+++ b/net/irda/irlan/irlan_filter.c
@@ -27,6 +27,7 @@
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28 28
29#include <net/irda/irlan_common.h> 29#include <net/irda/irlan_common.h>
30#include <net/irda/irlan_filter.h>
30 31
31/* 32/*
32 * Function irlan_filter_request (self, skb) 33 * Function irlan_filter_request (self, skb)
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 6dafbb43b529..3e9a06abbdd0 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
988 IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); 988 IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
989 return; 989 return;
990 } 990 }
991 /* Unlink tx_skb from list */
992 tx_skb->next = tx_skb->prev = NULL;
993 tx_skb->list = NULL;
994 991
995 /* Clear old Nr field + poll bit */ 992 /* Clear old Nr field + poll bit */
996 tx_skb->data[1] &= 0x0f; 993 tx_skb->data[1] &= 0x0f;
@@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
1063 IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); 1060 IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
1064 return; 1061 return;
1065 } 1062 }
1066 /* Unlink tx_skb from list */
1067 tx_skb->next = tx_skb->prev = NULL;
1068 tx_skb->list = NULL;
1069 1063
1070 /* Clear old Nr field + poll bit */ 1064 /* Clear old Nr field + poll bit */
1071 tx_skb->data[1] &= 0x0f; 1065 tx_skb->data[1] &= 0x0f;
@@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
1309 * Jean II 1303 * Jean II
1310 */ 1304 */
1311int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, 1305int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
1312 struct packet_type *ptype) 1306 struct packet_type *ptype, struct net_device *orig_dev)
1313{ 1307{
1314 struct irlap_info info; 1308 struct irlap_info info;
1315 struct irlap_cb *self; 1309 struct irlap_cb *self;
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7a4a4d7fbe66..c19e9ce05a3a 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL;
53/* These can be altered by the sysctl interface */ 53/* These can be altered by the sysctl interface */
54int sysctl_discovery = 0; 54int sysctl_discovery = 0;
55int sysctl_discovery_timeout = 3; /* 3 seconds by default */ 55int sysctl_discovery_timeout = 3; /* 3 seconds by default */
56EXPORT_SYMBOL(sysctl_discovery_timeout);
57int sysctl_discovery_slots = 6; /* 6 slots by default */ 56int sysctl_discovery_slots = 6; /* 6 slots by default */
58int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; 57int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
59char sysctl_devname[65]; 58char sysctl_devname[65];
@@ -67,7 +66,6 @@ const char *irlmp_reasons[] = {
67 "LM_INIT_DISCONNECT", 66 "LM_INIT_DISCONNECT",
68 "ERROR, NOT USED", 67 "ERROR, NOT USED",
69}; 68};
70EXPORT_SYMBOL(irlmp_reasons);
71 69
72/* 70/*
73 * Function irlmp_init (void) 71 * Function irlmp_init (void)
@@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
675 673
676 return new; 674 return new;
677} 675}
678EXPORT_SYMBOL(irlmp_dup);
679 676
680/* 677/*
681 * Function irlmp_disconnect_request (handle, userdata) 678 * Function irlmp_disconnect_request (handle, userdata)
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 6ffaed4544e9..634901dd156f 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -54,7 +54,7 @@ extern int irsock_init(void);
54extern void irsock_cleanup(void); 54extern void irsock_cleanup(void);
55/* irlap_frame.c */ 55/* irlap_frame.c */
56extern int irlap_driver_rcv(struct sk_buff *, struct net_device *, 56extern int irlap_driver_rcv(struct sk_buff *, struct net_device *,
57 struct packet_type *); 57 struct packet_type *, struct net_device *);
58 58
59/* 59/*
60 * Module parameters 60 * Module parameters
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 9004f7349a76..b391cb3893d4 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -517,9 +517,6 @@ extern int
517 irda_irnet_init(void); /* Initialise IrDA part of IrNET */ 517 irda_irnet_init(void); /* Initialise IrDA part of IrNET */
518extern void 518extern void
519 irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */ 519 irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */
520/* ---------------------------- MODULE ---------------------------- */
521extern int
522 irnet_init(void); /* Initialise IrNET module */
523 520
524/**************************** VARIABLES ****************************/ 521/**************************** VARIABLES ****************************/
525 522
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index f8f984bb9922..e53bf9e0053e 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void)
1107/* 1107/*
1108 * Module main entry point 1108 * Module main entry point
1109 */ 1109 */
1110int __init 1110static int __init
1111irnet_init(void) 1111irnet_init(void)
1112{ 1112{
1113 int err; 1113 int err;
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index b0dd3ea35999..1ba8c7106639 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
822 822
823 return entry; 823 return entry;
824} 824}
825EXPORT_SYMBOL(hashbin_find_next);
826 825
827/* 826/*
828 * Function hashbin_get_first (hashbin) 827 * Function hashbin_get_first (hashbin)
diff --git a/net/irda/qos.c b/net/irda/qos.c
index df732d56cc57..ddfb5c502a90 100644
--- a/net/irda/qos.c
+++ b/net/irda/qos.c
@@ -37,6 +37,7 @@
37#include <net/irda/parameters.h> 37#include <net/irda/parameters.h>
38#include <net/irda/qos.h> 38#include <net/irda/qos.h>
39#include <net/irda/irlap.h> 39#include <net/irda/irlap.h>
40#include <net/irda/irlap_frame.h>
40 41
41/* 42/*
42 * Maximum values of the baud rate we negociate with the other end. 43 * Maximum values of the baud rate we negociate with the other end.
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000000..f0b5efb31a00
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
1#
2# LAPB Data Link Drive
3#
4
5config LAPB
6 tristate "LAPB Data Link Driver (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
10 the lower) part of the X.25 protocol. It offers a reliable
11 connection service to exchange data frames with one other host, and
12 it is used to transport higher level protocols (mostly X.25 Packet
13 Layer, the higher part of X.25, but others are possible as well).
14 Usually, LAPB is used with specialized X.21 network cards, but Linux
15 currently supports LAPB only over Ethernet connections. If you want
16 to use LAPB connections over Ethernet, say Y here and to "LAPB over
17 Ethernet driver" below. Read
18 <file:Documentation/networking/lapb-module.txt> for technical
19 details.
20
21 To compile this driver as a module, choose M here: the
22 module will be called lapb. If unsure, say N.
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 5de05a0bc0ff..8b5eefd70f03 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb)
78 if (!skb_prev) 78 if (!skb_prev)
79 skb_queue_head(&lapb->write_queue, skb); 79 skb_queue_head(&lapb->write_queue, skb);
80 else 80 else
81 skb_append(skb_prev, skb); 81 skb_append(skb_prev, skb, &lapb->write_queue);
82 skb_prev = skb; 82 skb_prev = skb;
83 } 83 }
84} 84}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 20b4cfebd74c..66f55e514b56 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -23,13 +23,13 @@
23#include <linux/config.h> 23#include <linux/config.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/tcp.h>
27#include <linux/rtnetlink.h> 26#include <linux/rtnetlink.h>
28#include <linux/init.h> 27#include <linux/init.h>
29#include <net/llc.h> 28#include <net/llc.h>
30#include <net/llc_sap.h> 29#include <net/llc_sap.h>
31#include <net/llc_pdu.h> 30#include <net/llc_pdu.h>
32#include <net/llc_conn.h> 31#include <net/llc_conn.h>
32#include <net/tcp_states.h>
33 33
34/* remember: uninitialized global data is zeroed because its in .bss */ 34/* remember: uninitialized global data is zeroed because its in .bss */
35static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; 35static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
@@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
714 if (uaddr) 714 if (uaddr)
715 memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); 715 memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
716 msg->msg_namelen = sizeof(*uaddr); 716 msg->msg_namelen = sizeof(*uaddr);
717 if (!skb->list) { 717 if (!skb->next) {
718dgram_free: 718dgram_free:
719 kfree_skb(skb); 719 kfree_skb(skb);
720 } 720 }
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index eba812a9c69c..4c644bc70eae 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -16,7 +16,7 @@
16#include <net/llc_sap.h> 16#include <net/llc_sap.h>
17#include <net/llc_conn.h> 17#include <net/llc_conn.h>
18#include <net/sock.h> 18#include <net/sock.h>
19#include <linux/tcp.h> 19#include <net/tcp_states.h>
20#include <net/llc_c_ev.h> 20#include <net/llc_c_ev.h>
21#include <net/llc_c_ac.h> 21#include <net/llc_c_ac.h>
22#include <net/llc_c_st.h> 22#include <net/llc_c_st.h>
@@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
71 71
72 if (!ev->ind_prim && !ev->cfm_prim) { 72 if (!ev->ind_prim && !ev->cfm_prim) {
73 /* indicate or confirm not required */ 73 /* indicate or confirm not required */
74 if (!skb->list) 74 /* XXX this is not very pretty, perhaps we should store
75 * XXX indicate/confirm-needed state in the llc_conn_state_ev
76 * XXX control block of the SKB instead? -DaveM
77 */
78 if (!skb->next)
75 goto out_kfree_skb; 79 goto out_kfree_skb;
76 goto out_skb_put; 80 goto out_skb_put;
77 } 81 }
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 5ff02c080a0b..9727455bf0e7 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -103,7 +103,8 @@ out:
103struct llc_sap *llc_sap_open(unsigned char lsap, 103struct llc_sap *llc_sap_open(unsigned char lsap,
104 int (*func)(struct sk_buff *skb, 104 int (*func)(struct sk_buff *skb,
105 struct net_device *dev, 105 struct net_device *dev,
106 struct packet_type *pt)) 106 struct packet_type *pt,
107 struct net_device *orig_dev))
107{ 108{
108 struct llc_sap *sap = llc_sap_find(lsap); 109 struct llc_sap *sap = llc_sap_find(lsap);
109 110
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 0f9fc48aeaf9..0f84f66018e4 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -15,7 +15,6 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/tcp.h>
19#include <asm/errno.h> 18#include <asm/errno.h>
20#include <net/llc_if.h> 19#include <net/llc_if.h>
21#include <net/llc_sap.h> 20#include <net/llc_sap.h>
@@ -25,6 +24,7 @@
25#include <net/llc_c_ev.h> 24#include <net/llc_c_ev.h>
26#include <net/llc_c_ac.h> 25#include <net/llc_c_ac.h>
27#include <net/llc_c_st.h> 26#include <net/llc_c_st.h>
27#include <net/tcp_states.h>
28 28
29u8 llc_mac_null_var[IFHWADDRLEN]; 29u8 llc_mac_null_var[IFHWADDRLEN];
30 30
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 4da6976efc9c..13b46240b7a1 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
132 * data now), it queues this frame in the connection's backlog. 132 * data now), it queues this frame in the connection's backlog.
133 */ 133 */
134int llc_rcv(struct sk_buff *skb, struct net_device *dev, 134int llc_rcv(struct sk_buff *skb, struct net_device *dev,
135 struct packet_type *pt) 135 struct packet_type *pt, struct net_device *orig_dev)
136{ 136{
137 struct llc_sap *sap; 137 struct llc_sap *sap;
138 struct llc_pdu_sn *pdu; 138 struct llc_pdu_sn *pdu;
@@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
165 * LLC functionality 165 * LLC functionality
166 */ 166 */
167 if (sap->rcv_func) { 167 if (sap->rcv_func) {
168 sap->rcv_func(skb, dev, pt); 168 sap->rcv_func(skb, dev, pt, orig_dev);
169 goto out; 169 goto out;
170 } 170 }
171 dest = llc_pdu_type(skb); 171 dest = llc_pdu_type(skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 965c94eb4bbc..34228ef14985 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -21,7 +21,7 @@
21#include <net/llc_s_ev.h> 21#include <net/llc_s_ev.h>
22#include <net/llc_s_st.h> 22#include <net/llc_s_st.h>
23#include <net/sock.h> 23#include <net/sock.h>
24#include <linux/tcp.h> 24#include <net/tcp_states.h>
25#include <linux/llc.h> 25#include <linux/llc.h>
26 26
27/** 27/**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 000000000000..8296b38bf270
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,24 @@
1config NETFILTER_NETLINK
2 tristate "Netfilter netlink interface"
3 help
4 If this option is enabled, the kernel will include support
5 for the new netfilter netlink interface.
6
7config NETFILTER_NETLINK_QUEUE
8 tristate "Netfilter NFQUEUE over NFNETLINK interface"
9 depends on NETFILTER_NETLINK
10 help
11 If this option isenabled, the kernel will include support
12 for queueing packets via NFNETLINK.
13
14config NETFILTER_NETLINK_LOG
15 tristate "Netfilter LOG over NFNETLINK interface"
16 depends on NETFILTER_NETLINK
17 help
18 If this option is enabled, the kernel will include support
19 for logging packets via NFNETLINK.
20
21 This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
22 and is also scheduled to replace the old syslog-based ipt_LOG
23 and ip6t_LOG modules.
24
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 000000000000..b3b44f8b415a
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,7 @@
1netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
2
3obj-$(CONFIG_NETFILTER) = netfilter.o
4
5obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
6obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
7obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 000000000000..1ceb1a6c254b
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,216 @@
1/* netfilter.c: look after the filters for various protocols.
2 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
3 *
4 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
5 * way.
6 *
7 * Rusty Russell (C)2000 -- This code is GPL.
8 *
9 * February 2000: Modified by James Morris to have 1 queue per protocol.
10 * 15-Mar-2000: Added NF_REPEAT --RR.
11 * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
12 */
13#include <linux/config.h>
14#include <linux/kernel.h>
15#include <linux/netfilter.h>
16#include <net/protocol.h>
17#include <linux/init.h>
18#include <linux/skbuff.h>
19#include <linux/wait.h>
20#include <linux/module.h>
21#include <linux/interrupt.h>
22#include <linux/if.h>
23#include <linux/netdevice.h>
24#include <linux/inetdevice.h>
25#include <linux/proc_fs.h>
26#include <net/sock.h>
27
28#include "nf_internals.h"
29
30/* In this code, we can be waiting indefinitely for userspace to
31 * service a packet if a hook returns NF_QUEUE. We could keep a count
32 * of skbuffs queued for userspace, and not deregister a hook unless
33 * this is zero, but that sucks. Now, we simply check when the
34 * packets come back: if the hook is gone, the packet is discarded. */
35struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
36EXPORT_SYMBOL(nf_hooks);
37static DEFINE_SPINLOCK(nf_hook_lock);
38
39int nf_register_hook(struct nf_hook_ops *reg)
40{
41 struct list_head *i;
42
43 spin_lock_bh(&nf_hook_lock);
44 list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
45 if (reg->priority < ((struct nf_hook_ops *)i)->priority)
46 break;
47 }
48 list_add_rcu(&reg->list, i->prev);
49 spin_unlock_bh(&nf_hook_lock);
50
51 synchronize_net();
52 return 0;
53}
54EXPORT_SYMBOL(nf_register_hook);
55
56void nf_unregister_hook(struct nf_hook_ops *reg)
57{
58 spin_lock_bh(&nf_hook_lock);
59 list_del_rcu(&reg->list);
60 spin_unlock_bh(&nf_hook_lock);
61
62 synchronize_net();
63}
64EXPORT_SYMBOL(nf_unregister_hook);
65
66unsigned int nf_iterate(struct list_head *head,
67 struct sk_buff **skb,
68 int hook,
69 const struct net_device *indev,
70 const struct net_device *outdev,
71 struct list_head **i,
72 int (*okfn)(struct sk_buff *),
73 int hook_thresh)
74{
75 unsigned int verdict;
76
77 /*
78 * The caller must not block between calls to this
79 * function because of risk of continuing from deleted element.
80 */
81 list_for_each_continue_rcu(*i, head) {
82 struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
83
84 if (hook_thresh > elem->priority)
85 continue;
86
87 /* Optimization: we don't need to hold module
88 reference here, since function can't sleep. --RR */
89 verdict = elem->hook(hook, skb, indev, outdev, okfn);
90 if (verdict != NF_ACCEPT) {
91#ifdef CONFIG_NETFILTER_DEBUG
92 if (unlikely((verdict & NF_VERDICT_MASK)
93 > NF_MAX_VERDICT)) {
94 NFDEBUG("Evil return from %p(%u).\n",
95 elem->hook, hook);
96 continue;
97 }
98#endif
99 if (verdict != NF_REPEAT)
100 return verdict;
101 *i = (*i)->prev;
102 }
103 }
104 return NF_ACCEPT;
105}
106
107
108/* Returns 1 if okfn() needs to be executed by the caller,
109 * -EPERM for NF_DROP, 0 otherwise. */
110int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
111 struct net_device *indev,
112 struct net_device *outdev,
113 int (*okfn)(struct sk_buff *),
114 int hook_thresh)
115{
116 struct list_head *elem;
117 unsigned int verdict;
118 int ret = 0;
119
120 /* We may already have this, but read-locks nest anyway */
121 rcu_read_lock();
122
123 elem = &nf_hooks[pf][hook];
124next_hook:
125 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
126 outdev, &elem, okfn, hook_thresh);
127 if (verdict == NF_ACCEPT || verdict == NF_STOP) {
128 ret = 1;
129 goto unlock;
130 } else if (verdict == NF_DROP) {
131 kfree_skb(*pskb);
132 ret = -EPERM;
133 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
134 NFDEBUG("nf_hook: Verdict = QUEUE.\n");
135 if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
136 verdict >> NF_VERDICT_BITS))
137 goto next_hook;
138 }
139unlock:
140 rcu_read_unlock();
141 return ret;
142}
143EXPORT_SYMBOL(nf_hook_slow);
144
145
146int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
147{
148 struct sk_buff *nskb;
149
150 if (writable_len > (*pskb)->len)
151 return 0;
152
153 /* Not exclusive use of packet? Must copy. */
154 if (skb_shared(*pskb) || skb_cloned(*pskb))
155 goto copy_skb;
156
157 return pskb_may_pull(*pskb, writable_len);
158
159copy_skb:
160 nskb = skb_copy(*pskb, GFP_ATOMIC);
161 if (!nskb)
162 return 0;
163 BUG_ON(skb_is_nonlinear(nskb));
164
165 /* Rest of kernel will get very unhappy if we pass it a
166 suddenly-orphaned skbuff */
167 if ((*pskb)->sk)
168 skb_set_owner_w(nskb, (*pskb)->sk);
169 kfree_skb(*pskb);
170 *pskb = nskb;
171 return 1;
172}
173EXPORT_SYMBOL(skb_make_writable);
174
175
176/* This does not belong here, but locally generated errors need it if connection
177 tracking in use: without this, connection may not be in hash table, and hence
178 manufactured ICMP or RST packets will not be associated with it. */
179void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
180EXPORT_SYMBOL(ip_ct_attach);
181
182void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
183{
184 void (*attach)(struct sk_buff *, struct sk_buff *);
185
186 if (skb->nfct && (attach = ip_ct_attach) != NULL) {
187 mb(); /* Just to be sure: must be read before executing this */
188 attach(new, skb);
189 }
190}
191EXPORT_SYMBOL(nf_ct_attach);
192
193#ifdef CONFIG_PROC_FS
194struct proc_dir_entry *proc_net_netfilter;
195EXPORT_SYMBOL(proc_net_netfilter);
196#endif
197
198void __init netfilter_init(void)
199{
200 int i, h;
201 for (i = 0; i < NPROTO; i++) {
202 for (h = 0; h < NF_MAX_HOOKS; h++)
203 INIT_LIST_HEAD(&nf_hooks[i][h]);
204 }
205
206#ifdef CONFIG_PROC_FS
207 proc_net_netfilter = proc_mkdir("netfilter", proc_net);
208 if (!proc_net_netfilter)
209 panic("cannot create netfilter proc entry");
210#endif
211
212 if (netfilter_queue_init() < 0)
213 panic("cannot initialize nf_queue");
214 if (netfilter_log_init() < 0)
215 panic("cannot initialize nf_log");
216}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 000000000000..6bdee2910617
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,39 @@
1#ifndef _NF_INTERNALS_H
2#define _NF_INTERNALS_H
3
4#include <linux/config.h>
5#include <linux/list.h>
6#include <linux/skbuff.h>
7#include <linux/netdevice.h>
8
9#ifdef CONFIG_NETFILTER_DEBUG
10#define NFDEBUG(format, args...) printk(format , ## args)
11#else
12#define NFDEBUG(format, args...)
13#endif
14
15
16/* core.c */
17extern unsigned int nf_iterate(struct list_head *head,
18 struct sk_buff **skb,
19 int hook,
20 const struct net_device *indev,
21 const struct net_device *outdev,
22 struct list_head **i,
23 int (*okfn)(struct sk_buff *),
24 int hook_thresh);
25
26/* nf_queue.c */
27extern int nf_queue(struct sk_buff **skb,
28 struct list_head *elem,
29 int pf, unsigned int hook,
30 struct net_device *indev,
31 struct net_device *outdev,
32 int (*okfn)(struct sk_buff *),
33 unsigned int queuenum);
34extern int __init netfilter_queue_init(void);
35
36/* nf_log.c */
37extern int __init netfilter_log_init(void);
38
39#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 000000000000..3e76bd0824a2
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,178 @@
1#include <linux/config.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/proc_fs.h>
6#include <linux/skbuff.h>
7#include <linux/netfilter.h>
8#include <linux/seq_file.h>
9#include <net/protocol.h>
10
11#include "nf_internals.h"
12
13/* Internal logging interface, which relies on the real
14 LOG target modules */
15
16#define NF_LOG_PREFIXLEN 128
17
18static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
19static DEFINE_SPINLOCK(nf_log_lock);
20
21/* return EBUSY if somebody else is registered, EEXIST if the same logger
22 * is registred, 0 on success. */
23int nf_log_register(int pf, struct nf_logger *logger)
24{
25 int ret = -EBUSY;
26
27 if (pf >= NPROTO)
28 return -EINVAL;
29
30 /* Any setup of logging members must be done before
31 * substituting pointer. */
32 spin_lock(&nf_log_lock);
33 if (!nf_logging[pf]) {
34 rcu_assign_pointer(nf_logging[pf], logger);
35 ret = 0;
36 } else if (nf_logging[pf] == logger)
37 ret = -EEXIST;
38
39 spin_unlock(&nf_log_lock);
40 return ret;
41}
42EXPORT_SYMBOL(nf_log_register);
43
44int nf_log_unregister_pf(int pf)
45{
46 if (pf >= NPROTO)
47 return -EINVAL;
48
49 spin_lock(&nf_log_lock);
50 nf_logging[pf] = NULL;
51 spin_unlock(&nf_log_lock);
52
53 /* Give time to concurrent readers. */
54 synchronize_net();
55
56 return 0;
57}
58EXPORT_SYMBOL(nf_log_unregister_pf);
59
60void nf_log_unregister_logger(struct nf_logger *logger)
61{
62 int i;
63
64 spin_lock(&nf_log_lock);
65 for (i = 0; i < NPROTO; i++) {
66 if (nf_logging[i] == logger)
67 nf_logging[i] = NULL;
68 }
69 spin_unlock(&nf_log_lock);
70
71 synchronize_net();
72}
73EXPORT_SYMBOL(nf_log_unregister_logger);
74
75void nf_log_packet(int pf,
76 unsigned int hooknum,
77 const struct sk_buff *skb,
78 const struct net_device *in,
79 const struct net_device *out,
80 struct nf_loginfo *loginfo,
81 const char *fmt, ...)
82{
83 va_list args;
84 char prefix[NF_LOG_PREFIXLEN];
85 struct nf_logger *logger;
86
87 rcu_read_lock();
88 logger = rcu_dereference(nf_logging[pf]);
89 if (logger) {
90 va_start(args, fmt);
91 vsnprintf(prefix, sizeof(prefix), fmt, args);
92 va_end(args);
93 /* We must read logging before nf_logfn[pf] */
94 logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
95 } else if (net_ratelimit()) {
96 printk(KERN_WARNING "nf_log_packet: can\'t log since "
97 "no backend logging module loaded in! Please either "
98 "load one, or disable logging explicitly\n");
99 }
100 rcu_read_unlock();
101}
102EXPORT_SYMBOL(nf_log_packet);
103
104#ifdef CONFIG_PROC_FS
105static void *seq_start(struct seq_file *seq, loff_t *pos)
106{
107 rcu_read_lock();
108
109 if (*pos >= NPROTO)
110 return NULL;
111
112 return pos;
113}
114
115static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
116{
117 (*pos)++;
118
119 if (*pos >= NPROTO)
120 return NULL;
121
122 return pos;
123}
124
125static void seq_stop(struct seq_file *s, void *v)
126{
127 rcu_read_unlock();
128}
129
130static int seq_show(struct seq_file *s, void *v)
131{
132 loff_t *pos = v;
133 const struct nf_logger *logger;
134
135 logger = rcu_dereference(nf_logging[*pos]);
136
137 if (!logger)
138 return seq_printf(s, "%2lld NONE\n", *pos);
139
140 return seq_printf(s, "%2lld %s\n", *pos, logger->name);
141}
142
143static struct seq_operations nflog_seq_ops = {
144 .start = seq_start,
145 .next = seq_next,
146 .stop = seq_stop,
147 .show = seq_show,
148};
149
150static int nflog_open(struct inode *inode, struct file *file)
151{
152 return seq_open(file, &nflog_seq_ops);
153}
154
155static struct file_operations nflog_file_ops = {
156 .owner = THIS_MODULE,
157 .open = nflog_open,
158 .read = seq_read,
159 .llseek = seq_lseek,
160 .release = seq_release,
161};
162
163#endif /* PROC_FS */
164
165
166int __init netfilter_log_init(void)
167{
168#ifdef CONFIG_PROC_FS
169 struct proc_dir_entry *pde;
170
171 pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
172 if (!pde)
173 return -1;
174
175 pde->proc_fops = &nflog_file_ops;
176#endif
177 return 0;
178}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 000000000000..d10d552d9c40
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,343 @@
1#include <linux/config.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/proc_fs.h>
6#include <linux/skbuff.h>
7#include <linux/netfilter.h>
8#include <linux/seq_file.h>
9#include <net/protocol.h>
10
11#include "nf_internals.h"
12
13/*
14 * A queue handler may be registered for each protocol. Each is protected by
15 * long term mutex. The handler must provide an an outfn() to accept packets
16 * for queueing and must reinject all packets it receives, no matter what.
17 */
18static struct nf_queue_handler *queue_handler[NPROTO];
19static struct nf_queue_rerouter *queue_rerouter;
20
21static DEFINE_RWLOCK(queue_handler_lock);
22
23/* return EBUSY when somebody else is registered, return EEXIST if the
24 * same handler is registered, return 0 in case of success. */
25int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
26{
27 int ret;
28
29 if (pf >= NPROTO)
30 return -EINVAL;
31
32 write_lock_bh(&queue_handler_lock);
33 if (queue_handler[pf] == qh)
34 ret = -EEXIST;
35 else if (queue_handler[pf])
36 ret = -EBUSY;
37 else {
38 queue_handler[pf] = qh;
39 ret = 0;
40 }
41 write_unlock_bh(&queue_handler_lock);
42
43 return ret;
44}
45EXPORT_SYMBOL(nf_register_queue_handler);
46
47/* The caller must flush their queue before this */
48int nf_unregister_queue_handler(int pf)
49{
50 if (pf >= NPROTO)
51 return -EINVAL;
52
53 write_lock_bh(&queue_handler_lock);
54 queue_handler[pf] = NULL;
55 write_unlock_bh(&queue_handler_lock);
56
57 return 0;
58}
59EXPORT_SYMBOL(nf_unregister_queue_handler);
60
61int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
62{
63 if (pf >= NPROTO)
64 return -EINVAL;
65
66 write_lock_bh(&queue_handler_lock);
67 memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
68 write_unlock_bh(&queue_handler_lock);
69
70 return 0;
71}
72EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
73
74int nf_unregister_queue_rerouter(int pf)
75{
76 if (pf >= NPROTO)
77 return -EINVAL;
78
79 write_lock_bh(&queue_handler_lock);
80 memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
81 write_unlock_bh(&queue_handler_lock);
82 return 0;
83}
84EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
85
86void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
87{
88 int pf;
89
90 write_lock_bh(&queue_handler_lock);
91 for (pf = 0; pf < NPROTO; pf++) {
92 if (queue_handler[pf] == qh)
93 queue_handler[pf] = NULL;
94 }
95 write_unlock_bh(&queue_handler_lock);
96}
97EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
98
99/*
100 * Any packet that leaves via this function must come back
101 * through nf_reinject().
102 */
103int nf_queue(struct sk_buff **skb,
104 struct list_head *elem,
105 int pf, unsigned int hook,
106 struct net_device *indev,
107 struct net_device *outdev,
108 int (*okfn)(struct sk_buff *),
109 unsigned int queuenum)
110{
111 int status;
112 struct nf_info *info;
113#ifdef CONFIG_BRIDGE_NETFILTER
114 struct net_device *physindev = NULL;
115 struct net_device *physoutdev = NULL;
116#endif
117
118 /* QUEUE == DROP if noone is waiting, to be safe. */
119 read_lock(&queue_handler_lock);
120 if (!queue_handler[pf]->outfn) {
121 read_unlock(&queue_handler_lock);
122 kfree_skb(*skb);
123 return 1;
124 }
125
126 info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
127 if (!info) {
128 if (net_ratelimit())
129 printk(KERN_ERR "OOM queueing packet %p\n",
130 *skb);
131 read_unlock(&queue_handler_lock);
132 kfree_skb(*skb);
133 return 1;
134 }
135
136 *info = (struct nf_info) {
137 (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
138
139 /* If it's going away, ignore hook. */
140 if (!try_module_get(info->elem->owner)) {
141 read_unlock(&queue_handler_lock);
142 kfree(info);
143 return 0;
144 }
145
146 /* Bump dev refs so they don't vanish while packet is out */
147 if (indev) dev_hold(indev);
148 if (outdev) dev_hold(outdev);
149
150#ifdef CONFIG_BRIDGE_NETFILTER
151 if ((*skb)->nf_bridge) {
152 physindev = (*skb)->nf_bridge->physindev;
153 if (physindev) dev_hold(physindev);
154 physoutdev = (*skb)->nf_bridge->physoutdev;
155 if (physoutdev) dev_hold(physoutdev);
156 }
157#endif
158 if (queue_rerouter[pf].save)
159 queue_rerouter[pf].save(*skb, info);
160
161 status = queue_handler[pf]->outfn(*skb, info, queuenum,
162 queue_handler[pf]->data);
163
164 if (status >= 0 && queue_rerouter[pf].reroute)
165 status = queue_rerouter[pf].reroute(skb, info);
166
167 read_unlock(&queue_handler_lock);
168
169 if (status < 0) {
170 /* James M doesn't say fuck enough. */
171 if (indev) dev_put(indev);
172 if (outdev) dev_put(outdev);
173#ifdef CONFIG_BRIDGE_NETFILTER
174 if (physindev) dev_put(physindev);
175 if (physoutdev) dev_put(physoutdev);
176#endif
177 module_put(info->elem->owner);
178 kfree(info);
179 kfree_skb(*skb);
180
181 return 1;
182 }
183
184 return 1;
185}
186
187void nf_reinject(struct sk_buff *skb, struct nf_info *info,
188 unsigned int verdict)
189{
190 struct list_head *elem = &info->elem->list;
191 struct list_head *i;
192
193 rcu_read_lock();
194
195 /* Release those devices we held, or Alexey will kill me. */
196 if (info->indev) dev_put(info->indev);
197 if (info->outdev) dev_put(info->outdev);
198#ifdef CONFIG_BRIDGE_NETFILTER
199 if (skb->nf_bridge) {
200 if (skb->nf_bridge->physindev)
201 dev_put(skb->nf_bridge->physindev);
202 if (skb->nf_bridge->physoutdev)
203 dev_put(skb->nf_bridge->physoutdev);
204 }
205#endif
206
207 /* Drop reference to owner of hook which queued us. */
208 module_put(info->elem->owner);
209
210 list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
211 if (i == elem)
212 break;
213 }
214
215 if (elem == &nf_hooks[info->pf][info->hook]) {
216 /* The module which sent it to userspace is gone. */
217 NFDEBUG("%s: module disappeared, dropping packet.\n",
218 __FUNCTION__);
219 verdict = NF_DROP;
220 }
221
222 /* Continue traversal iff userspace said ok... */
223 if (verdict == NF_REPEAT) {
224 elem = elem->prev;
225 verdict = NF_ACCEPT;
226 }
227
228 if (verdict == NF_ACCEPT) {
229 next_hook:
230 verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
231 &skb, info->hook,
232 info->indev, info->outdev, &elem,
233 info->okfn, INT_MIN);
234 }
235
236 switch (verdict & NF_VERDICT_MASK) {
237 case NF_ACCEPT:
238 info->okfn(skb);
239 break;
240
241 case NF_QUEUE:
242 if (!nf_queue(&skb, elem, info->pf, info->hook,
243 info->indev, info->outdev, info->okfn,
244 verdict >> NF_VERDICT_BITS))
245 goto next_hook;
246 break;
247 }
248 rcu_read_unlock();
249
250 if (verdict == NF_DROP)
251 kfree_skb(skb);
252
253 kfree(info);
254 return;
255}
256EXPORT_SYMBOL(nf_reinject);
257
258#ifdef CONFIG_PROC_FS
259static void *seq_start(struct seq_file *seq, loff_t *pos)
260{
261 if (*pos >= NPROTO)
262 return NULL;
263
264 return pos;
265}
266
267static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
268{
269 (*pos)++;
270
271 if (*pos >= NPROTO)
272 return NULL;
273
274 return pos;
275}
276
277static void seq_stop(struct seq_file *s, void *v)
278{
279
280}
281
282static int seq_show(struct seq_file *s, void *v)
283{
284 int ret;
285 loff_t *pos = v;
286 struct nf_queue_handler *qh;
287
288 read_lock_bh(&queue_handler_lock);
289 qh = queue_handler[*pos];
290 if (!qh)
291 ret = seq_printf(s, "%2lld NONE\n", *pos);
292 else
293 ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
294 read_unlock_bh(&queue_handler_lock);
295
296 return ret;
297}
298
299static struct seq_operations nfqueue_seq_ops = {
300 .start = seq_start,
301 .next = seq_next,
302 .stop = seq_stop,
303 .show = seq_show,
304};
305
306static int nfqueue_open(struct inode *inode, struct file *file)
307{
308 return seq_open(file, &nfqueue_seq_ops);
309}
310
311static struct file_operations nfqueue_file_ops = {
312 .owner = THIS_MODULE,
313 .open = nfqueue_open,
314 .read = seq_read,
315 .llseek = seq_lseek,
316 .release = seq_release,
317};
318#endif /* PROC_FS */
319
320
321int __init netfilter_queue_init(void)
322{
323#ifdef CONFIG_PROC_FS
324 struct proc_dir_entry *pde;
325#endif
326 queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
327 GFP_KERNEL);
328 if (!queue_rerouter)
329 return -ENOMEM;
330
331#ifdef CONFIG_PROC_FS
332 pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
333 if (!pde) {
334 kfree(queue_rerouter);
335 return -1;
336 }
337 pde->proc_fops = &nfqueue_file_ops;
338#endif
339 memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
340
341 return 0;
342}
343
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 000000000000..61a833a9caa6
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,132 @@
1#include <linux/config.h>
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/skbuff.h>
6#include <linux/netfilter.h>
7#include <net/sock.h>
8
9#include "nf_internals.h"
10
11/* Sockopts only registered and called from user context, so
12 net locking would be overkill. Also, [gs]etsockopt calls may
13 sleep. */
14static DECLARE_MUTEX(nf_sockopt_mutex);
15static LIST_HEAD(nf_sockopts);
16
17/* Do exclusive ranges overlap? */
18static inline int overlap(int min1, int max1, int min2, int max2)
19{
20 return max1 > min2 && min1 < max2;
21}
22
23/* Functions to register sockopt ranges (exclusive). */
24int nf_register_sockopt(struct nf_sockopt_ops *reg)
25{
26 struct list_head *i;
27 int ret = 0;
28
29 if (down_interruptible(&nf_sockopt_mutex) != 0)
30 return -EINTR;
31
32 list_for_each(i, &nf_sockopts) {
33 struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
34 if (ops->pf == reg->pf
35 && (overlap(ops->set_optmin, ops->set_optmax,
36 reg->set_optmin, reg->set_optmax)
37 || overlap(ops->get_optmin, ops->get_optmax,
38 reg->get_optmin, reg->get_optmax))) {
39 NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
40 ops->set_optmin, ops->set_optmax,
41 ops->get_optmin, ops->get_optmax,
42 reg->set_optmin, reg->set_optmax,
43 reg->get_optmin, reg->get_optmax);
44 ret = -EBUSY;
45 goto out;
46 }
47 }
48
49 list_add(&reg->list, &nf_sockopts);
50out:
51 up(&nf_sockopt_mutex);
52 return ret;
53}
54EXPORT_SYMBOL(nf_register_sockopt);
55
56void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
57{
58 /* No point being interruptible: we're probably in cleanup_module() */
59 restart:
60 down(&nf_sockopt_mutex);
61 if (reg->use != 0) {
62 /* To be woken by nf_sockopt call... */
63 /* FIXME: Stuart Young's name appears gratuitously. */
64 set_current_state(TASK_UNINTERRUPTIBLE);
65 reg->cleanup_task = current;
66 up(&nf_sockopt_mutex);
67 schedule();
68 goto restart;
69 }
70 list_del(&reg->list);
71 up(&nf_sockopt_mutex);
72}
73EXPORT_SYMBOL(nf_unregister_sockopt);
74
75/* Call get/setsockopt() */
76static int nf_sockopt(struct sock *sk, int pf, int val,
77 char __user *opt, int *len, int get)
78{
79 struct list_head *i;
80 struct nf_sockopt_ops *ops;
81 int ret;
82
83 if (down_interruptible(&nf_sockopt_mutex) != 0)
84 return -EINTR;
85
86 list_for_each(i, &nf_sockopts) {
87 ops = (struct nf_sockopt_ops *)i;
88 if (ops->pf == pf) {
89 if (get) {
90 if (val >= ops->get_optmin
91 && val < ops->get_optmax) {
92 ops->use++;
93 up(&nf_sockopt_mutex);
94 ret = ops->get(sk, val, opt, len);
95 goto out;
96 }
97 } else {
98 if (val >= ops->set_optmin
99 && val < ops->set_optmax) {
100 ops->use++;
101 up(&nf_sockopt_mutex);
102 ret = ops->set(sk, val, opt, *len);
103 goto out;
104 }
105 }
106 }
107 }
108 up(&nf_sockopt_mutex);
109 return -ENOPROTOOPT;
110
111 out:
112 down(&nf_sockopt_mutex);
113 ops->use--;
114 if (ops->cleanup_task)
115 wake_up_process(ops->cleanup_task);
116 up(&nf_sockopt_mutex);
117 return ret;
118}
119
120int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
121 int len)
122{
123 return nf_sockopt(sk, pf, val, opt, &len, 0);
124}
125EXPORT_SYMBOL(nf_setsockopt);
126
127int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
128{
129 return nf_sockopt(sk, pf, val, opt, len, 1);
130}
131EXPORT_SYMBOL(nf_getsockopt);
132
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 000000000000..49a3900e3d32
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,376 @@
1/* Netfilter messages via netlink socket. Allows for user space
2 * protocol helpers and general trouble making from userspace.
3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>,
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
7 *
8 * Initial netfilter messages via netlink development funded and
9 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
10 *
11 * Further development of this code funded by Astaro AG (http://www.astaro.com)
12 *
13 * This software may be used and distributed according to the terms
14 * of the GNU General Public License, incorporated herein by reference.
15 */
16
17#include <linux/config.h>
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/socket.h>
21#include <linux/kernel.h>
22#include <linux/major.h>
23#include <linux/sched.h>
24#include <linux/timer.h>
25#include <linux/string.h>
26#include <linux/sockios.h>
27#include <linux/net.h>
28#include <linux/fcntl.h>
29#include <linux/skbuff.h>
30#include <asm/uaccess.h>
31#include <asm/system.h>
32#include <net/sock.h>
33#include <linux/init.h>
34#include <linux/spinlock.h>
35
36#include <linux/netfilter.h>
37#include <linux/netlink.h>
38#include <linux/netfilter/nfnetlink.h>
39
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
42MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
43
44static char __initdata nfversion[] = "0.30";
45
46#if 0
47#define DEBUGP(format, args...) \
48 printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
49 __LINE__, __FUNCTION__, ## args)
50#else
51#define DEBUGP(format, args...)
52#endif
53
54static struct sock *nfnl = NULL;
55static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
56DECLARE_MUTEX(nfnl_sem);
57
58void nfnl_lock(void)
59{
60 nfnl_shlock();
61}
62
63void nfnl_unlock(void)
64{
65 nfnl_shunlock();
66}
67
68int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
69{
70 DEBUGP("registering subsystem ID %u\n", n->subsys_id);
71
72 nfnl_lock();
73 if (subsys_table[n->subsys_id]) {
74 nfnl_unlock();
75 return -EBUSY;
76 }
77 subsys_table[n->subsys_id] = n;
78 nfnl_unlock();
79
80 return 0;
81}
82
83int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
84{
85 DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
86
87 nfnl_lock();
88 subsys_table[n->subsys_id] = NULL;
89 nfnl_unlock();
90
91 return 0;
92}
93
94static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
95{
96 u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
97
98 if (subsys_id >= NFNL_SUBSYS_COUNT
99 || subsys_table[subsys_id] == NULL)
100 return NULL;
101
102 return subsys_table[subsys_id];
103}
104
105static inline struct nfnl_callback *
106nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
107{
108 u_int8_t cb_id = NFNL_MSG_TYPE(type);
109
110 if (cb_id >= ss->cb_count) {
111 DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
112 return NULL;
113 }
114
115 return &ss->cb[cb_id];
116}
117
118void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
119 const void *data)
120{
121 struct nfattr *nfa;
122 int size = NFA_LENGTH(attrlen);
123
124 nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
125 nfa->nfa_type = attrtype;
126 nfa->nfa_len = size;
127 memcpy(NFA_DATA(nfa), data, attrlen);
128 memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
129}
130
131int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
132{
133 memset(tb, 0, sizeof(struct nfattr *) * maxattr);
134
135 while (NFA_OK(nfa, len)) {
136 unsigned flavor = nfa->nfa_type;
137 if (flavor && flavor <= maxattr)
138 tb[flavor-1] = nfa;
139 nfa = NFA_NEXT(nfa, len);
140 }
141
142 return 0;
143}
144
145/**
146 * nfnetlink_check_attributes - check and parse nfnetlink attributes
147 *
148 * subsys: nfnl subsystem for which this message is to be parsed
149 * nlmsghdr: netlink message to be checked/parsed
150 * cda: array of pointers, needs to be at least subsys->attr_count big
151 *
152 */
153static int
154nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
155 struct nlmsghdr *nlh, struct nfattr *cda[])
156{
157 int min_len;
158 u_int16_t attr_count;
159 u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
160
161 if (unlikely(cb_id >= subsys->cb_count)) {
162 DEBUGP("msgtype %u >= %u, returning\n",
163 cb_id, subsys->cb_count);
164 return -EINVAL;
165 }
166
167 min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
168 if (unlikely(nlh->nlmsg_len < min_len))
169 return -EINVAL;
170
171 attr_count = subsys->cb[cb_id].attr_count;
172 memset(cda, 0, sizeof(struct nfattr *) * attr_count);
173
174 /* check attribute lengths. */
175 if (likely(nlh->nlmsg_len > min_len)) {
176 struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
177 int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
178
179 while (NFA_OK(attr, attrlen)) {
180 unsigned flavor = attr->nfa_type;
181 if (flavor) {
182 if (flavor > attr_count)
183 return -EINVAL;
184 cda[flavor - 1] = attr;
185 }
186 attr = NFA_NEXT(attr, attrlen);
187 }
188 }
189
190 /* implicit: if nlmsg_len == min_len, we return 0, and an empty
191 * (zeroed) cda[] array. The message is valid, but empty. */
192
193 return 0;
194}
195
196int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
197{
198 int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
199 int err = 0;
200
201 NETLINK_CB(skb).dst_group = group;
202 if (echo)
203 atomic_inc(&skb->users);
204 netlink_broadcast(nfnl, skb, pid, group, allocation);
205 if (echo)
206 err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
207
208 return err;
209}
210
211int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
212{
213 return netlink_unicast(nfnl, skb, pid, flags);
214}
215
216/* Process one complete nfnetlink message. */
217static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
218 struct nlmsghdr *nlh, int *errp)
219{
220 struct nfnl_callback *nc;
221 struct nfnetlink_subsystem *ss;
222 int type, err = 0;
223
224 DEBUGP("entered; subsys=%u, msgtype=%u\n",
225 NFNL_SUBSYS_ID(nlh->nlmsg_type),
226 NFNL_MSG_TYPE(nlh->nlmsg_type));
227
228 /* Only requests are handled by kernel now. */
229 if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
230 DEBUGP("received non-request message\n");
231 return 0;
232 }
233
234 /* All the messages must at least contain nfgenmsg */
235 if (nlh->nlmsg_len <
236 NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
237 DEBUGP("received message was too short\n");
238 return 0;
239 }
240
241 type = nlh->nlmsg_type;
242 ss = nfnetlink_get_subsys(type);
243 if (!ss) {
244#ifdef CONFIG_KMOD
245 /* don't call nfnl_shunlock, since it would reenter
246 * with further packet processing */
247 up(&nfnl_sem);
248 request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
249 nfnl_shlock();
250 ss = nfnetlink_get_subsys(type);
251 if (!ss)
252#endif
253 goto err_inval;
254 }
255
256 nc = nfnetlink_find_client(type, ss);
257 if (!nc) {
258 DEBUGP("unable to find client for type %d\n", type);
259 goto err_inval;
260 }
261
262 if (nc->cap_required &&
263 !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
264 DEBUGP("permission denied for type %d\n", type);
265 *errp = -EPERM;
266 return -1;
267 }
268
269 {
270 u_int16_t attr_count =
271 ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
272 struct nfattr *cda[attr_count];
273
274 memset(cda, 0, sizeof(struct nfattr *) * attr_count);
275
276 err = nfnetlink_check_attributes(ss, nlh, cda);
277 if (err < 0)
278 goto err_inval;
279
280 DEBUGP("calling handler\n");
281 err = nc->call(nfnl, skb, nlh, cda, errp);
282 *errp = err;
283 return err;
284 }
285
286err_inval:
287 DEBUGP("returning -EINVAL\n");
288 *errp = -EINVAL;
289 return -1;
290}
291
292/* Process one packet of messages. */
293static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
294{
295 int err;
296 struct nlmsghdr *nlh;
297
298 while (skb->len >= NLMSG_SPACE(0)) {
299 u32 rlen;
300
301 nlh = (struct nlmsghdr *)skb->data;
302 if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
303 || skb->len < nlh->nlmsg_len)
304 return 0;
305 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
306 if (rlen > skb->len)
307 rlen = skb->len;
308 if (nfnetlink_rcv_msg(skb, nlh, &err)) {
309 if (!err)
310 return -1;
311 netlink_ack(skb, nlh, err);
312 } else
313 if (nlh->nlmsg_flags & NLM_F_ACK)
314 netlink_ack(skb, nlh, 0);
315 skb_pull(skb, rlen);
316 }
317
318 return 0;
319}
320
321static void nfnetlink_rcv(struct sock *sk, int len)
322{
323 do {
324 struct sk_buff *skb;
325
326 if (nfnl_shlock_nowait())
327 return;
328
329 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
330 if (nfnetlink_rcv_skb(skb)) {
331 if (skb->len)
332 skb_queue_head(&sk->sk_receive_queue,
333 skb);
334 else
335 kfree_skb(skb);
336 break;
337 }
338 kfree_skb(skb);
339 }
340
341 /* don't call nfnl_shunlock, since it would reenter
342 * with further packet processing */
343 up(&nfnl_sem);
344 } while(nfnl && nfnl->sk_receive_queue.qlen);
345}
346
347static void __exit nfnetlink_exit(void)
348{
349 printk("Removing netfilter NETLINK layer.\n");
350 sock_release(nfnl->sk_socket);
351 return;
352}
353
354static int __init nfnetlink_init(void)
355{
356 printk("Netfilter messages via NETLINK v%s.\n", nfversion);
357
358 nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
359 nfnetlink_rcv, THIS_MODULE);
360 if (!nfnl) {
361 printk(KERN_ERR "cannot initialize nfnetlink!\n");
362 return -1;
363 }
364
365 return 0;
366}
367
368module_init(nfnetlink_init);
369module_exit(nfnetlink_exit);
370
371EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
372EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
373EXPORT_SYMBOL_GPL(nfnetlink_send);
374EXPORT_SYMBOL_GPL(nfnetlink_unicast);
375EXPORT_SYMBOL_GPL(nfattr_parse);
376EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 000000000000..ff5601ceedcb
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1055 @@
1/*
2 * This is a module which is used for logging packets to userspace via
3 * nfetlink.
4 *
5 * (C) 2005 by Harald Welte <laforge@netfilter.org>
6 *
7 * Based on the old ipv4-only ipt_ULOG.c:
8 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 */
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/init.h>
18#include <linux/ip.h>
19#include <linux/ipv6.h>
20#include <linux/netdevice.h>
21#include <linux/netfilter.h>
22#include <linux/netlink.h>
23#include <linux/netfilter/nfnetlink.h>
24#include <linux/netfilter/nfnetlink_log.h>
25#include <linux/spinlock.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/security.h>
29#include <linux/list.h>
30#include <linux/jhash.h>
31#include <linux/random.h>
32#include <net/sock.h>
33
34#include <asm/atomic.h>
35
36#ifdef CONFIG_BRIDGE_NETFILTER
37#include "../bridge/br_private.h"
38#endif
39
40#define NFULNL_NLBUFSIZ_DEFAULT 4096
41#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
42#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
43
44#define PRINTR(x, args...) do { if (net_ratelimit()) \
45 printk(x, ## args); } while (0);
46
47#if 0
48#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
49 __FILE__, __LINE__, __FUNCTION__, \
50 ## args)
51#else
52#define UDEBUG(x, ...)
53#endif
54
55struct nfulnl_instance {
56 struct hlist_node hlist; /* global list of instances */
57 spinlock_t lock;
58 atomic_t use; /* use count */
59
60 unsigned int qlen; /* number of nlmsgs in skb */
61 struct sk_buff *skb; /* pre-allocatd skb */
62 struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
63 struct timer_list timer;
64 int peer_pid; /* PID of the peer process */
65
66 /* configurable parameters */
67 unsigned int flushtimeout; /* timeout until queue flush */
68 unsigned int nlbufsiz; /* netlink buffer allocation size */
69 unsigned int qthreshold; /* threshold of the queue */
70 u_int32_t copy_range;
71 u_int16_t group_num; /* number of this queue */
72 u_int8_t copy_mode;
73};
74
75static DEFINE_RWLOCK(instances_lock);
76
77#define INSTANCE_BUCKETS 16
78static struct hlist_head instance_table[INSTANCE_BUCKETS];
79static unsigned int hash_init;
80
81static inline u_int8_t instance_hashfn(u_int16_t group_num)
82{
83 return ((group_num & 0xff) % INSTANCE_BUCKETS);
84}
85
86static struct nfulnl_instance *
87__instance_lookup(u_int16_t group_num)
88{
89 struct hlist_head *head;
90 struct hlist_node *pos;
91 struct nfulnl_instance *inst;
92
93 UDEBUG("entering (group_num=%u)\n", group_num);
94
95 head = &instance_table[instance_hashfn(group_num)];
96 hlist_for_each_entry(inst, pos, head, hlist) {
97 if (inst->group_num == group_num)
98 return inst;
99 }
100 return NULL;
101}
102
103static inline void
104instance_get(struct nfulnl_instance *inst)
105{
106 atomic_inc(&inst->use);
107}
108
109static struct nfulnl_instance *
110instance_lookup_get(u_int16_t group_num)
111{
112 struct nfulnl_instance *inst;
113
114 read_lock_bh(&instances_lock);
115 inst = __instance_lookup(group_num);
116 if (inst)
117 instance_get(inst);
118 read_unlock_bh(&instances_lock);
119
120 return inst;
121}
122
123static void
124instance_put(struct nfulnl_instance *inst)
125{
126 if (inst && atomic_dec_and_test(&inst->use)) {
127 UDEBUG("kfree(inst=%p)\n", inst);
128 kfree(inst);
129 }
130}
131
132static void nfulnl_timer(unsigned long data);
133
134static struct nfulnl_instance *
135instance_create(u_int16_t group_num, int pid)
136{
137 struct nfulnl_instance *inst;
138
139 UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
140 pid);
141
142 write_lock_bh(&instances_lock);
143 if (__instance_lookup(group_num)) {
144 inst = NULL;
145 UDEBUG("aborting, instance already exists\n");
146 goto out_unlock;
147 }
148
149 inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
150 if (!inst)
151 goto out_unlock;
152
153 memset(inst, 0, sizeof(*inst));
154 INIT_HLIST_NODE(&inst->hlist);
155 inst->lock = SPIN_LOCK_UNLOCKED;
156 /* needs to be two, since we _put() after creation */
157 atomic_set(&inst->use, 2);
158
159 init_timer(&inst->timer);
160 inst->timer.function = nfulnl_timer;
161 inst->timer.data = (unsigned long)inst;
162 /* don't start timer yet. (re)start it with every packet */
163
164 inst->peer_pid = pid;
165 inst->group_num = group_num;
166
167 inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
168 inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT;
169 inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT;
170 inst->copy_mode = NFULNL_COPY_PACKET;
171 inst->copy_range = 0xffff;
172
173 if (!try_module_get(THIS_MODULE))
174 goto out_free;
175
176 hlist_add_head(&inst->hlist,
177 &instance_table[instance_hashfn(group_num)]);
178
179 UDEBUG("newly added node: %p, next=%p\n", &inst->hlist,
180 inst->hlist.next);
181
182 write_unlock_bh(&instances_lock);
183
184 return inst;
185
186out_free:
187 instance_put(inst);
188out_unlock:
189 write_unlock_bh(&instances_lock);
190 return NULL;
191}
192
193static int __nfulnl_send(struct nfulnl_instance *inst);
194
195static void
196_instance_destroy2(struct nfulnl_instance *inst, int lock)
197{
198 /* first pull it out of the global list */
199 if (lock)
200 write_lock_bh(&instances_lock);
201
202 UDEBUG("removing instance %p (queuenum=%u) from hash\n",
203 inst, inst->group_num);
204
205 hlist_del(&inst->hlist);
206
207 if (lock)
208 write_unlock_bh(&instances_lock);
209
210 /* then flush all pending packets from skb */
211
212 spin_lock_bh(&inst->lock);
213 if (inst->skb) {
214 if (inst->qlen)
215 __nfulnl_send(inst);
216 if (inst->skb) {
217 kfree_skb(inst->skb);
218 inst->skb = NULL;
219 }
220 }
221 spin_unlock_bh(&inst->lock);
222
223 /* and finally put the refcount */
224 instance_put(inst);
225
226 module_put(THIS_MODULE);
227}
228
229static inline void
230__instance_destroy(struct nfulnl_instance *inst)
231{
232 _instance_destroy2(inst, 0);
233}
234
235static inline void
236instance_destroy(struct nfulnl_instance *inst)
237{
238 _instance_destroy2(inst, 1);
239}
240
241static int
242nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
243 unsigned int range)
244{
245 int status = 0;
246
247 spin_lock_bh(&inst->lock);
248
249 switch (mode) {
250 case NFULNL_COPY_NONE:
251 case NFULNL_COPY_META:
252 inst->copy_mode = mode;
253 inst->copy_range = 0;
254 break;
255
256 case NFULNL_COPY_PACKET:
257 inst->copy_mode = mode;
258 /* we're using struct nfattr which has 16bit nfa_len */
259 if (range > 0xffff)
260 inst->copy_range = 0xffff;
261 else
262 inst->copy_range = range;
263 break;
264
265 default:
266 status = -EINVAL;
267 break;
268 }
269
270 spin_unlock_bh(&inst->lock);
271
272 return status;
273}
274
275static int
276nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
277{
278 int status;
279
280 spin_lock_bh(&inst->lock);
281 if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
282 status = -ERANGE;
283 else if (nlbufsiz > 131072)
284 status = -ERANGE;
285 else {
286 inst->nlbufsiz = nlbufsiz;
287 status = 0;
288 }
289 spin_unlock_bh(&inst->lock);
290
291 return status;
292}
293
294static int
295nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
296{
297 spin_lock_bh(&inst->lock);
298 inst->flushtimeout = timeout;
299 spin_unlock_bh(&inst->lock);
300
301 return 0;
302}
303
304static int
305nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
306{
307 spin_lock_bh(&inst->lock);
308 inst->qthreshold = qthresh;
309 spin_unlock_bh(&inst->lock);
310
311 return 0;
312}
313
314static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size,
315 unsigned int pkt_size)
316{
317 struct sk_buff *skb;
318
319 UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
320
321 /* alloc skb which should be big enough for a whole multipart
322 * message. WARNING: has to be <= 128k due to slab restrictions */
323
324 skb = alloc_skb(inst_size, GFP_ATOMIC);
325 if (!skb) {
326 PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
327 inst_size);
328
329 /* try to allocate only as much as we need for current
330 * packet */
331
332 skb = alloc_skb(pkt_size, GFP_ATOMIC);
333 if (!skb)
334 PRINTR("nfnetlink_log: can't even alloc %u bytes\n",
335 pkt_size);
336 }
337
338 return skb;
339}
340
341static int
342__nfulnl_send(struct nfulnl_instance *inst)
343{
344 int status;
345
346 if (timer_pending(&inst->timer))
347 del_timer(&inst->timer);
348
349 if (inst->qlen > 1)
350 inst->lastnlh->nlmsg_type = NLMSG_DONE;
351
352 status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
353 if (status < 0) {
354 UDEBUG("netlink_unicast() failed\n");
355 /* FIXME: statistics */
356 }
357
358 inst->qlen = 0;
359 inst->skb = NULL;
360 inst->lastnlh = NULL;
361
362 return status;
363}
364
365static void nfulnl_timer(unsigned long data)
366{
367 struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
368
369 UDEBUG("timer function called, flushing buffer\n");
370
371 spin_lock_bh(&inst->lock);
372 __nfulnl_send(inst);
373 instance_put(inst);
374 spin_unlock_bh(&inst->lock);
375}
376
377static inline int
378__build_packet_message(struct nfulnl_instance *inst,
379 const struct sk_buff *skb,
380 unsigned int data_len,
381 unsigned int pf,
382 unsigned int hooknum,
383 const struct net_device *indev,
384 const struct net_device *outdev,
385 const struct nf_loginfo *li,
386 const char *prefix)
387{
388 unsigned char *old_tail;
389 struct nfulnl_msg_packet_hdr pmsg;
390 struct nlmsghdr *nlh;
391 struct nfgenmsg *nfmsg;
392 u_int32_t tmp_uint;
393
394 UDEBUG("entered\n");
395
396 old_tail = inst->skb->tail;
397 nlh = NLMSG_PUT(inst->skb, 0, 0,
398 NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
399 sizeof(struct nfgenmsg));
400 nfmsg = NLMSG_DATA(nlh);
401 nfmsg->nfgen_family = pf;
402 nfmsg->version = NFNETLINK_V0;
403 nfmsg->res_id = htons(inst->group_num);
404
405 pmsg.hw_protocol = htons(skb->protocol);
406 pmsg.hook = hooknum;
407
408 NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
409
410 if (prefix) {
411 int slen = strlen(prefix);
412 if (slen > NFULNL_PREFIXLEN)
413 slen = NFULNL_PREFIXLEN;
414 NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
415 }
416
417 if (indev) {
418 tmp_uint = htonl(indev->ifindex);
419#ifndef CONFIG_BRIDGE_NETFILTER
420 NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
421 &tmp_uint);
422#else
423 if (pf == PF_BRIDGE) {
424 /* Case 1: outdev is physical input device, we need to
425 * look for bridge group (when called from
426 * netfilter_bridge) */
427 NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
428 sizeof(tmp_uint), &tmp_uint);
429 /* this is the bridge group "brX" */
430 tmp_uint = htonl(indev->br_port->br->dev->ifindex);
431 NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
432 sizeof(tmp_uint), &tmp_uint);
433 } else {
434 /* Case 2: indev is bridge group, we need to look for
435 * physical device (when called from ipv4) */
436 NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
437 sizeof(tmp_uint), &tmp_uint);
438 if (skb->nf_bridge && skb->nf_bridge->physindev) {
439 tmp_uint =
440 htonl(skb->nf_bridge->physindev->ifindex);
441 NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
442 sizeof(tmp_uint), &tmp_uint);
443 }
444 }
445#endif
446 }
447
448 if (outdev) {
449 tmp_uint = htonl(outdev->ifindex);
450#ifndef CONFIG_BRIDGE_NETFILTER
451 NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
452 &tmp_uint);
453#else
454 if (pf == PF_BRIDGE) {
455 /* Case 1: outdev is physical output device, we need to
456 * look for bridge group (when called from
457 * netfilter_bridge) */
458 NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
459 sizeof(tmp_uint), &tmp_uint);
460 /* this is the bridge group "brX" */
461 tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
462 NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
463 sizeof(tmp_uint), &tmp_uint);
464 } else {
465 /* Case 2: indev is a bridge group, we need to look
466 * for physical device (when called from ipv4) */
467 NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
468 sizeof(tmp_uint), &tmp_uint);
469 if (skb->nf_bridge) {
470 tmp_uint =
471 htonl(skb->nf_bridge->physoutdev->ifindex);
472 NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
473 sizeof(tmp_uint), &tmp_uint);
474 }
475 }
476#endif
477 }
478
479 if (skb->nfmark) {
480 tmp_uint = htonl(skb->nfmark);
481 NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
482 }
483
484 if (indev && skb->dev && skb->dev->hard_header_parse) {
485 struct nfulnl_msg_packet_hw phw;
486
487 phw.hw_addrlen =
488 skb->dev->hard_header_parse((struct sk_buff *)skb,
489 phw.hw_addr);
490 phw.hw_addrlen = htons(phw.hw_addrlen);
491 NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
492 }
493
494 if (skb->tstamp.off_sec) {
495 struct nfulnl_msg_packet_timestamp ts;
496
497 ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec);
498 ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec);
499
500 NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
501 }
502
503 /* UID */
504 if (skb->sk) {
505 read_lock_bh(&skb->sk->sk_callback_lock);
506 if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
507 u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
508 /* need to unlock here since NFA_PUT may goto */
509 read_unlock_bh(&skb->sk->sk_callback_lock);
510 NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
511 } else
512 read_unlock_bh(&skb->sk->sk_callback_lock);
513 }
514
515 if (data_len) {
516 struct nfattr *nfa;
517 int size = NFA_LENGTH(data_len);
518
519 if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
520 printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
521 goto nlmsg_failure;
522 }
523
524 nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
525 nfa->nfa_type = NFULA_PAYLOAD;
526 nfa->nfa_len = size;
527
528 if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
529 BUG();
530 }
531
532 nlh->nlmsg_len = inst->skb->tail - old_tail;
533 return 0;
534
535nlmsg_failure:
536 UDEBUG("nlmsg_failure\n");
537nfattr_failure:
538 PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
539 return -1;
540}
541
542#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
543
544static struct nf_loginfo default_loginfo = {
545 .type = NF_LOG_TYPE_ULOG,
546 .u = {
547 .ulog = {
548 .copy_len = 0xffff,
549 .group = 0,
550 .qthreshold = 1,
551 },
552 },
553};
554
555/* log handler for internal netfilter logging api */
556static void
557nfulnl_log_packet(unsigned int pf,
558 unsigned int hooknum,
559 const struct sk_buff *skb,
560 const struct net_device *in,
561 const struct net_device *out,
562 const struct nf_loginfo *li_user,
563 const char *prefix)
564{
565 unsigned int size, data_len;
566 struct nfulnl_instance *inst;
567 const struct nf_loginfo *li;
568 unsigned int qthreshold;
569 unsigned int nlbufsiz;
570
571 if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
572 li = li_user;
573 else
574 li = &default_loginfo;
575
576 inst = instance_lookup_get(li->u.ulog.group);
577 if (!inst)
578 inst = instance_lookup_get(0);
579 if (!inst) {
580 PRINTR("nfnetlink_log: trying to log packet, "
581 "but no instance for group %u\n", li->u.ulog.group);
582 return;
583 }
584
585 /* all macros expand to constant values at compile time */
586 /* FIXME: do we want to make the size calculation conditional based on
587 * what is actually present? way more branches and checks, but more
588 * memory efficient... */
589 size = NLMSG_SPACE(sizeof(struct nfgenmsg))
590 + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
591 + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
592 + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
593#ifdef CONFIG_BRIDGE_NETFILTER
594 + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
595 + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
596#endif
597 + NFA_SPACE(sizeof(u_int32_t)) /* mark */
598 + NFA_SPACE(sizeof(u_int32_t)) /* uid */
599 + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */
600 + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
601 + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
602
603 UDEBUG("initial size=%u\n", size);
604
605 spin_lock_bh(&inst->lock);
606
607 qthreshold = inst->qthreshold;
608 /* per-rule qthreshold overrides per-instance */
609 if (qthreshold > li->u.ulog.qthreshold)
610 qthreshold = li->u.ulog.qthreshold;
611
612 switch (inst->copy_mode) {
613 case NFULNL_COPY_META:
614 case NFULNL_COPY_NONE:
615 data_len = 0;
616 break;
617
618 case NFULNL_COPY_PACKET:
619 if (inst->copy_range == 0
620 || inst->copy_range > skb->len)
621 data_len = skb->len;
622 else
623 data_len = inst->copy_range;
624
625 size += NFA_SPACE(data_len);
626 UDEBUG("copy_packet, therefore size now %u\n", size);
627 break;
628
629 default:
630 spin_unlock_bh(&inst->lock);
631 instance_put(inst);
632 return;
633 }
634
635 if (size > inst->nlbufsiz)
636 nlbufsiz = size;
637 else
638 nlbufsiz = inst->nlbufsiz;
639
640 if (!inst->skb) {
641 if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
642 UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
643 inst->nlbufsiz, size);
644 goto alloc_failure;
645 }
646 } else if (inst->qlen >= qthreshold ||
647 size > skb_tailroom(inst->skb)) {
648 /* either the queue len is too high or we don't have
649 * enough room in the skb left. flush to userspace. */
650 UDEBUG("flushing old skb\n");
651
652 __nfulnl_send(inst);
653
654 if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
655 UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
656 inst->nlbufsiz, size);
657 goto alloc_failure;
658 }
659 }
660
661 UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
662 inst->qlen++;
663
664 __build_packet_message(inst, skb, data_len, pf,
665 hooknum, in, out, li, prefix);
666
667 /* timer_pending always called within inst->lock, so there
668 * is no chance of a race here */
669 if (!timer_pending(&inst->timer)) {
670 instance_get(inst);
671 inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
672 add_timer(&inst->timer);
673 }
674 spin_unlock_bh(&inst->lock);
675
676 return;
677
678alloc_failure:
679 spin_unlock_bh(&inst->lock);
680 instance_put(inst);
681 UDEBUG("error allocating skb\n");
682 /* FIXME: statistics */
683}
684
685static int
686nfulnl_rcv_nl_event(struct notifier_block *this,
687 unsigned long event, void *ptr)
688{
689 struct netlink_notify *n = ptr;
690
691 if (event == NETLINK_URELEASE &&
692 n->protocol == NETLINK_NETFILTER && n->pid) {
693 int i;
694
695 /* destroy all instances for this pid */
696 write_lock_bh(&instances_lock);
697 for (i = 0; i < INSTANCE_BUCKETS; i++) {
698 struct hlist_node *tmp, *t2;
699 struct nfulnl_instance *inst;
700 struct hlist_head *head = &instance_table[i];
701
702 hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
703 UDEBUG("node = %p\n", inst);
704 if (n->pid == inst->peer_pid)
705 __instance_destroy(inst);
706 }
707 }
708 write_unlock_bh(&instances_lock);
709 }
710 return NOTIFY_DONE;
711}
712
713static struct notifier_block nfulnl_rtnl_notifier = {
714 .notifier_call = nfulnl_rcv_nl_event,
715};
716
717static int
718nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
719 struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
720{
721 return -ENOTSUPP;
722}
723
724static struct nf_logger nfulnl_logger = {
725 .name = "nfnetlink_log",
726 .logfn = &nfulnl_log_packet,
727 .me = THIS_MODULE,
728};
729
730static const int nfula_min[NFULA_MAX] = {
731 [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr),
732 [NFULA_MARK-1] = sizeof(u_int32_t),
733 [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp),
734 [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t),
735 [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
736 [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw),
737 [NFULA_PAYLOAD-1] = 0,
738 [NFULA_PREFIX-1] = 0,
739 [NFULA_UID-1] = sizeof(u_int32_t),
740};
741
742static const int nfula_cfg_min[NFULA_CFG_MAX] = {
743 [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd),
744 [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode),
745 [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t),
746 [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t),
747 [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t),
748};
749
750static int
751nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
752 struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
753{
754 struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
755 u_int16_t group_num = ntohs(nfmsg->res_id);
756 struct nfulnl_instance *inst;
757 int ret = 0;
758
759 UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
760
761 if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
762 UDEBUG("bad attribute size\n");
763 return -EINVAL;
764 }
765
766 inst = instance_lookup_get(group_num);
767 if (nfula[NFULA_CFG_CMD-1]) {
768 u_int8_t pf = nfmsg->nfgen_family;
769 struct nfulnl_msg_config_cmd *cmd;
770 cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
771 UDEBUG("found CFG_CMD for\n");
772
773 switch (cmd->command) {
774 case NFULNL_CFG_CMD_BIND:
775 if (inst) {
776 ret = -EBUSY;
777 goto out_put;
778 }
779
780 inst = instance_create(group_num,
781 NETLINK_CB(skb).pid);
782 if (!inst) {
783 ret = -EINVAL;
784 goto out_put;
785 }
786 break;
787 case NFULNL_CFG_CMD_UNBIND:
788 if (!inst) {
789 ret = -ENODEV;
790 goto out_put;
791 }
792
793 if (inst->peer_pid != NETLINK_CB(skb).pid) {
794 ret = -EPERM;
795 goto out_put;
796 }
797
798 instance_destroy(inst);
799 break;
800 case NFULNL_CFG_CMD_PF_BIND:
801 UDEBUG("registering log handler for pf=%u\n", pf);
802 ret = nf_log_register(pf, &nfulnl_logger);
803 break;
804 case NFULNL_CFG_CMD_PF_UNBIND:
805 UDEBUG("unregistering log handler for pf=%u\n", pf);
806 /* This is a bug and a feature. We cannot unregister
807 * other handlers, like nfnetlink_inst can */
808 nf_log_unregister_pf(pf);
809 break;
810 default:
811 ret = -EINVAL;
812 break;
813 }
814 } else {
815 if (!inst) {
816 UDEBUG("no config command, and no instance for "
817 "group=%u pid=%u =>ENOENT\n",
818 group_num, NETLINK_CB(skb).pid);
819 ret = -ENOENT;
820 goto out_put;
821 }
822
823 if (inst->peer_pid != NETLINK_CB(skb).pid) {
824 UDEBUG("no config command, and wrong pid\n");
825 ret = -EPERM;
826 goto out_put;
827 }
828 }
829
830 if (nfula[NFULA_CFG_MODE-1]) {
831 struct nfulnl_msg_config_mode *params;
832 params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
833
834 nfulnl_set_mode(inst, params->copy_mode,
835 ntohs(params->copy_range));
836 }
837
838 if (nfula[NFULA_CFG_TIMEOUT-1]) {
839 u_int32_t timeout =
840 *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
841
842 nfulnl_set_timeout(inst, ntohl(timeout));
843 }
844
845 if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
846 u_int32_t nlbufsiz =
847 *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
848
849 nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
850 }
851
852 if (nfula[NFULA_CFG_QTHRESH-1]) {
853 u_int32_t qthresh =
854 *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
855
856 nfulnl_set_qthresh(inst, ntohl(qthresh));
857 }
858
859out_put:
860 instance_put(inst);
861 return ret;
862}
863
864static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
865 [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
866 .attr_count = NFULA_MAX,
867 .cap_required = CAP_NET_ADMIN, },
868 [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
869 .attr_count = NFULA_CFG_MAX,
870 .cap_required = CAP_NET_ADMIN },
871};
872
873static struct nfnetlink_subsystem nfulnl_subsys = {
874 .name = "log",
875 .subsys_id = NFNL_SUBSYS_ULOG,
876 .cb_count = NFULNL_MSG_MAX,
877 .cb = nfulnl_cb,
878};
879
880#ifdef CONFIG_PROC_FS
881struct iter_state {
882 unsigned int bucket;
883};
884
885static struct hlist_node *get_first(struct seq_file *seq)
886{
887 struct iter_state *st = seq->private;
888
889 if (!st)
890 return NULL;
891
892 for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
893 if (!hlist_empty(&instance_table[st->bucket]))
894 return instance_table[st->bucket].first;
895 }
896 return NULL;
897}
898
899static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
900{
901 struct iter_state *st = seq->private;
902
903 h = h->next;
904 while (!h) {
905 if (++st->bucket >= INSTANCE_BUCKETS)
906 return NULL;
907
908 h = instance_table[st->bucket].first;
909 }
910 return h;
911}
912
913static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
914{
915 struct hlist_node *head;
916 head = get_first(seq);
917
918 if (head)
919 while (pos && (head = get_next(seq, head)))
920 pos--;
921 return pos ? NULL : head;
922}
923
924static void *seq_start(struct seq_file *seq, loff_t *pos)
925{
926 read_lock_bh(&instances_lock);
927 return get_idx(seq, *pos);
928}
929
930static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
931{
932 (*pos)++;
933 return get_next(s, v);
934}
935
936static void seq_stop(struct seq_file *s, void *v)
937{
938 read_unlock_bh(&instances_lock);
939}
940
941static int seq_show(struct seq_file *s, void *v)
942{
943 const struct nfulnl_instance *inst = v;
944
945 return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
946 inst->group_num,
947 inst->peer_pid, inst->qlen,
948 inst->copy_mode, inst->copy_range,
949 inst->flushtimeout, atomic_read(&inst->use));
950}
951
952static struct seq_operations nful_seq_ops = {
953 .start = seq_start,
954 .next = seq_next,
955 .stop = seq_stop,
956 .show = seq_show,
957};
958
959static int nful_open(struct inode *inode, struct file *file)
960{
961 struct seq_file *seq;
962 struct iter_state *is;
963 int ret;
964
965 is = kmalloc(sizeof(*is), GFP_KERNEL);
966 if (!is)
967 return -ENOMEM;
968 memset(is, 0, sizeof(*is));
969 ret = seq_open(file, &nful_seq_ops);
970 if (ret < 0)
971 goto out_free;
972 seq = file->private_data;
973 seq->private = is;
974 return ret;
975out_free:
976 kfree(is);
977 return ret;
978}
979
980static struct file_operations nful_file_ops = {
981 .owner = THIS_MODULE,
982 .open = nful_open,
983 .read = seq_read,
984 .llseek = seq_lseek,
985 .release = seq_release_private,
986};
987
988#endif /* PROC_FS */
989
990static int
991init_or_cleanup(int init)
992{
993 int i, status = -ENOMEM;
994#ifdef CONFIG_PROC_FS
995 struct proc_dir_entry *proc_nful;
996#endif
997
998 if (!init)
999 goto cleanup;
1000
1001 for (i = 0; i < INSTANCE_BUCKETS; i++)
1002 INIT_HLIST_HEAD(&instance_table[i]);
1003
1004 /* it's not really all that important to have a random value, so
1005 * we can do this from the init function, even if there hasn't
1006 * been that much entropy yet */
1007 get_random_bytes(&hash_init, sizeof(hash_init));
1008
1009 netlink_register_notifier(&nfulnl_rtnl_notifier);
1010 status = nfnetlink_subsys_register(&nfulnl_subsys);
1011 if (status < 0) {
1012 printk(KERN_ERR "log: failed to create netlink socket\n");
1013 goto cleanup_netlink_notifier;
1014 }
1015
1016#ifdef CONFIG_PROC_FS
1017 proc_nful = create_proc_entry("nfnetlink_log", 0440,
1018 proc_net_netfilter);
1019 if (!proc_nful)
1020 goto cleanup_subsys;
1021 proc_nful->proc_fops = &nful_file_ops;
1022#endif
1023
1024 return status;
1025
1026cleanup:
1027 nf_log_unregister_logger(&nfulnl_logger);
1028#ifdef CONFIG_PROC_FS
1029 remove_proc_entry("nfnetlink_log", proc_net_netfilter);
1030cleanup_subsys:
1031#endif
1032 nfnetlink_subsys_unregister(&nfulnl_subsys);
1033cleanup_netlink_notifier:
1034 netlink_unregister_notifier(&nfulnl_rtnl_notifier);
1035 return status;
1036}
1037
1038static int __init init(void)
1039{
1040
1041 return init_or_cleanup(1);
1042}
1043
1044static void __exit fini(void)
1045{
1046 init_or_cleanup(0);
1047}
1048
1049MODULE_DESCRIPTION("netfilter userspace logging");
1050MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
1051MODULE_LICENSE("GPL");
1052MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
1053
1054module_init(init);
1055module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 000000000000..249bddb28acd
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,1121 @@
1/*
2 * This is a module which is used for queueing packets and communicating with
3 * userspace via nfetlink.
4 *
5 * (C) 2005 by Harald Welte <laforge@netfilter.org>
6 *
7 * Based on the old ipv4-only ip_queue.c:
8 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
9 * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 *
15 */
16#include <linux/module.h>
17#include <linux/skbuff.h>
18#include <linux/init.h>
19#include <linux/spinlock.h>
20#include <linux/notifier.h>
21#include <linux/netdevice.h>
22#include <linux/netfilter.h>
23#include <linux/proc_fs.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/netfilter_ipv6.h>
26#include <linux/netfilter/nfnetlink.h>
27#include <linux/netfilter/nfnetlink_queue.h>
28#include <linux/list.h>
29#include <net/sock.h>
30
31#include <asm/atomic.h>
32
33#ifdef CONFIG_BRIDGE_NETFILTER
34#include "../bridge/br_private.h"
35#endif
36
37#define NFQNL_QMAX_DEFAULT 1024
38
39#if 0
40#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
41 __FILE__, __LINE__, __FUNCTION__, \
42 ## args)
43#else
44#define QDEBUG(x, ...)
45#endif
46
47struct nfqnl_queue_entry {
48 struct list_head list;
49 struct nf_info *info;
50 struct sk_buff *skb;
51 unsigned int id;
52};
53
54struct nfqnl_instance {
55 struct hlist_node hlist; /* global list of queues */
56 atomic_t use;
57
58 int peer_pid;
59 unsigned int queue_maxlen;
60 unsigned int copy_range;
61 unsigned int queue_total;
62 unsigned int queue_dropped;
63 unsigned int queue_user_dropped;
64
65 atomic_t id_sequence; /* 'sequence' of pkt ids */
66
67 u_int16_t queue_num; /* number of this queue */
68 u_int8_t copy_mode;
69
70 spinlock_t lock;
71
72 struct list_head queue_list; /* packets in queue */
73};
74
75typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
76
77static DEFINE_RWLOCK(instances_lock);
78
79#define INSTANCE_BUCKETS 16
80static struct hlist_head instance_table[INSTANCE_BUCKETS];
81
82static inline u_int8_t instance_hashfn(u_int16_t queue_num)
83{
84 return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
85}
86
87static struct nfqnl_instance *
88__instance_lookup(u_int16_t queue_num)
89{
90 struct hlist_head *head;
91 struct hlist_node *pos;
92 struct nfqnl_instance *inst;
93
94 head = &instance_table[instance_hashfn(queue_num)];
95 hlist_for_each_entry(inst, pos, head, hlist) {
96 if (inst->queue_num == queue_num)
97 return inst;
98 }
99 return NULL;
100}
101
102static struct nfqnl_instance *
103instance_lookup_get(u_int16_t queue_num)
104{
105 struct nfqnl_instance *inst;
106
107 read_lock_bh(&instances_lock);
108 inst = __instance_lookup(queue_num);
109 if (inst)
110 atomic_inc(&inst->use);
111 read_unlock_bh(&instances_lock);
112
113 return inst;
114}
115
116static void
117instance_put(struct nfqnl_instance *inst)
118{
119 if (inst && atomic_dec_and_test(&inst->use)) {
120 QDEBUG("kfree(inst=%p)\n", inst);
121 kfree(inst);
122 }
123}
124
125static struct nfqnl_instance *
126instance_create(u_int16_t queue_num, int pid)
127{
128 struct nfqnl_instance *inst;
129
130 QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
131
132 write_lock_bh(&instances_lock);
133 if (__instance_lookup(queue_num)) {
134 inst = NULL;
135 QDEBUG("aborting, instance already exists\n");
136 goto out_unlock;
137 }
138
139 inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
140 if (!inst)
141 goto out_unlock;
142
143 memset(inst, 0, sizeof(*inst));
144 inst->queue_num = queue_num;
145 inst->peer_pid = pid;
146 inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
147 inst->copy_range = 0xfffff;
148 inst->copy_mode = NFQNL_COPY_NONE;
149 atomic_set(&inst->id_sequence, 0);
150 /* needs to be two, since we _put() after creation */
151 atomic_set(&inst->use, 2);
152 inst->lock = SPIN_LOCK_UNLOCKED;
153 INIT_LIST_HEAD(&inst->queue_list);
154
155 if (!try_module_get(THIS_MODULE))
156 goto out_free;
157
158 hlist_add_head(&inst->hlist,
159 &instance_table[instance_hashfn(queue_num)]);
160
161 write_unlock_bh(&instances_lock);
162
163 QDEBUG("successfully created new instance\n");
164
165 return inst;
166
167out_free:
168 kfree(inst);
169out_unlock:
170 write_unlock_bh(&instances_lock);
171 return NULL;
172}
173
174static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
175
176static void
177_instance_destroy2(struct nfqnl_instance *inst, int lock)
178{
179 /* first pull it out of the global list */
180 if (lock)
181 write_lock_bh(&instances_lock);
182
183 QDEBUG("removing instance %p (queuenum=%u) from hash\n",
184 inst, inst->queue_num);
185 hlist_del(&inst->hlist);
186
187 if (lock)
188 write_unlock_bh(&instances_lock);
189
190 /* then flush all pending skbs from the queue */
191 nfqnl_flush(inst, NF_DROP);
192
193 /* and finally put the refcount */
194 instance_put(inst);
195
196 module_put(THIS_MODULE);
197}
198
199static inline void
200__instance_destroy(struct nfqnl_instance *inst)
201{
202 _instance_destroy2(inst, 0);
203}
204
205static inline void
206instance_destroy(struct nfqnl_instance *inst)
207{
208 _instance_destroy2(inst, 1);
209}
210
211
212
213static void
214issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
215{
216 QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
217
218 /* TCP input path (and probably other bits) assume to be called
219 * from softirq context, not from syscall, like issue_verdict is
220 * called. TCP input path deadlocks with locks taken from timer
221 * softirq, e.g. We therefore emulate this by local_bh_disable() */
222
223 local_bh_disable();
224 nf_reinject(entry->skb, entry->info, verdict);
225 local_bh_enable();
226
227 kfree(entry);
228}
229
230static inline void
231__enqueue_entry(struct nfqnl_instance *queue,
232 struct nfqnl_queue_entry *entry)
233{
234 list_add(&entry->list, &queue->queue_list);
235 queue->queue_total++;
236}
237
238/*
239 * Find and return a queued entry matched by cmpfn, or return the last
240 * entry if cmpfn is NULL.
241 */
242static inline struct nfqnl_queue_entry *
243__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
244 unsigned long data)
245{
246 struct list_head *p;
247
248 list_for_each_prev(p, &queue->queue_list) {
249 struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
250
251 if (!cmpfn || cmpfn(entry, data))
252 return entry;
253 }
254 return NULL;
255}
256
257static inline void
258__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
259{
260 list_del(&entry->list);
261 q->queue_total--;
262}
263
264static inline struct nfqnl_queue_entry *
265__find_dequeue_entry(struct nfqnl_instance *queue,
266 nfqnl_cmpfn cmpfn, unsigned long data)
267{
268 struct nfqnl_queue_entry *entry;
269
270 entry = __find_entry(queue, cmpfn, data);
271 if (entry == NULL)
272 return NULL;
273
274 __dequeue_entry(queue, entry);
275 return entry;
276}
277
278
279static inline void
280__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
281{
282 struct nfqnl_queue_entry *entry;
283
284 while ((entry = __find_dequeue_entry(queue, NULL, 0)))
285 issue_verdict(entry, verdict);
286}
287
288static inline int
289__nfqnl_set_mode(struct nfqnl_instance *queue,
290 unsigned char mode, unsigned int range)
291{
292 int status = 0;
293
294 switch (mode) {
295 case NFQNL_COPY_NONE:
296 case NFQNL_COPY_META:
297 queue->copy_mode = mode;
298 queue->copy_range = 0;
299 break;
300
301 case NFQNL_COPY_PACKET:
302 queue->copy_mode = mode;
303 /* we're using struct nfattr which has 16bit nfa_len */
304 if (range > 0xffff)
305 queue->copy_range = 0xffff;
306 else
307 queue->copy_range = range;
308 break;
309
310 default:
311 status = -EINVAL;
312
313 }
314 return status;
315}
316
317static struct nfqnl_queue_entry *
318find_dequeue_entry(struct nfqnl_instance *queue,
319 nfqnl_cmpfn cmpfn, unsigned long data)
320{
321 struct nfqnl_queue_entry *entry;
322
323 spin_lock_bh(&queue->lock);
324 entry = __find_dequeue_entry(queue, cmpfn, data);
325 spin_unlock_bh(&queue->lock);
326
327 return entry;
328}
329
330static void
331nfqnl_flush(struct nfqnl_instance *queue, int verdict)
332{
333 spin_lock_bh(&queue->lock);
334 __nfqnl_flush(queue, verdict);
335 spin_unlock_bh(&queue->lock);
336}
337
338static struct sk_buff *
339nfqnl_build_packet_message(struct nfqnl_instance *queue,
340 struct nfqnl_queue_entry *entry, int *errp)
341{
342 unsigned char *old_tail;
343 size_t size;
344 size_t data_len = 0;
345 struct sk_buff *skb;
346 struct nfqnl_msg_packet_hdr pmsg;
347 struct nlmsghdr *nlh;
348 struct nfgenmsg *nfmsg;
349 unsigned int tmp_uint;
350
351 QDEBUG("entered\n");
352
353 /* all macros expand to constant values at compile time */
354 size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
355 + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
356 + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
357#ifdef CONFIG_BRIDGE_NETFILTER
358 + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
359 + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
360#endif
361 + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */
362 + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
363 + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
364
365 spin_lock_bh(&queue->lock);
366
367 switch (queue->copy_mode) {
368 case NFQNL_COPY_META:
369 case NFQNL_COPY_NONE:
370 data_len = 0;
371 break;
372
373 case NFQNL_COPY_PACKET:
374 if (queue->copy_range == 0
375 || queue->copy_range > entry->skb->len)
376 data_len = entry->skb->len;
377 else
378 data_len = queue->copy_range;
379
380 size += NLMSG_SPACE(data_len);
381 break;
382
383 default:
384 *errp = -EINVAL;
385 spin_unlock_bh(&queue->lock);
386 return NULL;
387 }
388
389 spin_unlock_bh(&queue->lock);
390
391 skb = alloc_skb(size, GFP_ATOMIC);
392 if (!skb)
393 goto nlmsg_failure;
394
395 old_tail= skb->tail;
396 nlh = NLMSG_PUT(skb, 0, 0,
397 NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
398 sizeof(struct nfgenmsg));
399 nfmsg = NLMSG_DATA(nlh);
400 nfmsg->nfgen_family = entry->info->pf;
401 nfmsg->version = NFNETLINK_V0;
402 nfmsg->res_id = htons(queue->queue_num);
403
404 pmsg.packet_id = htonl(entry->id);
405 pmsg.hw_protocol = htons(entry->skb->protocol);
406 pmsg.hook = entry->info->hook;
407
408 NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
409
410 if (entry->info->indev) {
411 tmp_uint = htonl(entry->info->indev->ifindex);
412#ifndef CONFIG_BRIDGE_NETFILTER
413 NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
414#else
415 if (entry->info->pf == PF_BRIDGE) {
416 /* Case 1: indev is physical input device, we need to
417 * look for bridge group (when called from
418 * netfilter_bridge) */
419 NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint),
420 &tmp_uint);
421 /* this is the bridge group "brX" */
422 tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex);
423 NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
424 &tmp_uint);
425 } else {
426 /* Case 2: indev is bridge group, we need to look for
427 * physical device (when called from ipv4) */
428 NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
429 &tmp_uint);
430 if (entry->skb->nf_bridge
431 && entry->skb->nf_bridge->physindev) {
432 tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex);
433 NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
434 sizeof(tmp_uint), &tmp_uint);
435 }
436 }
437#endif
438 }
439
440 if (entry->info->outdev) {
441 tmp_uint = htonl(entry->info->outdev->ifindex);
442#ifndef CONFIG_BRIDGE_NETFILTER
443 NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
444#else
445 if (entry->info->pf == PF_BRIDGE) {
446 /* Case 1: outdev is physical output device, we need to
447 * look for bridge group (when called from
448 * netfilter_bridge) */
449 NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
450 &tmp_uint);
451 /* this is the bridge group "brX" */
452 tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex);
453 NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
454 &tmp_uint);
455 } else {
456 /* Case 2: outdev is bridge group, we need to look for
457 * physical output device (when called from ipv4) */
458 NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
459 &tmp_uint);
460 if (entry->skb->nf_bridge
461 && entry->skb->nf_bridge->physoutdev) {
462 tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex);
463 NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
464 sizeof(tmp_uint), &tmp_uint);
465 }
466 }
467#endif
468 }
469
470 if (entry->skb->nfmark) {
471 tmp_uint = htonl(entry->skb->nfmark);
472 NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
473 }
474
475 if (entry->info->indev && entry->skb->dev
476 && entry->skb->dev->hard_header_parse) {
477 struct nfqnl_msg_packet_hw phw;
478
479 phw.hw_addrlen =
480 entry->skb->dev->hard_header_parse(entry->skb,
481 phw.hw_addr);
482 phw.hw_addrlen = htons(phw.hw_addrlen);
483 NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
484 }
485
486 if (entry->skb->tstamp.off_sec) {
487 struct nfqnl_msg_packet_timestamp ts;
488
489 ts.sec = cpu_to_be64(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec);
490 ts.usec = cpu_to_be64(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec);
491
492 NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
493 }
494
495 if (data_len) {
496 struct nfattr *nfa;
497 int size = NFA_LENGTH(data_len);
498
499 if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
500 printk(KERN_WARNING "nf_queue: no tailroom!\n");
501 goto nlmsg_failure;
502 }
503
504 nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
505 nfa->nfa_type = NFQA_PAYLOAD;
506 nfa->nfa_len = size;
507
508 if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len))
509 BUG();
510 }
511
512 nlh->nlmsg_len = skb->tail - old_tail;
513 return skb;
514
515nlmsg_failure:
516nfattr_failure:
517 if (skb)
518 kfree_skb(skb);
519 *errp = -EINVAL;
520 if (net_ratelimit())
521 printk(KERN_ERR "nf_queue: error creating packet message\n");
522 return NULL;
523}
524
525static int
526nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
527 unsigned int queuenum, void *data)
528{
529 int status = -EINVAL;
530 struct sk_buff *nskb;
531 struct nfqnl_instance *queue;
532 struct nfqnl_queue_entry *entry;
533
534 QDEBUG("entered\n");
535
536 queue = instance_lookup_get(queuenum);
537 if (!queue) {
538 QDEBUG("no queue instance matching\n");
539 return -EINVAL;
540 }
541
542 if (queue->copy_mode == NFQNL_COPY_NONE) {
543 QDEBUG("mode COPY_NONE, aborting\n");
544 status = -EAGAIN;
545 goto err_out_put;
546 }
547
548 entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
549 if (entry == NULL) {
550 if (net_ratelimit())
551 printk(KERN_ERR
552 "nf_queue: OOM in nfqnl_enqueue_packet()\n");
553 status = -ENOMEM;
554 goto err_out_put;
555 }
556
557 entry->info = info;
558 entry->skb = skb;
559 entry->id = atomic_inc_return(&queue->id_sequence);
560
561 nskb = nfqnl_build_packet_message(queue, entry, &status);
562 if (nskb == NULL)
563 goto err_out_free;
564
565 spin_lock_bh(&queue->lock);
566
567 if (!queue->peer_pid)
568 goto err_out_free_nskb;
569
570 if (queue->queue_total >= queue->queue_maxlen) {
571 queue->queue_dropped++;
572 status = -ENOSPC;
573 if (net_ratelimit())
574 printk(KERN_WARNING "ip_queue: full at %d entries, "
575 "dropping packets(s). Dropped: %d\n",
576 queue->queue_total, queue->queue_dropped);
577 goto err_out_free_nskb;
578 }
579
580 /* nfnetlink_unicast will either free the nskb or add it to a socket */
581 status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
582 if (status < 0) {
583 queue->queue_user_dropped++;
584 goto err_out_unlock;
585 }
586
587 __enqueue_entry(queue, entry);
588
589 spin_unlock_bh(&queue->lock);
590 instance_put(queue);
591 return status;
592
593err_out_free_nskb:
594 kfree_skb(nskb);
595
596err_out_unlock:
597 spin_unlock_bh(&queue->lock);
598
599err_out_free:
600 kfree(entry);
601err_out_put:
602 instance_put(queue);
603 return status;
604}
605
606static int
607nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
608{
609 int diff;
610
611 diff = data_len - e->skb->len;
612 if (diff < 0)
613 skb_trim(e->skb, data_len);
614 else if (diff > 0) {
615 if (data_len > 0xFFFF)
616 return -EINVAL;
617 if (diff > skb_tailroom(e->skb)) {
618 struct sk_buff *newskb;
619
620 newskb = skb_copy_expand(e->skb,
621 skb_headroom(e->skb),
622 diff,
623 GFP_ATOMIC);
624 if (newskb == NULL) {
625 printk(KERN_WARNING "ip_queue: OOM "
626 "in mangle, dropping packet\n");
627 return -ENOMEM;
628 }
629 if (e->skb->sk)
630 skb_set_owner_w(newskb, e->skb->sk);
631 kfree_skb(e->skb);
632 e->skb = newskb;
633 }
634 skb_put(e->skb, diff);
635 }
636 if (!skb_make_writable(&e->skb, data_len))
637 return -ENOMEM;
638 memcpy(e->skb->data, data, data_len);
639
640 return 0;
641}
642
643static inline int
644id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
645{
646 return (id == e->id);
647}
648
649static int
650nfqnl_set_mode(struct nfqnl_instance *queue,
651 unsigned char mode, unsigned int range)
652{
653 int status;
654
655 spin_lock_bh(&queue->lock);
656 status = __nfqnl_set_mode(queue, mode, range);
657 spin_unlock_bh(&queue->lock);
658
659 return status;
660}
661
662static int
663dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
664{
665 if (entry->info->indev)
666 if (entry->info->indev->ifindex == ifindex)
667 return 1;
668
669 if (entry->info->outdev)
670 if (entry->info->outdev->ifindex == ifindex)
671 return 1;
672
673 return 0;
674}
675
676/* drop all packets with either indev or outdev == ifindex from all queue
677 * instances */
678static void
679nfqnl_dev_drop(int ifindex)
680{
681 int i;
682
683 QDEBUG("entering for ifindex %u\n", ifindex);
684
685 /* this only looks like we have to hold the readlock for a way too long
686 * time, issue_verdict(), nf_reinject(), ... - but we always only
687 * issue NF_DROP, which is processed directly in nf_reinject() */
688 read_lock_bh(&instances_lock);
689
690 for (i = 0; i < INSTANCE_BUCKETS; i++) {
691 struct hlist_node *tmp;
692 struct nfqnl_instance *inst;
693 struct hlist_head *head = &instance_table[i];
694
695 hlist_for_each_entry(inst, tmp, head, hlist) {
696 struct nfqnl_queue_entry *entry;
697 while ((entry = find_dequeue_entry(inst, dev_cmp,
698 ifindex)) != NULL)
699 issue_verdict(entry, NF_DROP);
700 }
701 }
702
703 read_unlock_bh(&instances_lock);
704}
705
706#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
707
708static int
709nfqnl_rcv_dev_event(struct notifier_block *this,
710 unsigned long event, void *ptr)
711{
712 struct net_device *dev = ptr;
713
714 /* Drop any packets associated with the downed device */
715 if (event == NETDEV_DOWN)
716 nfqnl_dev_drop(dev->ifindex);
717 return NOTIFY_DONE;
718}
719
720static struct notifier_block nfqnl_dev_notifier = {
721 .notifier_call = nfqnl_rcv_dev_event,
722};
723
724static int
725nfqnl_rcv_nl_event(struct notifier_block *this,
726 unsigned long event, void *ptr)
727{
728 struct netlink_notify *n = ptr;
729
730 if (event == NETLINK_URELEASE &&
731 n->protocol == NETLINK_NETFILTER && n->pid) {
732 int i;
733
734 /* destroy all instances for this pid */
735 write_lock_bh(&instances_lock);
736 for (i = 0; i < INSTANCE_BUCKETS; i++) {
737 struct hlist_node *tmp, *t2;
738 struct nfqnl_instance *inst;
739 struct hlist_head *head = &instance_table[i];
740
741 hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
742 if (n->pid == inst->peer_pid)
743 __instance_destroy(inst);
744 }
745 }
746 write_unlock_bh(&instances_lock);
747 }
748 return NOTIFY_DONE;
749}
750
751static struct notifier_block nfqnl_rtnl_notifier = {
752 .notifier_call = nfqnl_rcv_nl_event,
753};
754
755static const int nfqa_verdict_min[NFQA_MAX] = {
756 [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr),
757 [NFQA_MARK-1] = sizeof(u_int32_t),
758 [NFQA_PAYLOAD-1] = 0,
759};
760
761static int
762nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
763 struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
764{
765 struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
766 u_int16_t queue_num = ntohs(nfmsg->res_id);
767
768 struct nfqnl_msg_verdict_hdr *vhdr;
769 struct nfqnl_instance *queue;
770 unsigned int verdict;
771 struct nfqnl_queue_entry *entry;
772 int err;
773
774 if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
775 QDEBUG("bad attribute size\n");
776 return -EINVAL;
777 }
778
779 queue = instance_lookup_get(queue_num);
780 if (!queue)
781 return -ENODEV;
782
783 if (queue->peer_pid != NETLINK_CB(skb).pid) {
784 err = -EPERM;
785 goto err_out_put;
786 }
787
788 if (!nfqa[NFQA_VERDICT_HDR-1]) {
789 err = -EINVAL;
790 goto err_out_put;
791 }
792
793 vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
794 verdict = ntohl(vhdr->verdict);
795
796 if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
797 err = -EINVAL;
798 goto err_out_put;
799 }
800
801 entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
802 if (entry == NULL) {
803 err = -ENOENT;
804 goto err_out_put;
805 }
806
807 if (nfqa[NFQA_PAYLOAD-1]) {
808 if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
809 NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
810 verdict = NF_DROP;
811 }
812
813 if (nfqa[NFQA_MARK-1])
814 skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1]));
815
816 issue_verdict(entry, verdict);
817 instance_put(queue);
818 return 0;
819
820err_out_put:
821 instance_put(queue);
822 return err;
823}
824
825static int
826nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
827 struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
828{
829 return -ENOTSUPP;
830}
831
832static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
833 [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd),
834 [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params),
835};
836
837static struct nf_queue_handler nfqh = {
838 .name = "nf_queue",
839 .outfn = &nfqnl_enqueue_packet,
840};
841
842static int
843nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
844 struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
845{
846 struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
847 u_int16_t queue_num = ntohs(nfmsg->res_id);
848 struct nfqnl_instance *queue;
849 int ret = 0;
850
851 QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
852
853 if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
854 QDEBUG("bad attribute size\n");
855 return -EINVAL;
856 }
857
858 queue = instance_lookup_get(queue_num);
859 if (nfqa[NFQA_CFG_CMD-1]) {
860 struct nfqnl_msg_config_cmd *cmd;
861 cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
862 QDEBUG("found CFG_CMD\n");
863
864 switch (cmd->command) {
865 case NFQNL_CFG_CMD_BIND:
866 if (queue)
867 return -EBUSY;
868
869 queue = instance_create(queue_num, NETLINK_CB(skb).pid);
870 if (!queue)
871 return -EINVAL;
872 break;
873 case NFQNL_CFG_CMD_UNBIND:
874 if (!queue)
875 return -ENODEV;
876
877 if (queue->peer_pid != NETLINK_CB(skb).pid) {
878 ret = -EPERM;
879 goto out_put;
880 }
881
882 instance_destroy(queue);
883 break;
884 case NFQNL_CFG_CMD_PF_BIND:
885 QDEBUG("registering queue handler for pf=%u\n",
886 ntohs(cmd->pf));
887 ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
888 break;
889 case NFQNL_CFG_CMD_PF_UNBIND:
890 QDEBUG("unregistering queue handler for pf=%u\n",
891 ntohs(cmd->pf));
892 /* This is a bug and a feature. We can unregister
893 * other handlers(!) */
894 ret = nf_unregister_queue_handler(ntohs(cmd->pf));
895 break;
896 default:
897 ret = -EINVAL;
898 break;
899 }
900 } else {
901 if (!queue) {
902 QDEBUG("no config command, and no instance ENOENT\n");
903 ret = -ENOENT;
904 goto out_put;
905 }
906
907 if (queue->peer_pid != NETLINK_CB(skb).pid) {
908 QDEBUG("no config command, and wrong pid\n");
909 ret = -EPERM;
910 goto out_put;
911 }
912 }
913
914 if (nfqa[NFQA_CFG_PARAMS-1]) {
915 struct nfqnl_msg_config_params *params;
916 params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
917
918 nfqnl_set_mode(queue, params->copy_mode,
919 ntohl(params->copy_range));
920 }
921
922out_put:
923 instance_put(queue);
924 return ret;
925}
926
927static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
928 [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
929 .attr_count = NFQA_MAX,
930 .cap_required = CAP_NET_ADMIN },
931 [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
932 .attr_count = NFQA_MAX,
933 .cap_required = CAP_NET_ADMIN },
934 [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
935 .attr_count = NFQA_CFG_MAX,
936 .cap_required = CAP_NET_ADMIN },
937};
938
939static struct nfnetlink_subsystem nfqnl_subsys = {
940 .name = "nf_queue",
941 .subsys_id = NFNL_SUBSYS_QUEUE,
942 .cb_count = NFQNL_MSG_MAX,
943 .cb = nfqnl_cb,
944};
945
946#ifdef CONFIG_PROC_FS
947struct iter_state {
948 unsigned int bucket;
949};
950
951static struct hlist_node *get_first(struct seq_file *seq)
952{
953 struct iter_state *st = seq->private;
954
955 if (!st)
956 return NULL;
957
958 for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
959 if (!hlist_empty(&instance_table[st->bucket]))
960 return instance_table[st->bucket].first;
961 }
962 return NULL;
963}
964
965static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
966{
967 struct iter_state *st = seq->private;
968
969 h = h->next;
970 while (!h) {
971 if (++st->bucket >= INSTANCE_BUCKETS)
972 return NULL;
973
974 h = instance_table[st->bucket].first;
975 }
976 return h;
977}
978
979static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
980{
981 struct hlist_node *head;
982 head = get_first(seq);
983
984 if (head)
985 while (pos && (head = get_next(seq, head)))
986 pos--;
987 return pos ? NULL : head;
988}
989
990static void *seq_start(struct seq_file *seq, loff_t *pos)
991{
992 read_lock_bh(&instances_lock);
993 return get_idx(seq, *pos);
994}
995
996static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
997{
998 (*pos)++;
999 return get_next(s, v);
1000}
1001
1002static void seq_stop(struct seq_file *s, void *v)
1003{
1004 read_unlock_bh(&instances_lock);
1005}
1006
1007static int seq_show(struct seq_file *s, void *v)
1008{
1009 const struct nfqnl_instance *inst = v;
1010
1011 return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
1012 inst->queue_num,
1013 inst->peer_pid, inst->queue_total,
1014 inst->copy_mode, inst->copy_range,
1015 inst->queue_dropped, inst->queue_user_dropped,
1016 atomic_read(&inst->id_sequence),
1017 atomic_read(&inst->use));
1018}
1019
1020static struct seq_operations nfqnl_seq_ops = {
1021 .start = seq_start,
1022 .next = seq_next,
1023 .stop = seq_stop,
1024 .show = seq_show,
1025};
1026
1027static int nfqnl_open(struct inode *inode, struct file *file)
1028{
1029 struct seq_file *seq;
1030 struct iter_state *is;
1031 int ret;
1032
1033 is = kmalloc(sizeof(*is), GFP_KERNEL);
1034 if (!is)
1035 return -ENOMEM;
1036 memset(is, 0, sizeof(*is));
1037 ret = seq_open(file, &nfqnl_seq_ops);
1038 if (ret < 0)
1039 goto out_free;
1040 seq = file->private_data;
1041 seq->private = is;
1042 return ret;
1043out_free:
1044 kfree(is);
1045 return ret;
1046}
1047
1048static struct file_operations nfqnl_file_ops = {
1049 .owner = THIS_MODULE,
1050 .open = nfqnl_open,
1051 .read = seq_read,
1052 .llseek = seq_lseek,
1053 .release = seq_release_private,
1054};
1055
1056#endif /* PROC_FS */
1057
1058static int
1059init_or_cleanup(int init)
1060{
1061 int i, status = -ENOMEM;
1062#ifdef CONFIG_PROC_FS
1063 struct proc_dir_entry *proc_nfqueue;
1064#endif
1065
1066 if (!init)
1067 goto cleanup;
1068
1069 for (i = 0; i < INSTANCE_BUCKETS; i++)
1070 INIT_HLIST_HEAD(&instance_table[i]);
1071
1072 netlink_register_notifier(&nfqnl_rtnl_notifier);
1073 status = nfnetlink_subsys_register(&nfqnl_subsys);
1074 if (status < 0) {
1075 printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
1076 goto cleanup_netlink_notifier;
1077 }
1078
1079#ifdef CONFIG_PROC_FS
1080 proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
1081 proc_net_netfilter);
1082 if (!proc_nfqueue)
1083 goto cleanup_subsys;
1084 proc_nfqueue->proc_fops = &nfqnl_file_ops;
1085#endif
1086
1087 register_netdevice_notifier(&nfqnl_dev_notifier);
1088
1089 return status;
1090
1091cleanup:
1092 nf_unregister_queue_handlers(&nfqh);
1093 unregister_netdevice_notifier(&nfqnl_dev_notifier);
1094#ifdef CONFIG_PROC_FS
1095 remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
1096cleanup_subsys:
1097#endif
1098 nfnetlink_subsys_unregister(&nfqnl_subsys);
1099cleanup_netlink_notifier:
1100 netlink_unregister_notifier(&nfqnl_rtnl_notifier);
1101 return status;
1102}
1103
1104static int __init init(void)
1105{
1106
1107 return init_or_cleanup(1);
1108}
1109
1110static void __exit fini(void)
1111{
1112 init_or_cleanup(0);
1113}
1114
1115MODULE_DESCRIPTION("netfilter packet queue handler");
1116MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
1117MODULE_LICENSE("GPL");
1118MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
1119
1120module_init(init);
1121module_exit(fini);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3405fdf41b93..62435ffc6184 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -13,7 +13,12 @@
13 * added netlink_proto_exit 13 * added netlink_proto_exit
14 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> 14 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
15 * use nlk_sk, as sk->protinfo is on a diet 8) 15 * use nlk_sk, as sk->protinfo is on a diet 8)
16 * 16 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
17 * - inc module use count of module that owns
18 * the kernel socket in case userspace opens
19 * socket of same protocol
20 * - remove all module support, since netlink is
21 * mandatory if CONFIG_NET=y these days
17 */ 22 */
18 23
19#include <linux/config.h> 24#include <linux/config.h>
@@ -55,21 +60,29 @@
55#include <net/scm.h> 60#include <net/scm.h>
56 61
57#define Nprintk(a...) 62#define Nprintk(a...)
63#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
58 64
59struct netlink_sock { 65struct netlink_sock {
60 /* struct sock has to be the first member of netlink_sock */ 66 /* struct sock has to be the first member of netlink_sock */
61 struct sock sk; 67 struct sock sk;
62 u32 pid; 68 u32 pid;
63 unsigned int groups;
64 u32 dst_pid; 69 u32 dst_pid;
65 unsigned int dst_groups; 70 u32 dst_group;
71 u32 flags;
72 u32 subscriptions;
73 u32 ngroups;
74 unsigned long *groups;
66 unsigned long state; 75 unsigned long state;
67 wait_queue_head_t wait; 76 wait_queue_head_t wait;
68 struct netlink_callback *cb; 77 struct netlink_callback *cb;
69 spinlock_t cb_lock; 78 spinlock_t cb_lock;
70 void (*data_ready)(struct sock *sk, int bytes); 79 void (*data_ready)(struct sock *sk, int bytes);
80 struct module *module;
71}; 81};
72 82
83#define NETLINK_KERNEL_SOCKET 0x1
84#define NETLINK_RECV_PKTINFO 0x2
85
73static inline struct netlink_sock *nlk_sk(struct sock *sk) 86static inline struct netlink_sock *nlk_sk(struct sock *sk)
74{ 87{
75 return (struct netlink_sock *)sk; 88 return (struct netlink_sock *)sk;
@@ -92,6 +105,9 @@ struct netlink_table {
92 struct nl_pid_hash hash; 105 struct nl_pid_hash hash;
93 struct hlist_head mc_list; 106 struct hlist_head mc_list;
94 unsigned int nl_nonroot; 107 unsigned int nl_nonroot;
108 unsigned int groups;
109 struct module *module;
110 int registered;
95}; 111};
96 112
97static struct netlink_table *nl_table; 113static struct netlink_table *nl_table;
@@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
106 122
107static struct notifier_block *netlink_chain; 123static struct notifier_block *netlink_chain;
108 124
125static u32 netlink_group_mask(u32 group)
126{
127 return group ? 1 << (group - 1) : 0;
128}
129
109static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) 130static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
110{ 131{
111 return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; 132 return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
@@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk)
122 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); 143 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
123 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); 144 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
124 BUG_TRAP(!nlk_sk(sk)->cb); 145 BUG_TRAP(!nlk_sk(sk)->cb);
146 BUG_TRAP(!nlk_sk(sk)->groups);
125} 147}
126 148
127/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. 149/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP.
@@ -317,7 +339,7 @@ static void netlink_remove(struct sock *sk)
317 netlink_table_grab(); 339 netlink_table_grab();
318 if (sk_del_node_init(sk)) 340 if (sk_del_node_init(sk))
319 nl_table[sk->sk_protocol].hash.entries--; 341 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 342 if (nlk_sk(sk)->subscriptions)
321 __sk_del_bind_node(sk); 343 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 344 netlink_table_ungrab();
323} 345}
@@ -328,19 +350,11 @@ static struct proto netlink_proto = {
328 .obj_size = sizeof(struct netlink_sock), 350 .obj_size = sizeof(struct netlink_sock),
329}; 351};
330 352
331static int netlink_create(struct socket *sock, int protocol) 353static int __netlink_create(struct socket *sock, int protocol)
332{ 354{
333 struct sock *sk; 355 struct sock *sk;
334 struct netlink_sock *nlk; 356 struct netlink_sock *nlk;
335 357
336 sock->state = SS_UNCONNECTED;
337
338 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
339 return -ESOCKTNOSUPPORT;
340
341 if (protocol<0 || protocol >= MAX_LINKS)
342 return -EPROTONOSUPPORT;
343
344 sock->ops = &netlink_ops; 358 sock->ops = &netlink_ops;
345 359
346 sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); 360 sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
@@ -350,15 +364,67 @@ static int netlink_create(struct socket *sock, int protocol)
350 sock_init_data(sock, sk); 364 sock_init_data(sock, sk);
351 365
352 nlk = nlk_sk(sk); 366 nlk = nlk_sk(sk);
353
354 spin_lock_init(&nlk->cb_lock); 367 spin_lock_init(&nlk->cb_lock);
355 init_waitqueue_head(&nlk->wait); 368 init_waitqueue_head(&nlk->wait);
356 sk->sk_destruct = netlink_sock_destruct;
357 369
370 sk->sk_destruct = netlink_sock_destruct;
358 sk->sk_protocol = protocol; 371 sk->sk_protocol = protocol;
359 return 0; 372 return 0;
360} 373}
361 374
375static int netlink_create(struct socket *sock, int protocol)
376{
377 struct module *module = NULL;
378 struct netlink_sock *nlk;
379 unsigned int groups;
380 int err = 0;
381
382 sock->state = SS_UNCONNECTED;
383
384 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
385 return -ESOCKTNOSUPPORT;
386
387 if (protocol<0 || protocol >= MAX_LINKS)
388 return -EPROTONOSUPPORT;
389
390 netlink_lock_table();
391#ifdef CONFIG_KMOD
392 if (!nl_table[protocol].registered) {
393 netlink_unlock_table();
394 request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
395 netlink_lock_table();
396 }
397#endif
398 if (nl_table[protocol].registered &&
399 try_module_get(nl_table[protocol].module))
400 module = nl_table[protocol].module;
401 else
402 err = -EPROTONOSUPPORT;
403 groups = nl_table[protocol].groups;
404 netlink_unlock_table();
405
406 if (err || (err = __netlink_create(sock, protocol) < 0))
407 goto out_module;
408
409 nlk = nlk_sk(sock->sk);
410
411 nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL);
412 if (nlk->groups == NULL) {
413 err = -ENOMEM;
414 goto out_module;
415 }
416 memset(nlk->groups, 0, NLGRPSZ(groups));
417 nlk->ngroups = groups;
418
419 nlk->module = module;
420out:
421 return err;
422
423out_module:
424 module_put(module);
425 goto out;
426}
427
362static int netlink_release(struct socket *sock) 428static int netlink_release(struct socket *sock)
363{ 429{
364 struct sock *sk = sock->sk; 430 struct sock *sk = sock->sk;
@@ -387,14 +453,27 @@ static int netlink_release(struct socket *sock)
387 453
388 skb_queue_purge(&sk->sk_write_queue); 454 skb_queue_purge(&sk->sk_write_queue);
389 455
390 if (nlk->pid && !nlk->groups) { 456 if (nlk->pid && !nlk->subscriptions) {
391 struct netlink_notify n = { 457 struct netlink_notify n = {
392 .protocol = sk->sk_protocol, 458 .protocol = sk->sk_protocol,
393 .pid = nlk->pid, 459 .pid = nlk->pid,
394 }; 460 };
395 notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); 461 notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
396 } 462 }
397 463
464 if (nlk->module)
465 module_put(nlk->module);
466
467 if (nlk->flags & NETLINK_KERNEL_SOCKET) {
468 netlink_table_grab();
469 nl_table[sk->sk_protocol].module = NULL;
470 nl_table[sk->sk_protocol].registered = 0;
471 netlink_table_ungrab();
472 }
473
474 kfree(nlk->groups);
475 nlk->groups = NULL;
476
398 sock_put(sk); 477 sock_put(sk);
399 return 0; 478 return 0;
400} 479}
@@ -443,6 +522,18 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag)
443 capable(CAP_NET_ADMIN); 522 capable(CAP_NET_ADMIN);
444} 523}
445 524
525static void
526netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
527{
528 struct netlink_sock *nlk = nlk_sk(sk);
529
530 if (nlk->subscriptions && !subscriptions)
531 __sk_del_bind_node(sk);
532 else if (!nlk->subscriptions && subscriptions)
533 sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
534 nlk->subscriptions = subscriptions;
535}
536
446static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 537static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
447{ 538{
448 struct sock *sk = sock->sk; 539 struct sock *sk = sock->sk;
@@ -468,15 +559,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
468 return err; 559 return err;
469 } 560 }
470 561
471 if (!nladdr->nl_groups && !nlk->groups) 562 if (!nladdr->nl_groups && !(u32)nlk->groups[0])
472 return 0; 563 return 0;
473 564
474 netlink_table_grab(); 565 netlink_table_grab();
475 if (nlk->groups && !nladdr->nl_groups) 566 netlink_update_subscriptions(sk, nlk->subscriptions +
476 __sk_del_bind_node(sk); 567 hweight32(nladdr->nl_groups) -
477 else if (!nlk->groups && nladdr->nl_groups) 568 hweight32(nlk->groups[0]));
478 sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); 569 nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
479 nlk->groups = nladdr->nl_groups;
480 netlink_table_ungrab(); 570 netlink_table_ungrab();
481 571
482 return 0; 572 return 0;
@@ -493,7 +583,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
493 if (addr->sa_family == AF_UNSPEC) { 583 if (addr->sa_family == AF_UNSPEC) {
494 sk->sk_state = NETLINK_UNCONNECTED; 584 sk->sk_state = NETLINK_UNCONNECTED;
495 nlk->dst_pid = 0; 585 nlk->dst_pid = 0;
496 nlk->dst_groups = 0; 586 nlk->dst_group = 0;
497 return 0; 587 return 0;
498 } 588 }
499 if (addr->sa_family != AF_NETLINK) 589 if (addr->sa_family != AF_NETLINK)
@@ -509,7 +599,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
509 if (err == 0) { 599 if (err == 0) {
510 sk->sk_state = NETLINK_CONNECTED; 600 sk->sk_state = NETLINK_CONNECTED;
511 nlk->dst_pid = nladdr->nl_pid; 601 nlk->dst_pid = nladdr->nl_pid;
512 nlk->dst_groups = nladdr->nl_groups; 602 nlk->dst_group = ffs(nladdr->nl_groups);
513 } 603 }
514 604
515 return err; 605 return err;
@@ -527,10 +617,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
527 617
528 if (peer) { 618 if (peer) {
529 nladdr->nl_pid = nlk->dst_pid; 619 nladdr->nl_pid = nlk->dst_pid;
530 nladdr->nl_groups = nlk->dst_groups; 620 nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
531 } else { 621 } else {
532 nladdr->nl_pid = nlk->pid; 622 nladdr->nl_pid = nlk->pid;
533 nladdr->nl_groups = nlk->groups; 623 nladdr->nl_groups = nlk->groups[0];
534 } 624 }
535 return 0; 625 return 0;
536} 626}
@@ -648,7 +738,8 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
648 sock_put(sk); 738 sock_put(sk);
649} 739}
650 740
651static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation) 741static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
742 unsigned int __nocast allocation)
652{ 743{
653 int delta; 744 int delta;
654 745
@@ -717,7 +808,7 @@ struct netlink_broadcast_data {
717 int failure; 808 int failure;
718 int congested; 809 int congested;
719 int delivered; 810 int delivered;
720 int allocation; 811 unsigned int allocation;
721 struct sk_buff *skb, *skb2; 812 struct sk_buff *skb, *skb2;
722}; 813};
723 814
@@ -730,7 +821,8 @@ static inline int do_one_broadcast(struct sock *sk,
730 if (p->exclude_sk == sk) 821 if (p->exclude_sk == sk)
731 goto out; 822 goto out;
732 823
733 if (nlk->pid == p->pid || !(nlk->groups & p->group)) 824 if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
825 !test_bit(p->group - 1, nlk->groups))
734 goto out; 826 goto out;
735 827
736 if (p->failure) { 828 if (p->failure) {
@@ -769,7 +861,7 @@ out:
769} 861}
770 862
771int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, 863int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
772 u32 group, int allocation) 864 u32 group, unsigned int __nocast allocation)
773{ 865{
774 struct netlink_broadcast_data info; 866 struct netlink_broadcast_data info;
775 struct hlist_node *node; 867 struct hlist_node *node;
@@ -826,7 +918,8 @@ static inline int do_one_set_err(struct sock *sk,
826 if (sk == p->exclude_sk) 918 if (sk == p->exclude_sk)
827 goto out; 919 goto out;
828 920
829 if (nlk->pid == p->pid || !(nlk->groups & p->group)) 921 if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
922 !test_bit(p->group - 1, nlk->groups))
830 goto out; 923 goto out;
831 924
832 sk->sk_err = p->code; 925 sk->sk_err = p->code;
@@ -854,6 +947,94 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
854 read_unlock(&nl_table_lock); 947 read_unlock(&nl_table_lock);
855} 948}
856 949
950static int netlink_setsockopt(struct socket *sock, int level, int optname,
951 char __user *optval, int optlen)
952{
953 struct sock *sk = sock->sk;
954 struct netlink_sock *nlk = nlk_sk(sk);
955 int val = 0, err;
956
957 if (level != SOL_NETLINK)
958 return -ENOPROTOOPT;
959
960 if (optlen >= sizeof(int) &&
961 get_user(val, (int __user *)optval))
962 return -EFAULT;
963
964 switch (optname) {
965 case NETLINK_PKTINFO:
966 if (val)
967 nlk->flags |= NETLINK_RECV_PKTINFO;
968 else
969 nlk->flags &= ~NETLINK_RECV_PKTINFO;
970 err = 0;
971 break;
972 case NETLINK_ADD_MEMBERSHIP:
973 case NETLINK_DROP_MEMBERSHIP: {
974 unsigned int subscriptions;
975 int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
976
977 if (!netlink_capable(sock, NL_NONROOT_RECV))
978 return -EPERM;
979 if (!val || val - 1 >= nlk->ngroups)
980 return -EINVAL;
981 netlink_table_grab();
982 old = test_bit(val - 1, nlk->groups);
983 subscriptions = nlk->subscriptions - old + new;
984 if (new)
985 __set_bit(val - 1, nlk->groups);
986 else
987 __clear_bit(val - 1, nlk->groups);
988 netlink_update_subscriptions(sk, subscriptions);
989 netlink_table_ungrab();
990 err = 0;
991 break;
992 }
993 default:
994 err = -ENOPROTOOPT;
995 }
996 return err;
997}
998
999static int netlink_getsockopt(struct socket *sock, int level, int optname,
1000 char __user *optval, int __user *optlen)
1001{
1002 struct sock *sk = sock->sk;
1003 struct netlink_sock *nlk = nlk_sk(sk);
1004 int len, val, err;
1005
1006 if (level != SOL_NETLINK)
1007 return -ENOPROTOOPT;
1008
1009 if (get_user(len, optlen))
1010 return -EFAULT;
1011 if (len < 0)
1012 return -EINVAL;
1013
1014 switch (optname) {
1015 case NETLINK_PKTINFO:
1016 if (len < sizeof(int))
1017 return -EINVAL;
1018 len = sizeof(int);
1019 val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
1020 put_user(len, optlen);
1021 put_user(val, optval);
1022 err = 0;
1023 break;
1024 default:
1025 err = -ENOPROTOOPT;
1026 }
1027 return err;
1028}
1029
1030static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
1031{
1032 struct nl_pktinfo info;
1033
1034 info.group = NETLINK_CB(skb).dst_group;
1035 put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
1036}
1037
857static inline void netlink_rcv_wake(struct sock *sk) 1038static inline void netlink_rcv_wake(struct sock *sk)
858{ 1039{
859 struct netlink_sock *nlk = nlk_sk(sk); 1040 struct netlink_sock *nlk = nlk_sk(sk);
@@ -872,7 +1053,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
872 struct netlink_sock *nlk = nlk_sk(sk); 1053 struct netlink_sock *nlk = nlk_sk(sk);
873 struct sockaddr_nl *addr=msg->msg_name; 1054 struct sockaddr_nl *addr=msg->msg_name;
874 u32 dst_pid; 1055 u32 dst_pid;
875 u32 dst_groups; 1056 u32 dst_group;
876 struct sk_buff *skb; 1057 struct sk_buff *skb;
877 int err; 1058 int err;
878 struct scm_cookie scm; 1059 struct scm_cookie scm;
@@ -890,12 +1071,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
890 if (addr->nl_family != AF_NETLINK) 1071 if (addr->nl_family != AF_NETLINK)
891 return -EINVAL; 1072 return -EINVAL;
892 dst_pid = addr->nl_pid; 1073 dst_pid = addr->nl_pid;
893 dst_groups = addr->nl_groups; 1074 dst_group = ffs(addr->nl_groups);
894 if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND)) 1075 if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
895 return -EPERM; 1076 return -EPERM;
896 } else { 1077 } else {
897 dst_pid = nlk->dst_pid; 1078 dst_pid = nlk->dst_pid;
898 dst_groups = nlk->dst_groups; 1079 dst_group = nlk->dst_group;
899 } 1080 }
900 1081
901 if (!nlk->pid) { 1082 if (!nlk->pid) {
@@ -913,9 +1094,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
913 goto out; 1094 goto out;
914 1095
915 NETLINK_CB(skb).pid = nlk->pid; 1096 NETLINK_CB(skb).pid = nlk->pid;
916 NETLINK_CB(skb).groups = nlk->groups;
917 NETLINK_CB(skb).dst_pid = dst_pid; 1097 NETLINK_CB(skb).dst_pid = dst_pid;
918 NETLINK_CB(skb).dst_groups = dst_groups; 1098 NETLINK_CB(skb).dst_group = dst_group;
919 NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); 1099 NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
920 memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); 1100 memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
921 1101
@@ -937,9 +1117,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
937 goto out; 1117 goto out;
938 } 1118 }
939 1119
940 if (dst_groups) { 1120 if (dst_group) {
941 atomic_inc(&skb->users); 1121 atomic_inc(&skb->users);
942 netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); 1122 netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
943 } 1123 }
944 err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); 1124 err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
945 1125
@@ -985,7 +1165,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
985 addr->nl_family = AF_NETLINK; 1165 addr->nl_family = AF_NETLINK;
986 addr->nl_pad = 0; 1166 addr->nl_pad = 0;
987 addr->nl_pid = NETLINK_CB(skb).pid; 1167 addr->nl_pid = NETLINK_CB(skb).pid;
988 addr->nl_groups = NETLINK_CB(skb).dst_groups; 1168 addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
989 msg->msg_namelen = sizeof(*addr); 1169 msg->msg_namelen = sizeof(*addr);
990 } 1170 }
991 1171
@@ -1000,6 +1180,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
1000 netlink_dump(sk); 1180 netlink_dump(sk);
1001 1181
1002 scm_recv(sock, msg, siocb->scm, flags); 1182 scm_recv(sock, msg, siocb->scm, flags);
1183 if (nlk->flags & NETLINK_RECV_PKTINFO)
1184 netlink_cmsg_recv_pktinfo(msg, skb);
1003 1185
1004out: 1186out:
1005 netlink_rcv_wake(sk); 1187 netlink_rcv_wake(sk);
@@ -1022,10 +1204,13 @@ static void netlink_data_ready(struct sock *sk, int len)
1022 */ 1204 */
1023 1205
1024struct sock * 1206struct sock *
1025netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) 1207netlink_kernel_create(int unit, unsigned int groups,
1208 void (*input)(struct sock *sk, int len),
1209 struct module *module)
1026{ 1210{
1027 struct socket *sock; 1211 struct socket *sock;
1028 struct sock *sk; 1212 struct sock *sk;
1213 struct netlink_sock *nlk;
1029 1214
1030 if (!nl_table) 1215 if (!nl_table)
1031 return NULL; 1216 return NULL;
@@ -1036,20 +1221,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
1036 if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) 1221 if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
1037 return NULL; 1222 return NULL;
1038 1223
1039 if (netlink_create(sock, unit) < 0) { 1224 if (__netlink_create(sock, unit) < 0)
1040 sock_release(sock); 1225 goto out_sock_release;
1041 return NULL; 1226
1042 }
1043 sk = sock->sk; 1227 sk = sock->sk;
1044 sk->sk_data_ready = netlink_data_ready; 1228 sk->sk_data_ready = netlink_data_ready;
1045 if (input) 1229 if (input)
1046 nlk_sk(sk)->data_ready = input; 1230 nlk_sk(sk)->data_ready = input;
1047 1231
1048 if (netlink_insert(sk, 0)) { 1232 if (netlink_insert(sk, 0))
1049 sock_release(sock); 1233 goto out_sock_release;
1050 return NULL; 1234
1051 } 1235 nlk = nlk_sk(sk);
1236 nlk->flags |= NETLINK_KERNEL_SOCKET;
1237
1238 netlink_table_grab();
1239 nl_table[unit].groups = groups < 32 ? 32 : groups;
1240 nl_table[unit].module = module;
1241 nl_table[unit].registered = 1;
1242 netlink_table_ungrab();
1243
1052 return sk; 1244 return sk;
1245
1246out_sock_release:
1247 sock_release(sock);
1248 return NULL;
1053} 1249}
1054 1250
1055void netlink_set_nonroot(int protocol, unsigned int flags) 1251void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1287,7 +1483,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
1287 s, 1483 s,
1288 s->sk_protocol, 1484 s->sk_protocol,
1289 nlk->pid, 1485 nlk->pid,
1290 nlk->groups, 1486 nlk->flags & NETLINK_KERNEL_SOCKET ?
1487 0 : (unsigned int)nlk->groups[0],
1291 atomic_read(&s->sk_rmem_alloc), 1488 atomic_read(&s->sk_rmem_alloc),
1292 atomic_read(&s->sk_wmem_alloc), 1489 atomic_read(&s->sk_wmem_alloc),
1293 nlk->cb, 1490 nlk->cb,
@@ -1361,8 +1558,8 @@ static struct proto_ops netlink_ops = {
1361 .ioctl = sock_no_ioctl, 1558 .ioctl = sock_no_ioctl,
1362 .listen = sock_no_listen, 1559 .listen = sock_no_listen,
1363 .shutdown = sock_no_shutdown, 1560 .shutdown = sock_no_shutdown,
1364 .setsockopt = sock_no_setsockopt, 1561 .setsockopt = netlink_setsockopt,
1365 .getsockopt = sock_no_getsockopt, 1562 .getsockopt = netlink_getsockopt,
1366 .sendmsg = netlink_sendmsg, 1563 .sendmsg = netlink_sendmsg,
1367 .recvmsg = netlink_recvmsg, 1564 .recvmsg = netlink_recvmsg,
1368 .mmap = sock_no_mmap, 1565 .mmap = sock_no_mmap,
@@ -1437,21 +1634,7 @@ out:
1437 return err; 1634 return err;
1438} 1635}
1439 1636
1440static void __exit netlink_proto_exit(void)
1441{
1442 sock_unregister(PF_NETLINK);
1443 proc_net_remove("netlink");
1444 kfree(nl_table);
1445 nl_table = NULL;
1446 proto_unregister(&netlink_proto);
1447}
1448
1449core_initcall(netlink_proto_init); 1637core_initcall(netlink_proto_init);
1450module_exit(netlink_proto_exit);
1451
1452MODULE_LICENSE("GPL");
1453
1454MODULE_ALIAS_NETPROTO(PF_NETLINK);
1455 1638
1456EXPORT_SYMBOL(netlink_ack); 1639EXPORT_SYMBOL(netlink_ack);
1457EXPORT_SYMBOL(netlink_broadcast); 1640EXPORT_SYMBOL(netlink_broadcast);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 31ed4a9a1d06..4b53de982114 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -39,7 +39,7 @@
39#include <linux/proc_fs.h> 39#include <linux/proc_fs.h>
40#include <linux/seq_file.h> 40#include <linux/seq_file.h>
41#include <net/ip.h> 41#include <net/ip.h>
42#include <net/tcp.h> 42#include <net/tcp_states.h>
43#include <net/arp.h> 43#include <net/arp.h>
44#include <linux/init.h> 44#include <linux/init.h>
45 45
@@ -459,12 +459,7 @@ static struct sock *nr_make_new(struct sock *osk)
459 sk->sk_sndbuf = osk->sk_sndbuf; 459 sk->sk_sndbuf = osk->sk_sndbuf;
460 sk->sk_state = TCP_ESTABLISHED; 460 sk->sk_state = TCP_ESTABLISHED;
461 sk->sk_sleep = osk->sk_sleep; 461 sk->sk_sleep = osk->sk_sleep;
462 462 sock_copy_flags(sk, osk);
463 if (sock_flag(osk, SOCK_ZAPPED))
464 sock_set_flag(sk, SOCK_ZAPPED);
465
466 if (sock_flag(osk, SOCK_DBG))
467 sock_set_flag(sk, SOCK_DBG);
468 463
469 skb_queue_head_init(&nr->ack_queue); 464 skb_queue_head_init(&nr->ack_queue);
470 skb_queue_head_init(&nr->reseq_queue); 465 skb_queue_head_init(&nr->reseq_queue);
@@ -541,7 +536,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
541 struct nr_sock *nr = nr_sk(sk); 536 struct nr_sock *nr = nr_sk(sk);
542 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; 537 struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
543 struct net_device *dev; 538 struct net_device *dev;
544 ax25_address *user, *source; 539 ax25_uid_assoc *user;
540 ax25_address *source;
545 541
546 lock_sock(sk); 542 lock_sock(sk);
547 if (!sock_flag(sk, SOCK_ZAPPED)) { 543 if (!sock_flag(sk, SOCK_ZAPPED)) {
@@ -580,16 +576,19 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
580 } else { 576 } else {
581 source = &addr->fsa_ax25.sax25_call; 577 source = &addr->fsa_ax25.sax25_call;
582 578
583 if ((user = ax25_findbyuid(current->euid)) == NULL) { 579 user = ax25_findbyuid(current->euid);
580 if (user) {
581 nr->user_addr = user->call;
582 ax25_uid_put(user);
583 } else {
584 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { 584 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
585 release_sock(sk); 585 release_sock(sk);
586 dev_put(dev); 586 dev_put(dev);
587 return -EPERM; 587 return -EPERM;
588 } 588 }
589 user = source; 589 nr->user_addr = *source;
590 } 590 }
591 591
592 nr->user_addr = *user;
593 nr->source_addr = *source; 592 nr->source_addr = *source;
594 } 593 }
595 594
@@ -609,7 +608,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
609 struct sock *sk = sock->sk; 608 struct sock *sk = sock->sk;
610 struct nr_sock *nr = nr_sk(sk); 609 struct nr_sock *nr = nr_sk(sk);
611 struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; 610 struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
612 ax25_address *user, *source = NULL; 611 ax25_address *source = NULL;
612 ax25_uid_assoc *user;
613 struct net_device *dev; 613 struct net_device *dev;
614 614
615 lock_sock(sk); 615 lock_sock(sk);
@@ -650,16 +650,19 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
650 } 650 }
651 source = (ax25_address *)dev->dev_addr; 651 source = (ax25_address *)dev->dev_addr;
652 652
653 if ((user = ax25_findbyuid(current->euid)) == NULL) { 653 user = ax25_findbyuid(current->euid);
654 if (user) {
655 nr->user_addr = user->call;
656 ax25_uid_put(user);
657 } else {
654 if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { 658 if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
655 dev_put(dev); 659 dev_put(dev);
656 release_sock(sk); 660 release_sock(sk);
657 return -EPERM; 661 return -EPERM;
658 } 662 }
659 user = source; 663 nr->user_addr = *source;
660 } 664 }
661 665
662 nr->user_addr = *user;
663 nr->source_addr = *source; 666 nr->source_addr = *source;
664 nr->device = dev; 667 nr->device = dev;
665 668
@@ -855,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
855 frametype = skb->data[19] & 0x0F; 858 frametype = skb->data[19] & 0x0F;
856 flags = skb->data[19] & 0xF0; 859 flags = skb->data[19] & 0xF0;
857 860
858#ifdef CONFIG_INET
859 /* 861 /*
860 * Check for an incoming IP over NET/ROM frame. 862 * Check for an incoming IP over NET/ROM frame.
861 */ 863 */
862 if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { 864 if (frametype == NR_PROTOEXT &&
865 circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
863 skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); 866 skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
864 skb->h.raw = skb->data; 867 skb->h.raw = skb->data;
865 868
866 return nr_rx_ip(skb, dev); 869 return nr_rx_ip(skb, dev);
867 } 870 }
868#endif
869 871
870 /* 872 /*
871 * Find an existing socket connection, based on circuit ID, if it's 873 * Find an existing socket connection, based on circuit ID, if it's
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
index 220bf7494f71..263da4c26494 100644
--- a/net/netrom/nr_dev.c
+++ b/net/netrom/nr_dev.c
@@ -38,8 +38,6 @@
38#include <net/ax25.h> 38#include <net/ax25.h>
39#include <net/netrom.h> 39#include <net/netrom.h>
40 40
41#ifdef CONFIG_INET
42
43/* 41/*
44 * Only allow IP over NET/ROM frames through if the netrom device is up. 42 * Only allow IP over NET/ROM frames through if the netrom device is up.
45 */ 43 */
@@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
64 skb->nh.raw = skb->data; 62 skb->nh.raw = skb->data;
65 skb->pkt_type = PACKET_HOST; 63 skb->pkt_type = PACKET_HOST;
66 64
67 ip_rcv(skb, skb->dev, NULL); 65 netif_rx(skb);
68 66
69 return 1; 67 return 1;
70} 68}
71 69
70#ifdef CONFIG_INET
72 71
73static int nr_rebuild_header(struct sk_buff *skb) 72static int nr_rebuild_header(struct sk_buff *skb)
74{ 73{
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 9c44b3794126..64b81a796907 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -22,8 +22,7 @@
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/skbuff.h> 23#include <linux/skbuff.h>
24#include <net/sock.h> 24#include <net/sock.h>
25#include <net/tcp.h> 25#include <net/tcp_states.h>
26#include <net/ip.h> /* For ip_rcv */
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28#include <asm/system.h> 27#include <asm/system.h>
29#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
index 0627347b14b8..587bed2674bf 100644
--- a/net/netrom/nr_subr.c
+++ b/net/netrom/nr_subr.c
@@ -21,7 +21,7 @@
21#include <linux/netdevice.h> 21#include <linux/netdevice.h>
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <net/sock.h> 23#include <net/sock.h>
24#include <net/tcp.h> 24#include <net/tcp_states.h>
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
@@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk)
77 if (skb_prev == NULL) 77 if (skb_prev == NULL)
78 skb_queue_head(&sk->sk_write_queue, skb); 78 skb_queue_head(&sk->sk_write_queue, skb);
79 else 79 else
80 skb_append(skb_prev, skb); 80 skb_append(skb_prev, skb, &sk->sk_write_queue);
81 skb_prev = skb; 81 skb_prev = skb;
82 } 82 }
83} 83}
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index faabda8088be..75b72d389ba9 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -22,7 +22,7 @@
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/skbuff.h> 23#include <linux/skbuff.h>
24#include <net/sock.h> 24#include <net/sock.h>
25#include <net/tcp.h> 25#include <net/tcp_states.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27#include <asm/system.h> 27#include <asm/system.h>
28#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000000..34ff93ff894d
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
1#
2# Packet configuration
3#
4
5config PACKET
6 tristate "Packet socket"
7 ---help---
8 The Packet protocol is used by applications which communicate
9 directly with network devices without an intermediate network
10 protocol implemented in the kernel, e.g. tcpdump. If you want them
11 to work, choose Y.
12
13 To compile this driver as a module, choose M here: the module will
14 be called af_packet.
15
16 If unsure, say Y.
17
18config PACKET_MMAP
19 bool "Packet socket: mmapped IO"
20 depends on PACKET
21 help
22 If you say Y here, the Packet protocol driver will use an IO
23 mechanism that results in faster communication.
24
25 If unsure, say N.
26
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75a1..ba997095f08f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -241,7 +241,7 @@ static struct proto_ops packet_ops;
241#ifdef CONFIG_SOCK_PACKET 241#ifdef CONFIG_SOCK_PACKET
242static struct proto_ops packet_ops_spkt; 242static struct proto_ops packet_ops_spkt;
243 243
244static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 244static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
245{ 245{
246 struct sock *sk; 246 struct sock *sk;
247 struct sockaddr_pkt *spkt; 247 struct sockaddr_pkt *spkt;
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
274 dst_release(skb->dst); 274 dst_release(skb->dst);
275 skb->dst = NULL; 275 skb->dst = NULL;
276 276
277 /* drop conntrack reference */
278 nf_reset(skb);
279
277 spkt = (struct sockaddr_pkt*)skb->cb; 280 spkt = (struct sockaddr_pkt*)skb->cb;
278 281
279 skb_push(skb, skb->data-skb->mac.raw); 282 skb_push(skb, skb->data-skb->mac.raw);
@@ -438,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned
438 we will not harm anyone. 441 we will not harm anyone.
439 */ 442 */
440 443
441static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 444static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
442{ 445{
443 struct sock *sk; 446 struct sock *sk;
444 struct sockaddr_ll *sll; 447 struct sockaddr_ll *sll;
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
517 dst_release(skb->dst); 520 dst_release(skb->dst);
518 skb->dst = NULL; 521 skb->dst = NULL;
519 522
523 /* drop conntrack reference */
524 nf_reset(skb);
525
520 spin_lock(&sk->sk_receive_queue.lock); 526 spin_lock(&sk->sk_receive_queue.lock);
521 po->stats.tp_packets++; 527 po->stats.tp_packets++;
522 __skb_queue_tail(&sk->sk_receive_queue, skb); 528 __skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -540,7 +546,7 @@ drop:
540} 546}
541 547
542#ifdef CONFIG_PACKET_MMAP 548#ifdef CONFIG_PACKET_MMAP
543static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 549static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
544{ 550{
545 struct sock *sk; 551 struct sock *sk;
546 struct packet_sock *po; 552 struct packet_sock *po;
@@ -629,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
629 h->tp_snaplen = snaplen; 635 h->tp_snaplen = snaplen;
630 h->tp_mac = macoff; 636 h->tp_mac = macoff;
631 h->tp_net = netoff; 637 h->tp_net = netoff;
632 if (skb->stamp.tv_sec == 0) { 638 if (skb->tstamp.off_sec == 0) {
633 do_gettimeofday(&skb->stamp); 639 __net_timestamp(skb);
634 sock_enable_timestamp(sk); 640 sock_enable_timestamp(sk);
635 } 641 }
636 h->tp_sec = skb->stamp.tv_sec; 642 h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
637 h->tp_usec = skb->stamp.tv_usec; 643 h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
638 644
639 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); 645 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
640 sll->sll_halen = 0; 646 sll->sll_halen = 0;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 7eb6a5bf93ea..c6e59f84c3ae 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -41,7 +41,7 @@
41#include <net/rose.h> 41#include <net/rose.h>
42#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <net/tcp.h> 44#include <net/tcp_states.h>
45#include <net/ip.h> 45#include <net/ip.h>
46#include <net/arp.h> 46#include <net/arp.h>
47 47
@@ -556,12 +556,7 @@ static struct sock *rose_make_new(struct sock *osk)
556 sk->sk_sndbuf = osk->sk_sndbuf; 556 sk->sk_sndbuf = osk->sk_sndbuf;
557 sk->sk_state = TCP_ESTABLISHED; 557 sk->sk_state = TCP_ESTABLISHED;
558 sk->sk_sleep = osk->sk_sleep; 558 sk->sk_sleep = osk->sk_sleep;
559 559 sock_copy_flags(sk, osk);
560 if (sock_flag(osk, SOCK_ZAPPED))
561 sock_set_flag(sk, SOCK_ZAPPED);
562
563 if (sock_flag(osk, SOCK_DBG))
564 sock_set_flag(sk, SOCK_DBG);
565 560
566 init_timer(&rose->timer); 561 init_timer(&rose->timer);
567 init_timer(&rose->idletimer); 562 init_timer(&rose->idletimer);
@@ -631,7 +626,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
631 struct rose_sock *rose = rose_sk(sk); 626 struct rose_sock *rose = rose_sk(sk);
632 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; 627 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
633 struct net_device *dev; 628 struct net_device *dev;
634 ax25_address *user, *source; 629 ax25_address *source;
630 ax25_uid_assoc *user;
635 int n; 631 int n;
636 632
637 if (!sock_flag(sk, SOCK_ZAPPED)) 633 if (!sock_flag(sk, SOCK_ZAPPED))
@@ -656,14 +652,17 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
656 652
657 source = &addr->srose_call; 653 source = &addr->srose_call;
658 654
659 if ((user = ax25_findbyuid(current->euid)) == NULL) { 655 user = ax25_findbyuid(current->euid);
656 if (user) {
657 rose->source_call = user->call;
658 ax25_uid_put(user);
659 } else {
660 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) 660 if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
661 return -EACCES; 661 return -EACCES;
662 user = source; 662 rose->source_call = *source;
663 } 663 }
664 664
665 rose->source_addr = addr->srose_addr; 665 rose->source_addr = addr->srose_addr;
666 rose->source_call = *user;
667 rose->device = dev; 666 rose->device = dev;
668 rose->source_ndigis = addr->srose_ndigis; 667 rose->source_ndigis = addr->srose_ndigis;
669 668
@@ -690,8 +689,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
690 struct rose_sock *rose = rose_sk(sk); 689 struct rose_sock *rose = rose_sk(sk);
691 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; 690 struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
692 unsigned char cause, diagnostic; 691 unsigned char cause, diagnostic;
693 ax25_address *user;
694 struct net_device *dev; 692 struct net_device *dev;
693 ax25_uid_assoc *user;
695 int n; 694 int n;
696 695
697 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { 696 if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
@@ -741,12 +740,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
741 if ((dev = rose_dev_first()) == NULL) 740 if ((dev = rose_dev_first()) == NULL)
742 return -ENETUNREACH; 741 return -ENETUNREACH;
743 742
744 if ((user = ax25_findbyuid(current->euid)) == NULL) 743 user = ax25_findbyuid(current->euid);
744 if (!user)
745 return -EINVAL; 745 return -EINVAL;
746 746
747 memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); 747 memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
748 rose->source_call = *user; 748 rose->source_call = user->call;
749 rose->device = dev; 749 rose->device = dev;
750 ax25_uid_put(user);
750 751
751 rose_insert_socket(sk); /* Finish the bind */ 752 rose_insert_socket(sk); /* Finish the bind */
752 } 753 }
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index ef475a1bb1ba..8348d33f1efe 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -26,8 +26,7 @@
26#include <linux/netdevice.h> 26#include <linux/netdevice.h>
27#include <linux/skbuff.h> 27#include <linux/skbuff.h>
28#include <net/sock.h> 28#include <net/sock.h>
29#include <net/ip.h> /* For ip_rcv */ 29#include <net/tcp_states.h>
30#include <net/tcp.h>
31#include <asm/system.h> 30#include <asm/system.h>
32#include <linux/fcntl.h> 31#include <linux/fcntl.h>
33#include <linux/mm.h> 32#include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index ff73ebb912b8..4510cd7613ec 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -24,7 +24,7 @@
24#include <linux/if_arp.h> 24#include <linux/if_arp.h>
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <net/sock.h> 26#include <net/sock.h>
27#include <net/tcp.h> 27#include <net/tcp_states.h>
28#include <asm/system.h> 28#include <asm/system.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
@@ -994,8 +994,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
994 * 1. The frame isn't for us, 994 * 1. The frame isn't for us,
995 * 2. It isn't "owned" by any existing route. 995 * 2. It isn't "owned" by any existing route.
996 */ 996 */
997 if (frametype != ROSE_CALL_REQUEST) /* XXX */ 997 if (frametype != ROSE_CALL_REQUEST) { /* XXX */
998 return 0; 998 res = 0;
999 goto out;
1000 }
999 1001
1000 len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; 1002 len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2;
1001 len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; 1003 len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2;
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7db7e1cedc3a..a29a3a960fd6 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -21,7 +21,7 @@
21#include <linux/netdevice.h> 21#include <linux/netdevice.h>
22#include <linux/skbuff.h> 22#include <linux/skbuff.h>
23#include <net/sock.h> 23#include <net/sock.h>
24#include <net/tcp.h> 24#include <net/tcp_states.h>
25#include <asm/system.h> 25#include <asm/system.h>
26#include <linux/fcntl.h> 26#include <linux/fcntl.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
@@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk)
74 if (skb_prev == NULL) 74 if (skb_prev == NULL)
75 skb_queue_head(&sk->sk_write_queue, skb); 75 skb_queue_head(&sk->sk_write_queue, skb);
76 else 76 else
77 skb_append(skb_prev, skb); 77 skb_append(skb_prev, skb, &sk->sk_write_queue);
78 skb_prev = skb; 78 skb_prev = skb;
79 } 79 }
80} 80}
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 84dd4403f792..50ae0371dab8 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -22,7 +22,7 @@
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/skbuff.h> 23#include <linux/skbuff.h>
24#include <net/sock.h> 24#include <net/sock.h>
25#include <net/tcp.h> 25#include <net/tcp_states.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 9bce7794130a..122c086ee2db 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
330 330
331 msg->trans = trans; 331 msg->trans = trans;
332 msg->state = RXRPC_MSG_RECEIVED; 332 msg->state = RXRPC_MSG_RECEIVED;
333 msg->stamp = pkt->stamp; 333 skb_get_timestamp(pkt, &msg->stamp);
334 if (msg->stamp.tv_sec == 0) { 334 if (msg->stamp.tv_sec == 0) {
335 do_gettimeofday(&msg->stamp); 335 do_gettimeofday(&msg->stamp);
336 if (pkt->sk) 336 if (pkt->sk)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7bac249258e3..45d3bc0812c8 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
1# 1#
2# Traffic control configuration. 2# Traffic control configuration.
3# 3#
4
5menuconfig NET_SCHED
6 bool "QoS and/or fair queueing"
7 ---help---
8 When the kernel has several packets to send out over a network
9 device, it has to decide which ones to send first, which ones to
10 delay, and which ones to drop. This is the job of the packet
11 scheduler, and several different algorithms for how to do this
12 "fairly" have been proposed.
13
14 If you say N here, you will get the standard packet scheduler, which
15 is a FIFO (first come, first served). If you say Y here, you will be
16 able to choose from among several alternative algorithms which can
17 then be attached to different network devices. This is useful for
18 example if some of your network devices are real time devices that
19 need a certain minimum data flow rate, or if you need to limit the
20 maximum data flow rate for traffic which matches specified criteria.
21 This code is considered to be experimental.
22
23 To administer these schedulers, you'll need the user-level utilities
24 from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
25 That package also contains some documentation; for more, check out
26 <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
27
28 This Quality of Service (QoS) support will enable you to use
29 Differentiated Services (diffserv) and Resource Reservation Protocol
30 (RSVP) on your Linux router if you also say Y to "QoS support",
31 "Packet classifier API" and to some classifiers below. Documentation
32 and software is at <http://diffserv.sourceforge.net/>.
33
34 If you say Y here and to "/proc file system" below, you will be able
35 to read status information about packet schedulers from the file
36 /proc/net/psched.
37
38 The available schedulers are listed in the following questions; you
39 can say Y to as many as you like. If unsure, say N now.
40
4choice 41choice
5 prompt "Packet scheduler clock source" 42 prompt "Packet scheduler clock source"
6 depends on NET_SCHED 43 depends on NET_SCHED
@@ -454,6 +491,7 @@ config NET_EMATCH_TEXT
454 depends on NET_EMATCH 491 depends on NET_EMATCH
455 select TEXTSEARCH 492 select TEXTSEARCH
456 select TEXTSEARCH_KMP 493 select TEXTSEARCH_KMP
494 select TEXTSEARCH_BM
457 select TEXTSEARCH_FSM 495 select TEXTSEARCH_FSM
458 ---help--- 496 ---help---
459 Say Y here if you want to be ablt to classify packets based on 497 Say Y here if you want to be ablt to classify packets based on
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 249c61936ea0..8aebe8f6d271 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
165 while ((a = act) != NULL) { 165 while ((a = act) != NULL) {
166repeat: 166repeat:
167 if (a->ops && a->ops->act) { 167 if (a->ops && a->ops->act) {
168 ret = a->ops->act(&skb, a); 168 ret = a->ops->act(&skb, a, res);
169 if (TC_MUNGED & skb->tc_verd) { 169 if (TC_MUNGED & skb->tc_verd) {
170 /* copied already, allow trampling */ 170 /* copied already, allow trampling */
171 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); 171 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -179,11 +179,6 @@ repeat:
179 act = a->next; 179 act = a->next;
180 } 180 }
181exec_done: 181exec_done:
182 if (skb->tc_classid > 0) {
183 res->classid = skb->tc_classid;
184 res->class = 0;
185 skb->tc_classid = 0;
186 }
187 return ret; 182 return ret;
188} 183}
189 184
@@ -598,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
598 nlh->nlmsg_flags |= NLM_F_ROOT; 593 nlh->nlmsg_flags |= NLM_F_ROOT;
599 module_put(a->ops->owner); 594 module_put(a->ops->owner);
600 kfree(a); 595 kfree(a);
601 err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 596 err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
602 if (err > 0) 597 if (err > 0)
603 return 0; 598 return 0;
604 599
@@ -661,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
661 656
662 /* now do the delete */ 657 /* now do the delete */
663 tcf_action_destroy(head, 0); 658 tcf_action_destroy(head, 0);
664 ret = rtnetlink_send(skb, pid, RTMGRP_TC, 659 ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
665 n->nlmsg_flags&NLM_F_ECHO); 660 n->nlmsg_flags&NLM_F_ECHO);
666 if (ret > 0) 661 if (ret > 0)
667 return 0; 662 return 0;
@@ -703,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
703 x->rta_len = skb->tail - (u8*)x; 698 x->rta_len = skb->tail - (u8*)x;
704 699
705 nlh->nlmsg_len = skb->tail - b; 700 nlh->nlmsg_len = skb->tail - b;
706 NETLINK_CB(skb).dst_groups = RTMGRP_TC; 701 NETLINK_CB(skb).dst_group = RTNLGRP_TC;
707 702
708 err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); 703 err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
709 if (err > 0) 704 if (err > 0)
710 err = 0; 705 err = 0;
711 return err; 706 return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3b5714ef4d1a..b4d89fbb3782 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
367 return -EINVAL; 367 return -EINVAL;
368 } 368 }
369 369
370 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 370 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
371} 371}
372 372
373struct tcf_dump_args 373struct tcf_dump_args
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 53d98f8d3d80..00eae5f9a01a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
27 * lvalue rvalue 27 * lvalue rvalue
28 * +-----------+ +-----------+ 28 * +-----------+ +-----------+
29 * | type: INT | | type: INT | 29 * | type: INT | | type: INT |
30 * def | id: INDEV | | id: VALUE | 30 * def | id: DEV | | id: VALUE |
31 * | data: | | data: 3 | 31 * | data: | | data: 3 |
32 * +-----------+ +-----------+ 32 * +-----------+ +-----------+
33 * | | 33 * | |
34 * ---> meta_ops[INT][INDEV](...) | 34 * ---> meta_ops[INT][DEV](...) |
35 * | | 35 * | |
36 * ----------- | 36 * ----------- |
37 * V V 37 * V V
38 * +-----------+ +-----------+ 38 * +-----------+ +-----------+
39 * | type: INT | | type: INT | 39 * | type: INT | | type: INT |
40 * obj | id: INDEV | | id: VALUE | 40 * obj | id: DEV | | id: VALUE |
41 * | data: 2 |<--data got filled out | data: 3 | 41 * | data: 2 |<--data got filled out | data: 3 |
42 * +-----------+ +-----------+ 42 * +-----------+ +-----------+
43 * | | 43 * | |
@@ -170,26 +170,6 @@ META_COLLECTOR(var_dev)
170 *err = var_dev(skb->dev, dst); 170 *err = var_dev(skb->dev, dst);
171} 171}
172 172
173META_COLLECTOR(int_indev)
174{
175 *err = int_dev(skb->input_dev, dst);
176}
177
178META_COLLECTOR(var_indev)
179{
180 *err = var_dev(skb->input_dev, dst);
181}
182
183META_COLLECTOR(int_realdev)
184{
185 *err = int_dev(skb->real_dev, dst);
186}
187
188META_COLLECTOR(var_realdev)
189{
190 *err = var_dev(skb->real_dev, dst);
191}
192
193/************************************************************************** 173/**************************************************************************
194 * skb attributes 174 * skb attributes
195 **************************************************************************/ 175 **************************************************************************/
@@ -229,12 +209,14 @@ META_COLLECTOR(int_maclen)
229 * Netfilter 209 * Netfilter
230 **************************************************************************/ 210 **************************************************************************/
231 211
232#ifdef CONFIG_NETFILTER
233META_COLLECTOR(int_nfmark) 212META_COLLECTOR(int_nfmark)
234{ 213{
214#ifdef CONFIG_NETFILTER
235 dst->value = skb->nfmark; 215 dst->value = skb->nfmark;
236} 216#else
217 dst->value = 0;
237#endif 218#endif
219}
238 220
239/************************************************************************** 221/**************************************************************************
240 * Traffic Control 222 * Traffic Control
@@ -245,31 +227,21 @@ META_COLLECTOR(int_tcindex)
245 dst->value = skb->tc_index; 227 dst->value = skb->tc_index;
246} 228}
247 229
248#ifdef CONFIG_NET_CLS_ACT
249META_COLLECTOR(int_tcverd)
250{
251 dst->value = skb->tc_verd;
252}
253
254META_COLLECTOR(int_tcclassid)
255{
256 dst->value = skb->tc_classid;
257}
258#endif
259
260/************************************************************************** 230/**************************************************************************
261 * Routing 231 * Routing
262 **************************************************************************/ 232 **************************************************************************/
263 233
264#ifdef CONFIG_NET_CLS_ROUTE
265META_COLLECTOR(int_rtclassid) 234META_COLLECTOR(int_rtclassid)
266{ 235{
267 if (unlikely(skb->dst == NULL)) 236 if (unlikely(skb->dst == NULL))
268 *err = -1; 237 *err = -1;
269 else 238 else
239#ifdef CONFIG_NET_CLS_ROUTE
270 dst->value = skb->dst->tclassid; 240 dst->value = skb->dst->tclassid;
271} 241#else
242 dst->value = 0;
272#endif 243#endif
244}
273 245
274META_COLLECTOR(int_rtiif) 246META_COLLECTOR(int_rtiif)
275{ 247{
@@ -505,8 +477,6 @@ struct meta_ops
505static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { 477static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
506 [TCF_META_TYPE_VAR] = { 478 [TCF_META_TYPE_VAR] = {
507 [META_ID(DEV)] = META_FUNC(var_dev), 479 [META_ID(DEV)] = META_FUNC(var_dev),
508 [META_ID(INDEV)] = META_FUNC(var_indev),
509 [META_ID(REALDEV)] = META_FUNC(var_realdev),
510 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), 480 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
511 }, 481 },
512 [TCF_META_TYPE_INT] = { 482 [TCF_META_TYPE_INT] = {
@@ -515,25 +485,15 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
515 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1), 485 [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
516 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2), 486 [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
517 [META_ID(DEV)] = META_FUNC(int_dev), 487 [META_ID(DEV)] = META_FUNC(int_dev),
518 [META_ID(INDEV)] = META_FUNC(int_indev),
519 [META_ID(REALDEV)] = META_FUNC(int_realdev),
520 [META_ID(PRIORITY)] = META_FUNC(int_priority), 488 [META_ID(PRIORITY)] = META_FUNC(int_priority),
521 [META_ID(PROTOCOL)] = META_FUNC(int_protocol), 489 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
522 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), 490 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
523 [META_ID(PKTLEN)] = META_FUNC(int_pktlen), 491 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
524 [META_ID(DATALEN)] = META_FUNC(int_datalen), 492 [META_ID(DATALEN)] = META_FUNC(int_datalen),
525 [META_ID(MACLEN)] = META_FUNC(int_maclen), 493 [META_ID(MACLEN)] = META_FUNC(int_maclen),
526#ifdef CONFIG_NETFILTER
527 [META_ID(NFMARK)] = META_FUNC(int_nfmark), 494 [META_ID(NFMARK)] = META_FUNC(int_nfmark),
528#endif
529 [META_ID(TCINDEX)] = META_FUNC(int_tcindex), 495 [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
530#ifdef CONFIG_NET_CLS_ACT
531 [META_ID(TCVERDICT)] = META_FUNC(int_tcverd),
532 [META_ID(TCCLASSID)] = META_FUNC(int_tcclassid),
533#endif
534#ifdef CONFIG_NET_CLS_ROUTE
535 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), 496 [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
536#endif
537 [META_ID(RTIIF)] = META_FUNC(int_rtiif), 497 [META_ID(RTIIF)] = META_FUNC(int_rtiif),
538 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family), 498 [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
539 [META_ID(SK_STATE)] = META_FUNC(int_sk_state), 499 [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 873840d8d072..77beabc91fa3 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -55,9 +55,6 @@ static int em_text_change(struct tcf_proto *tp, void *data, int len,
55 struct ts_config *ts_conf; 55 struct ts_config *ts_conf;
56 int flags = 0; 56 int flags = 0;
57 57
58 printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
59 conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
60
61 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len)) 58 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
62 return -EINVAL; 59 return -EINVAL;
63 60
diff --git a/net/sched/gact.c b/net/sched/gact.c
index a811c89fef7f..d1c6d542912a 100644
--- a/net/sched/gact.c
+++ b/net/sched/gact.c
@@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind)
135} 135}
136 136
137static int 137static int
138tcf_gact(struct sk_buff **pskb, struct tc_action *a) 138tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
139{ 139{
140 struct tcf_gact *p = PRIV(a, gact); 140 struct tcf_gact *p = PRIV(a, gact);
141 struct sk_buff *skb = *pskb; 141 struct sk_buff *skb = *pskb;
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
index b114d994d523..f50136eed211 100644
--- a/net/sched/ipt.c
+++ b/net/sched/ipt.c
@@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind)
201} 201}
202 202
203static int 203static int
204tcf_ipt(struct sk_buff **pskb, struct tc_action *a) 204tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
205{ 205{
206 int ret = 0, result = 0; 206 int ret = 0, result = 0;
207 struct tcf_ipt *p = PRIV(a, ipt); 207 struct tcf_ipt *p = PRIV(a, ipt);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
index f309ce336803..20d06916dc0b 100644
--- a/net/sched/mirred.c
+++ b/net/sched/mirred.c
@@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind)
158} 158}
159 159
160static int 160static int
161tcf_mirred(struct sk_buff **pskb, struct tc_action *a) 161tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
162{ 162{
163 struct tcf_mirred *p = PRIV(a, mirred); 163 struct tcf_mirred *p = PRIV(a, mirred);
164 struct net_device *dev; 164 struct net_device *dev;
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
index 678be6a645fb..767d24f4610e 100644
--- a/net/sched/pedit.c
+++ b/net/sched/pedit.c
@@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
130} 130}
131 131
132static int 132static int
133tcf_pedit(struct sk_buff **pskb, struct tc_action *a) 133tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
134{ 134{
135 struct tcf_pedit *p = PRIV(a, pedit); 135 struct tcf_pedit *p = PRIV(a, pedit);
136 struct sk_buff *skb = *pskb; 136 struct sk_buff *skb = *pskb;
diff --git a/net/sched/police.c b/net/sched/police.c
index c03545faf523..eb39fb2f39b6 100644
--- a/net/sched/police.c
+++ b/net/sched/police.c
@@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
284 return 0; 284 return 0;
285} 285}
286 286
287static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) 287static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a,
288 struct tcf_result *res)
288{ 289{
289 psched_time_t now; 290 psched_time_t now;
290 struct sk_buff *skb = *pskb; 291 struct sk_buff *skb = *pskb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9a069af4a02..737681cb9a92 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
816 } 816 }
817 817
818 if (skb->len) 818 if (skb->len)
819 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 819 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
820 820
821err_out: 821err_out:
822 kfree_skb(skb); 822 kfree_skb(skb);
@@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1040 return -EINVAL; 1040 return -EINVAL;
1041 } 1041 }
1042 1042
1043 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 1043 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1044} 1044}
1045 1045
1046struct qdisc_dump_args 1046struct qdisc_dump_args
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 73e218e646ac..99ceb91f0150 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev)
238 spin_unlock_bh(&dev->xmit_lock); 238 spin_unlock_bh(&dev->xmit_lock);
239} 239}
240 240
241void netif_carrier_on(struct net_device *dev)
242{
243 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
244 linkwatch_fire_event(dev);
245 if (netif_running(dev))
246 __netdev_watchdog_up(dev);
247}
248
249void netif_carrier_off(struct net_device *dev)
250{
251 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
252 linkwatch_fire_event(dev);
253}
254
241/* "NOOP" scheduler: the best scheduler, recommended for all interfaces 255/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
242 under all circumstances. It is difficult to invent anything faster or 256 under all circumstances. It is difficult to invent anything faster or
243 cheaper. 257 cheaper.
@@ -331,11 +345,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
331 int prio; 345 int prio;
332 struct sk_buff_head *list = qdisc_priv(qdisc); 346 struct sk_buff_head *list = qdisc_priv(qdisc);
333 347
334 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) { 348 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
335 struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); 349 if (!skb_queue_empty(list + prio)) {
336 if (skb) {
337 qdisc->q.qlen--; 350 qdisc->q.qlen--;
338 return skb; 351 return __qdisc_dequeue_head(qdisc, list + prio);
339 } 352 }
340 } 353 }
341 354
@@ -439,6 +452,7 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
439 if (!ops->init || ops->init(sch, NULL) == 0) 452 if (!ops->init || ops->init(sch, NULL) == 0)
440 return sch; 453 return sch;
441 454
455 qdisc_destroy(sch);
442errout: 456errout:
443 return NULL; 457 return NULL;
444} 458}
@@ -600,6 +614,8 @@ void dev_shutdown(struct net_device *dev)
600} 614}
601 615
602EXPORT_SYMBOL(__netdev_watchdog_up); 616EXPORT_SYMBOL(__netdev_watchdog_up);
617EXPORT_SYMBOL(netif_carrier_on);
618EXPORT_SYMBOL(netif_carrier_off);
603EXPORT_SYMBOL(noop_qdisc); 619EXPORT_SYMBOL(noop_qdisc);
604EXPORT_SYMBOL(noop_qdisc_ops); 620EXPORT_SYMBOL(noop_qdisc_ops);
605EXPORT_SYMBOL(qdisc_create_dflt); 621EXPORT_SYMBOL(qdisc_create_dflt);
diff --git a/net/sched/simple.c b/net/sched/simple.c
index 3ab4c675ab5d..8a6ae4f491e8 100644
--- a/net/sched/simple.c
+++ b/net/sched/simple.c
@@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock);
44#include <net/pkt_act.h> 44#include <net/pkt_act.h>
45#include <net/act_generic.h> 45#include <net/act_generic.h>
46 46
47static int tcf_simp(struct sk_buff **pskb, struct tc_action *a) 47static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
48{ 48{
49 struct sk_buff *skb = *pskb; 49 struct sk_buff *skb = *pskb;
50 struct tcf_defact *p = PRIV(a, defact); 50 struct tcf_defact *p = PRIV(a, defact);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4b47dd6f2485..5b24ae0650d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
71 const struct sctp_endpoint *ep, 71 const struct sctp_endpoint *ep,
72 const struct sock *sk, 72 const struct sock *sk,
73 sctp_scope_t scope, 73 sctp_scope_t scope,
74 int gfp) 74 unsigned int __nocast gfp)
75{ 75{
76 struct sctp_sock *sp; 76 struct sctp_sock *sp;
77 int i; 77 int i;
@@ -272,7 +272,8 @@ fail_init:
272/* Allocate and initialize a new association */ 272/* Allocate and initialize a new association */
273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, 273struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
274 const struct sock *sk, 274 const struct sock *sk,
275 sctp_scope_t scope, int gfp) 275 sctp_scope_t scope,
276 unsigned int __nocast gfp)
276{ 277{
277 struct sctp_association *asoc; 278 struct sctp_association *asoc;
278 279
@@ -478,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
478/* Add a transport address to an association. */ 479/* Add a transport address to an association. */
479struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 480struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
480 const union sctp_addr *addr, 481 const union sctp_addr *addr,
481 const int gfp, 482 const unsigned int __nocast gfp,
482 const int peer_state) 483 const int peer_state)
483{ 484{
484 struct sctp_transport *peer; 485 struct sctp_transport *peer;
@@ -1229,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
1229/* Build the bind address list for the association based on info from the 1230/* Build the bind address list for the association based on info from the
1230 * local endpoint and the remote peer. 1231 * local endpoint and the remote peer.
1231 */ 1232 */
1232int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp) 1233int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
1234 unsigned int __nocast gfp)
1233{ 1235{
1234 sctp_scope_t scope; 1236 sctp_scope_t scope;
1235 int flags; 1237 int flags;
@@ -1251,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
1251 1253
1252/* Build the association's bind address list from the cookie. */ 1254/* Build the association's bind address list from the cookie. */
1253int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, 1255int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
1254 struct sctp_cookie *cookie, int gfp) 1256 struct sctp_cookie *cookie,
1257 unsigned int __nocast gfp)
1255{ 1258{
1256 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); 1259 int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
1257 int var_size3 = cookie->raw_addr_list_len; 1260 int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60a2..f71549710f2e 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
53 53
54/* Forward declarations for internal helpers. */ 54/* Forward declarations for internal helpers. */
55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, 55static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
56 sctp_scope_t scope, int gfp, int flags); 56 sctp_scope_t scope, unsigned int __nocast gfp,
57 int flags);
57static void sctp_bind_addr_clean(struct sctp_bind_addr *); 58static void sctp_bind_addr_clean(struct sctp_bind_addr *);
58 59
59/* First Level Abstractions. */ 60/* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
63 */ 64 */
64int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 65int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
65 const struct sctp_bind_addr *src, 66 const struct sctp_bind_addr *src,
66 sctp_scope_t scope, int gfp, int flags) 67 sctp_scope_t scope, unsigned int __nocast gfp,
68 int flags)
67{ 69{
68 struct sctp_sockaddr_entry *addr; 70 struct sctp_sockaddr_entry *addr;
69 struct list_head *pos; 71 struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
144 146
145/* Add an address to the bind address list in the SCTP_bind_addr structure. */ 147/* Add an address to the bind address list in the SCTP_bind_addr structure. */
146int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, 148int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
147 int gfp) 149 unsigned int __nocast gfp)
148{ 150{
149 struct sctp_sockaddr_entry *addr; 151 struct sctp_sockaddr_entry *addr;
150 152
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
197 * The second argument is the return value for the length. 199 * The second argument is the return value for the length.
198 */ 200 */
199union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, 201union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
200 int *addrs_len, int gfp) 202 int *addrs_len,
203 unsigned int __nocast gfp)
201{ 204{
202 union sctp_params addrparms; 205 union sctp_params addrparms;
203 union sctp_params retval; 206 union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
249 * address parameters). 252 * address parameters).
250 */ 253 */
251int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, 254int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
252 int addrs_len, __u16 port, int gfp) 255 int addrs_len, __u16 port, unsigned int __nocast gfp)
253{ 256{
254 union sctp_addr_param *rawaddr; 257 union sctp_addr_param *rawaddr;
255 struct sctp_paramhdr *param; 258 struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
347/* Copy out addresses from the global local address list. */ 350/* Copy out addresses from the global local address list. */
348static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 351static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
349 union sctp_addr *addr, 352 union sctp_addr *addr,
350 sctp_scope_t scope, int gfp, int flags) 353 sctp_scope_t scope, unsigned int __nocast gfp,
354 int flags)
351{ 355{
352 int error = 0; 356 int error = 0;
353 357
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab7885058..61da2937e641 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
62} 62}
63 63
64/* Allocate and initialize datamsg. */ 64/* Allocate and initialize datamsg. */
65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp) 65SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
66{ 66{
67 struct sctp_datamsg *msg; 67 struct sctp_datamsg *msg;
68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp); 68 msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index c44bf4165c6e..e22ccd655965 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
67 * Initialize the base fields of the endpoint structure. 67 * Initialize the base fields of the endpoint structure.
68 */ 68 */
69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, 69static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
70 struct sock *sk, int gfp) 70 struct sock *sk,
71 unsigned int __nocast gfp)
71{ 72{
72 struct sctp_sock *sp = sctp_sk(sk); 73 struct sctp_sock *sp = sctp_sk(sk);
73 memset(ep, 0, sizeof(struct sctp_endpoint)); 74 memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -137,7 +138,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
137/* Create a sctp_endpoint with all that boring stuff initialized. 138/* Create a sctp_endpoint with all that boring stuff initialized.
138 * Returns NULL if there isn't enough memory. 139 * Returns NULL if there isn't enough memory.
139 */ 140 */
140struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp) 141struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
142 unsigned int __nocast gfp)
141{ 143{
142 struct sctp_endpoint *ep; 144 struct sctp_endpoint *ep;
143 145
@@ -191,8 +193,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
191 sctp_unhash_endpoint(ep); 193 sctp_unhash_endpoint(ep);
192 194
193 /* Free up the HMAC transform. */ 195 /* Free up the HMAC transform. */
194 if (sctp_sk(ep->base.sk)->hmac) 196 sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
195 sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
196 197
197 /* Cleanup. */ 198 /* Cleanup. */
198 sctp_inq_free(&ep->base.inqueue); 199 sctp_inq_free(&ep->base.inqueue);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5e085e041a6e..28f32243397f 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb)
236 } 236 }
237 237
238 /* SCTP seems to always need a timestamp right now (FIXME) */ 238 /* SCTP seems to always need a timestamp right now (FIXME) */
239 if (skb->stamp.tv_sec == 0) { 239 if (skb->tstamp.off_sec == 0) {
240 do_gettimeofday(&skb->stamp); 240 __net_timestamp(skb);
241 sock_enable_timestamp(sk); 241 sock_enable_timestamp(sk);
242 } 242 }
243 243
@@ -351,7 +351,6 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
351 * 351 *
352 */ 352 */
353void sctp_icmp_proto_unreachable(struct sock *sk, 353void sctp_icmp_proto_unreachable(struct sock *sk,
354 struct sctp_endpoint *ep,
355 struct sctp_association *asoc, 354 struct sctp_association *asoc,
356 struct sctp_transport *t) 355 struct sctp_transport *t)
357{ 356{
@@ -367,7 +366,6 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
367/* Common lookup code for icmp/icmpv6 error handler. */ 366/* Common lookup code for icmp/icmpv6 error handler. */
368struct sock *sctp_err_lookup(int family, struct sk_buff *skb, 367struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
369 struct sctphdr *sctphdr, 368 struct sctphdr *sctphdr,
370 struct sctp_endpoint **epp,
371 struct sctp_association **app, 369 struct sctp_association **app,
372 struct sctp_transport **tpp) 370 struct sctp_transport **tpp)
373{ 371{
@@ -375,11 +373,10 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
375 union sctp_addr daddr; 373 union sctp_addr daddr;
376 struct sctp_af *af; 374 struct sctp_af *af;
377 struct sock *sk = NULL; 375 struct sock *sk = NULL;
378 struct sctp_endpoint *ep = NULL;
379 struct sctp_association *asoc = NULL; 376 struct sctp_association *asoc = NULL;
380 struct sctp_transport *transport = NULL; 377 struct sctp_transport *transport = NULL;
381 378
382 *app = NULL; *epp = NULL; *tpp = NULL; 379 *app = NULL; *tpp = NULL;
383 380
384 af = sctp_get_af_specific(family); 381 af = sctp_get_af_specific(family);
385 if (unlikely(!af)) { 382 if (unlikely(!af)) {
@@ -394,26 +391,15 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
394 * packet. 391 * packet.
395 */ 392 */
396 asoc = __sctp_lookup_association(&saddr, &daddr, &transport); 393 asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
397 if (!asoc) { 394 if (!asoc)
398 /* If there is no matching association, see if it matches any 395 return NULL;
399 * endpoint. This may happen for an ICMP error generated in
400 * response to an INIT_ACK.
401 */
402 ep = __sctp_rcv_lookup_endpoint(&daddr);
403 if (!ep) {
404 return NULL;
405 }
406 }
407 396
408 if (asoc) { 397 sk = asoc->base.sk;
409 sk = asoc->base.sk;
410 398
411 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) { 399 if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
412 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 400 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
413 goto out; 401 goto out;
414 } 402 }
415 } else
416 sk = ep->base.sk;
417 403
418 sctp_bh_lock_sock(sk); 404 sctp_bh_lock_sock(sk);
419 405
@@ -423,7 +409,6 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
423 if (sock_owned_by_user(sk)) 409 if (sock_owned_by_user(sk))
424 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); 410 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
425 411
426 *epp = ep;
427 *app = asoc; 412 *app = asoc;
428 *tpp = transport; 413 *tpp = transport;
429 return sk; 414 return sk;
@@ -432,21 +417,16 @@ out:
432 sock_put(sk); 417 sock_put(sk);
433 if (asoc) 418 if (asoc)
434 sctp_association_put(asoc); 419 sctp_association_put(asoc);
435 if (ep)
436 sctp_endpoint_put(ep);
437 return NULL; 420 return NULL;
438} 421}
439 422
440/* Common cleanup code for icmp/icmpv6 error handler. */ 423/* Common cleanup code for icmp/icmpv6 error handler. */
441void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep, 424void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
442 struct sctp_association *asoc)
443{ 425{
444 sctp_bh_unlock_sock(sk); 426 sctp_bh_unlock_sock(sk);
445 sock_put(sk); 427 sock_put(sk);
446 if (asoc) 428 if (asoc)
447 sctp_association_put(asoc); 429 sctp_association_put(asoc);
448 if (ep)
449 sctp_endpoint_put(ep);
450} 430}
451 431
452/* 432/*
@@ -471,7 +451,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
471 int type = skb->h.icmph->type; 451 int type = skb->h.icmph->type;
472 int code = skb->h.icmph->code; 452 int code = skb->h.icmph->code;
473 struct sock *sk; 453 struct sock *sk;
474 struct sctp_endpoint *ep;
475 struct sctp_association *asoc; 454 struct sctp_association *asoc;
476 struct sctp_transport *transport; 455 struct sctp_transport *transport;
477 struct inet_sock *inet; 456 struct inet_sock *inet;
@@ -488,7 +467,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
488 savesctp = skb->h.raw; 467 savesctp = skb->h.raw;
489 skb->nh.iph = iph; 468 skb->nh.iph = iph;
490 skb->h.raw = (char *)sh; 469 skb->h.raw = (char *)sh;
491 sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport); 470 sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport);
492 /* Put back, the original pointers. */ 471 /* Put back, the original pointers. */
493 skb->nh.raw = saveip; 472 skb->nh.raw = saveip;
494 skb->h.raw = savesctp; 473 skb->h.raw = savesctp;
@@ -515,7 +494,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
515 } 494 }
516 else { 495 else {
517 if (ICMP_PROT_UNREACH == code) { 496 if (ICMP_PROT_UNREACH == code) {
518 sctp_icmp_proto_unreachable(sk, ep, asoc, 497 sctp_icmp_proto_unreachable(sk, asoc,
519 transport); 498 transport);
520 goto out_unlock; 499 goto out_unlock;
521 } 500 }
@@ -544,7 +523,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
544 } 523 }
545 524
546out_unlock: 525out_unlock:
547 sctp_err_finish(sk, ep, asoc); 526 sctp_err_finish(sk, asoc);
548} 527}
549 528
550/* 529/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c7e42d125b9c..fa3be2b8fb5f 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -66,8 +66,8 @@
66#include <linux/seq_file.h> 66#include <linux/seq_file.h>
67 67
68#include <net/protocol.h> 68#include <net/protocol.h>
69#include <net/tcp.h>
70#include <net/ndisc.h> 69#include <net/ndisc.h>
70#include <net/ip.h>
71#include <net/ipv6.h> 71#include <net/ipv6.h>
72#include <net/transp_v6.h> 72#include <net/transp_v6.h>
73#include <net/addrconf.h> 73#include <net/addrconf.h>
@@ -91,7 +91,6 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; 91 struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); 92 struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
93 struct sock *sk; 93 struct sock *sk;
94 struct sctp_endpoint *ep;
95 struct sctp_association *asoc; 94 struct sctp_association *asoc;
96 struct sctp_transport *transport; 95 struct sctp_transport *transport;
97 struct ipv6_pinfo *np; 96 struct ipv6_pinfo *np;
@@ -105,7 +104,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
105 savesctp = skb->h.raw; 104 savesctp = skb->h.raw;
106 skb->nh.ipv6h = iph; 105 skb->nh.ipv6h = iph;
107 skb->h.raw = (char *)sh; 106 skb->h.raw = (char *)sh;
108 sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport); 107 sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport);
109 /* Put back, the original pointers. */ 108 /* Put back, the original pointers. */
110 skb->nh.raw = saveip; 109 skb->nh.raw = saveip;
111 skb->h.raw = savesctp; 110 skb->h.raw = savesctp;
@@ -124,7 +123,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
124 goto out_unlock; 123 goto out_unlock;
125 case ICMPV6_PARAMPROB: 124 case ICMPV6_PARAMPROB:
126 if (ICMPV6_UNK_NEXTHDR == code) { 125 if (ICMPV6_UNK_NEXTHDR == code) {
127 sctp_icmp_proto_unreachable(sk, ep, asoc, transport); 126 sctp_icmp_proto_unreachable(sk, asoc, transport);
128 goto out_unlock; 127 goto out_unlock;
129 } 128 }
130 break; 129 break;
@@ -142,7 +141,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
142 } 141 }
143 142
144out_unlock: 143out_unlock:
145 sctp_err_finish(sk, ep, asoc); 144 sctp_err_finish(sk, asoc);
146out: 145out:
147 if (likely(idev != NULL)) 146 if (likely(idev != NULL))
148 in6_dev_put(idev); 147 in6_dev_put(idev);
@@ -642,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
642 else 641 else
643 newinet->pmtudisc = IP_PMTUDISC_WANT; 642 newinet->pmtudisc = IP_PMTUDISC_WANT;
644 643
645#ifdef INET_REFCNT_DEBUG 644 sk_refcnt_debug_inc(newsk);
646 atomic_inc(&inet6_sock_nr);
647 atomic_inc(&inet_sock_nr);
648#endif
649 645
650 if (newsk->sk_prot->init(newsk)) { 646 if (newsk->sk_prot->init(newsk)) {
651 sk_common_release(newsk); 647 sk_common_release(newsk);
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 0781e5d509fd..8ff588f0d76a 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -127,8 +127,12 @@ done:
127/* Initialize the objcount in the proc filesystem. */ 127/* Initialize the objcount in the proc filesystem. */
128void sctp_dbg_objcnt_init(void) 128void sctp_dbg_objcnt_init(void)
129{ 129{
130 create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp, 130 struct proc_dir_entry *ent;
131 ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
131 sctp_dbg_objcnt_read, NULL); 132 sctp_dbg_objcnt_read, NULL);
133 if (!ent)
134 printk(KERN_WARNING
135 "sctp_dbg_objcnt: Unable to create /proc entry.\n");
132} 136}
133 137
134/* Cleanup the objcount entry in the proc filesystem. */ 138/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 98d49ec9b74b..b74f7772b576 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,7 @@ static struct snmp_mib sctp_snmp_list[] = {
57 SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), 57 SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
58 SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), 58 SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
59 SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), 59 SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
60 SNMP_MIB_SENTINEL
60}; 61};
61 62
62/* Return the current value of a particular entry in the mib by adding its 63/* Return the current value of a particular entry in the mib by adding its
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e7f37faba7c0..e7025be77691 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -62,7 +62,7 @@
62/* Global data structures. */ 62/* Global data structures. */
63struct sctp_globals sctp_globals; 63struct sctp_globals sctp_globals;
64struct proc_dir_entry *proc_net_sctp; 64struct proc_dir_entry *proc_net_sctp;
65DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics); 65DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
66 66
67struct idr sctp_assocs_id; 67struct idr sctp_assocs_id;
68DEFINE_SPINLOCK(sctp_assocs_id_lock); 68DEFINE_SPINLOCK(sctp_assocs_id_lock);
@@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
78static struct sctp_af *sctp_af_v4_specific; 78static struct sctp_af *sctp_af_v4_specific;
79static struct sctp_af *sctp_af_v6_specific; 79static struct sctp_af *sctp_af_v6_specific;
80 80
81kmem_cache_t *sctp_chunk_cachep; 81kmem_cache_t *sctp_chunk_cachep __read_mostly;
82kmem_cache_t *sctp_bucket_cachep; 82kmem_cache_t *sctp_bucket_cachep __read_mostly;
83 83
84extern int sctp_snmp_proc_init(void); 84extern int sctp_snmp_proc_init(void);
85extern int sctp_snmp_proc_exit(void); 85extern int sctp_snmp_proc_exit(void);
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
219 219
220/* Copy the local addresses which are valid for 'scope' into 'bp'. */ 220/* Copy the local addresses which are valid for 'scope' into 'bp'. */
221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, 221int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
222 int gfp, int copy_flags) 222 unsigned int __nocast gfp, int copy_flags)
223{ 223{
224 struct sctp_sockaddr_entry *addr; 224 struct sctp_sockaddr_entry *addr;
225 int error = 0; 225 int error = 0;
@@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
593 newinet->mc_index = 0; 593 newinet->mc_index = 0;
594 newinet->mc_list = NULL; 594 newinet->mc_list = NULL;
595 595
596#ifdef INET_REFCNT_DEBUG 596 sk_refcnt_debug_inc(newsk);
597 atomic_inc(&inet_sock_nr);
598#endif
599 597
600 if (newsk->sk_prot->init(newsk)) { 598 if (newsk->sk_prot->init(newsk)) {
601 sk_common_release(newsk); 599 sk_common_release(newsk);
@@ -1244,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void)
1244module_init(sctp_init); 1242module_init(sctp_init);
1245module_exit(sctp_exit); 1243module_exit(sctp_exit);
1246 1244
1245/*
1246 * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
1247 */
1248MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
1247MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>"); 1249MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
1248MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); 1250MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
1249MODULE_LICENSE("GPL"); 1251MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 773cd93fa3d0..3868a8d70cc0 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
78static int sctp_process_param(struct sctp_association *asoc, 78static int sctp_process_param(struct sctp_association *asoc,
79 union sctp_params param, 79 union sctp_params param,
80 const union sctp_addr *peer_addr, 80 const union sctp_addr *peer_addr,
81 int gfp); 81 unsigned int __nocast gfp);
82 82
83/* What was the inbound interface for this chunk? */ 83/* What was the inbound interface for this chunk? */
84int sctp_chunk_iif(const struct sctp_chunk *chunk) 84int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
174 */ 174 */
175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, 175struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
176 const struct sctp_bind_addr *bp, 176 const struct sctp_bind_addr *bp,
177 int gfp, int vparam_len) 177 unsigned int __nocast gfp, int vparam_len)
178{ 178{
179 sctp_inithdr_t init; 179 sctp_inithdr_t init;
180 union sctp_params addrs; 180 union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
261 261
262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, 262struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
263 const struct sctp_chunk *chunk, 263 const struct sctp_chunk *chunk,
264 int gfp, int unkparam_len) 264 unsigned int __nocast gfp, int unkparam_len)
265{ 265{
266 sctp_inithdr_t initack; 266 sctp_inithdr_t initack;
267 struct sctp_chunk *retval; 267 struct sctp_chunk *retval;
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
1233 1233
1234/* Create a CLOSED association to use with an incoming packet. */ 1234/* Create a CLOSED association to use with an incoming packet. */
1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, 1235struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
1236 struct sctp_chunk *chunk, int gfp) 1236 struct sctp_chunk *chunk,
1237 unsigned int __nocast gfp)
1237{ 1238{
1238 struct sctp_association *asoc; 1239 struct sctp_association *asoc;
1239 struct sk_buff *skb; 1240 struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
1348struct sctp_association *sctp_unpack_cookie( 1349struct sctp_association *sctp_unpack_cookie(
1349 const struct sctp_endpoint *ep, 1350 const struct sctp_endpoint *ep,
1350 const struct sctp_association *asoc, 1351 const struct sctp_association *asoc,
1351 struct sctp_chunk *chunk, int gfp, 1352 struct sctp_chunk *chunk, unsigned int __nocast gfp,
1352 int *error, struct sctp_chunk **errp) 1353 int *error, struct sctp_chunk **errp)
1353{ 1354{
1354 struct sctp_association *retval = NULL; 1355 struct sctp_association *retval = NULL;
@@ -1361,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie(
1361 char *key; 1362 char *key;
1362 sctp_scope_t scope; 1363 sctp_scope_t scope;
1363 struct sk_buff *skb = chunk->skb; 1364 struct sk_buff *skb = chunk->skb;
1365 struct timeval tv;
1364 1366
1365 headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE; 1367 headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE;
1366 bodysize = ntohs(chunk->chunk_hdr->length) - headersize; 1368 bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
@@ -1433,7 +1435,8 @@ no_hmac:
1433 * an association, there is no need to check cookie's expiration 1435 * an association, there is no need to check cookie's expiration
1434 * for init collision case of lost COOKIE ACK. 1436 * for init collision case of lost COOKIE ACK.
1435 */ 1437 */
1436 if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) { 1438 skb_get_timestamp(skb, &tv);
1439 if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
1437 __u16 len; 1440 __u16 len;
1438 /* 1441 /*
1439 * Section 3.3.10.3 Stale Cookie Error (3) 1442 * Section 3.3.10.3 Stale Cookie Error (3)
@@ -1446,10 +1449,9 @@ no_hmac:
1446 len = ntohs(chunk->chunk_hdr->length); 1449 len = ntohs(chunk->chunk_hdr->length);
1447 *errp = sctp_make_op_error_space(asoc, chunk, len); 1450 *errp = sctp_make_op_error_space(asoc, chunk, len);
1448 if (*errp) { 1451 if (*errp) {
1449 suseconds_t usecs = (skb->stamp.tv_sec - 1452 suseconds_t usecs = (tv.tv_sec -
1450 bear_cookie->expiration.tv_sec) * 1000000L + 1453 bear_cookie->expiration.tv_sec) * 1000000L +
1451 skb->stamp.tv_usec - 1454 tv.tv_usec - bear_cookie->expiration.tv_usec;
1452 bear_cookie->expiration.tv_usec;
1453 1455
1454 usecs = htonl(usecs); 1456 usecs = htonl(usecs);
1455 sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, 1457 sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
@@ -1812,7 +1814,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
1812 */ 1814 */
1813int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, 1815int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1814 const union sctp_addr *peer_addr, 1816 const union sctp_addr *peer_addr,
1815 sctp_init_chunk_t *peer_init, int gfp) 1817 sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
1816{ 1818{
1817 union sctp_params param; 1819 union sctp_params param;
1818 struct sctp_transport *transport; 1820 struct sctp_transport *transport;
@@ -1983,7 +1985,7 @@ nomem:
1983static int sctp_process_param(struct sctp_association *asoc, 1985static int sctp_process_param(struct sctp_association *asoc,
1984 union sctp_params param, 1986 union sctp_params param,
1985 const union sctp_addr *peer_addr, 1987 const union sctp_addr *peer_addr,
1986 int gfp) 1988 unsigned int __nocast gfp)
1987{ 1989{
1988 union sctp_addr addr; 1990 union sctp_addr addr;
1989 int i; 1991 int i;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 778639db125a..39c970b5b198 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
63 void *event_arg, 63 void *event_arg,
64 sctp_disposition_t status, 64 sctp_disposition_t status,
65 sctp_cmd_seq_t *commands, 65 sctp_cmd_seq_t *commands,
66 int gfp); 66 unsigned int __nocast gfp);
67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, 67static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
68 sctp_state_t state, 68 sctp_state_t state,
69 struct sctp_endpoint *ep, 69 struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
71 void *event_arg, 71 void *event_arg,
72 sctp_disposition_t status, 72 sctp_disposition_t status,
73 sctp_cmd_seq_t *commands, 73 sctp_cmd_seq_t *commands,
74 int gfp); 74 unsigned int __nocast gfp);
75 75
76/******************************************************************** 76/********************************************************************
77 * Helper functions 77 * Helper functions
@@ -497,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, 497static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
498 struct sctp_association *asoc, 498 struct sctp_association *asoc,
499 struct sctp_chunk *chunk, 499 struct sctp_chunk *chunk,
500 sctp_init_chunk_t *peer_init, int gfp) 500 sctp_init_chunk_t *peer_init,
501 unsigned int __nocast gfp)
501{ 502{
502 int error; 503 int error;
503 504
@@ -852,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
852 struct sctp_endpoint *ep, 853 struct sctp_endpoint *ep,
853 struct sctp_association *asoc, 854 struct sctp_association *asoc,
854 void *event_arg, 855 void *event_arg,
855 int gfp) 856 unsigned int __nocast gfp)
856{ 857{
857 sctp_cmd_seq_t commands; 858 sctp_cmd_seq_t commands;
858 const sctp_sm_table_entry_t *state_fn; 859 const sctp_sm_table_entry_t *state_fn;
@@ -897,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
897 void *event_arg, 898 void *event_arg,
898 sctp_disposition_t status, 899 sctp_disposition_t status,
899 sctp_cmd_seq_t *commands, 900 sctp_cmd_seq_t *commands,
900 int gfp) 901 unsigned int __nocast gfp)
901{ 902{
902 int error; 903 int error;
903 904
@@ -985,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
985 void *event_arg, 986 void *event_arg,
986 sctp_disposition_t status, 987 sctp_disposition_t status,
987 sctp_cmd_seq_t *commands, 988 sctp_cmd_seq_t *commands,
988 int gfp) 989 unsigned int __nocast gfp)
989{ 990{
990 int error = 0; 991 int error = 0;
991 int force; 992 int force;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 091a66f06a35..91ec8c936913 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4194,8 +4194,7 @@ out:
4194 sctp_release_sock(sk); 4194 sctp_release_sock(sk);
4195 return err; 4195 return err;
4196cleanup: 4196cleanup:
4197 if (tfm) 4197 sctp_crypto_free_tfm(tfm);
4198 sctp_crypto_free_tfm(tfm);
4199 goto out; 4198 goto out;
4200} 4199}
4201 4200
@@ -4892,7 +4891,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4892 sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { 4891 sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
4893 event = sctp_skb2event(skb); 4892 event = sctp_skb2event(skb);
4894 if (event->asoc == assoc) { 4893 if (event->asoc == assoc) {
4895 __skb_unlink(skb, skb->list); 4894 __skb_unlink(skb, &oldsk->sk_receive_queue);
4896 __skb_queue_tail(&newsk->sk_receive_queue, skb); 4895 __skb_queue_tail(&newsk->sk_receive_queue, skb);
4897 } 4896 }
4898 } 4897 }
@@ -4921,7 +4920,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
4921 sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { 4920 sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
4922 event = sctp_skb2event(skb); 4921 event = sctp_skb2event(skb);
4923 if (event->asoc == assoc) { 4922 if (event->asoc == assoc) {
4924 __skb_unlink(skb, skb->list); 4923 __skb_unlink(skb, &oldsp->pd_lobby);
4925 __skb_queue_tail(queue, skb); 4924 __skb_queue_tail(queue, skb);
4926 } 4925 }
4927 } 4926 }
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451b6..25037daf3fa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
57/* Create a new sctp_ssnmap. 57/* Create a new sctp_ssnmap.
58 * Allocate room to store at least 'len' contiguous TSNs. 58 * Allocate room to store at least 'len' contiguous TSNs.
59 */ 59 */
60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp) 60struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
61 unsigned int __nocast gfp)
61{ 62{
62 struct sctp_ssnmap *retval; 63 struct sctp_ssnmap *retval;
63 int size; 64 int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index dc4893474f18..75b28dd634fe 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <net/sctp/structs.h> 44#include <net/sctp/structs.h>
45#include <net/sctp/sctp.h>
45#include <linux/sysctl.h> 46#include <linux/sysctl.h>
46 47
47static ctl_handler sctp_sysctl_jiffies_ms; 48static ctl_handler sctp_sysctl_jiffies_ms;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a63b69179607..d2f04ebe5081 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
57/* Initialize a new transport from provided memory. */ 57/* Initialize a new transport from provided memory. */
58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, 58static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
59 const union sctp_addr *addr, 59 const union sctp_addr *addr,
60 int gfp) 60 unsigned int __nocast gfp)
61{ 61{
62 /* Copy in the address. */ 62 /* Copy in the address. */
63 peer->ipaddr = *addr; 63 peer->ipaddr = *addr;
@@ -121,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
121} 121}
122 122
123/* Allocate and initialize a new transport. */ 123/* Allocate and initialize a new transport. */
124struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp) 124struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
125 unsigned int __nocast gfp)
125{ 126{
126 struct sctp_transport *transport; 127 struct sctp_transport *transport;
127 128
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff534735..0abd5101107c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
74 74
75/* Create a new sctp_ulpevent. */ 75/* Create a new sctp_ulpevent. */
76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, 76SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
77 int gfp) 77 unsigned int __nocast gfp)
78{ 78{
79 struct sctp_ulpevent *event; 79 struct sctp_ulpevent *event;
80 struct sk_buff *skb; 80 struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( 136struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
137 const struct sctp_association *asoc, 137 const struct sctp_association *asoc,
138 __u16 flags, __u16 state, __u16 error, __u16 outbound, 138 __u16 flags, __u16 state, __u16 error, __u16 outbound,
139 __u16 inbound, int gfp) 139 __u16 inbound, unsigned int __nocast gfp)
140{ 140{
141 struct sctp_ulpevent *event; 141 struct sctp_ulpevent *event;
142 struct sctp_assoc_change *sac; 142 struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( 237struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
238 const struct sctp_association *asoc, 238 const struct sctp_association *asoc,
239 const struct sockaddr_storage *aaddr, 239 const struct sockaddr_storage *aaddr,
240 int flags, int state, int error, int gfp) 240 int flags, int state, int error, unsigned int __nocast gfp)
241{ 241{
242 struct sctp_ulpevent *event; 242 struct sctp_ulpevent *event;
243 struct sctp_paddr_change *spc; 243 struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
350 */ 350 */
351struct sctp_ulpevent *sctp_ulpevent_make_remote_error( 351struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
352 const struct sctp_association *asoc, struct sctp_chunk *chunk, 352 const struct sctp_association *asoc, struct sctp_chunk *chunk,
353 __u16 flags, int gfp) 353 __u16 flags, unsigned int __nocast gfp)
354{ 354{
355 struct sctp_ulpevent *event; 355 struct sctp_ulpevent *event;
356 struct sctp_remote_error *sre; 356 struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
448 */ 448 */
449struct sctp_ulpevent *sctp_ulpevent_make_send_failed( 449struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
450 const struct sctp_association *asoc, struct sctp_chunk *chunk, 450 const struct sctp_association *asoc, struct sctp_chunk *chunk,
451 __u16 flags, __u32 error, int gfp) 451 __u16 flags, __u32 error, unsigned int __nocast gfp)
452{ 452{
453 struct sctp_ulpevent *event; 453 struct sctp_ulpevent *event;
454 struct sctp_send_failed *ssf; 454 struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
557 */ 557 */
558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( 558struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
559 const struct sctp_association *asoc, 559 const struct sctp_association *asoc,
560 __u16 flags, int gfp) 560 __u16 flags, unsigned int __nocast gfp)
561{ 561{
562 struct sctp_ulpevent *event; 562 struct sctp_ulpevent *event;
563 struct sctp_shutdown_event *sse; 563 struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
620 * 5.3.1.6 SCTP_ADAPTION_INDICATION 620 * 5.3.1.6 SCTP_ADAPTION_INDICATION
621 */ 621 */
622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication( 622struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
623 const struct sctp_association *asoc, int gfp) 623 const struct sctp_association *asoc, unsigned int __nocast gfp)
624{ 624{
625 struct sctp_ulpevent *event; 625 struct sctp_ulpevent *event;
626 struct sctp_adaption_event *sai; 626 struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
657 */ 657 */
658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, 658struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
659 struct sctp_chunk *chunk, 659 struct sctp_chunk *chunk,
660 int gfp) 660 unsigned int __nocast gfp)
661{ 661{
662 struct sctp_ulpevent *event = NULL; 662 struct sctp_ulpevent *event = NULL;
663 struct sk_buff *skb; 663 struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
718 * various events. 718 * various events.
719 */ 719 */
720struct sctp_ulpevent *sctp_ulpevent_make_pdapi( 720struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
721 const struct sctp_association *asoc, __u32 indication, int gfp) 721 const struct sctp_association *asoc, __u32 indication,
722 unsigned int __nocast gfp)
722{ 723{
723 struct sctp_ulpevent *event; 724 struct sctp_ulpevent *event;
724 struct sctp_pdapi_event *pd; 725 struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac4a..ec2c857eae7f 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -50,9 +50,9 @@
50 50
51/* Forward declarations for internal helpers. */ 51/* Forward declarations for internal helpers. */
52static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, 52static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
53 struct sctp_ulpevent *); 53 struct sctp_ulpevent *);
54static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, 54static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
55 struct sctp_ulpevent *); 55 struct sctp_ulpevent *);
56 56
57/* 1st Level Abstractions */ 57/* 1st Level Abstractions */
58 58
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
100 100
101/* Process an incoming DATA chunk. */ 101/* Process an incoming DATA chunk. */
102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 102int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
103 int gfp) 103 unsigned int __nocast gfp)
104{ 104{
105 struct sk_buff_head temp; 105 struct sk_buff_head temp;
106 sctp_data_chunk_t *hdr; 106 sctp_data_chunk_t *hdr;
@@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
125 event = sctp_ulpq_order(ulpq, event); 125 event = sctp_ulpq_order(ulpq, event);
126 } 126 }
127 127
128 /* Send event to the ULP. */ 128 /* Send event to the ULP. 'event' is the sctp_ulpevent for
129 * very first SKB on the 'temp' list.
130 */
129 if (event) 131 if (event)
130 sctp_ulpq_tail_event(ulpq, event); 132 sctp_ulpq_tail_event(ulpq, event);
131 133
@@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
158 return sctp_clear_pd(ulpq->asoc->base.sk); 160 return sctp_clear_pd(ulpq->asoc->base.sk);
159} 161}
160 162
161 163/* If the SKB of 'event' is on a list, it is the first such member
162 164 * of that list.
165 */
163int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) 166int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
164{ 167{
165 struct sock *sk = ulpq->asoc->base.sk; 168 struct sock *sk = ulpq->asoc->base.sk;
166 struct sk_buff_head *queue; 169 struct sk_buff_head *queue, *skb_list;
170 struct sk_buff *skb = sctp_event2skb(event);
167 int clear_pd = 0; 171 int clear_pd = 0;
168 172
173 skb_list = (struct sk_buff_head *) skb->prev;
174
169 /* If the socket is just going to throw this away, do not 175 /* If the socket is just going to throw this away, do not
170 * even try to deliver it. 176 * even try to deliver it.
171 */ 177 */
@@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
197 /* If we are harvesting multiple skbs they will be 203 /* If we are harvesting multiple skbs they will be
198 * collected on a list. 204 * collected on a list.
199 */ 205 */
200 if (sctp_event2skb(event)->list) 206 if (skb_list)
201 sctp_skb_list_tail(sctp_event2skb(event)->list, queue); 207 sctp_skb_list_tail(skb_list, queue);
202 else 208 else
203 __skb_queue_tail(queue, sctp_event2skb(event)); 209 __skb_queue_tail(queue, skb);
204 210
205 /* Did we just complete partial delivery and need to get 211 /* Did we just complete partial delivery and need to get
206 * rolling again? Move pending data to the receive 212 * rolling again? Move pending data to the receive
@@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
214 return 1; 220 return 1;
215 221
216out_free: 222out_free:
217 if (sctp_event2skb(event)->list) 223 if (skb_list)
218 sctp_queue_purge_ulpevents(sctp_event2skb(event)->list); 224 sctp_queue_purge_ulpevents(skb_list);
219 else 225 else
220 sctp_ulpevent_free(event); 226 sctp_ulpevent_free(event);
227
221 return 0; 228 return 0;
222} 229}
223 230
@@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
269 * payload was fragmented on the way and ip had to reassemble them. 276 * payload was fragmented on the way and ip had to reassemble them.
270 * We add the rest of skb's to the first skb's fraglist. 277 * We add the rest of skb's to the first skb's fraglist.
271 */ 278 */
272static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag) 279static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
273{ 280{
274 struct sk_buff *pos; 281 struct sk_buff *pos;
275 struct sctp_ulpevent *event; 282 struct sctp_ulpevent *event;
@@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
294 skb_shinfo(f_frag)->frag_list = pos; 301 skb_shinfo(f_frag)->frag_list = pos;
295 302
296 /* Remove the first fragment from the reassembly queue. */ 303 /* Remove the first fragment from the reassembly queue. */
297 __skb_unlink(f_frag, f_frag->list); 304 __skb_unlink(f_frag, queue);
298 while (pos) { 305 while (pos) {
299 306
300 pnext = pos->next; 307 pnext = pos->next;
@@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
304 f_frag->data_len += pos->len; 311 f_frag->data_len += pos->len;
305 312
306 /* Remove the fragment from the reassembly queue. */ 313 /* Remove the fragment from the reassembly queue. */
307 __skb_unlink(pos, pos->list); 314 __skb_unlink(pos, queue);
308 315
309 /* Break if we have reached the last fragment. */ 316 /* Break if we have reached the last fragment. */
310 if (pos == l_frag) 317 if (pos == l_frag)
@@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u
375done: 382done:
376 return retval; 383 return retval;
377found: 384found:
378 retval = sctp_make_reassembled_event(first_frag, pos); 385 retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
379 if (retval) 386 if (retval)
380 retval->msg_flags |= MSG_EOR; 387 retval->msg_flags |= MSG_EOR;
381 goto done; 388 goto done;
@@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq
435 * further. 442 * further.
436 */ 443 */
437done: 444done:
438 retval = sctp_make_reassembled_event(first_frag, last_frag); 445 retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
439 if (retval && is_last) 446 if (retval && is_last)
440 retval->msg_flags |= MSG_EOR; 447 retval->msg_flags |= MSG_EOR;
441 448
@@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u
527 * further. 534 * further.
528 */ 535 */
529done: 536done:
530 retval = sctp_make_reassembled_event(first_frag, last_frag); 537 retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
531 return retval; 538 return retval;
532} 539}
533 540
@@ -537,6 +544,7 @@ done:
537static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, 544static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
538 struct sctp_ulpevent *event) 545 struct sctp_ulpevent *event)
539{ 546{
547 struct sk_buff_head *event_list;
540 struct sk_buff *pos, *tmp; 548 struct sk_buff *pos, *tmp;
541 struct sctp_ulpevent *cevent; 549 struct sctp_ulpevent *cevent;
542 struct sctp_stream *in; 550 struct sctp_stream *in;
@@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
547 ssn = event->ssn; 555 ssn = event->ssn;
548 in = &ulpq->asoc->ssnmap->in; 556 in = &ulpq->asoc->ssnmap->in;
549 557
558 event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
559
550 /* We are holding the chunks by stream, by SSN. */ 560 /* We are holding the chunks by stream, by SSN. */
551 sctp_skb_for_each(pos, &ulpq->lobby, tmp) { 561 sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
552 cevent = (struct sctp_ulpevent *) pos->cb; 562 cevent = (struct sctp_ulpevent *) pos->cb;
@@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
567 /* Found it, so mark in the ssnmap. */ 577 /* Found it, so mark in the ssnmap. */
568 sctp_ssn_next(in, sid); 578 sctp_ssn_next(in, sid);
569 579
570 __skb_unlink(pos, pos->list); 580 __skb_unlink(pos, &ulpq->lobby);
571 581
572 /* Attach all gathered skbs to the event. */ 582 /* Attach all gathered skbs to the event. */
573 __skb_queue_tail(sctp_event2skb(event)->list, pos); 583 __skb_queue_tail(event_list, pos);
574 } 584 }
575} 585}
576 586
@@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
626} 636}
627 637
628static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, 638static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
629 struct sctp_ulpevent *event) 639 struct sctp_ulpevent *event)
630{ 640{
631 __u16 sid, ssn; 641 __u16 sid, ssn;
632 struct sctp_stream *in; 642 struct sctp_stream *in;
@@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
667{ 677{
668 struct sk_buff *pos, *tmp; 678 struct sk_buff *pos, *tmp;
669 struct sctp_ulpevent *cevent; 679 struct sctp_ulpevent *cevent;
670 struct sctp_ulpevent *event = NULL; 680 struct sctp_ulpevent *event;
671 struct sctp_stream *in; 681 struct sctp_stream *in;
672 struct sk_buff_head temp; 682 struct sk_buff_head temp;
673 __u16 csid, cssn; 683 __u16 csid, cssn;
@@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
675 in = &ulpq->asoc->ssnmap->in; 685 in = &ulpq->asoc->ssnmap->in;
676 686
677 /* We are holding the chunks by stream, by SSN. */ 687 /* We are holding the chunks by stream, by SSN. */
688 skb_queue_head_init(&temp);
689 event = NULL;
678 sctp_skb_for_each(pos, &ulpq->lobby, tmp) { 690 sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
679 cevent = (struct sctp_ulpevent *) pos->cb; 691 cevent = (struct sctp_ulpevent *) pos->cb;
680 csid = cevent->stream; 692 csid = cevent->stream;
@@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
686 /* Found it, so mark in the ssnmap. */ 698 /* Found it, so mark in the ssnmap. */
687 sctp_ssn_next(in, csid); 699 sctp_ssn_next(in, csid);
688 700
689 __skb_unlink(pos, pos->list); 701 __skb_unlink(pos, &ulpq->lobby);
690 if (!event) { 702 if (!event) {
691 /* Create a temporary list to collect chunks on. */ 703 /* Create a temporary list to collect chunks on. */
692 event = sctp_skb2event(pos); 704 event = sctp_skb2event(pos);
693 skb_queue_head_init(&temp);
694 __skb_queue_tail(&temp, sctp_event2skb(event)); 705 __skb_queue_tail(&temp, sctp_event2skb(event));
695 } else { 706 } else {
696 /* Attach all gathered skbs to the event. */ 707 /* Attach all gathered skbs to the event. */
697 __skb_queue_tail(sctp_event2skb(event)->list, pos); 708 __skb_queue_tail(&temp, pos);
698 } 709 }
699 } 710 }
700 711
701 /* Send event to the ULP. */ 712 /* Send event to the ULP. 'event' is the sctp_ulpevent for
713 * very first SKB on the 'temp' list.
714 */
702 if (event) 715 if (event)
703 sctp_ulpq_tail_event(ulpq, event); 716 sctp_ulpq_tail_event(ulpq, event);
704} 717}
@@ -778,7 +791,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
778 791
779/* Partial deliver the first message as there is pressure on rwnd. */ 792/* Partial deliver the first message as there is pressure on rwnd. */
780void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, 793void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
781 struct sctp_chunk *chunk, int gfp) 794 struct sctp_chunk *chunk,
795 unsigned int __nocast gfp)
782{ 796{
783 struct sctp_ulpevent *event; 797 struct sctp_ulpevent *event;
784 struct sctp_association *asoc; 798 struct sctp_association *asoc;
@@ -802,7 +816,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
802 816
803/* Renege some packets to make room for an incoming chunk. */ 817/* Renege some packets to make room for an incoming chunk. */
804void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, 818void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
805 int gfp) 819 unsigned int __nocast gfp)
806{ 820{
807 struct sctp_association *asoc; 821 struct sctp_association *asoc;
808 __u16 needed, freed; 822 __u16 needed, freed;
@@ -841,7 +855,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
841/* Notify the application if an association is aborted and in 855/* Notify the application if an association is aborted and in
842 * partial delivery mode. Send up any pending received messages. 856 * partial delivery mode. Send up any pending received messages.
843 */ 857 */
844void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp) 858void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
845{ 859{
846 struct sctp_ulpevent *ev = NULL; 860 struct sctp_ulpevent *ev = NULL;
847 struct sock *sk; 861 struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index 6f2a17881972..94fe638b4d72 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -70,6 +70,8 @@
70#include <linux/seq_file.h> 70#include <linux/seq_file.h>
71#include <linux/wanrouter.h> 71#include <linux/wanrouter.h>
72#include <linux/if_bridge.h> 72#include <linux/if_bridge.h>
73#include <linux/if_frad.h>
74#include <linux/if_vlan.h>
73#include <linux/init.h> 75#include <linux/init.h>
74#include <linux/poll.h> 76#include <linux/poll.h>
75#include <linux/cache.h> 77#include <linux/cache.h>
@@ -272,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule
272 274
273#define SOCKFS_MAGIC 0x534F434B 275#define SOCKFS_MAGIC 0x534F434B
274 276
275static kmem_cache_t * sock_inode_cachep; 277static kmem_cache_t * sock_inode_cachep __read_mostly;
276 278
277static struct inode *sock_alloc_inode(struct super_block *sb) 279static struct inode *sock_alloc_inode(struct super_block *sb)
278{ 280{
@@ -331,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
331 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); 333 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
332} 334}
333 335
334static struct vfsmount *sock_mnt; 336static struct vfsmount *sock_mnt __read_mostly;
335 337
336static struct file_system_type sock_fs_type = { 338static struct file_system_type sock_fs_type = {
337 .name = "sockfs", 339 .name = "sockfs",
@@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock)
404 file->f_mode = FMODE_READ | FMODE_WRITE; 406 file->f_mode = FMODE_READ | FMODE_WRITE;
405 file->f_flags = O_RDWR; 407 file->f_flags = O_RDWR;
406 file->f_pos = 0; 408 file->f_pos = 0;
409 file->private_data = sock;
407 fd_install(fd, file); 410 fd_install(fd, file);
408 } 411 }
409 412
@@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err)
436 return NULL; 439 return NULL;
437 } 440 }
438 441
442 if (file->f_op == &socket_file_ops)
443 return file->private_data; /* set in sock_map_fd */
444
439 inode = file->f_dentry->d_inode; 445 inode = file->f_dentry->d_inode;
440 if (!S_ISSOCK(inode->i_mode)) { 446 if (!S_ISSOCK(inode->i_mode)) {
441 *err = -ENOTSOCK; 447 *err = -ENOTSOCK;
@@ -720,8 +726,8 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
720 return __sock_sendmsg(iocb, sock, &x->async_msg, size); 726 return __sock_sendmsg(iocb, sock, &x->async_msg, size);
721} 727}
722 728
723ssize_t sock_sendpage(struct file *file, struct page *page, 729static ssize_t sock_sendpage(struct file *file, struct page *page,
724 int offset, size_t size, loff_t *ppos, int more) 730 int offset, size_t size, loff_t *ppos, int more)
725{ 731{
726 struct socket *sock; 732 struct socket *sock;
727 int flags; 733 int flags;
@@ -944,7 +950,7 @@ static int sock_mmap(struct file * file, struct vm_area_struct * vma)
944 return sock->ops->mmap(file, sock, vma); 950 return sock->ops->mmap(file, sock, vma);
945} 951}
946 952
947int sock_close(struct inode *inode, struct file *filp) 953static int sock_close(struct inode *inode, struct file *filp)
948{ 954{
949 /* 955 /*
950 * It was possible the inode is NULL we were 956 * It was possible the inode is NULL we were
@@ -2023,9 +2029,6 @@ int sock_unregister(int family)
2023 return 0; 2029 return 0;
2024} 2030}
2025 2031
2026
2027extern void sk_init(void);
2028
2029void __init sock_init(void) 2032void __init sock_init(void)
2030{ 2033{
2031 /* 2034 /*
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 24c21f2a33a7..ee6ae74cd1b2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -160,7 +160,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
160 " unsupported checksum %d", cksumtype); 160 " unsupported checksum %d", cksumtype);
161 goto out; 161 goto out;
162 } 162 }
163 if (!(tfm = crypto_alloc_tfm(cksumname, 0))) 163 if (!(tfm = crypto_alloc_tfm(cksumname, CRYPTO_TFM_REQ_MAY_SLEEP)))
164 goto out; 164 goto out;
165 cksum->len = crypto_tfm_alg_digestsize(tfm); 165 cksum->len = crypto_tfm_alg_digestsize(tfm);
166 if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL) 166 if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL)
@@ -185,9 +185,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
185 sg->page = body->pages[i]; 185 sg->page = body->pages[i];
186 sg->offset = offset; 186 sg->offset = offset;
187 sg->length = thislen; 187 sg->length = thislen;
188 kmap(sg->page); /* XXX kmap_atomic? */
189 crypto_digest_update(tfm, sg, 1); 188 crypto_digest_update(tfm, sg, 1);
190 kunmap(sg->page);
191 len -= thislen; 189 len -= thislen;
192 i++; 190 i++;
193 offset = 0; 191 offset = 0;
@@ -201,8 +199,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
201 crypto_digest_final(tfm, cksum->data); 199 crypto_digest_final(tfm, cksum->data);
202 code = 0; 200 code = 0;
203out: 201out:
204 if (tfm) 202 crypto_free_tfm(tfm);
205 crypto_free_tfm(tfm);
206 return code; 203 return code;
207} 204}
208 205
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index cf726510df8e..606a8a82cafb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -185,12 +185,9 @@ static void
185gss_delete_sec_context_kerberos(void *internal_ctx) { 185gss_delete_sec_context_kerberos(void *internal_ctx) {
186 struct krb5_ctx *kctx = internal_ctx; 186 struct krb5_ctx *kctx = internal_ctx;
187 187
188 if (kctx->seq) 188 crypto_free_tfm(kctx->seq);
189 crypto_free_tfm(kctx->seq); 189 crypto_free_tfm(kctx->enc);
190 if (kctx->enc) 190 kfree(kctx->mech_used.data);
191 crypto_free_tfm(kctx->enc);
192 if (kctx->mech_used.data)
193 kfree(kctx->mech_used.data);
194 kfree(kctx); 191 kfree(kctx);
195} 192}
196 193
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index dad05994c3eb..6c97d61baa9b 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -214,14 +214,10 @@ static void
214gss_delete_sec_context_spkm3(void *internal_ctx) { 214gss_delete_sec_context_spkm3(void *internal_ctx) {
215 struct spkm3_ctx *sctx = internal_ctx; 215 struct spkm3_ctx *sctx = internal_ctx;
216 216
217 if(sctx->derived_integ_key) 217 crypto_free_tfm(sctx->derived_integ_key);
218 crypto_free_tfm(sctx->derived_integ_key); 218 crypto_free_tfm(sctx->derived_conf_key);
219 if(sctx->derived_conf_key) 219 kfree(sctx->share_key.data);
220 crypto_free_tfm(sctx->derived_conf_key); 220 kfree(sctx->mech_used.data);
221 if(sctx->share_key.data)
222 kfree(sctx->share_key.data);
223 if(sctx->mech_used.data)
224 kfree(sctx->mech_used.data);
225 kfree(sctx); 221 kfree(sctx);
226} 222}
227 223
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 554f224c0445..ded6c63f11ec 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Userland/kernel interface for rpcauth_gss. 4 * Userland/kernel interface for rpcauth_gss.
5 * Code shamelessly plagiarized from fs/nfsd/nfsctl.c 5 * Code shamelessly plagiarized from fs/nfsd/nfsctl.c
6 * and fs/driverfs/inode.c 6 * and fs/sysfs/inode.c
7 * 7 *
8 * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no> 8 * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no>
9 * 9 *
@@ -28,13 +28,13 @@
28#include <linux/workqueue.h> 28#include <linux/workqueue.h>
29#include <linux/sunrpc/rpc_pipe_fs.h> 29#include <linux/sunrpc/rpc_pipe_fs.h>
30 30
31static struct vfsmount *rpc_mount; 31static struct vfsmount *rpc_mount __read_mostly;
32static int rpc_mount_count; 32static int rpc_mount_count;
33 33
34static struct file_system_type rpc_pipe_fs_type; 34static struct file_system_type rpc_pipe_fs_type;
35 35
36 36
37static kmem_cache_t *rpc_inode_cachep; 37static kmem_cache_t *rpc_inode_cachep __read_mostly;
38 38
39#define RPC_UPCALL_TIMEOUT (30*HZ) 39#define RPC_UPCALL_TIMEOUT (30*HZ)
40 40
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d9eb7fbd521..f3104035e35d 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,10 +34,10 @@ static int rpc_task_id;
34#define RPC_BUFFER_MAXSIZE (2048) 34#define RPC_BUFFER_MAXSIZE (2048)
35#define RPC_BUFFER_POOLSIZE (8) 35#define RPC_BUFFER_POOLSIZE (8)
36#define RPC_TASK_POOLSIZE (8) 36#define RPC_TASK_POOLSIZE (8)
37static kmem_cache_t *rpc_task_slabp; 37static kmem_cache_t *rpc_task_slabp __read_mostly;
38static kmem_cache_t *rpc_buffer_slabp; 38static kmem_cache_t *rpc_buffer_slabp __read_mostly;
39static mempool_t *rpc_task_mempool; 39static mempool_t *rpc_task_mempool __read_mostly;
40static mempool_t *rpc_buffer_mempool; 40static mempool_t *rpc_buffer_mempool __read_mostly;
41 41
42static void __rpc_default_timer(struct rpc_task *task); 42static void __rpc_default_timer(struct rpc_task *task);
43static void rpciod_killall(void); 43static void rpciod_killall(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 56db8f13e6cb..05fe2e735538 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -34,7 +34,7 @@
34#include <net/sock.h> 34#include <net/sock.h>
35#include <net/checksum.h> 35#include <net/checksum.h>
36#include <net/ip.h> 36#include <net/ip.h>
37#include <net/tcp.h> 37#include <net/tcp_states.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39#include <asm/ioctls.h> 39#include <asm/ioctls.h>
40 40
@@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
584 /* possibly an icmp error */ 584 /* possibly an icmp error */
585 dprintk("svc: recvfrom returned error %d\n", -err); 585 dprintk("svc: recvfrom returned error %d\n", -err);
586 } 586 }
587 if (skb->stamp.tv_sec == 0) { 587 if (skb->tstamp.off_sec == 0) {
588 skb->stamp.tv_sec = xtime.tv_sec; 588 struct timeval tv;
589 skb->stamp.tv_usec = xtime.tv_nsec * 1000; 589
590 tv.tv_sec = xtime.tv_sec;
591 tv.tv_usec = xtime.tv_nsec * 1000;
592 skb_set_timestamp(skb, &tv);
590 /* Don't enable netstamp, sunrpc doesn't 593 /* Don't enable netstamp, sunrpc doesn't
591 need that much accuracy */ 594 need that much accuracy */
592 } 595 }
593 svsk->sk_sk->sk_stamp = skb->stamp; 596 skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
594 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 597 set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
595 598
596 /* 599 /*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8a4d9c106af1..fde16f40a581 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -993,6 +993,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
993 return -EINVAL; 993 return -EINVAL;
994 } else { 994 } else {
995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 || 995 if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
996 desc->array_len > desc->array_maxlen ||
996 (unsigned long) base + 4 + desc->array_len * 997 (unsigned long) base + 4 + desc->array_len *
997 desc->elem_size > buf->len) 998 desc->elem_size > buf->len)
998 return -EINVAL; 999 return -EINVAL;
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 3f6e31069c54..c5241fcbb966 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -17,17 +17,15 @@
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18 18
19#ifdef CONFIG_INET 19#ifdef CONFIG_INET
20extern struct ctl_table ipv4_table[]; 20#include <net/ip.h>
21#endif 21#endif
22 22
23extern struct ctl_table core_table[];
24
25#ifdef CONFIG_NET 23#ifdef CONFIG_NET
26extern struct ctl_table ether_table[]; 24#include <linux/if_ether.h>
27#endif 25#endif
28 26
29#ifdef CONFIG_TR 27#ifdef CONFIG_TR
30extern struct ctl_table tr_table[]; 28#include <linux/if_tr.h>
31#endif 29#endif
32 30
33struct ctl_table net_table[] = { 31struct ctl_table net_table[] = {
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000000..5a69733bcdad
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
1#
2# Unix Domain Sockets
3#
4
5config UNIX
6 tristate "Unix domain sockets"
7 ---help---
8 If you say Y here, you will include support for Unix domain sockets;
9 sockets are the standard Unix mechanism for establishing and
10 accessing network connections. Many commonly used programs such as
11 the X Window system and syslog use these sockets even if your
12 machine is not connected to any network. Unless you are working on
13 an embedded system or something similar, you therefore definitely
14 want to say Y here.
15
16 To compile this driver as a module, choose M here: the module will be
17 called unix. Note that several important services won't work
18 correctly if you say M here and then neglect to load the module.
19
20 Say Y unless you know what you are doing.
21
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d403e34088ad..41feca3bef86 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -105,7 +105,7 @@
105#include <linux/skbuff.h> 105#include <linux/skbuff.h>
106#include <linux/netdevice.h> 106#include <linux/netdevice.h>
107#include <net/sock.h> 107#include <net/sock.h>
108#include <linux/tcp.h> 108#include <net/tcp_states.h>
109#include <net/af_unix.h> 109#include <net/af_unix.h>
110#include <linux/proc_fs.h> 110#include <linux/proc_fs.h>
111#include <linux/seq_file.h> 111#include <linux/seq_file.h>
@@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = {
2026 .owner = THIS_MODULE, 2026 .owner = THIS_MODULE,
2027}; 2027};
2028 2028
2029#ifdef CONFIG_SYSCTL
2030extern void unix_sysctl_register(void);
2031extern void unix_sysctl_unregister(void);
2032#else
2033static inline void unix_sysctl_register(void) {}
2034static inline void unix_sysctl_unregister(void) {}
2035#endif
2036
2037static int __init af_unix_init(void) 2029static int __init af_unix_init(void)
2038{ 2030{
2039 int rc = -1; 2031 int rc = -1;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4bd95c8f5934..6ffc64e1712d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -76,11 +76,11 @@
76#include <linux/netdevice.h> 76#include <linux/netdevice.h>
77#include <linux/file.h> 77#include <linux/file.h>
78#include <linux/proc_fs.h> 78#include <linux/proc_fs.h>
79#include <linux/tcp.h>
80 79
81#include <net/sock.h> 80#include <net/sock.h>
82#include <net/af_unix.h> 81#include <net/af_unix.h>
83#include <net/scm.h> 82#include <net/scm.h>
83#include <net/tcp_states.h>
84 84
85/* Internal data structures and random procedures: */ 85/* Internal data structures and random procedures: */
86 86
@@ -286,16 +286,16 @@ void unix_gc(void)
286 skb = skb_peek(&s->sk_receive_queue); 286 skb = skb_peek(&s->sk_receive_queue);
287 while (skb && 287 while (skb &&
288 skb != (struct sk_buff *)&s->sk_receive_queue) { 288 skb != (struct sk_buff *)&s->sk_receive_queue) {
289 nextsk=skb->next; 289 nextsk = skb->next;
290 /* 290 /*
291 * Do we have file descriptors ? 291 * Do we have file descriptors ?
292 */ 292 */
293 if(UNIXCB(skb).fp) 293 if (UNIXCB(skb).fp) {
294 { 294 __skb_unlink(skb,
295 __skb_unlink(skb, skb->list); 295 &s->sk_receive_queue);
296 __skb_queue_tail(&hitlist,skb); 296 __skb_queue_tail(&hitlist, skb);
297 } 297 }
298 skb=nextsk; 298 skb = nextsk;
299 } 299 }
300 spin_unlock(&s->sk_receive_queue.lock); 300 spin_unlock(&s->sk_receive_queue.lock);
301 } 301 }
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index c974dac4580a..690ffa5d5bfb 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -12,7 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14 14
15extern int sysctl_unix_max_dgram_qlen; 15#include <net/af_unix.h>
16 16
17static ctl_table unix_table[] = { 17static ctl_table unix_table[] = {
18 { 18 {
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 000000000000..1debe1cb054e
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
1#
2# Configuration for WAN router
3#
4
5config WAN_ROUTER
6 tristate "WAN router"
7 depends on EXPERIMENTAL
8 ---help---
9 Wide Area Networks (WANs), such as X.25, frame relay and leased
10 lines, are used to interconnect Local Area Networks (LANs) over vast
11 distances with data transfer rates significantly higher than those
12 achievable with commonly used asynchronous modem connections.
13 Usually, a quite expensive external device called a `WAN router' is
14 needed to connect to a WAN.
15
16 As an alternative, WAN routing can be built into the Linux kernel.
17 With relatively inexpensive WAN interface cards available on the
18 market, a perfectly usable router can be built for less than half
19 the price of an external router. If you have one of those cards and
20 wish to use your Linux box as a WAN router, say Y here and also to
21 the WAN driver for your card, below. You will then need the
22 wan-tools package which is available from <ftp://ftp.sangoma.com/>.
23 Read <file:Documentation/networking/wan-router.txt> for more
24 information.
25
26 To compile WAN routing support as a module, choose M here: the
27 module will be called wanrouter.
28
29 If unsure, say N.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index d93b19faaab7..596cb96e5f47 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -57,7 +57,7 @@
57#include <linux/wanpipe.h> 57#include <linux/wanpipe.h>
58#include <linux/if_wanpipe.h> 58#include <linux/if_wanpipe.h>
59#include <linux/pkt_sched.h> 59#include <linux/pkt_sched.h>
60#include <linux/tcp.h> 60#include <linux/tcp_states.h>
61#include <linux/if_wanpipe_common.h> 61#include <linux/if_wanpipe_common.h>
62#include <linux/sdla_x25.h> 62#include <linux/sdla_x25.h>
63 63
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
358 */ 358 */
359 359
360 360
361unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) 361__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
362{ 362{
363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ 363 int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
364 unsigned short ethertype; 364 __be16 ethertype;
365 365
366 switch (skb->data[cnt]) { 366 switch (skb->data[cnt]) {
367 case NLPID_IP: /* IP datagramm */ 367 case NLPID_IP: /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
379 skb->data[cnt+3], dev->name); 379 skb->data[cnt+3], dev->name);
380 return 0; 380 return 0;
381 } 381 }
382 ethertype = *((unsigned short*)&skb->data[cnt+4]); 382 ethertype = *((__be16*)&skb->data[cnt+4]);
383 cnt += 6; 383 cnt += 6;
384 break; 384 break;
385 385
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000000..e6759c9660bb
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
1#
2# CCITT X.25 Packet Layer
3#
4
5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL
8 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network
11 entry point can carry several logical point-to-point connections
12 (called "virtual circuits") to other computers connected to the X.25
13 network. Governments, banks, and other organizations tend to use it
14 to connect to each other or to form Wide Area Networks (WANs). Many
15 countries have public X.25 networks. X.25 consists of two
16 protocols: the higher level Packet Layer Protocol (PLP) (say Y here
17 if you want that) and the lower level data link layer protocol LAPB
18 (say Y to "LAPB Data Link Driver" below if you want that).
19
20 You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
21 <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
22 Information about X.25 for Linux is contained in the files
23 <file:Documentation/networking/x25.txt> and
24 <file:Documentation/networking/x25-iface.txt>.
25
26 One connects to an X.25 network either with a dedicated network card
27 using the X.21 protocol (not yet supported by Linux) or one can do
28 X.25 over a standard telephone line using an ordinary modem (say Y
29 to "X.25 async driver" below) or over Ethernet using an ordinary
30 Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
31 Driver" and "LAPB over Ethernet driver" below).
32
33 To compile this driver as a module, choose M here: the module
34 will be called x25. If unsure, say N.
35
36
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 04bec047fa9a..020d73cc8414 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -47,7 +47,7 @@
47#include <linux/if_arp.h> 47#include <linux/if_arp.h>
48#include <linux/skbuff.h> 48#include <linux/skbuff.h>
49#include <net/sock.h> 49#include <net/sock.h>
50#include <net/tcp.h> 50#include <net/tcp_states.h>
51#include <asm/uaccess.h> 51#include <asm/uaccess.h>
52#include <linux/fcntl.h> 52#include <linux/fcntl.h>
53#include <linux/termios.h> /* For TIOCINQ/OUTQ */ 53#include <linux/termios.h> /* For TIOCINQ/OUTQ */
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 36fc3bf6d882..adfe7b8df355 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
81} 81}
82 82
83int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, 83int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
84 struct packet_type *ptype) 84 struct packet_type *ptype, struct net_device *orig_dev)
85{ 85{
86 struct sk_buff *nskb; 86 struct sk_buff *nskb;
87 struct x25_neigh *nb; 87 struct x25_neigh *nb;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index b0197c70a9fc..26146874b839 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -28,7 +28,7 @@
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/skbuff.h> 29#include <linux/skbuff.h>
30#include <net/sock.h> 30#include <net/sock.h>
31#include <net/tcp.h> 31#include <net/tcp_states.h>
32#include <net/x25.h> 32#include <net/x25.h>
33 33
34static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) 34static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 7fd872ad0c20..8be9b8fbc24d 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -27,7 +27,7 @@
27#include <linux/string.h> 27#include <linux/string.h>
28#include <linux/skbuff.h> 28#include <linux/skbuff.h>
29#include <net/sock.h> 29#include <net/sock.h>
30#include <net/tcp.h> 30#include <net/tcp_states.h>
31#include <net/x25.h> 31#include <net/x25.h>
32 32
33/* 33/*
@@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk)
80 if (!skb_prev) 80 if (!skb_prev)
81 skb_queue_head(&sk->sk_write_queue, skb); 81 skb_queue_head(&sk->sk_write_queue, skb);
82 else 82 else
83 skb_append(skb_prev, skb); 83 skb_append(skb_prev, skb, &sk->sk_write_queue);
84 skb_prev = skb; 84 skb_prev = skb;
85 } 85 }
86} 86}
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index d6a21a3ad80e..0a92e1da3922 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -23,7 +23,7 @@
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <net/sock.h> 25#include <net/sock.h>
26#include <net/tcp.h> 26#include <net/tcp_states.h>
27#include <net/x25.h> 27#include <net/x25.h>
28 28
29static void x25_heartbeat_expiry(unsigned long); 29static void x25_heartbeat_expiry(unsigned long);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c48..0c1c04322baf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
1# 1#
2# XFRM configuration 2# XFRM configuration
3# 3#
4config XFRM
5 bool
6 depends on NET
7
4config XFRM_USER 8config XFRM_USER
5 tristate "IPsec user configuration interface" 9 tristate "IPsec user configuration interface"
6 depends on INET && XFRM 10 depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
10 14
11 If unsure, say Y. 15 If unsure, say Y.
12 16
17config NET_KEY
18 tristate "PF_KEY sockets"
19 select XFRM
20 ---help---
21 PF_KEYv2 socket family, compatible to KAME ones.
22 They are required if you are going to use IPsec tools ported
23 from KAME.
24
25 Say Y unless you know what you are doing.
26
27
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c58a6f05a0b6..2407a7072327 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
12#include <net/ip.h> 12#include <net/ip.h>
13#include <net/xfrm.h> 13#include <net/xfrm.h>
14 14
15static kmem_cache_t *secpath_cachep; 15static kmem_cache_t *secpath_cachep __read_mostly;
16 16
17void __secpath_destroy(struct sec_path *sp) 17void __secpath_destroy(struct sec_path *sp)
18{ 18{
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d65ed8684fc1..83c8135e1764 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list);
37static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); 37static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
38static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; 38static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
39 39
40static kmem_cache_t *xfrm_dst_cache; 40static kmem_cache_t *xfrm_dst_cache __read_mostly;
41 41
42static struct work_struct xfrm_policy_gc_work; 42static struct work_struct xfrm_policy_gc_work;
43static struct list_head xfrm_policy_gc_list = 43static struct list_head xfrm_policy_gc_list =
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ecade4893a13..c35336a0f71b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
1125 if (build_expire(skb, x, c->data.hard) < 0) 1125 if (build_expire(skb, x, c->data.hard) < 0)
1126 BUG(); 1126 BUG();
1127 1127
1128 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1128 NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
1129 1129 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
1130 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1131} 1130}
1132 1131
1133static int xfrm_notify_sa_flush(struct km_event *c) 1132static int xfrm_notify_sa_flush(struct km_event *c)
@@ -1152,7 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c)
1152 1151
1153 nlh->nlmsg_len = skb->tail - b; 1152 nlh->nlmsg_len = skb->tail - b;
1154 1153
1155 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); 1154 NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
1155 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
1156 1156
1157nlmsg_failure: 1157nlmsg_failure:
1158 kfree_skb(skb); 1158 kfree_skb(skb);
@@ -1226,7 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
1226 1226
1227 nlh->nlmsg_len = skb->tail - b; 1227 nlh->nlmsg_len = skb->tail - b;
1228 1228
1229 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); 1229 NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
1230 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
1230 1231
1231nlmsg_failure: 1232nlmsg_failure:
1232rtattr_failure: 1233rtattr_failure:
@@ -1304,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
1304 if (build_acquire(skb, x, xt, xp, dir) < 0) 1305 if (build_acquire(skb, x, xt, xp, dir) < 0)
1305 BUG(); 1306 BUG();
1306 1307
1307 NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE; 1308 NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE;
1308 1309 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
1309 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
1310} 1310}
1311 1311
1312/* User gives us xfrm_user_policy_info followed by an array of 0 1312/* User gives us xfrm_user_policy_info followed by an array of 0
@@ -1350,6 +1350,9 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
1350 if (nr > XFRM_MAX_DEPTH) 1350 if (nr > XFRM_MAX_DEPTH)
1351 return NULL; 1351 return NULL;
1352 1352
1353 if (p->dir > XFRM_POLICY_OUT)
1354 return NULL;
1355
1353 xp = xfrm_policy_alloc(GFP_KERNEL); 1356 xp = xfrm_policy_alloc(GFP_KERNEL);
1354 if (xp == NULL) { 1357 if (xp == NULL) {
1355 *dir = -ENOBUFS; 1358 *dir = -ENOBUFS;
@@ -1402,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
1402 if (build_polexpire(skb, xp, dir, c->data.hard) < 0) 1405 if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
1403 BUG(); 1406 BUG();
1404 1407
1405 NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; 1408 NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
1406 1409 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
1407 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
1408} 1410}
1409 1411
1410static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c) 1412static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
@@ -1452,7 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
1452 1454
1453 nlh->nlmsg_len = skb->tail - b; 1455 nlh->nlmsg_len = skb->tail - b;
1454 1456
1455 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); 1457 NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
1458 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
1456 1459
1457nlmsg_failure: 1460nlmsg_failure:
1458rtattr_failure: 1461rtattr_failure:
@@ -1477,7 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c)
1477 1480
1478 nlh->nlmsg_len = skb->tail - b; 1481 nlh->nlmsg_len = skb->tail - b;
1479 1482
1480 return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); 1483 NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
1484 return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
1481 1485
1482nlmsg_failure: 1486nlmsg_failure:
1483 kfree_skb(skb); 1487 kfree_skb(skb);
@@ -1516,7 +1520,8 @@ static int __init xfrm_user_init(void)
1516{ 1520{
1517 printk(KERN_INFO "Initializing IPsec netlink socket\n"); 1521 printk(KERN_INFO "Initializing IPsec netlink socket\n");
1518 1522
1519 xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); 1523 xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
1524 xfrm_netlink_rcv, THIS_MODULE);
1520 if (xfrm_nl == NULL) 1525 if (xfrm_nl == NULL)
1521 return -ENOMEM; 1526 return -ENOMEM;
1522 1527
@@ -1534,3 +1539,4 @@ static void __exit xfrm_user_exit(void)
1534module_init(xfrm_user_init); 1539module_init(xfrm_user_init);
1535module_exit(xfrm_user_exit); 1540module_exit(xfrm_user_exit);
1536MODULE_LICENSE("GPL"); 1541MODULE_LICENSE("GPL");
1542MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);