aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-01-04 19:27:41 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-04 19:27:41 -0500
commitd347da0deffa1d8f88f0d270eab040e4707c9916 (patch)
treee0911f2ef4d36a7b44f7a5379feabebbd37dcfc4
parentc6c88bbde4d8b2ffe9886b7130b2e23781d424e5 (diff)
parent74cb8798222bb7d1aecb0acb91e6eeedf5feb948 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
-rw-r--r--Documentation/networking/ip-sysctl.txt23
-rw-r--r--drivers/char/random.c10
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c2
-rw-r--r--drivers/net/ns83820.c1
-rw-r--r--drivers/net/pppoe.c31
-rw-r--r--drivers/net/pppox.c10
-rw-r--r--drivers/net/sk98lin/skge.c1
-rw-r--r--drivers/net/skge.c1
-rw-r--r--drivers/net/tg3.c3
-rw-r--r--drivers/net/wireless/ipw2200.c15
-rw-r--r--fs/9p/trans_sock.c1
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--include/asm-alpha/bitops.h1
-rw-r--r--include/asm-arm/bitops.h2
-rw-r--r--include/asm-arm26/bitops.h1
-rw-r--r--include/asm-cris/bitops.h1
-rw-r--r--include/asm-frv/bitops.h1
-rw-r--r--include/asm-generic/bitops.h1
-rw-r--r--include/asm-h8300/bitops.h1
-rw-r--r--include/asm-i386/bitops.h1
-rw-r--r--include/asm-ia64/bitops.h1
-rw-r--r--include/asm-m32r/bitops.h1
-rw-r--r--include/asm-m68k/bitops.h1
-rw-r--r--include/asm-m68knommu/bitops.h1
-rw-r--r--include/asm-mips/bitops.h2
-rw-r--r--include/asm-parisc/bitops.h1
-rw-r--r--include/asm-powerpc/bitops.h1
-rw-r--r--include/asm-s390/bitops.h1
-rw-r--r--include/asm-sh/bitops.h1
-rw-r--r--include/asm-sh64/bitops.h1
-rw-r--r--include/asm-sparc/bitops.h1
-rw-r--r--include/asm-sparc64/bitops.h1
-rw-r--r--include/asm-v850/bitops.h1
-rw-r--r--include/asm-x86_64/bitops.h27
-rw-r--r--include/asm-xtensa/bitops.h1
-rw-r--r--include/linux/bitops.h9
-rw-r--r--include/linux/dccp.h7
-rw-r--r--include/linux/etherdevice.h3
-rw-r--r--include/linux/if_pppox.h3
-rw-r--r--include/linux/ip.h121
-rw-r--r--include/linux/ipv6.h79
-rw-r--r--include/linux/net.h4
-rw-r--r--include/linux/pfkeyv2.h13
-rw-r--r--include/linux/pkt_sched.h7
-rw-r--r--include/linux/random.h6
-rw-r--r--include/linux/security.h132
-rw-r--r--include/linux/skbuff.h5
-rw-r--r--include/linux/socket.h1
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--include/linux/tcp.h21
-rw-r--r--include/linux/udp.h6
-rw-r--r--include/linux/xfrm.h29
-rw-r--r--include/net/af_unix.h12
-rw-r--r--include/net/atmclip.h2
-rw-r--r--include/net/dst.h1
-rw-r--r--include/net/flow.h7
-rw-r--r--include/net/genetlink.h2
-rw-r--r--include/net/icmp.h9
-rw-r--r--include/net/ieee80211_crypt.h9
-rw-r--r--include/net/inet6_connection_sock.h42
-rw-r--r--include/net/inet6_hashtables.h32
-rw-r--r--include/net/inet_common.h4
-rw-r--r--include/net/inet_connection_sock.h45
-rw-r--r--include/net/inet_ecn.h2
-rw-r--r--include/net/inet_hashtables.h24
-rw-r--r--include/net/inet_sock.h193
-rw-r--r--include/net/inet_timewait_sock.h8
-rw-r--r--include/net/inetpeer.h1
-rw-r--r--include/net/ip.h19
-rw-r--r--include/net/ip_fib.h2
-rw-r--r--include/net/ip_vs.h12
-rw-r--r--include/net/ipv6.h12
-rw-r--r--include/net/ndisc.h17
-rw-r--r--include/net/neighbour.h2
-rw-r--r--include/net/pkt_act.h1
-rw-r--r--include/net/protocol.h3
-rw-r--r--include/net/raw.h2
-rw-r--r--include/net/request_sock.h2
-rw-r--r--include/net/sctp/structs.h76
-rw-r--r--include/net/sctp/user.h30
-rw-r--r--include/net/sock.h32
-rw-r--r--include/net/tcp.h246
-rw-r--r--include/net/tcp_states.h16
-rw-r--r--include/net/timewait_sock.h31
-rw-r--r--include/net/transp_v6.h2
-rw-r--r--include/net/udp.h4
-rw-r--r--include/net/xfrm.h30
-rw-r--r--init/main.c4
-rw-r--r--net/appletalk/ddp.c23
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/bluetooth/af_bluetooth.c5
-rw-r--r--net/bluetooth/bnep/sock.c2
-rw-r--r--net/bluetooth/cmtp/sock.c2
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/sock.c2
-rw-r--r--net/bluetooth/l2cap.c9
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c9
-rw-r--r--net/bridge/br.c1
-rw-r--r--net/bridge/br_device.c91
-rw-r--r--net/bridge/br_if.c55
-rw-r--r--net/bridge/br_input.c11
-rw-r--r--net/bridge/br_netfilter.c4
-rw-r--r--net/bridge/br_notify.c14
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/bridge/br_stp_if.c5
-rw-r--r--net/bridge/netfilter/Kconfig6
-rw-r--r--net/bridge/netfilter/ebt_log.c73
-rw-r--r--net/bridge/netfilter/ebt_ulog.c53
-rw-r--r--net/core/datagram.c36
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c112
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netpoll.c1
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/skbuff.c27
-rw-r--r--net/core/sock.c21
-rw-r--r--net/core/stream.c10
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c33
-rw-r--r--net/dccp/ackvec.h12
-rw-r--r--net/dccp/ccid.h2
-rw-r--r--net/dccp/dccp.h24
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/input.c79
-rw-r--r--net/dccp/ipv4.c305
-rw-r--r--net/dccp/ipv6.c1261
-rw-r--r--net/dccp/ipv6.h37
-rw-r--r--net/dccp/minisocks.c23
-rw-r--r--net/dccp/output.c47
-rw-r--r--net/dccp/proto.c56
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_in.c17
-rw-r--r--net/econet/af_econet.c9
-rw-r--r--net/ieee80211/ieee80211_rx.c5
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_diag.c14
-rw-r--r--net/ipv4/inet_hashtables.c178
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c28
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c175
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c1
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_bic.c85
-rw-r--r--net/ipv4/tcp_cong.c28
-rw-r--r--net/ipv4/tcp_cubic.c411
-rw-r--r--net/ipv4/tcp_input.c99
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_vegas.c4
-rw-r--r--net/ipv4/udp.c22
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/af_inet6.c90
-rw-r--r--net/ipv6/ah6.c1
-rw-r--r--net/ipv6/esp6.c1
-rw-r--r--net/ipv6/exthdrs.c4
-rw-r--r--net/ipv6/inet6_connection_sock.c199
-rw-r--r--net/ipv6/inet6_hashtables.c183
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ipcomp6.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c24
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c191
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c1
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c1
-rw-r--r--net/ipv6/netfilter/ip6t_esp.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/ipv6/tcp_ipv6.c639
-rw-r--r--net/ipv6/udp.c16
-rw-r--r--net/ipx/af_ipx.c6
-rw-r--r--net/irda/af_irda.c23
-rw-r--r--net/key/af_key.c201
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/netrom/af_netrom.c16
-rw-r--r--net/nonet.c5
-rw-r--r--net/packet/af_packet.c10
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/sched/sch_netem.c49
-rw-r--r--net/sched/sch_teql.c1
-rw-r--r--net/sctp/associola.c81
-rw-r--r--net/sctp/input.c36
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/output.c17
-rw-r--r--net/sctp/protocol.c3
-rw-r--r--net/sctp/sm_sideeffect.c26
-rw-r--r--net/sctp/sm_statefuns.c20
-rw-r--r--net/sctp/socket.c691
-rw-r--r--net/sctp/transport.c32
-rw-r--r--net/socket.c243
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/unix/af_unix.c55
-rw-r--r--net/unix/garbage.c4
-rw-r--r--net/wanrouter/af_wanpipe.c6
-rw-r--r--net/x25/af_x25.c6
-rw-r--r--net/xfrm/xfrm_policy.c88
-rw-r--r--net/xfrm/xfrm_state.c9
-rw-r--r--net/xfrm/xfrm_user.c148
-rw-r--r--security/Kconfig13
-rw-r--r--security/dummy.c45
-rw-r--r--security/selinux/Makefile2
-rw-r--r--security/selinux/hooks.c39
-rw-r--r--security/selinux/include/av_perm_to_string.h2
-rw-r--r--security/selinux/include/av_permissions.h2
-rw-r--r--security/selinux/include/xfrm.h54
-rw-r--r--security/selinux/xfrm.c311
263 files changed, 6860 insertions, 2857 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f62..2b7cf19a06ad 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
46 for the hash secret) for IP fragments. 46 for the hash secret) for IP fragments.
47 Default: 600 47 Default: 600
48 48
49ipfrag_max_dist - INTEGER
50 ipfrag_max_dist is a non-negative integer value which defines the
51 maximum "disorder" which is allowed among fragments which share a
52 common IP source address. Note that reordering of packets is
53 not unusual, but if a large number of fragments arrive from a source
54 IP address while a particular fragment queue remains incomplete, it
55 probably indicates that one or more fragments belonging to that queue
56 have been lost. When ipfrag_max_dist is positive, an additional check
57 is done on fragments before they are added to a reassembly queue - if
58 ipfrag_max_dist (or more) fragments have arrived from a particular IP
59 address between additions to any IP fragment queue using that source
60 address, it's presumed that one or more fragments in the queue are
61 lost. The existing fragment queue will be dropped, and a new one
62 started. An ipfrag_max_dist value of zero disables this check.
63
64 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
65 result in unnecessarily dropping fragment queues when normal
66 reordering of packets occurs, which could lead to poor application
67 performance. Using a very large value, e.g. 50000, increases the
68 likelihood of incorrectly reassembling IP fragments that originate
69 from different IP datagrams, which could result in data corruption.
70 Default: 64
71
49INET peer storage: 72INET peer storage:
50 73
51inet_peer_threshold - INTEGER 74inet_peer_threshold - INTEGER
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7999da25fe40..bdfdfd28594d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
1554 1554
1555EXPORT_SYMBOL(secure_tcp_sequence_number); 1555EXPORT_SYMBOL(secure_tcp_sequence_number);
1556 1556
1557 1557/* Generate secure starting point for ephemeral IPV4 transport port search */
1558 1558u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
1559/* Generate secure starting point for ephemeral TCP port search */
1560u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
1561{ 1559{
1562 struct keydata *keyptr = get_keyptr(); 1560 struct keydata *keyptr = get_keyptr();
1563 u32 hash[4]; 1561 u32 hash[4];
@@ -1575,7 +1573,7 @@ u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
1575} 1573}
1576 1574
1577#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1575#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1578u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) 1576u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
1579{ 1577{
1580 struct keydata *keyptr = get_keyptr(); 1578 struct keydata *keyptr = get_keyptr();
1581 u32 hash[12]; 1579 u32 hash[12];
@@ -1586,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
1586 1584
1587 return twothirdsMD4Transform(daddr, hash); 1585 return twothirdsMD4Transform(daddr, hash);
1588} 1586}
1589EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); 1587EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
1590#endif 1588#endif
1591 1589
1592#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) 1590#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 475d98fa9e26..780009c7eaa6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -47,6 +47,8 @@
47#include <linux/ip.h> 47#include <linux/ip.h>
48#include <linux/in.h> 48#include <linux/in.h>
49 49
50#include <net/dst.h>
51
50MODULE_AUTHOR("Roland Dreier"); 52MODULE_AUTHOR("Roland Dreier");
51MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 53MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
52MODULE_LICENSE("Dual BSD/GPL"); 54MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ef3ee035bbc8..ed0c2ead8bc1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -43,6 +43,8 @@
43#include <linux/delay.h> 43#include <linux/delay.h>
44#include <linux/completion.h> 44#include <linux/completion.h>
45 45
46#include <net/dst.h>
47
46#include "ipoib.h" 48#include "ipoib.h"
47 49
48#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 50#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index f857ae94d261..b0c3b6ab6263 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -115,6 +115,7 @@
115#include <linux/ethtool.h> 115#include <linux/ethtool.h>
116#include <linux/timer.h> 116#include <linux/timer.h>
117#include <linux/if_vlan.h> 117#include <linux/if_vlan.h>
118#include <linux/rtnetlink.h>
118 119
119#include <asm/io.h> 120#include <asm/io.h>
120#include <asm/uaccess.h> 121#include <asm/uaccess.h>
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index a842ecc60a34..9369f811075d 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
85static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb); 85static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb);
86static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); 86static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
87 87
88static struct proto_ops pppoe_ops; 88static const struct proto_ops pppoe_ops;
89static DEFINE_RWLOCK(pppoe_hash_lock); 89static DEFINE_RWLOCK(pppoe_hash_lock);
90 90
91static struct ppp_channel_ops pppoe_chan_ops; 91static struct ppp_channel_ops pppoe_chan_ops;
@@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb,
383{ 383{
384 struct pppoe_hdr *ph; 384 struct pppoe_hdr *ph;
385 struct pppox_sock *po; 385 struct pppox_sock *po;
386 struct sock *sk;
387 int ret;
388 386
389 if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) 387 if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
390 goto drop; 388 goto drop;
@@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb,
395 ph = (struct pppoe_hdr *) skb->nh.raw; 393 ph = (struct pppoe_hdr *) skb->nh.raw;
396 394
397 po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source); 395 po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source);
398 if (!po) 396 if (po != NULL)
399 goto drop; 397 return sk_receive_skb(sk_pppox(po), skb);
400
401 sk = sk_pppox(po);
402 bh_lock_sock(sk);
403
404 /* Socket state is unknown, must put skb into backlog. */
405 if (sock_owned_by_user(sk) != 0) {
406 sk_add_backlog(sk, skb);
407 ret = NET_RX_SUCCESS;
408 } else {
409 ret = pppoe_rcv_core(sk, skb);
410 }
411
412 bh_unlock_sock(sk);
413 sock_put(sk);
414
415 return ret;
416drop: 398drop:
417 kfree_skb(skb); 399 kfree_skb(skb);
418out: 400out:
@@ -1081,9 +1063,7 @@ static int __init pppoe_proc_init(void)
1081static inline int pppoe_proc_init(void) { return 0; } 1063static inline int pppoe_proc_init(void) { return 0; }
1082#endif /* CONFIG_PROC_FS */ 1064#endif /* CONFIG_PROC_FS */
1083 1065
1084/* ->ioctl are set at pppox_create */ 1066static const struct proto_ops pppoe_ops = {
1085
1086static struct proto_ops pppoe_ops = {
1087 .family = AF_PPPOX, 1067 .family = AF_PPPOX,
1088 .owner = THIS_MODULE, 1068 .owner = THIS_MODULE,
1089 .release = pppoe_release, 1069 .release = pppoe_release,
@@ -1099,7 +1079,8 @@ static struct proto_ops pppoe_ops = {
1099 .getsockopt = sock_no_getsockopt, 1079 .getsockopt = sock_no_getsockopt,
1100 .sendmsg = pppoe_sendmsg, 1080 .sendmsg = pppoe_sendmsg,
1101 .recvmsg = pppoe_recvmsg, 1081 .recvmsg = pppoe_recvmsg,
1102 .mmap = sock_no_mmap 1082 .mmap = sock_no_mmap,
1083 .ioctl = pppox_ioctl,
1103}; 1084};
1104 1085
1105static struct pppox_proto pppoe_proto = { 1086static struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c
index 0c1e114527fb..9315046b3f55 100644
--- a/drivers/net/pppox.c
+++ b/drivers/net/pppox.c
@@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto);
68EXPORT_SYMBOL(unregister_pppox_proto); 68EXPORT_SYMBOL(unregister_pppox_proto);
69EXPORT_SYMBOL(pppox_unbind_sock); 69EXPORT_SYMBOL(pppox_unbind_sock);
70 70
71static int pppox_ioctl(struct socket* sock, unsigned int cmd, 71int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
72 unsigned long arg)
73{ 72{
74 struct sock *sk = sock->sk; 73 struct sock *sk = sock->sk;
75 struct pppox_sock *po = pppox_sk(sk); 74 struct pppox_sock *po = pppox_sk(sk);
@@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd,
105 return rc; 104 return rc;
106} 105}
107 106
107EXPORT_SYMBOL(pppox_ioctl);
108 108
109static int pppox_create(struct socket *sock, int protocol) 109static int pppox_create(struct socket *sock, int protocol)
110{ 110{
@@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol)
119 goto out; 119 goto out;
120 120
121 rc = pppox_protos[protocol]->create(sock); 121 rc = pppox_protos[protocol]->create(sock);
122 if (!rc) { 122
123 /* We get to set the ioctl handler. */
124 /* For everything else, pppox is just a shell. */
125 sock->ops->ioctl = pppox_ioctl;
126 }
127 module_put(pppox_protos[protocol]->owner); 123 module_put(pppox_protos[protocol]->owner);
128out: 124out:
129 return rc; 125 return rc;
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ae7343934758..e1a2d52cc1fe 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -107,6 +107,7 @@
107 107
108#include "h/skversion.h" 108#include "h/skversion.h"
109 109
110#include <linux/in.h>
110#include <linux/module.h> 111#include <linux/module.h>
111#include <linux/moduleparam.h> 112#include <linux/moduleparam.h>
112#include <linux/init.h> 113#include <linux/init.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 00d683063c01..d8cc3aea032a 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#include <linux/config.h> 27#include <linux/config.h>
28#include <linux/in.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/moduleparam.h> 31#include <linux/moduleparam.h>
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2fc9893d69e1..eb86b059809b 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -24,6 +24,7 @@
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/delay.h> 26#include <linux/delay.h>
27#include <linux/in.h>
27#include <linux/init.h> 28#include <linux/init.h>
28#include <linux/ioport.h> 29#include <linux/ioport.h>
29#include <linux/pci.h> 30#include <linux/pci.h>
@@ -3650,7 +3651,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
3650 TXD_FLAG_CPU_POST_DMA); 3651 TXD_FLAG_CPU_POST_DMA);
3651 3652
3652 skb->nh.iph->check = 0; 3653 skb->nh.iph->check = 0;
3653 skb->nh.iph->tot_len = ntohs(mss + ip_tcp_len + tcp_opt_len); 3654 skb->nh.iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len);
3654 if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) { 3655 if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) {
3655 skb->h.th->check = 0; 3656 skb->h.th->check = 0;
3656 base_flags &= ~TXD_FLAG_TCPUDP_CSUM; 3657 base_flags &= ~TXD_FLAG_TCPUDP_CSUM;
diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 5e7c7e944c9d..64f6d1f25753 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -7456,8 +7456,7 @@ static void ipw_handle_data_packet(struct ipw_priv *priv,
7456 /* HW decrypt will not clear the WEP bit, MIC, PN, etc. */ 7456 /* HW decrypt will not clear the WEP bit, MIC, PN, etc. */
7457 hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data; 7457 hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data;
7458 if (priv->ieee->iw_mode != IW_MODE_MONITOR && 7458 if (priv->ieee->iw_mode != IW_MODE_MONITOR &&
7459 ((is_multicast_ether_addr(hdr->addr1) || 7459 (is_multicast_ether_addr(hdr->addr1) ?
7460 is_broadcast_ether_addr(hdr->addr1)) ?
7461 !priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt)) 7460 !priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt))
7462 ipw_rebuild_decrypted_skb(priv, rxb->skb); 7461 ipw_rebuild_decrypted_skb(priv, rxb->skb);
7463 7462
@@ -7648,8 +7647,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
7648 return 0; 7647 return 0;
7649 7648
7650 /* {broad,multi}cast packets to our BSSID go through */ 7649 /* {broad,multi}cast packets to our BSSID go through */
7651 if (is_multicast_ether_addr(header->addr1) || 7650 if (is_multicast_ether_addr(header->addr1))
7652 is_broadcast_ether_addr(header->addr1))
7653 return !memcmp(header->addr3, priv->bssid, ETH_ALEN); 7651 return !memcmp(header->addr3, priv->bssid, ETH_ALEN);
7654 7652
7655 /* packets to our adapter go through */ 7653 /* packets to our adapter go through */
@@ -7662,8 +7660,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
7662 return 0; 7660 return 0;
7663 7661
7664 /* {broad,multi}cast packets to our BSS go through */ 7662 /* {broad,multi}cast packets to our BSS go through */
7665 if (is_multicast_ether_addr(header->addr1) || 7663 if (is_multicast_ether_addr(header->addr1))
7666 is_broadcast_ether_addr(header->addr1))
7667 return !memcmp(header->addr2, priv->bssid, ETH_ALEN); 7664 return !memcmp(header->addr2, priv->bssid, ETH_ALEN);
7668 7665
7669 /* packets to our adapter go through */ 7666 /* packets to our adapter go through */
@@ -9657,8 +9654,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
9657 switch (priv->ieee->iw_mode) { 9654 switch (priv->ieee->iw_mode) {
9658 case IW_MODE_ADHOC: 9655 case IW_MODE_ADHOC:
9659 hdr_len = IEEE80211_3ADDR_LEN; 9656 hdr_len = IEEE80211_3ADDR_LEN;
9660 unicast = !(is_multicast_ether_addr(hdr->addr1) || 9657 unicast = !is_multicast_ether_addr(hdr->addr1);
9661 is_broadcast_ether_addr(hdr->addr1));
9662 id = ipw_find_station(priv, hdr->addr1); 9658 id = ipw_find_station(priv, hdr->addr1);
9663 if (id == IPW_INVALID_STATION) { 9659 if (id == IPW_INVALID_STATION) {
9664 id = ipw_add_station(priv, hdr->addr1); 9660 id = ipw_add_station(priv, hdr->addr1);
@@ -9673,8 +9669,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
9673 9669
9674 case IW_MODE_INFRA: 9670 case IW_MODE_INFRA:
9675 default: 9671 default:
9676 unicast = !(is_multicast_ether_addr(hdr->addr3) || 9672 unicast = !is_multicast_ether_addr(hdr->addr3);
9677 is_broadcast_ether_addr(hdr->addr3));
9678 hdr_len = IEEE80211_3ADDR_LEN; 9673 hdr_len = IEEE80211_3ADDR_LEN;
9679 id = 0; 9674 id = 0;
9680 break; 9675 break;
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c33..6a9a75d40f73 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/config.h> 28#include <linux/config.h>
29#include <linux/in.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/net.h> 31#include <linux/net.h>
31#include <linux/ipv6.h> 32#include <linux/ipv6.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba33..30cae3602867 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
14#include <linux/sunrpc/svc.h> 14#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/svcsock.h> 15#include <linux/sunrpc/svcsock.h>
16#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
17
18#include <net/inet_sock.h>
19
17#include "nfs4_fs.h" 20#include "nfs4_fs.h"
18#include "callback.h" 21#include "callback.h"
19 22
diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h
index 578ed3f1a607..302201f1a097 100644
--- a/include/asm-alpha/bitops.h
+++ b/include/asm-alpha/bitops.h
@@ -321,6 +321,7 @@ static inline int fls(int word)
321#else 321#else
322#define fls generic_fls 322#define fls generic_fls
323#endif 323#endif
324#define fls64 generic_fls64
324 325
325/* Compute powers of two for the given integer. */ 326/* Compute powers of two for the given integer. */
326static inline long floor_log2(unsigned long word) 327static inline long floor_log2(unsigned long word)
diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h
index 7399d431edfe..d02de721ecc1 100644
--- a/include/asm-arm/bitops.h
+++ b/include/asm-arm/bitops.h
@@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word)
332 */ 332 */
333 333
334#define fls(x) generic_fls(x) 334#define fls(x) generic_fls(x)
335#define fls64(x) generic_fls64(x)
335 336
336/* 337/*
337 * ffs: find first bit set. This is defined the same way as 338 * ffs: find first bit set. This is defined the same way as
@@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word)
351#define fls(x) \ 352#define fls(x) \
352 ( __builtin_constant_p(x) ? generic_fls(x) : \ 353 ( __builtin_constant_p(x) ? generic_fls(x) : \
353 ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) ) 354 ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) )
355#define fls64(x) generic_fls64(x)
354#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) 356#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
355#define __ffs(x) (ffs(x) - 1) 357#define __ffs(x) (ffs(x) - 1)
356#define ffz(x) __ffs( ~(x) ) 358#define ffz(x) __ffs( ~(x) )
diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h
index 7d062fb2e343..15cc6f2da792 100644
--- a/include/asm-arm26/bitops.h
+++ b/include/asm-arm26/bitops.h
@@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word)
259 */ 259 */
260 260
261#define fls(x) generic_fls(x) 261#define fls(x) generic_fls(x)
262#define fls64(x) generic_fls64(x)
262 263
263/* 264/*
264 * ffs: find first bit set. This is defined the same way as 265 * ffs: find first bit set. This is defined the same way as
diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index 1bddb3f3a289..d3eb0f1e4208 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
240 */ 240 */
241 241
242#define fls(x) generic_fls(x) 242#define fls(x) generic_fls(x)
243#define fls64(x) generic_fls64(x)
243 244
244/* 245/*
245 * hweightN - returns the hamming weight of a N-bit word 246 * hweightN - returns the hamming weight of a N-bit word
diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index b664bd5b6663..02be7b3a8a83 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -228,6 +228,7 @@ found_middle:
228 \ 228 \
229 bit ? 33 - bit : bit; \ 229 bit ? 33 - bit : bit; \
230}) 230})
231#define fls64(x) generic_fls64(x)
231 232
232/* 233/*
233 * Every architecture must define this function. It's the fastest 234 * Every architecture must define this function. It's the fastest
diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index ce31b739fd80..0e6d9852008c 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr)
56 */ 56 */
57 57
58#define fls(x) generic_fls(x) 58#define fls(x) generic_fls(x)
59#define fls64(x) generic_fls64(x)
59 60
60#ifdef __KERNEL__ 61#ifdef __KERNEL__
61 62
diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h
index 5036f595f8c9..c0411ec9d651 100644
--- a/include/asm-h8300/bitops.h
+++ b/include/asm-h8300/bitops.h
@@ -406,5 +406,6 @@ found_middle:
406#endif /* __KERNEL__ */ 406#endif /* __KERNEL__ */
407 407
408#define fls(x) generic_fls(x) 408#define fls(x) generic_fls(x)
409#define fls64(x) generic_fls64(x)
409 410
410#endif /* _H8300_BITOPS_H */ 411#endif /* _H8300_BITOPS_H */
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index ddf1739dc7fd..4807aa1d2e3d 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word)
372 */ 372 */
373 373
374#define fls(x) generic_fls(x) 374#define fls(x) generic_fls(x)
375#define fls64(x) generic_fls64(x)
375 376
376#ifdef __KERNEL__ 377#ifdef __KERNEL__
377 378
diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index 7232528e2d0c..36d0fb95ea89 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -345,6 +345,7 @@ fls (int t)
345 x |= x >> 16; 345 x |= x >> 16;
346 return ia64_popcnt(x); 346 return ia64_popcnt(x);
347} 347}
348#define fls64(x) generic_fls64(x)
348 349
349/* 350/*
350 * ffs: find first bit set. This is defined the same way as the libc and compiler builtin 351 * ffs: find first bit set. This is defined the same way as the libc and compiler builtin
diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h
index e78443981349..abea2fdd8689 100644
--- a/include/asm-m32r/bitops.h
+++ b/include/asm-m32r/bitops.h
@@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word)
465 * fls: find last bit set. 465 * fls: find last bit set.
466 */ 466 */
467#define fls(x) generic_fls(x) 467#define fls(x) generic_fls(x)
468#define fls64(x) generic_fls64(x)
468 469
469#ifdef __KERNEL__ 470#ifdef __KERNEL__
470 471
diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h
index b1bcf7c66516..13f4c0048463 100644
--- a/include/asm-m68k/bitops.h
+++ b/include/asm-m68k/bitops.h
@@ -310,6 +310,7 @@ static inline int fls(int x)
310 310
311 return 32 - cnt; 311 return 32 - cnt;
312} 312}
313#define fls64(x) generic_fls64(x)
313 314
314/* 315/*
315 * Every architecture must define this function. It's the fastest 316 * Every architecture must define this function. It's the fastest
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h
index c42f88a9b9f9..4058dd086a02 100644
--- a/include/asm-m68knommu/bitops.h
+++ b/include/asm-m68knommu/bitops.h
@@ -499,5 +499,6 @@ found_middle:
499 * fls: find last bit set. 499 * fls: find last bit set.
500 */ 500 */
501#define fls(x) generic_fls(x) 501#define fls(x) generic_fls(x)
502#define fls64(x) generic_fls64(x)
502 503
503#endif /* _M68KNOMMU_BITOPS_H */ 504#endif /* _M68KNOMMU_BITOPS_H */
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index 5496f9064a6a..3b0c8aaf6e8b 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word)
695 695
696 return flz(~word) + 1; 696 return flz(~word) + 1;
697} 697}
698 698#define fls64(x) generic_fls64(x)
699 699
700/* 700/*
701 * find_next_zero_bit - find the first zero bit in a memory region 701 * find_next_zero_bit - find the first zero bit in a memory region
diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h
index 55b98c67fd82..15d8c2b51584 100644
--- a/include/asm-parisc/bitops.h
+++ b/include/asm-parisc/bitops.h
@@ -263,6 +263,7 @@ static __inline__ int fls(int x)
263 263
264 return ret; 264 return ret;
265} 265}
266#define fls64(x) generic_fls64(x)
266 267
267/* 268/*
268 * hweightN: returns the hamming weight (i.e. the number 269 * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h
index 5727229b0444..1996eaa8aeae 100644
--- a/include/asm-powerpc/bitops.h
+++ b/include/asm-powerpc/bitops.h
@@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x)
310 asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); 310 asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
311 return 32 - lz; 311 return 32 - lz;
312} 312}
313#define fls64(x) generic_fls64(x)
313 314
314/* 315/*
315 * hweightN: returns the hamming weight (i.e. the number 316 * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index b07c578b22ea..61232760cc3b 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b)
839 * fls: find last bit set. 839 * fls: find last bit set.
840 */ 840 */
841#define fls(x) generic_fls(x) 841#define fls(x) generic_fls(x)
842#define fls64(x) generic_fls64(x)
842 843
843/* 844/*
844 * hweightN: returns the hamming weight (i.e. the number 845 * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h
index 5163d1ff2f1b..1c5260860045 100644
--- a/include/asm-sh/bitops.h
+++ b/include/asm-sh/bitops.h
@@ -470,6 +470,7 @@ found_middle:
470 */ 470 */
471 471
472#define fls(x) generic_fls(x) 472#define fls(x) generic_fls(x)
473#define fls64(x) generic_fls64(x)
473 474
474#endif /* __KERNEL__ */ 475#endif /* __KERNEL__ */
475 476
diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h
index e1ff63e09227..ce9c3ad45fe0 100644
--- a/include/asm-sh64/bitops.h
+++ b/include/asm-sh64/bitops.h
@@ -510,6 +510,7 @@ found_middle:
510 510
511#define ffs(x) generic_ffs(x) 511#define ffs(x) generic_ffs(x)
512#define fls(x) generic_fls(x) 512#define fls(x) generic_fls(x)
513#define fls64(x) generic_fls64(x)
513 514
514#endif /* __KERNEL__ */ 515#endif /* __KERNEL__ */
515 516
diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h
index bfbd795a0a80..41722b5e45ef 100644
--- a/include/asm-sparc/bitops.h
+++ b/include/asm-sparc/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(int x)
298 * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. 298 * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
299 */ 299 */
300#define fls(x) generic_fls(x) 300#define fls(x) generic_fls(x)
301#define fls64(x) generic_fls64(x)
301 302
302/* 303/*
303 * hweightN: returns the hamming weight (i.e. the number 304 * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 6388b8376c50..6efc0162fb09 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word)
119 */ 119 */
120 120
121#define fls(x) generic_fls(x) 121#define fls(x) generic_fls(x)
122#define fls64(x) generic_fls64(x)
122 123
123#ifdef __KERNEL__ 124#ifdef __KERNEL__
124 125
diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index b91e799763fd..8955d2376ac8 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -276,6 +276,7 @@ found_middle:
276 276
277#define ffs(x) generic_ffs (x) 277#define ffs(x) generic_ffs (x)
278#define fls(x) generic_fls (x) 278#define fls(x) generic_fls (x)
279#define fls64(x) generic_fls64(x)
279#define __ffs(x) ffs(x) 280#define __ffs(x) ffs(x)
280 281
281 282
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 05a0d374404b..a4d5d0909453 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -340,6 +340,20 @@ static __inline__ unsigned long __ffs(unsigned long word)
340 return word; 340 return word;
341} 341}
342 342
343/*
344 * __fls: find last bit set.
345 * @word: The word to search
346 *
347 * Undefined if no zero exists, so code should check against ~0UL first.
348 */
349static __inline__ unsigned long __fls(unsigned long word)
350{
351 __asm__("bsrq %1,%0"
352 :"=r" (word)
353 :"rm" (word));
354 return word;
355}
356
343#ifdef __KERNEL__ 357#ifdef __KERNEL__
344 358
345static inline int sched_find_first_bit(const unsigned long *b) 359static inline int sched_find_first_bit(const unsigned long *b)
@@ -370,6 +384,19 @@ static __inline__ int ffs(int x)
370} 384}
371 385
372/** 386/**
387 * fls64 - find last bit set in 64 bit word
388 * @x: the word to search
389 *
390 * This is defined the same way as fls.
391 */
392static __inline__ int fls64(__u64 x)
393{
394 if (x == 0)
395 return 0;
396 return __fls(x) + 1;
397}
398
399/**
373 * hweightN - returns the hamming weight of a N-bit word 400 * hweightN - returns the hamming weight of a N-bit word
374 * @x: the word to weigh 401 * @x: the word to weigh
375 * 402 *
diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h
index e76ee889e21d..0a2065f1a372 100644
--- a/include/asm-xtensa/bitops.h
+++ b/include/asm-xtensa/bitops.h
@@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x)
245{ 245{
246 return __cntlz(x); 246 return __cntlz(x);
247} 247}
248#define fls64(x) generic_fls64(x)
248 249
249static __inline__ int 250static __inline__ int
250find_next_bit(const unsigned long *addr, int size, int offset) 251find_next_bit(const unsigned long *addr, int size, int offset)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 38c2fb7ebe09..6a2a19f14bb2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x)
76 */ 76 */
77#include <asm/bitops.h> 77#include <asm/bitops.h>
78 78
79
80static inline int generic_fls64(__u64 x)
81{
82 __u32 h = x >> 32;
83 if (h)
84 return fls(x) + 32;
85 return fls(x);
86}
87
79static __inline__ int get_bitmask_order(unsigned int count) 88static __inline__ int get_bitmask_order(unsigned int count)
80{ 89{
81 int order; 90 int order;
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 71fab4311e92..088529f54965 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -192,10 +192,9 @@ enum {
192#include <linux/workqueue.h> 192#include <linux/workqueue.h>
193 193
194#include <net/inet_connection_sock.h> 194#include <net/inet_connection_sock.h>
195#include <net/inet_sock.h>
195#include <net/inet_timewait_sock.h> 196#include <net/inet_timewait_sock.h>
196#include <net/sock.h>
197#include <net/tcp_states.h> 197#include <net/tcp_states.h>
198#include <net/tcp.h>
199 198
200enum dccp_state { 199enum dccp_state {
201 DCCP_OPEN = TCP_ESTABLISHED, 200 DCCP_OPEN = TCP_ESTABLISHED,
@@ -408,8 +407,6 @@ struct dccp_ackvec;
408 * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss 407 * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
409 * @dccps_timestamp_time - time of latest TIMESTAMP option 408 * @dccps_timestamp_time - time of latest TIMESTAMP option
410 * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option 409 * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
411 * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options)
412 * @dccps_pmtu_cookie - Last pmtu seen by socket
413 * @dccps_packet_size - Set thru setsockopt 410 * @dccps_packet_size - Set thru setsockopt
414 * @dccps_role - Role of this sock, one of %dccp_role 411 * @dccps_role - Role of this sock, one of %dccp_role
415 * @dccps_ndp_count - number of Non Data Packets since last data packet 412 * @dccps_ndp_count - number of Non Data Packets since last data packet
@@ -434,8 +431,6 @@ struct dccp_sock {
434 __u32 dccps_timestamp_echo; 431 __u32 dccps_timestamp_echo;
435 __u32 dccps_packet_size; 432 __u32 dccps_packet_size;
436 unsigned long dccps_ndp_count; 433 unsigned long dccps_ndp_count;
437 __u16 dccps_ext_header_len;
438 __u32 dccps_pmtu_cookie;
439 __u32 dccps_mss_cache; 434 __u32 dccps_mss_cache;
440 struct dccp_options dccps_options; 435 struct dccp_options dccps_options;
441 struct dccp_ackvec *dccps_hc_rx_ackvec; 436 struct dccp_ackvec *dccps_hc_rx_ackvec;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 5f49a30eb6f2..745c988359c0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr)
63 * @addr: Pointer to a six-byte array containing the Ethernet address 63 * @addr: Pointer to a six-byte array containing the Ethernet address
64 * 64 *
65 * Return true if the address is a multicast address. 65 * Return true if the address is a multicast address.
66 * By definition the broadcast address is also a multicast address.
66 */ 67 */
67static inline int is_multicast_ether_addr(const u8 *addr) 68static inline int is_multicast_ether_addr(const u8 *addr)
68{ 69{
69 return ((addr[0] != 0xff) && (0x01 & addr[0])); 70 return (0x01 & addr[0]);
70} 71}
71 72
72/** 73/**
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index e677f73f13dd..4fab3d0a4bce 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -157,8 +157,7 @@ struct pppox_proto {
157extern int register_pppox_proto(int proto_num, struct pppox_proto *pp); 157extern int register_pppox_proto(int proto_num, struct pppox_proto *pp);
158extern void unregister_pppox_proto(int proto_num); 158extern void unregister_pppox_proto(int proto_num);
159extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ 159extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
160extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd, 160extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
161 unsigned long arg);
162 161
163/* PPPoX socket states */ 162/* PPPoX socket states */
164enum { 163enum {
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 33e8a19a1a0f..9e2eb9a602eb 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -16,6 +16,7 @@
16 */ 16 */
17#ifndef _LINUX_IP_H 17#ifndef _LINUX_IP_H
18#define _LINUX_IP_H 18#define _LINUX_IP_H
19#include <linux/types.h>
19#include <asm/byteorder.h> 20#include <asm/byteorder.h>
20 21
21#define IPTOS_TOS_MASK 0x1E 22#define IPTOS_TOS_MASK 0x1E
@@ -78,126 +79,6 @@
78#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ 79#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */
79#define IPOPT_TS_PRESPEC 3 /* specified modules only */ 80#define IPOPT_TS_PRESPEC 3 /* specified modules only */
80 81
81#ifdef __KERNEL__
82#include <linux/config.h>
83#include <linux/types.h>
84#include <net/request_sock.h>
85#include <net/sock.h>
86#include <linux/igmp.h>
87#include <net/flow.h>
88
89struct ip_options {
90 __u32 faddr; /* Saved first hop address */
91 unsigned char optlen;
92 unsigned char srr;
93 unsigned char rr;
94 unsigned char ts;
95 unsigned char is_setbyuser:1, /* Set by setsockopt? */
96 is_data:1, /* Options in __data, rather than skb */
97 is_strictroute:1, /* Strict source route */
98 srr_is_hit:1, /* Packet destination addr was our one */
99 is_changed:1, /* IP checksum more not valid */
100 rr_needaddr:1, /* Need to record addr of outgoing dev */
101 ts_needtime:1, /* Need to record timestamp */
102 ts_needaddr:1; /* Need to record addr of outgoing dev */
103 unsigned char router_alert;
104 unsigned char __pad1;
105 unsigned char __pad2;
106 unsigned char __data[0];
107};
108
109#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
110
111struct inet_request_sock {
112 struct request_sock req;
113 u32 loc_addr;
114 u32 rmt_addr;
115 u16 rmt_port;
116 u16 snd_wscale : 4,
117 rcv_wscale : 4,
118 tstamp_ok : 1,
119 sack_ok : 1,
120 wscale_ok : 1,
121 ecn_ok : 1,
122 acked : 1;
123 struct ip_options *opt;
124};
125
126static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
127{
128 return (struct inet_request_sock *)sk;
129}
130
131struct ipv6_pinfo;
132
133struct inet_sock {
134 /* sk and pinet6 has to be the first two members of inet_sock */
135 struct sock sk;
136#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
137 struct ipv6_pinfo *pinet6;
138#endif
139 /* Socket demultiplex comparisons on incoming packets. */
140 __u32 daddr; /* Foreign IPv4 addr */
141 __u32 rcv_saddr; /* Bound local IPv4 addr */
142 __u16 dport; /* Destination port */
143 __u16 num; /* Local port */
144 __u32 saddr; /* Sending source */
145 __s16 uc_ttl; /* Unicast TTL */
146 __u16 cmsg_flags;
147 struct ip_options *opt;
148 __u16 sport; /* Source port */
149 __u16 id; /* ID counter for DF pkts */
150 __u8 tos; /* TOS */
151 __u8 mc_ttl; /* Multicasting TTL */
152 __u8 pmtudisc;
153 unsigned recverr : 1,
154 freebind : 1,
155 hdrincl : 1,
156 mc_loop : 1;
157 int mc_index; /* Multicast device index */
158 __u32 mc_addr;
159 struct ip_mc_socklist *mc_list; /* Group array */
160 /*
161 * Following members are used to retain the infomation to build
162 * an ip header on each ip fragmentation while the socket is corked.
163 */
164 struct {
165 unsigned int flags;
166 unsigned int fragsize;
167 struct ip_options *opt;
168 struct rtable *rt;
169 int length; /* Total length of all frames */
170 u32 addr;
171 struct flowi fl;
172 } cork;
173};
174
175#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
176#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
177
178static inline struct inet_sock *inet_sk(const struct sock *sk)
179{
180 return (struct inet_sock *)sk;
181}
182
183static inline void __inet_sk_copy_descendant(struct sock *sk_to,
184 const struct sock *sk_from,
185 const int ancestor_size)
186{
187 memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
188 sk_from->sk_prot->obj_size - ancestor_size);
189}
190#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
191static inline void inet_sk_copy_descendant(struct sock *sk_to,
192 const struct sock *sk_from)
193{
194 __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
195}
196#endif
197#endif
198
199extern int inet_sk_rebuild_header(struct sock *sk);
200
201struct iphdr { 82struct iphdr {
202#if defined(__LITTLE_ENDIAN_BITFIELD) 83#if defined(__LITTLE_ENDIAN_BITFIELD)
203 __u8 ihl:4, 84 __u8 ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e0b922785d98..93bbed5c6cf4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -171,12 +171,13 @@ enum {
171}; 171};
172 172
173#ifdef __KERNEL__ 173#ifdef __KERNEL__
174#include <linux/in6.h> /* struct sockaddr_in6 */
175#include <linux/icmpv6.h> 174#include <linux/icmpv6.h>
176#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
177#include <linux/tcp.h> 175#include <linux/tcp.h>
178#include <linux/udp.h> 176#include <linux/udp.h>
179 177
178#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
179#include <net/inet_sock.h>
180
180/* 181/*
181 This structure contains results of exthdrs parsing 182 This structure contains results of exthdrs parsing
182 as offsets from skb->nh. 183 as offsets from skb->nh.
@@ -199,18 +200,17 @@ static inline int inet6_iif(const struct sk_buff *skb)
199 return IP6CB(skb)->iif; 200 return IP6CB(skb)->iif;
200} 201}
201 202
202struct tcp6_request_sock { 203struct inet6_request_sock {
203 struct tcp_request_sock req;
204 struct in6_addr loc_addr; 204 struct in6_addr loc_addr;
205 struct in6_addr rmt_addr; 205 struct in6_addr rmt_addr;
206 struct sk_buff *pktopts; 206 struct sk_buff *pktopts;
207 int iif; 207 int iif;
208}; 208};
209 209
210static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk) 210struct tcp6_request_sock {
211{ 211 struct tcp_request_sock tcp6rsk_tcp;
212 return (struct tcp6_request_sock *)sk; 212 struct inet6_request_sock tcp6rsk_inet6;
213} 213};
214 214
215/** 215/**
216 * struct ipv6_pinfo - ipv6 private area 216 * struct ipv6_pinfo - ipv6 private area
@@ -298,12 +298,36 @@ struct tcp6_sock {
298 struct ipv6_pinfo inet6; 298 struct ipv6_pinfo inet6;
299}; 299};
300 300
301extern int inet6_sk_rebuild_header(struct sock *sk);
302
301#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 303#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
302static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) 304static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
303{ 305{
304 return inet_sk(__sk)->pinet6; 306 return inet_sk(__sk)->pinet6;
305} 307}
306 308
309static inline struct inet6_request_sock *
310 inet6_rsk(const struct request_sock *rsk)
311{
312 return (struct inet6_request_sock *)(((u8 *)rsk) +
313 inet_rsk(rsk)->inet6_rsk_offset);
314}
315
316static inline u32 inet6_rsk_offset(struct request_sock *rsk)
317{
318 return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock);
319}
320
321static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
322{
323 struct request_sock *req = reqsk_alloc(ops);
324
325 if (req != NULL)
326 inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req);
327
328 return req;
329}
330
307static inline struct raw6_sock *raw6_sk(const struct sock *sk) 331static inline struct raw6_sock *raw6_sk(const struct sock *sk)
308{ 332{
309 return (struct raw6_sock *)sk; 333 return (struct raw6_sock *)sk;
@@ -323,28 +347,37 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
323#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) 347#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only)
324#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) 348#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
325 349
326#include <linux/tcp.h> 350struct inet6_timewait_sock {
351 struct in6_addr tw_v6_daddr;
352 struct in6_addr tw_v6_rcv_saddr;
353};
327 354
328struct tcp6_timewait_sock { 355struct tcp6_timewait_sock {
329 struct tcp_timewait_sock tw_v6_sk; 356 struct tcp_timewait_sock tcp6tw_tcp;
330 struct in6_addr tw_v6_daddr; 357 struct inet6_timewait_sock tcp6tw_inet6;
331 struct in6_addr tw_v6_rcv_saddr;
332}; 358};
333 359
334static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) 360static inline u16 inet6_tw_offset(const struct proto *prot)
335{ 361{
336 return (struct tcp6_timewait_sock *)sk; 362 return prot->twsk_prot->twsk_obj_size -
363 sizeof(struct inet6_timewait_sock);
337} 364}
338 365
339static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) 366static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
367{
368 return (struct inet6_timewait_sock *)(((u8 *)sk) +
369 inet_twsk(sk)->tw_ipv6_offset);
370}
371
372static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk)
340{ 373{
341 return likely(sk->sk_state != TCP_TIME_WAIT) ? 374 return likely(sk->sk_state != TCP_TIME_WAIT) ?
342 &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; 375 &inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr;
343} 376}
344 377
345static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) 378static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
346{ 379{
347 return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; 380 return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL;
348} 381}
349 382
350static inline int inet_v6_ipv6only(const struct sock *sk) 383static inline int inet_v6_ipv6only(const struct sock *sk)
@@ -361,13 +394,19 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
361 return NULL; 394 return NULL;
362} 395}
363 396
397static inline struct inet6_request_sock *
398 inet6_rsk(const struct request_sock *rsk)
399{
400 return NULL;
401}
402
364static inline struct raw6_sock *raw6_sk(const struct sock *sk) 403static inline struct raw6_sock *raw6_sk(const struct sock *sk)
365{ 404{
366 return NULL; 405 return NULL;
367} 406}
368 407
369#define __tcp_v6_rcv_saddr(__sk) NULL 408#define __inet6_rcv_saddr(__sk) NULL
370#define tcp_v6_rcv_saddr(__sk) NULL 409#define inet6_rcv_saddr(__sk) NULL
371#define tcp_twsk_ipv6only(__sk) 0 410#define tcp_twsk_ipv6only(__sk) 0
372#define inet_v6_ipv6only(__sk) 0 411#define inet_v6_ipv6only(__sk) 0
373#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ 412#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
diff --git a/include/linux/net.h b/include/linux/net.h
index d6a41e6577f6..28195a2d8ff0 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -107,7 +107,7 @@ enum sock_type {
107struct socket { 107struct socket {
108 socket_state state; 108 socket_state state;
109 unsigned long flags; 109 unsigned long flags;
110 struct proto_ops *ops; 110 const struct proto_ops *ops;
111 struct fasync_struct *fasync_list; 111 struct fasync_struct *fasync_list;
112 struct file *file; 112 struct file *file;
113 struct sock *sk; 113 struct sock *sk;
@@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms
260SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \ 260SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \
261 (file, sock, vma)) \ 261 (file, sock, vma)) \
262 \ 262 \
263static struct proto_ops name##_ops = { \ 263static const struct proto_ops name##_ops = { \
264 .family = fam, \ 264 .family = fam, \
265 .owner = THIS_MODULE, \ 265 .owner = THIS_MODULE, \
266 .release = __lock_##name##_release, \ 266 .release = __lock_##name##_release, \
diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 724066778aff..6351c4055ace 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -216,6 +216,16 @@ struct sadb_x_nat_t_port {
216} __attribute__((packed)); 216} __attribute__((packed));
217/* sizeof(struct sadb_x_nat_t_port) == 8 */ 217/* sizeof(struct sadb_x_nat_t_port) == 8 */
218 218
219/* Generic LSM security context */
220struct sadb_x_sec_ctx {
221 uint16_t sadb_x_sec_len;
222 uint16_t sadb_x_sec_exttype;
223 uint8_t sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */
224 uint8_t sadb_x_ctx_doi;
225 uint16_t sadb_x_ctx_len;
226} __attribute__((packed));
227/* sizeof(struct sadb_sec_ctx) = 8 */
228
219/* Message types */ 229/* Message types */
220#define SADB_RESERVED 0 230#define SADB_RESERVED 0
221#define SADB_GETSPI 1 231#define SADB_GETSPI 1
@@ -325,7 +335,8 @@ struct sadb_x_nat_t_port {
325#define SADB_X_EXT_NAT_T_SPORT 21 335#define SADB_X_EXT_NAT_T_SPORT 21
326#define SADB_X_EXT_NAT_T_DPORT 22 336#define SADB_X_EXT_NAT_T_DPORT 22
327#define SADB_X_EXT_NAT_T_OA 23 337#define SADB_X_EXT_NAT_T_OA 23
328#define SADB_EXT_MAX 23 338#define SADB_X_EXT_SEC_CTX 24
339#define SADB_EXT_MAX 24
329 340
330/* Identity Extension values */ 341/* Identity Extension values */
331#define SADB_IDENTTYPE_RESERVED 0 342#define SADB_IDENTTYPE_RESERVED 0
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e87b233615b3..d10f35338507 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -429,6 +429,7 @@ enum
429 TCA_NETEM_CORR, 429 TCA_NETEM_CORR,
430 TCA_NETEM_DELAY_DIST, 430 TCA_NETEM_DELAY_DIST,
431 TCA_NETEM_REORDER, 431 TCA_NETEM_REORDER,
432 TCA_NETEM_CORRUPT,
432 __TCA_NETEM_MAX, 433 __TCA_NETEM_MAX,
433}; 434};
434 435
@@ -457,6 +458,12 @@ struct tc_netem_reorder
457 __u32 correlation; 458 __u32 correlation;
458}; 459};
459 460
461struct tc_netem_corrupt
462{
463 __u32 probability;
464 __u32 correlation;
465};
466
460#define NETEM_DIST_SCALE 8192 467#define NETEM_DIST_SCALE 8192
461 468
462#endif 469#endif
diff --git a/include/linux/random.h b/include/linux/random.h
index 7b2adb3322d5..5d6456bcdeba 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -52,9 +52,9 @@ extern void get_random_bytes(void *buf, int nbytes);
52void generate_random_uuid(unsigned char uuid_out[16]); 52void generate_random_uuid(unsigned char uuid_out[16]);
53 53
54extern __u32 secure_ip_id(__u32 daddr); 54extern __u32 secure_ip_id(__u32 daddr);
55extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); 55extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
56extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 56extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr,
57 __u16 dport); 57 __u16 dport);
58extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, 58extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
59 __u16 sport, __u16 dport); 59 __u16 sport, __u16 dport);
60extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, 60extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
diff --git a/include/linux/security.h b/include/linux/security.h
index f7e0ae018712..ef753654daa5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -59,6 +59,12 @@ struct sk_buff;
59struct sock; 59struct sock;
60struct sockaddr; 60struct sockaddr;
61struct socket; 61struct socket;
62struct flowi;
63struct dst_entry;
64struct xfrm_selector;
65struct xfrm_policy;
66struct xfrm_state;
67struct xfrm_user_sec_ctx;
62 68
63extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); 69extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
64extern int cap_netlink_recv(struct sk_buff *skb); 70extern int cap_netlink_recv(struct sk_buff *skb);
@@ -788,6 +794,52 @@ struct swap_info_struct;
788 * which is used to copy security attributes between local stream sockets. 794 * which is used to copy security attributes between local stream sockets.
789 * @sk_free_security: 795 * @sk_free_security:
790 * Deallocate security structure. 796 * Deallocate security structure.
797 * @sk_getsid:
798 * Retrieve the LSM-specific sid for the sock to enable caching of network
799 * authorizations.
800 *
801 * Security hooks for XFRM operations.
802 *
803 * @xfrm_policy_alloc_security:
804 * @xp contains the xfrm_policy being added to Security Policy Database
805 * used by the XFRM system.
806 * @sec_ctx contains the security context information being provided by
807 * the user-level policy update program (e.g., setkey).
808 * Allocate a security structure to the xp->selector.security field.
809 * The security field is initialized to NULL when the xfrm_policy is
810 * allocated.
811 * Return 0 if operation was successful (memory to allocate, legal context)
812 * @xfrm_policy_clone_security:
813 * @old contains an existing xfrm_policy in the SPD.
814 * @new contains a new xfrm_policy being cloned from old.
815 * Allocate a security structure to the new->selector.security field
816 * that contains the information from the old->selector.security field.
817 * Return 0 if operation was successful (memory to allocate).
818 * @xfrm_policy_free_security:
819 * @xp contains the xfrm_policy
820 * Deallocate xp->selector.security.
821 * @xfrm_state_alloc_security:
822 * @x contains the xfrm_state being added to the Security Association
823 * Database by the XFRM system.
824 * @sec_ctx contains the security context information being provided by
825 * the user-level SA generation program (e.g., setkey or racoon).
826 * Allocate a security structure to the x->sel.security field. The
827 * security field is initialized to NULL when the xfrm_state is
828 * allocated.
829 * Return 0 if operation was successful (memory to allocate, legal context).
830 * @xfrm_state_free_security:
831 * @x contains the xfrm_state.
832 * Deallocate x>sel.security.
833 * @xfrm_policy_lookup:
834 * @xp contains the xfrm_policy for which the access control is being
835 * checked.
836 * @sk_sid contains the sock security label that is used to authorize
837 * access to the policy xp.
838 * @dir contains the direction of the flow (input or output).
839 * Check permission when a sock selects a xfrm_policy for processing
840 * XFRMs on a packet. The hook is called when selecting either a
841 * per-socket policy or a generic xfrm policy.
842 * Return 0 if permission is granted.
791 * 843 *
792 * Security hooks affecting all Key Management operations 844 * Security hooks affecting all Key Management operations
793 * 845 *
@@ -1237,8 +1289,18 @@ struct security_operations {
1237 int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); 1289 int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
1238 int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); 1290 int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
1239 void (*sk_free_security) (struct sock *sk); 1291 void (*sk_free_security) (struct sock *sk);
1292 unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
1240#endif /* CONFIG_SECURITY_NETWORK */ 1293#endif /* CONFIG_SECURITY_NETWORK */
1241 1294
1295#ifdef CONFIG_SECURITY_NETWORK_XFRM
1296 int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
1297 int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
1298 void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
1299 int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
1300 void (*xfrm_state_free_security) (struct xfrm_state *x);
1301 int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
1302#endif /* CONFIG_SECURITY_NETWORK_XFRM */
1303
1242 /* key management security hooks */ 1304 /* key management security hooks */
1243#ifdef CONFIG_KEYS 1305#ifdef CONFIG_KEYS
1244 int (*key_alloc)(struct key *key); 1306 int (*key_alloc)(struct key *key);
@@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk)
2679{ 2741{
2680 return security_ops->sk_free_security(sk); 2742 return security_ops->sk_free_security(sk);
2681} 2743}
2744
2745static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
2746{
2747 return security_ops->sk_getsid(sk, fl, dir);
2748}
2682#else /* CONFIG_SECURITY_NETWORK */ 2749#else /* CONFIG_SECURITY_NETWORK */
2683static inline int security_unix_stream_connect(struct socket * sock, 2750static inline int security_unix_stream_connect(struct socket * sock,
2684 struct socket * other, 2751 struct socket * other,
@@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
2795static inline void security_sk_free(struct sock *sk) 2862static inline void security_sk_free(struct sock *sk)
2796{ 2863{
2797} 2864}
2865
2866static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
2867{
2868 return 0;
2869}
2798#endif /* CONFIG_SECURITY_NETWORK */ 2870#endif /* CONFIG_SECURITY_NETWORK */
2799 2871
2872#ifdef CONFIG_SECURITY_NETWORK_XFRM
2873static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
2874{
2875 return security_ops->xfrm_policy_alloc_security(xp, sec_ctx);
2876}
2877
2878static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
2879{
2880 return security_ops->xfrm_policy_clone_security(old, new);
2881}
2882
2883static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
2884{
2885 security_ops->xfrm_policy_free_security(xp);
2886}
2887
2888static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
2889{
2890 return security_ops->xfrm_state_alloc_security(x, sec_ctx);
2891}
2892
2893static inline void security_xfrm_state_free(struct xfrm_state *x)
2894{
2895 security_ops->xfrm_state_free_security(x);
2896}
2897
2898static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
2899{
2900 return security_ops->xfrm_policy_lookup(xp, sk_sid, dir);
2901}
2902#else /* CONFIG_SECURITY_NETWORK_XFRM */
2903static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
2904{
2905 return 0;
2906}
2907
2908static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
2909{
2910 return 0;
2911}
2912
2913static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
2914{
2915}
2916
2917static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
2918{
2919 return 0;
2920}
2921
2922static inline void security_xfrm_state_free(struct xfrm_state *x)
2923{
2924}
2925
2926static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
2927{
2928 return 0;
2929}
2930#endif /* CONFIG_SECURITY_NETWORK_XFRM */
2931
2800#ifdef CONFIG_KEYS 2932#ifdef CONFIG_KEYS
2801#ifdef CONFIG_SECURITY 2933#ifdef CONFIG_SECURITY
2802static inline int security_key_alloc(struct key *key) 2934static inline int security_key_alloc(struct key *key)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d6001a923..483cfc47ec34 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -32,7 +32,6 @@
32 32
33#define HAVE_ALLOC_SKB /* For the drivers to know */ 33#define HAVE_ALLOC_SKB /* For the drivers to know */
34#define HAVE_ALIGNABLE_SKB /* Ditto 8) */ 34#define HAVE_ALIGNABLE_SKB /* Ditto 8) */
35#define SLAB_SKB /* Slabified skbuffs */
36 35
37#define CHECKSUM_NONE 0 36#define CHECKSUM_NONE 0
38#define CHECKSUM_HW 1 37#define CHECKSUM_HW 1
@@ -134,7 +133,7 @@ struct skb_frag_struct {
134 */ 133 */
135struct skb_shared_info { 134struct skb_shared_info {
136 atomic_t dataref; 135 atomic_t dataref;
137 unsigned int nr_frags; 136 unsigned short nr_frags;
138 unsigned short tso_size; 137 unsigned short tso_size;
139 unsigned short tso_segs; 138 unsigned short tso_segs;
140 unsigned short ufo_size; 139 unsigned short ufo_size;
@@ -1239,6 +1238,8 @@ extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
1239 int hlen, 1238 int hlen,
1240 struct iovec *iov); 1239 struct iovec *iov);
1241extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb); 1240extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
1241extern void skb_kill_datagram(struct sock *sk, struct sk_buff *skb,
1242 unsigned int flags);
1242extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, 1243extern unsigned int skb_checksum(const struct sk_buff *skb, int offset,
1243 int len, unsigned int csum); 1244 int len, unsigned int csum);
1244extern int skb_copy_bits(const struct sk_buff *skb, int offset, 1245extern int skb_copy_bits(const struct sk_buff *skb, int offset,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 1739c2d5b95b..9f4019156fd8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage {
27#include <linux/compiler.h> /* __user */ 27#include <linux/compiler.h> /* __user */
28 28
29extern int sysctl_somaxconn; 29extern int sysctl_somaxconn;
30extern void sock_init(void);
31#ifdef CONFIG_PROC_FS 30#ifdef CONFIG_PROC_FS
32struct seq_file; 31struct seq_file;
33extern void socket_seq_show(struct seq_file *seq); 32extern void socket_seq_show(struct seq_file *seq);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..93fa765e47d3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
390 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, 390 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
391 NET_TCP_CONG_CONTROL=110, 391 NET_TCP_CONG_CONTROL=110,
392 NET_TCP_ABC=111, 392 NET_TCP_ABC=111,
393 NET_IPV4_IPFRAG_MAX_DIST=112,
393}; 394};
394 395
395enum { 396enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0e1da6602e05..f2bb2396853f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -55,22 +55,6 @@ struct tcphdr {
55 __u16 urg_ptr; 55 __u16 urg_ptr;
56}; 56};
57 57
58#define TCP_ACTION_FIN (1 << 7)
59
60enum {
61 TCPF_ESTABLISHED = (1 << 1),
62 TCPF_SYN_SENT = (1 << 2),
63 TCPF_SYN_RECV = (1 << 3),
64 TCPF_FIN_WAIT1 = (1 << 4),
65 TCPF_FIN_WAIT2 = (1 << 5),
66 TCPF_TIME_WAIT = (1 << 6),
67 TCPF_CLOSE = (1 << 7),
68 TCPF_CLOSE_WAIT = (1 << 8),
69 TCPF_LAST_ACK = (1 << 9),
70 TCPF_LISTEN = (1 << 10),
71 TCPF_CLOSING = (1 << 11)
72};
73
74/* 58/*
75 * The union cast uses a gcc extension to avoid aliasing problems 59 * The union cast uses a gcc extension to avoid aliasing problems
76 * (union is compatible to any of its members) 60 * (union is compatible to any of its members)
@@ -254,10 +238,9 @@ struct tcp_sock {
254 __u32 snd_wl1; /* Sequence for window update */ 238 __u32 snd_wl1; /* Sequence for window update */
255 __u32 snd_wnd; /* The window we expect to receive */ 239 __u32 snd_wnd; /* The window we expect to receive */
256 __u32 max_window; /* Maximal window ever seen from peer */ 240 __u32 max_window; /* Maximal window ever seen from peer */
257 __u32 pmtu_cookie; /* Last pmtu seen by socket */
258 __u32 mss_cache; /* Cached effective mss, not including SACKS */ 241 __u32 mss_cache; /* Cached effective mss, not including SACKS */
259 __u16 xmit_size_goal; /* Goal for segmenting output packets */ 242 __u16 xmit_size_goal; /* Goal for segmenting output packets */
260 __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ 243 /* XXX Two bytes hole, try to pack */
261 244
262 __u32 window_clamp; /* Maximal window to advertise */ 245 __u32 window_clamp; /* Maximal window to advertise */
263 __u32 rcv_ssthresh; /* Current window clamp */ 246 __u32 rcv_ssthresh; /* Current window clamp */
@@ -295,8 +278,6 @@ struct tcp_sock {
295 278
296 struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ 279 struct sk_buff_head out_of_order_queue; /* Out of order segments go here */
297 280
298 struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
299
300 __u32 rcv_wnd; /* Current receiver window */ 281 __u32 rcv_wnd; /* Current receiver window */
301 __u32 rcv_wup; /* rcv_nxt on last window update sent */ 282 __u32 rcv_wup; /* rcv_nxt on last window update sent */
302 __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ 283 __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/linux/udp.h b/include/linux/udp.h
index b60e0b4a25c4..85a55658831c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,10 +35,10 @@ struct udphdr {
35#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */ 35#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */
36 36
37#ifdef __KERNEL__ 37#ifdef __KERNEL__
38
39#include <linux/config.h> 38#include <linux/config.h>
40#include <net/sock.h> 39#include <linux/types.h>
41#include <linux/ip.h> 40
41#include <net/inet_sock.h>
42 42
43struct udp_sock { 43struct udp_sock {
44 /* inet_sock has to be the first member */ 44 /* inet_sock has to be the first member */
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 0fb077d68441..82fbb758e28f 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -27,6 +27,22 @@ struct xfrm_id
27 __u8 proto; 27 __u8 proto;
28}; 28};
29 29
30struct xfrm_sec_ctx {
31 __u8 ctx_doi;
32 __u8 ctx_alg;
33 __u16 ctx_len;
34 __u32 ctx_sid;
35 char ctx_str[0];
36};
37
38/* Security Context Domains of Interpretation */
39#define XFRM_SC_DOI_RESERVED 0
40#define XFRM_SC_DOI_LSM 1
41
42/* Security Context Algorithms */
43#define XFRM_SC_ALG_RESERVED 0
44#define XFRM_SC_ALG_SELINUX 1
45
30/* Selector, used as selector both on policy rules (SPD) and SAs. */ 46/* Selector, used as selector both on policy rules (SPD) and SAs. */
31 47
32struct xfrm_selector 48struct xfrm_selector
@@ -146,6 +162,18 @@ enum {
146 162
147#define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE) 163#define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)
148 164
165/*
166 * Generic LSM security context for comunicating to user space
167 * NOTE: Same format as sadb_x_sec_ctx
168 */
169struct xfrm_user_sec_ctx {
170 __u16 len;
171 __u16 exttype;
172 __u8 ctx_alg; /* LSMs: e.g., selinux == 1 */
173 __u8 ctx_doi;
174 __u16 ctx_len;
175};
176
149struct xfrm_user_tmpl { 177struct xfrm_user_tmpl {
150 struct xfrm_id id; 178 struct xfrm_id id;
151 __u16 family; 179 __u16 family;
@@ -176,6 +204,7 @@ enum xfrm_attr_type_t {
176 XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */ 204 XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */
177 XFRMA_SA, 205 XFRMA_SA,
178 XFRMA_POLICY, 206 XFRMA_POLICY,
207 XFRMA_SEC_CTX, /* struct xfrm_sec_ctx */
179 __XFRMA_MAX 208 __XFRMA_MAX
180 209
181#define XFRMA_MAX (__XFRMA_MAX - 1) 210#define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b5d785ab4a0e..bfc1779fc753 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -13,7 +13,7 @@ extern void unix_gc(void);
13#define UNIX_HASH_SIZE 256 13#define UNIX_HASH_SIZE 256
14 14
15extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; 15extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
16extern rwlock_t unix_table_lock; 16extern spinlock_t unix_table_lock;
17 17
18extern atomic_t unix_tot_inflight; 18extern atomic_t unix_tot_inflight;
19 19
@@ -58,10 +58,10 @@ struct unix_skb_parms {
58#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) 58#define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb))
59#define UNIXCREDS(skb) (&UNIXCB((skb)).creds) 59#define UNIXCREDS(skb) (&UNIXCB((skb)).creds)
60 60
61#define unix_state_rlock(s) read_lock(&unix_sk(s)->lock) 61#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock)
62#define unix_state_runlock(s) read_unlock(&unix_sk(s)->lock) 62#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock)
63#define unix_state_wlock(s) write_lock(&unix_sk(s)->lock) 63#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock)
64#define unix_state_wunlock(s) write_unlock(&unix_sk(s)->lock) 64#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock)
65 65
66#ifdef __KERNEL__ 66#ifdef __KERNEL__
67/* The AF_UNIX socket */ 67/* The AF_UNIX socket */
@@ -76,7 +76,7 @@ struct unix_sock {
76 struct sock *other; 76 struct sock *other;
77 struct sock *gc_tree; 77 struct sock *gc_tree;
78 atomic_t inflight; 78 atomic_t inflight;
79 rwlock_t lock; 79 spinlock_t lock;
80 wait_queue_head_t peer_wait; 80 wait_queue_head_t peer_wait;
81}; 81};
82#define unix_sk(__sk) ((struct unix_sock *)__sk) 82#define unix_sk(__sk) ((struct unix_sock *)__sk)
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 47048b1d179a..90fcc98e676f 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -7,7 +7,6 @@
7#define _ATMCLIP_H 7#define _ATMCLIP_H
8 8
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/skbuff.h>
11#include <linux/atm.h> 10#include <linux/atm.h>
12#include <linux/atmdev.h> 11#include <linux/atmdev.h>
13#include <linux/atmarp.h> 12#include <linux/atmarp.h>
@@ -18,6 +17,7 @@
18#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back)) 17#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
19#define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key) 18#define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
20 19
20struct sk_buff;
21 21
22struct clip_vcc { 22struct clip_vcc {
23 struct atm_vcc *vcc; /* VCC descriptor */ 23 struct atm_vcc *vcc; /* VCC descriptor */
diff --git a/include/net/dst.h b/include/net/dst.h
index 6c196a5baf24..bee8b84d329d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -9,6 +9,7 @@
9#define _NET_DST_H 9#define _NET_DST_H
10 10
11#include <linux/config.h> 11#include <linux/config.h>
12#include <linux/netdevice.h>
12#include <linux/rtnetlink.h> 13#include <linux/rtnetlink.h>
13#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
14#include <linux/jiffies.h> 15#include <linux/jiffies.h>
diff --git a/include/net/flow.h b/include/net/flow.h
index 9a5c94b1a0ec..ec7eb86eb203 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -84,11 +84,12 @@ struct flowi {
84#define FLOW_DIR_OUT 1 84#define FLOW_DIR_OUT 1
85#define FLOW_DIR_FWD 2 85#define FLOW_DIR_FWD 2
86 86
87typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, 87struct sock;
88typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
88 void **objp, atomic_t **obj_refp); 89 void **objp, atomic_t **obj_refp);
89 90
90extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, 91extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
91 flow_resolve_t resolver); 92 flow_resolve_t resolver);
92extern void flow_cache_flush(void); 93extern void flow_cache_flush(void);
93extern atomic_t flow_cache_genid; 94extern atomic_t flow_cache_genid;
94 95
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 52d8b1a73d52..c5b96b2b8155 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -60,7 +60,7 @@ struct genl_info
60 */ 60 */
61struct genl_ops 61struct genl_ops
62{ 62{
63 unsigned int cmd; 63 u8 cmd;
64 unsigned int flags; 64 unsigned int flags;
65 struct nla_policy *policy; 65 struct nla_policy *policy;
66 int (*doit)(struct sk_buff *skb, 66 int (*doit)(struct sk_buff *skb,
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 6cdebeee5f96..e7c3f20fbafc 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -20,12 +20,9 @@
20 20
21#include <linux/config.h> 21#include <linux/config.h>
22#include <linux/icmp.h> 22#include <linux/icmp.h>
23#include <linux/skbuff.h>
24 23
25#include <net/sock.h> 24#include <net/inet_sock.h>
26#include <net/protocol.h>
27#include <net/snmp.h> 25#include <net/snmp.h>
28#include <linux/ip.h>
29 26
30struct icmp_err { 27struct icmp_err {
31 int errno; 28 int errno;
@@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
38#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) 35#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field)
39#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) 36#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field)
40 37
38struct dst_entry;
39struct net_proto_family;
40struct sk_buff;
41
41extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info); 42extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info);
42extern int icmp_rcv(struct sk_buff *skb); 43extern int icmp_rcv(struct sk_buff *skb);
43extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg); 44extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h
index 225fc751d464..03b766afdc39 100644
--- a/include/net/ieee80211_crypt.h
+++ b/include/net/ieee80211_crypt.h
@@ -23,12 +23,17 @@
23#ifndef IEEE80211_CRYPT_H 23#ifndef IEEE80211_CRYPT_H
24#define IEEE80211_CRYPT_H 24#define IEEE80211_CRYPT_H
25 25
26#include <linux/skbuff.h> 26#include <linux/types.h>
27#include <linux/list.h>
28#include <asm/atomic.h>
27 29
28enum { 30enum {
29 IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), 31 IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0),
30}; 32};
31 33
34struct sk_buff;
35struct module;
36
32struct ieee80211_crypto_ops { 37struct ieee80211_crypto_ops {
33 const char *name; 38 const char *name;
34 struct list_head list; 39 struct list_head list;
@@ -87,6 +92,8 @@ struct ieee80211_crypt_data {
87 atomic_t refcnt; 92 atomic_t refcnt;
88}; 93};
89 94
95struct ieee80211_device;
96
90int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops); 97int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
91int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops); 98int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
92struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name); 99struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
new file mode 100644
index 000000000000..b33b438bffcc
--- /dev/null
+++ b/include/net/inet6_connection_sock.h
@@ -0,0 +1,42 @@
1/*
2 * NET Generic infrastructure for INET6 connection oriented protocols.
3 *
4 * Authors: Many people, see the TCPv6 sources
5 *
6 * From code originally in TCPv6
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13#ifndef _INET6_CONNECTION_SOCK_H
14#define _INET6_CONNECTION_SOCK_H
15
16#include <linux/types.h>
17
18struct in6_addr;
19struct inet_bind_bucket;
20struct request_sock;
21struct sk_buff;
22struct sock;
23struct sockaddr;
24
25extern int inet6_csk_bind_conflict(const struct sock *sk,
26 const struct inet_bind_bucket *tb);
27
28extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
29 struct request_sock ***prevp,
30 const __u16 rport,
31 const struct in6_addr *raddr,
32 const struct in6_addr *laddr,
33 const int iif);
34
35extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
36 struct request_sock *req,
37 const unsigned long timeout);
38
39extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
40
41extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok);
42#endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 5a2beed5a770..25f708ff020e 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
48 return inet6_ehashfn(laddr, lport, faddr, fport); 48 return inet6_ehashfn(laddr, lport, faddr, fport);
49} 49}
50 50
51static inline void __inet6_hash(struct inet_hashinfo *hashinfo,
52 struct sock *sk)
53{
54 struct hlist_head *list;
55 rwlock_t *lock;
56
57 BUG_TRAP(sk_unhashed(sk));
58
59 if (sk->sk_state == TCP_LISTEN) {
60 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
61 lock = &hashinfo->lhash_lock;
62 inet_listen_wlock(hashinfo);
63 } else {
64 unsigned int hash;
65 sk->sk_hash = hash = inet6_sk_ehashfn(sk);
66 hash &= (hashinfo->ehash_size - 1);
67 list = &hashinfo->ehash[hash].chain;
68 lock = &hashinfo->ehash[hash].lock;
69 write_lock(lock);
70 }
71
72 __sk_add_node(sk, list);
73 sock_prot_inc_use(sk->sk_prot);
74 write_unlock(lock);
75}
76
51/* 77/*
52 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so 78 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
53 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM 79 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
@@ -84,10 +110,10 @@ static inline struct sock *
84 110
85 if(*((__u32 *)&(tw->tw_dport)) == ports && 111 if(*((__u32 *)&(tw->tw_dport)) == ports &&
86 sk->sk_family == PF_INET6) { 112 sk->sk_family == PF_INET6) {
87 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); 113 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
88 114
89 if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && 115 if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
90 ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && 116 ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
91 (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) 117 (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
92 goto hit; 118 goto hit;
93 } 119 }
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index f943306ce5ff..227adcbdfec8 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -1,8 +1,8 @@
1#ifndef _INET_COMMON_H 1#ifndef _INET_COMMON_H
2#define _INET_COMMON_H 2#define _INET_COMMON_H
3 3
4extern struct proto_ops inet_stream_ops; 4extern const struct proto_ops inet_stream_ops;
5extern struct proto_ops inet_dgram_ops; 5extern const struct proto_ops inet_dgram_ops;
6 6
7/* 7/*
8 * INET4 prototypes used by INET6 8 * INET4 prototypes used by INET6
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b0c99060b78d..50234fa56a68 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -15,9 +15,11 @@
15#ifndef _INET_CONNECTION_SOCK_H 15#ifndef _INET_CONNECTION_SOCK_H
16#define _INET_CONNECTION_SOCK_H 16#define _INET_CONNECTION_SOCK_H
17 17
18#include <linux/ip.h> 18#include <linux/compiler.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/timer.h> 20#include <linux/timer.h>
21
22#include <net/inet_sock.h>
21#include <net/request_sock.h> 23#include <net/request_sock.h>
22 24
23#define INET_CSK_DEBUG 1 25#define INET_CSK_DEBUG 1
@@ -29,6 +31,29 @@ struct inet_bind_bucket;
29struct inet_hashinfo; 31struct inet_hashinfo;
30struct tcp_congestion_ops; 32struct tcp_congestion_ops;
31 33
34/*
35 * Pointers to address related TCP functions
36 * (i.e. things that depend on the address family)
37 */
38struct inet_connection_sock_af_ops {
39 int (*queue_xmit)(struct sk_buff *skb, int ipfragok);
40 void (*send_check)(struct sock *sk, int len,
41 struct sk_buff *skb);
42 int (*rebuild_header)(struct sock *sk);
43 int (*conn_request)(struct sock *sk, struct sk_buff *skb);
44 struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
45 struct request_sock *req,
46 struct dst_entry *dst);
47 int (*remember_stamp)(struct sock *sk);
48 __u16 net_header_len;
49 int (*setsockopt)(struct sock *sk, int level, int optname,
50 char __user *optval, int optlen);
51 int (*getsockopt)(struct sock *sk, int level, int optname,
52 char __user *optval, int __user *optlen);
53 void (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
54 int sockaddr_len;
55};
56
32/** inet_connection_sock - INET connection oriented sock 57/** inet_connection_sock - INET connection oriented sock
33 * 58 *
34 * @icsk_accept_queue: FIFO of established children 59 * @icsk_accept_queue: FIFO of established children
@@ -36,13 +61,16 @@ struct tcp_congestion_ops;
36 * @icsk_timeout: Timeout 61 * @icsk_timeout: Timeout
37 * @icsk_retransmit_timer: Resend (no ack) 62 * @icsk_retransmit_timer: Resend (no ack)
38 * @icsk_rto: Retransmit timeout 63 * @icsk_rto: Retransmit timeout
64 * @icsk_pmtu_cookie Last pmtu seen by socket
39 * @icsk_ca_ops Pluggable congestion control hook 65 * @icsk_ca_ops Pluggable congestion control hook
66 * @icsk_af_ops Operations which are AF_INET{4,6} specific
40 * @icsk_ca_state: Congestion control state 67 * @icsk_ca_state: Congestion control state
41 * @icsk_retransmits: Number of unrecovered [RTO] timeouts 68 * @icsk_retransmits: Number of unrecovered [RTO] timeouts
42 * @icsk_pending: Scheduled timer event 69 * @icsk_pending: Scheduled timer event
43 * @icsk_backoff: Backoff 70 * @icsk_backoff: Backoff
44 * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries 71 * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries
45 * @icsk_probes_out: unanswered 0 window probes 72 * @icsk_probes_out: unanswered 0 window probes
73 * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
46 * @icsk_ack: Delayed ACK control data 74 * @icsk_ack: Delayed ACK control data
47 */ 75 */
48struct inet_connection_sock { 76struct inet_connection_sock {
@@ -54,14 +82,17 @@ struct inet_connection_sock {
54 struct timer_list icsk_retransmit_timer; 82 struct timer_list icsk_retransmit_timer;
55 struct timer_list icsk_delack_timer; 83 struct timer_list icsk_delack_timer;
56 __u32 icsk_rto; 84 __u32 icsk_rto;
85 __u32 icsk_pmtu_cookie;
57 struct tcp_congestion_ops *icsk_ca_ops; 86 struct tcp_congestion_ops *icsk_ca_ops;
87 struct inet_connection_sock_af_ops *icsk_af_ops;
88 unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
58 __u8 icsk_ca_state; 89 __u8 icsk_ca_state;
59 __u8 icsk_retransmits; 90 __u8 icsk_retransmits;
60 __u8 icsk_pending; 91 __u8 icsk_pending;
61 __u8 icsk_backoff; 92 __u8 icsk_backoff;
62 __u8 icsk_syn_retries; 93 __u8 icsk_syn_retries;
63 __u8 icsk_probes_out; 94 __u8 icsk_probes_out;
64 /* 2 BYTES HOLE, TRY TO PACK! */ 95 __u16 icsk_ext_hdr_len;
65 struct { 96 struct {
66 __u8 pending; /* ACK is pending */ 97 __u8 pending; /* ACK is pending */
67 __u8 quick; /* Scheduled number of quick acks */ 98 __u8 quick; /* Scheduled number of quick acks */
@@ -192,8 +223,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk,
192 const __u16 rport, 223 const __u16 rport,
193 const __u32 raddr, 224 const __u32 raddr,
194 const __u32 laddr); 225 const __u32 laddr);
226extern int inet_csk_bind_conflict(const struct sock *sk,
227 const struct inet_bind_bucket *tb);
195extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, 228extern int inet_csk_get_port(struct inet_hashinfo *hashinfo,
196 struct sock *sk, unsigned short snum); 229 struct sock *sk, unsigned short snum,
230 int (*bind_conflict)(const struct sock *sk,
231 const struct inet_bind_bucket *tb));
197 232
198extern struct dst_entry* inet_csk_route_req(struct sock *sk, 233extern struct dst_entry* inet_csk_route_req(struct sock *sk,
199 const struct request_sock *req); 234 const struct request_sock *req);
@@ -207,7 +242,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk,
207 242
208extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, 243extern void inet_csk_reqsk_queue_hash_add(struct sock *sk,
209 struct request_sock *req, 244 struct request_sock *req,
210 const unsigned timeout); 245 unsigned long timeout);
211 246
212static inline void inet_csk_reqsk_queue_removed(struct sock *sk, 247static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
213 struct request_sock *req) 248 struct request_sock *req)
@@ -273,4 +308,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk)
273extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); 308extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries);
274extern void inet_csk_listen_stop(struct sock *sk); 309extern void inet_csk_listen_stop(struct sock *sk);
275 310
311extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
312
276#endif /* _INET_CONNECTION_SOCK_H */ 313#endif /* _INET_CONNECTION_SOCK_H */
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index b0c47e2eccf1..d599c6bfbb86 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -3,6 +3,8 @@
3 3
4#include <linux/ip.h> 4#include <linux/ip.h>
5#include <linux/skbuff.h> 5#include <linux/skbuff.h>
6
7#include <net/inet_sock.h>
6#include <net/dsfield.h> 8#include <net/dsfield.h>
7 9
8enum { 10enum {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 07840baa9341..135d80fd658e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -26,6 +26,7 @@
26#include <linux/wait.h> 26#include <linux/wait.h>
27 27
28#include <net/inet_connection_sock.h> 28#include <net/inet_connection_sock.h>
29#include <net/inet_sock.h>
29#include <net/route.h> 30#include <net/route.h>
30#include <net/sock.h> 31#include <net/sock.h>
31#include <net/tcp_states.h> 32#include <net/tcp_states.h>
@@ -128,26 +129,6 @@ struct inet_hashinfo {
128 kmem_cache_t *bind_bucket_cachep; 129 kmem_cache_t *bind_bucket_cachep;
129}; 130};
130 131
131static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
132 const __u32 faddr, const __u16 fport)
133{
134 unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
135 h ^= h >> 16;
136 h ^= h >> 8;
137 return h;
138}
139
140static inline int inet_sk_ehashfn(const struct sock *sk)
141{
142 const struct inet_sock *inet = inet_sk(sk);
143 const __u32 laddr = inet->rcv_saddr;
144 const __u16 lport = inet->num;
145 const __u32 faddr = inet->daddr;
146 const __u16 fport = inet->dport;
147
148 return inet_ehashfn(laddr, lport, faddr, fport);
149}
150
151static inline struct inet_ehash_bucket *inet_ehash_bucket( 132static inline struct inet_ehash_bucket *inet_ehash_bucket(
152 struct inet_hashinfo *hashinfo, 133 struct inet_hashinfo *hashinfo,
153 unsigned int hash) 134 unsigned int hash)
@@ -434,4 +415,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
434 415
435 return sk; 416 return sk;
436} 417}
418
419extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
420 struct sock *sk);
437#endif /* _INET_HASHTABLES_H */ 421#endif /* _INET_HASHTABLES_H */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
new file mode 100644
index 000000000000..883eb529ef8e
--- /dev/null
+++ b/include/net/inet_sock.h
@@ -0,0 +1,193 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Definitions for inet_sock
7 *
8 * Authors: Many, reorganised here by
9 * Arnaldo Carvalho de Melo <acme@mandriva.com>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16#ifndef _INET_SOCK_H
17#define _INET_SOCK_H
18
19#include <linux/config.h>
20
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <net/flow.h>
25#include <net/sock.h>
26#include <net/request_sock.h>
27
28/** struct ip_options - IP Options
29 *
30 * @faddr - Saved first hop address
31 * @is_setbyuser - Set by setsockopt?
32 * @is_data - Options in __data, rather than skb
33 * @is_strictroute - Strict source route
34 * @srr_is_hit - Packet destination addr was our one
35 * @is_changed - IP checksum more not valid
36 * @rr_needaddr - Need to record addr of outgoing dev
37 * @ts_needtime - Need to record timestamp
38 * @ts_needaddr - Need to record addr of outgoing dev
39 */
40struct ip_options {
41 __u32 faddr;
42 unsigned char optlen;
43 unsigned char srr;
44 unsigned char rr;
45 unsigned char ts;
46 unsigned char is_setbyuser:1,
47 is_data:1,
48 is_strictroute:1,
49 srr_is_hit:1,
50 is_changed:1,
51 rr_needaddr:1,
52 ts_needtime:1,
53 ts_needaddr:1;
54 unsigned char router_alert;
55 unsigned char __pad1;
56 unsigned char __pad2;
57 unsigned char __data[0];
58};
59
60#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
61
62struct inet_request_sock {
63 struct request_sock req;
64#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
65 u16 inet6_rsk_offset;
66 /* 2 bytes hole, try to pack */
67#endif
68 u32 loc_addr;
69 u32 rmt_addr;
70 u16 rmt_port;
71 u16 snd_wscale : 4,
72 rcv_wscale : 4,
73 tstamp_ok : 1,
74 sack_ok : 1,
75 wscale_ok : 1,
76 ecn_ok : 1,
77 acked : 1;
78 struct ip_options *opt;
79};
80
81static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
82{
83 return (struct inet_request_sock *)sk;
84}
85
86struct ip_mc_socklist;
87struct ipv6_pinfo;
88struct rtable;
89
90/** struct inet_sock - representation of INET sockets
91 *
92 * @sk - ancestor class
93 * @pinet6 - pointer to IPv6 control block
94 * @daddr - Foreign IPv4 addr
95 * @rcv_saddr - Bound local IPv4 addr
96 * @dport - Destination port
97 * @num - Local port
98 * @saddr - Sending source
99 * @uc_ttl - Unicast TTL
100 * @sport - Source port
101 * @id - ID counter for DF pkts
102 * @tos - TOS
103 * @mc_ttl - Multicasting TTL
104 * @is_icsk - is this an inet_connection_sock?
105 * @mc_index - Multicast device index
106 * @mc_list - Group array
107 * @cork - info to build ip hdr on each ip frag while socket is corked
108 */
109struct inet_sock {
110 /* sk and pinet6 has to be the first two members of inet_sock */
111 struct sock sk;
112#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
113 struct ipv6_pinfo *pinet6;
114#endif
115 /* Socket demultiplex comparisons on incoming packets. */
116 __u32 daddr;
117 __u32 rcv_saddr;
118 __u16 dport;
119 __u16 num;
120 __u32 saddr;
121 __s16 uc_ttl;
122 __u16 cmsg_flags;
123 struct ip_options *opt;
124 __u16 sport;
125 __u16 id;
126 __u8 tos;
127 __u8 mc_ttl;
128 __u8 pmtudisc;
129 __u8 recverr:1,
130 is_icsk:1,
131 freebind:1,
132 hdrincl:1,
133 mc_loop:1;
134 int mc_index;
135 __u32 mc_addr;
136 struct ip_mc_socklist *mc_list;
137 struct {
138 unsigned int flags;
139 unsigned int fragsize;
140 struct ip_options *opt;
141 struct rtable *rt;
142 int length; /* Total length of all frames */
143 u32 addr;
144 struct flowi fl;
145 } cork;
146};
147
148#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
149#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
150
151static inline struct inet_sock *inet_sk(const struct sock *sk)
152{
153 return (struct inet_sock *)sk;
154}
155
156static inline void __inet_sk_copy_descendant(struct sock *sk_to,
157 const struct sock *sk_from,
158 const int ancestor_size)
159{
160 memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
161 sk_from->sk_prot->obj_size - ancestor_size);
162}
163#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
164static inline void inet_sk_copy_descendant(struct sock *sk_to,
165 const struct sock *sk_from)
166{
167 __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
168}
169#endif
170
171extern int inet_sk_rebuild_header(struct sock *sk);
172
173static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
174 const __u32 faddr, const __u16 fport)
175{
176 unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
177 h ^= h >> 16;
178 h ^= h >> 8;
179 return h;
180}
181
182static inline int inet_sk_ehashfn(const struct sock *sk)
183{
184 const struct inet_sock *inet = inet_sk(sk);
185 const __u32 laddr = inet->rcv_saddr;
186 const __u16 lport = inet->num;
187 const __u32 faddr = inet->daddr;
188 const __u16 fport = inet->dport;
189
190 return inet_ehashfn(laddr, lport, faddr, fport);
191}
192
193#endif /* _INET_SOCK_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 28f7b2103505..1da294c47522 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -17,15 +17,16 @@
17 17
18#include <linux/config.h> 18#include <linux/config.h>
19 19
20#include <linux/ip.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/module.h> 21#include <linux/module.h>
23#include <linux/timer.h> 22#include <linux/timer.h>
24#include <linux/types.h> 23#include <linux/types.h>
25#include <linux/workqueue.h> 24#include <linux/workqueue.h>
26 25
26#include <net/inet_sock.h>
27#include <net/sock.h> 27#include <net/sock.h>
28#include <net/tcp_states.h> 28#include <net/tcp_states.h>
29#include <net/timewait_sock.h>
29 30
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31 32
@@ -127,7 +128,8 @@ struct inet_timewait_sock {
127 __u16 tw_num; 128 __u16 tw_num;
128 /* And these are ours. */ 129 /* And these are ours. */
129 __u8 tw_ipv6only:1; 130 __u8 tw_ipv6only:1;
130 /* 31 bits hole, try to pack */ 131 /* 15 bits hole, try to pack */
132 __u16 tw_ipv6_offset;
131 int tw_timeout; 133 int tw_timeout;
132 unsigned long tw_ttd; 134 unsigned long tw_ttd;
133 struct inet_bind_bucket *tw_tb; 135 struct inet_bind_bucket *tw_tb;
@@ -199,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw)
199 printk(KERN_DEBUG "%s timewait_sock %p released\n", 201 printk(KERN_DEBUG "%s timewait_sock %p released\n",
200 tw->tw_prot->name, tw); 202 tw->tw_prot->name, tw);
201#endif 203#endif
202 kmem_cache_free(tw->tw_prot->twsk_slab, tw); 204 kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
203 module_put(owner); 205 module_put(owner);
204 } 206 }
205} 207}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b6..0965515f40cf 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
25 __u32 v4daddr; /* peer's address */ 25 __u32 v4daddr; /* peer's address */
26 __u16 avl_height; 26 __u16 avl_height;
27 __u16 ip_id_count; /* IP ID for the next packet */ 27 __u16 ip_id_count; /* IP ID for the next packet */
28 atomic_t rid; /* Frag reception counter */
28 __u32 tcp_ts; 29 __u32 tcp_ts;
29 unsigned long tcp_ts_stamp; 30 unsigned long tcp_ts_stamp;
30}; 31};
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6ea..f7e7fd728b67 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,14 +24,10 @@
24 24
25#include <linux/config.h> 25#include <linux/config.h>
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/socket.h>
28#include <linux/ip.h> 27#include <linux/ip.h>
29#include <linux/in.h> 28#include <linux/in.h>
30#include <linux/netdevice.h> 29
31#include <linux/inetdevice.h> 30#include <net/inet_sock.h>
32#include <linux/in_route.h>
33#include <net/route.h>
34#include <net/arp.h>
35#include <net/snmp.h> 31#include <net/snmp.h>
36 32
37struct sock; 33struct sock;
@@ -45,6 +41,7 @@ struct inet_skb_parm
45#define IPSKB_TRANSLATED 2 41#define IPSKB_TRANSLATED 2
46#define IPSKB_FORWARDED 4 42#define IPSKB_FORWARDED 4
47#define IPSKB_XFRM_TUNNEL_SIZE 8 43#define IPSKB_XFRM_TUNNEL_SIZE 8
44#define IPSKB_FRAG_COMPLETE 16
48}; 45};
49 46
50struct ipcm_cookie 47struct ipcm_cookie
@@ -74,6 +71,13 @@ extern rwlock_t ip_ra_lock;
74 71
75#define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ 72#define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */
76 73
74struct msghdr;
75struct net_device;
76struct packet_type;
77struct rtable;
78struct sk_buff;
79struct sockaddr;
80
77extern void ip_mc_dropsocket(struct sock *); 81extern void ip_mc_dropsocket(struct sock *);
78extern void ip_mc_dropdevice(struct net_device *dev); 82extern void ip_mc_dropdevice(struct net_device *dev);
79extern int igmp_mc_proc_init(void); 83extern int igmp_mc_proc_init(void);
@@ -168,6 +172,7 @@ extern int sysctl_ipfrag_high_thresh;
168extern int sysctl_ipfrag_low_thresh; 172extern int sysctl_ipfrag_low_thresh;
169extern int sysctl_ipfrag_time; 173extern int sysctl_ipfrag_time;
170extern int sysctl_ipfrag_secret_interval; 174extern int sysctl_ipfrag_secret_interval;
175extern int sysctl_ipfrag_max_dist;
171 176
172/* From inetpeer.c */ 177/* From inetpeer.c */
173extern int inet_peer_threshold; 178extern int inet_peer_threshold;
@@ -182,6 +187,8 @@ extern int sysctl_ip_dynaddr;
182extern void ipfrag_init(void); 187extern void ipfrag_init(void);
183 188
184#ifdef CONFIG_INET 189#ifdef CONFIG_INET
190#include <net/dst.h>
191
185/* The function in 2.2 was invalid, producing wrong result for 192/* The function in 2.2 was invalid, producing wrong result for
186 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ 193 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
187static inline 194static inline
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14de4ebd1211..e000fa2cd5f6 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
238 struct net_device *dev, u32 *spec_dst, u32 *itag); 238 struct net_device *dev, u32 *spec_dst, u32 *itag);
239extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); 239extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
240 240
241struct rtentry;
242
241/* Exported by fib_semantics.c */ 243/* Exported by fib_semantics.c */
242extern int ip_fib_check_default(u32 gw, struct net_device *dev); 244extern int ip_fib_check_default(u32 gw, struct net_device *dev);
243extern int fib_sync_down(u32 local, struct net_device *dev, int force); 245extern int fib_sync_down(u32 local, struct net_device *dev, int force);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3b5559a023a4..7d2674fde19a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -251,16 +251,15 @@ struct ip_vs_daemon_user {
251#include <linux/config.h> 251#include <linux/config.h>
252#include <linux/list.h> /* for struct list_head */ 252#include <linux/list.h> /* for struct list_head */
253#include <linux/spinlock.h> /* for struct rwlock_t */ 253#include <linux/spinlock.h> /* for struct rwlock_t */
254#include <linux/skbuff.h> /* for struct sk_buff */
255#include <linux/ip.h> /* for struct iphdr */
256#include <asm/atomic.h> /* for struct atomic_t */ 254#include <asm/atomic.h> /* for struct atomic_t */
257#include <linux/netdevice.h> /* for struct neighbour */
258#include <net/dst.h> /* for struct dst_entry */
259#include <net/udp.h>
260#include <linux/compiler.h> 255#include <linux/compiler.h>
256#include <linux/timer.h>
261 257
258#include <net/checksum.h>
262 259
263#ifdef CONFIG_IP_VS_DEBUG 260#ifdef CONFIG_IP_VS_DEBUG
261#include <linux/net.h>
262
264extern int ip_vs_get_debug_level(void); 263extern int ip_vs_get_debug_level(void);
265#define IP_VS_DBG(level, msg...) \ 264#define IP_VS_DBG(level, msg...) \
266 do { \ 265 do { \
@@ -429,8 +428,11 @@ struct ip_vs_stats
429 spinlock_t lock; /* spin lock */ 428 spinlock_t lock; /* spin lock */
430}; 429};
431 430
431struct dst_entry;
432struct iphdr;
432struct ip_vs_conn; 433struct ip_vs_conn;
433struct ip_vs_app; 434struct ip_vs_app;
435struct sk_buff;
434 436
435struct ip_vs_protocol { 437struct ip_vs_protocol {
436 struct ip_vs_protocol *next; 438 struct ip_vs_protocol *next;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0a2ad51cff82..860bbac4c4ee 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -240,6 +240,8 @@ extern struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_t
240struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, 240struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
241 struct ipv6_txoptions *opt); 241 struct ipv6_txoptions *opt);
242 242
243extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb);
244
243extern int ip6_frag_nqueues; 245extern int ip6_frag_nqueues;
244extern atomic_t ip6_frag_mem; 246extern atomic_t ip6_frag_mem;
245 247
@@ -525,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
525extern int inet6_ioctl(struct socket *sock, unsigned int cmd, 527extern int inet6_ioctl(struct socket *sock, unsigned int cmd,
526 unsigned long arg); 528 unsigned long arg);
527 529
530extern int inet6_hash_connect(struct inet_timewait_death_row *death_row,
531 struct sock *sk);
532
528/* 533/*
529 * reassembly.c 534 * reassembly.c
530 */ 535 */
@@ -533,8 +538,11 @@ extern int sysctl_ip6frag_low_thresh;
533extern int sysctl_ip6frag_time; 538extern int sysctl_ip6frag_time;
534extern int sysctl_ip6frag_secret_interval; 539extern int sysctl_ip6frag_secret_interval;
535 540
536extern struct proto_ops inet6_stream_ops; 541extern const struct proto_ops inet6_stream_ops;
537extern struct proto_ops inet6_dgram_ops; 542extern const struct proto_ops inet6_dgram_ops;
543
544struct group_source_req;
545struct group_filter;
538 546
539extern int ip6_mc_source(int add, int omode, struct sock *sk, 547extern int ip6_mc_source(int add, int omode, struct sock *sk,
540 struct group_source_req *pgsr); 548 struct group_source_req *pgsr);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f85d6e4b7442..bbac87eeb422 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,11 +35,20 @@ enum {
35 35
36#ifdef __KERNEL__ 36#ifdef __KERNEL__
37 37
38#include <linux/skbuff.h> 38#include <linux/config.h>
39#include <linux/netdevice.h> 39#include <linux/compiler.h>
40#include <linux/icmpv6.h> 40#include <linux/icmpv6.h>
41#include <linux/in6.h>
42#include <linux/types.h>
43
41#include <net/neighbour.h> 44#include <net/neighbour.h>
42#include <asm/atomic.h> 45
46struct ctl_table;
47struct file;
48struct inet6_dev;
49struct net_device;
50struct net_proto_family;
51struct sk_buff;
43 52
44extern struct neigh_table nd_tbl; 53extern struct neigh_table nd_tbl;
45 54
@@ -108,7 +117,7 @@ extern int igmp6_event_report(struct sk_buff *skb);
108extern void igmp6_cleanup(void); 117extern void igmp6_cleanup(void);
109 118
110#ifdef CONFIG_SYSCTL 119#ifdef CONFIG_SYSCTL
111extern int ndisc_ifinfo_sysctl_change(ctl_table *ctl, 120extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
112 int write, 121 int write,
113 struct file * filp, 122 struct file * filp,
114 void __user *buffer, 123 void __user *buffer,
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 34c07731933d..6fa9ae190741 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -49,8 +49,8 @@
49#ifdef __KERNEL__ 49#ifdef __KERNEL__
50 50
51#include <asm/atomic.h> 51#include <asm/atomic.h>
52#include <linux/skbuff.h>
53#include <linux/netdevice.h> 52#include <linux/netdevice.h>
53#include <linux/skbuff.h>
54#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
55#include <linux/seq_file.h> 55#include <linux/seq_file.h>
56 56
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index bd08964b72c0..b225d8472b7e 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -15,7 +15,6 @@
15#include <linux/in.h> 15#include <linux/in.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h> 18#include <linux/skbuff.h>
20#include <linux/rtnetlink.h> 19#include <linux/rtnetlink.h>
21#include <linux/module.h> 20#include <linux/module.h>
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 357691f6a45f..63f7db99c2a6 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -65,7 +65,7 @@ struct inet_protosw {
65 int protocol; /* This is the L4 protocol number. */ 65 int protocol; /* This is the L4 protocol number. */
66 66
67 struct proto *prot; 67 struct proto *prot;
68 struct proto_ops *ops; 68 const struct proto_ops *ops;
69 69
70 int capability; /* Which (if any) capability do 70 int capability; /* Which (if any) capability do
71 * we need to use this socket 71 * we need to use this socket
@@ -76,6 +76,7 @@ struct inet_protosw {
76}; 76};
77#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */ 77#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
78#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ 78#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
79#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
79 80
80extern struct net_protocol *inet_protocol_base; 81extern struct net_protocol *inet_protocol_base;
81extern struct net_protocol *inet_protos[MAX_INET_PROTOS]; 82extern struct net_protocol *inet_protos[MAX_INET_PROTOS];
diff --git a/include/net/raw.h b/include/net/raw.h
index f47917469b12..e67b28a0248c 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,8 @@
19 19
20#include <linux/config.h> 20#include <linux/config.h>
21 21
22#include <net/protocol.h>
23
22extern struct proto raw_prot; 24extern struct proto raw_prot;
23 25
24extern void raw_err(struct sock *, struct sk_buff *, u32 info); 26extern void raw_err(struct sock *, struct sk_buff *, u32 info);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b52cc52ffe39..11641c9384f7 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
244 244
245static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, 245static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
246 u32 hash, struct request_sock *req, 246 u32 hash, struct request_sock *req,
247 unsigned timeout) 247 unsigned long timeout)
248{ 248{
249 struct listen_sock *lopt = queue->listen_opt; 249 struct listen_sock *lopt = queue->listen_opt;
250 250
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8e7794ee27ff..f5c22d77feab 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -277,6 +277,24 @@ struct sctp_sock {
277 __u32 default_context; 277 __u32 default_context;
278 __u32 default_timetolive; 278 __u32 default_timetolive;
279 279
280 /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
281 * the destination address every heartbeat interval. This value
282 * will be inherited by all new associations.
283 */
284 __u32 hbinterval;
285
286 /* This is the max_retrans value for new associations. */
287 __u16 pathmaxrxt;
288
289 /* The initial Path MTU to use for new associations. */
290 __u32 pathmtu;
291
292 /* The default SACK delay timeout for new associations. */
293 __u32 sackdelay;
294
295 /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
296 __u32 param_flags;
297
280 struct sctp_initmsg initmsg; 298 struct sctp_initmsg initmsg;
281 struct sctp_rtoinfo rtoinfo; 299 struct sctp_rtoinfo rtoinfo;
282 struct sctp_paddrparams paddrparam; 300 struct sctp_paddrparams paddrparam;
@@ -845,9 +863,6 @@ struct sctp_transport {
845 /* Data that has been sent, but not acknowledged. */ 863 /* Data that has been sent, but not acknowledged. */
846 __u32 flight_size; 864 __u32 flight_size;
847 865
848 /* PMTU : The current known path MTU. */
849 __u32 pmtu;
850
851 /* Destination */ 866 /* Destination */
852 struct dst_entry *dst; 867 struct dst_entry *dst;
853 /* Source address. */ 868 /* Source address. */
@@ -862,7 +877,22 @@ struct sctp_transport {
862 /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to 877 /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
863 * the destination address every heartbeat interval. 878 * the destination address every heartbeat interval.
864 */ 879 */
865 int hb_interval; 880 __u32 hbinterval;
881
882 /* This is the max_retrans value for the transport and will
883 * be initialized from the assocs value. This can be changed
884 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
885 */
886 __u16 pathmaxrxt;
887
888 /* PMTU : The current known path MTU. */
889 __u32 pathmtu;
890
891 /* SACK delay timeout */
892 __u32 sackdelay;
893
894 /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
895 __u32 param_flags;
866 896
867 /* When was the last time (in jiffies) that we heard from this 897 /* When was the last time (in jiffies) that we heard from this
868 * transport? We use this to pick new active and retran paths. 898 * transport? We use this to pick new active and retran paths.
@@ -882,22 +912,11 @@ struct sctp_transport {
882 */ 912 */
883 int state; 913 int state;
884 914
885 /* hb_allowed : The current heartbeat state of this destination,
886 * : i.e. ALLOW-HB, NO-HEARTBEAT, etc.
887 */
888 int hb_allowed;
889
890 /* These are the error stats for this destination. */ 915 /* These are the error stats for this destination. */
891 916
892 /* Error count : The current error count for this destination. */ 917 /* Error count : The current error count for this destination. */
893 unsigned short error_count; 918 unsigned short error_count;
894 919
895 /* This is the max_retrans value for the transport and will
896 * be initialized to proto.max_retrans.path. This can be changed
897 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
898 */
899 int max_retrans;
900
901 /* Per : A timer used by each destination. 920 /* Per : A timer used by each destination.
902 * Destination : 921 * Destination :
903 * Timer : 922 * Timer :
@@ -1502,6 +1521,28 @@ struct sctp_association {
1502 /* The largest timeout or RTO value to use in attempting an INIT */ 1521 /* The largest timeout or RTO value to use in attempting an INIT */
1503 __u16 max_init_timeo; 1522 __u16 max_init_timeo;
1504 1523
1524 /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
1525 * the destination address every heartbeat interval. This value
1526 * will be inherited by all new transports.
1527 */
1528 __u32 hbinterval;
1529
1530 /* This is the max_retrans value for new transports in the
1531 * association.
1532 */
1533 __u16 pathmaxrxt;
1534
1535 /* Association : The smallest PMTU discovered for all of the
1536 * PMTU : peer's transport addresses.
1537 */
1538 __u32 pathmtu;
1539
1540 /* SACK delay timeout */
1541 __u32 sackdelay;
1542
1543 /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
1544 __u32 param_flags;
1545
1505 int timeouts[SCTP_NUM_TIMEOUT_TYPES]; 1546 int timeouts[SCTP_NUM_TIMEOUT_TYPES];
1506 struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; 1547 struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES];
1507 1548
@@ -1571,11 +1612,6 @@ struct sctp_association {
1571 */ 1612 */
1572 wait_queue_head_t wait; 1613 wait_queue_head_t wait;
1573 1614
1574 /* Association : The smallest PMTU discovered for all of the
1575 * PMTU : peer's transport addresses.
1576 */
1577 __u32 pmtu;
1578
1579 /* The message size at which SCTP fragmentation will occur. */ 1615 /* The message size at which SCTP fragmentation will occur. */
1580 __u32 frag_point; 1616 __u32 frag_point;
1581 1617
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index f1c3bc54526a..8a6bef6f91eb 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,8 @@ enum sctp_optname {
93#define SCTP_STATUS SCTP_STATUS 93#define SCTP_STATUS SCTP_STATUS
94 SCTP_GET_PEER_ADDR_INFO, 94 SCTP_GET_PEER_ADDR_INFO,
95#define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO 95#define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO
96 SCTP_DELAYED_ACK_TIME,
97#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME
96 98
97 /* Internal Socket Options. Some of the sctp library functions are 99 /* Internal Socket Options. Some of the sctp library functions are
98 * implemented using these socket options. 100 * implemented using these socket options.
@@ -503,13 +505,41 @@ struct sctp_setadaption {
503 * unreachable. The following structure is used to access and modify an 505 * unreachable. The following structure is used to access and modify an
504 * address's parameters: 506 * address's parameters:
505 */ 507 */
508enum sctp_spp_flags {
509 SPP_HB_ENABLE = 1, /*Enable heartbeats*/
510 SPP_HB_DISABLE = 2, /*Disable heartbeats*/
511 SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE,
512 SPP_HB_DEMAND = 4, /*Send heartbeat immediately*/
513 SPP_PMTUD_ENABLE = 8, /*Enable PMTU discovery*/
514 SPP_PMTUD_DISABLE = 16, /*Disable PMTU discovery*/
515 SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE,
516 SPP_SACKDELAY_ENABLE = 32, /*Enable SACK*/
517 SPP_SACKDELAY_DISABLE = 64, /*Disable SACK*/
518 SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
519};
520
506struct sctp_paddrparams { 521struct sctp_paddrparams {
507 sctp_assoc_t spp_assoc_id; 522 sctp_assoc_t spp_assoc_id;
508 struct sockaddr_storage spp_address; 523 struct sockaddr_storage spp_address;
509 __u32 spp_hbinterval; 524 __u32 spp_hbinterval;
510 __u16 spp_pathmaxrxt; 525 __u16 spp_pathmaxrxt;
526 __u32 spp_pathmtu;
527 __u32 spp_sackdelay;
528 __u32 spp_flags;
511} __attribute__((packed, aligned(4))); 529} __attribute__((packed, aligned(4)));
512 530
531/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
532 *
533 * This options will get or set the delayed ack timer. The time is set
534 * in milliseconds. If the assoc_id is 0, then this sets or gets the
535 * endpoints default delayed ack timer value. If the assoc_id field is
536 * non-zero, then the set or get effects the specified association.
537 */
538struct sctp_assoc_value {
539 sctp_assoc_t assoc_id;
540 uint32_t assoc_value;
541};
542
513/* 543/*
514 * 7.2.2 Peer Address Information 544 * 7.2.2 Peer Address Information
515 * 545 *
diff --git a/include/net/sock.h b/include/net/sock.h
index 982b4ecd187b..6961700ff3a0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk);
493extern int sk_wait_data(struct sock *sk, long *timeo); 493extern int sk_wait_data(struct sock *sk, long *timeo);
494 494
495struct request_sock_ops; 495struct request_sock_ops;
496struct timewait_sock_ops;
496 497
497/* Networking protocol blocks we attach to sockets. 498/* Networking protocol blocks we attach to sockets.
498 * socket layer -> transport layer interface 499 * socket layer -> transport layer interface
@@ -557,11 +558,10 @@ struct proto {
557 kmem_cache_t *slab; 558 kmem_cache_t *slab;
558 unsigned int obj_size; 559 unsigned int obj_size;
559 560
560 kmem_cache_t *twsk_slab;
561 unsigned int twsk_obj_size;
562 atomic_t *orphan_count; 561 atomic_t *orphan_count;
563 562
564 struct request_sock_ops *rsk_prot; 563 struct request_sock_ops *rsk_prot;
564 struct timewait_sock_ops *twsk_prot;
565 565
566 struct module *owner; 566 struct module *owner;
567 567
@@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk)
926 sk_free(sk); 926 sk_free(sk);
927} 927}
928 928
929static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
930{
931 int rc = NET_RX_SUCCESS;
932
933 if (sk_filter(sk, skb, 0))
934 goto discard_and_relse;
935
936 skb->dev = NULL;
937
938 bh_lock_sock(sk);
939 if (!sock_owned_by_user(sk))
940 rc = sk->sk_backlog_rcv(sk, skb);
941 else
942 sk_add_backlog(sk, skb);
943 bh_unlock_sock(sk);
944out:
945 sock_put(sk);
946 return rc;
947discard_and_relse:
948 kfree_skb(skb);
949 goto out;
950}
951
929/* Detach socket from process context. 952/* Detach socket from process context.
930 * Announce socket dead, detach it from wait queue and inode. 953 * Announce socket dead, detach it from wait queue and inode.
931 * Note that parent inode held reference count on this struct sock, 954 * Note that parent inode held reference count on this struct sock,
@@ -1166,7 +1189,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
1166 1189
1167static inline int sock_error(struct sock *sk) 1190static inline int sock_error(struct sock *sk)
1168{ 1191{
1169 int err = xchg(&sk->sk_err, 0); 1192 int err;
1193 if (likely(!sk->sk_err))
1194 return 0;
1195 err = xchg(&sk->sk_err, 0);
1170 return -err; 1196 return -err;
1171} 1197}
1172 1198
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f9fbea..77f21c65bbca 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -225,53 +225,6 @@ extern atomic_t tcp_sockets_allocated;
225extern int tcp_memory_pressure; 225extern int tcp_memory_pressure;
226 226
227/* 227/*
228 * Pointers to address related TCP functions
229 * (i.e. things that depend on the address family)
230 */
231
232struct tcp_func {
233 int (*queue_xmit) (struct sk_buff *skb,
234 int ipfragok);
235
236 void (*send_check) (struct sock *sk,
237 struct tcphdr *th,
238 int len,
239 struct sk_buff *skb);
240
241 int (*rebuild_header) (struct sock *sk);
242
243 int (*conn_request) (struct sock *sk,
244 struct sk_buff *skb);
245
246 struct sock * (*syn_recv_sock) (struct sock *sk,
247 struct sk_buff *skb,
248 struct request_sock *req,
249 struct dst_entry *dst);
250
251 int (*remember_stamp) (struct sock *sk);
252
253 __u16 net_header_len;
254
255 int (*setsockopt) (struct sock *sk,
256 int level,
257 int optname,
258 char __user *optval,
259 int optlen);
260
261 int (*getsockopt) (struct sock *sk,
262 int level,
263 int optname,
264 char __user *optval,
265 int __user *optlen);
266
267
268 void (*addr2sockaddr) (struct sock *sk,
269 struct sockaddr *);
270
271 int sockaddr_len;
272};
273
274/*
275 * The next routines deal with comparing 32 bit unsigned ints 228 * The next routines deal with comparing 32 bit unsigned ints
276 * and worry about wraparound (automatic with unsigned arithmetic). 229 * and worry about wraparound (automatic with unsigned arithmetic).
277 */ 230 */
@@ -334,6 +287,9 @@ extern int tcp_rcv_established(struct sock *sk,
334 287
335extern void tcp_rcv_space_adjust(struct sock *sk); 288extern void tcp_rcv_space_adjust(struct sock *sk);
336 289
290extern int tcp_twsk_unique(struct sock *sk,
291 struct sock *sktw, void *twp);
292
337static inline void tcp_dec_quickack_mode(struct sock *sk, 293static inline void tcp_dec_quickack_mode(struct sock *sk,
338 const unsigned int pkts) 294 const unsigned int pkts)
339{ 295{
@@ -405,8 +361,7 @@ extern void tcp_parse_options(struct sk_buff *skb,
405 * TCP v4 functions exported for the inet6 API 361 * TCP v4 functions exported for the inet6 API
406 */ 362 */
407 363
408extern void tcp_v4_send_check(struct sock *sk, 364extern void tcp_v4_send_check(struct sock *sk, int len,
409 struct tcphdr *th, int len,
410 struct sk_buff *skb); 365 struct sk_buff *skb);
411 366
412extern int tcp_v4_conn_request(struct sock *sk, 367extern int tcp_v4_conn_request(struct sock *sk,
@@ -490,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
490extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, 445extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
491 sk_read_actor_t recv_actor); 446 sk_read_actor_t recv_actor);
492 447
493/* Initialize RCV_MSS value. 448extern void tcp_initialize_rcv_mss(struct sock *sk);
494 * RCV_MSS is an our guess about MSS used by the peer.
495 * We haven't any direct information about the MSS.
496 * It's better to underestimate the RCV_MSS rather than overestimate.
497 * Overestimations make us ACKing less frequently than needed.
498 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
499 */
500 449
501static inline void tcp_initialize_rcv_mss(struct sock *sk) 450static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
502{
503 struct tcp_sock *tp = tcp_sk(sk);
504 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
505
506 hint = min(hint, tp->rcv_wnd/2);
507 hint = min(hint, TCP_MIN_RCVMSS);
508 hint = max(hint, TCP_MIN_MSS);
509
510 inet_csk(sk)->icsk_ack.rcv_mss = hint;
511}
512
513static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
514{ 451{
515 tp->pred_flags = htonl((tp->tcp_header_len << 26) | 452 tp->pred_flags = htonl((tp->tcp_header_len << 26) |
516 ntohl(TCP_FLAG_ACK) | 453 ntohl(TCP_FLAG_ACK) |
517 snd_wnd); 454 snd_wnd);
518} 455}
519 456
520static __inline__ void tcp_fast_path_on(struct tcp_sock *tp) 457static inline void tcp_fast_path_on(struct tcp_sock *tp)
521{ 458{
522 __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); 459 __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
523} 460}
@@ -535,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
535 * Rcv_nxt can be after the window if our peer push more data 472 * Rcv_nxt can be after the window if our peer push more data
536 * than the offered window. 473 * than the offered window.
537 */ 474 */
538static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp) 475static inline u32 tcp_receive_window(const struct tcp_sock *tp)
539{ 476{
540 s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; 477 s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
541 478
@@ -707,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk);
707extern int tcp_set_default_congestion_control(const char *name); 644extern int tcp_set_default_congestion_control(const char *name);
708extern void tcp_get_default_congestion_control(char *name); 645extern void tcp_get_default_congestion_control(char *name);
709extern int tcp_set_congestion_control(struct sock *sk, const char *name); 646extern int tcp_set_congestion_control(struct sock *sk, const char *name);
647extern void tcp_slow_start(struct tcp_sock *tp);
710 648
711extern struct tcp_congestion_ops tcp_init_congestion_ops; 649extern struct tcp_congestion_ops tcp_init_congestion_ops;
712extern u32 tcp_reno_ssthresh(struct sock *sk); 650extern u32 tcp_reno_ssthresh(struct sock *sk);
@@ -746,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
746 * "Packets left network, but not honestly ACKed yet" PLUS 684 * "Packets left network, but not honestly ACKed yet" PLUS
747 * "Packets fast retransmitted" 685 * "Packets fast retransmitted"
748 */ 686 */
749static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) 687static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
750{ 688{
751 return (tp->packets_out - tp->left_out + tp->retrans_out); 689 return (tp->packets_out - tp->left_out + tp->retrans_out);
752} 690}
@@ -766,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
766 (tp->snd_cwnd >> 2))); 704 (tp->snd_cwnd >> 2)));
767} 705}
768 706
769/*
770 * Linear increase during slow start
771 */
772static inline void tcp_slow_start(struct tcp_sock *tp)
773{
774 if (sysctl_tcp_abc) {
775 /* RFC3465: Slow Start
776 * TCP sender SHOULD increase cwnd by the number of
777 * previously unacknowledged bytes ACKed by each incoming
778 * acknowledgment, provided the increase is not more than L
779 */
780 if (tp->bytes_acked < tp->mss_cache)
781 return;
782
783 /* We MAY increase by 2 if discovered delayed ack */
784 if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
785 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
786 tp->snd_cwnd++;
787 }
788 }
789 tp->bytes_acked = 0;
790
791 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
792 tp->snd_cwnd++;
793}
794
795
796static inline void tcp_sync_left_out(struct tcp_sock *tp) 707static inline void tcp_sync_left_out(struct tcp_sock *tp)
797{ 708{
798 if (tp->rx_opt.sack_ok && 709 if (tp->rx_opt.sack_ok &&
@@ -801,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
801 tp->left_out = tp->sacked_out + tp->lost_out; 712 tp->left_out = tp->sacked_out + tp->lost_out;
802} 713}
803 714
804/* Set slow start threshold and cwnd not falling to slow start */ 715extern void tcp_enter_cwr(struct sock *sk);
805static inline void __tcp_enter_cwr(struct sock *sk)
806{
807 const struct inet_connection_sock *icsk = inet_csk(sk);
808 struct tcp_sock *tp = tcp_sk(sk);
809
810 tp->undo_marker = 0;
811 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
812 tp->snd_cwnd = min(tp->snd_cwnd,
813 tcp_packets_in_flight(tp) + 1U);
814 tp->snd_cwnd_cnt = 0;
815 tp->high_seq = tp->snd_nxt;
816 tp->snd_cwnd_stamp = tcp_time_stamp;
817 TCP_ECN_queue_cwr(tp);
818}
819
820static inline void tcp_enter_cwr(struct sock *sk)
821{
822 struct tcp_sock *tp = tcp_sk(sk);
823
824 tp->prior_ssthresh = 0;
825 tp->bytes_acked = 0;
826 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
827 __tcp_enter_cwr(sk);
828 tcp_set_ca_state(sk, TCP_CA_CWR);
829 }
830}
831
832extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); 716extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
833 717
834/* Slow start with delack produces 3 packets of burst, so that 718/* Slow start with delack produces 3 packets of burst, so that
@@ -860,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
860 return left <= tcp_max_burst(tp); 744 return left <= tcp_max_burst(tp);
861} 745}
862 746
863static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, 747static inline void tcp_minshall_update(struct tcp_sock *tp, int mss,
864 const struct sk_buff *skb) 748 const struct sk_buff *skb)
865{ 749{
866 if (skb->len < mss) 750 if (skb->len < mss)
867 tp->snd_sml = TCP_SKB_CB(skb)->end_seq; 751 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
868} 752}
869 753
870static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) 754static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
871{ 755{
872 const struct inet_connection_sock *icsk = inet_csk(sk); 756 const struct inet_connection_sock *icsk = inet_csk(sk);
873 if (!tp->packets_out && !icsk->icsk_pending) 757 if (!tp->packets_out && !icsk->icsk_pending)
@@ -875,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t
875 icsk->icsk_rto, TCP_RTO_MAX); 759 icsk->icsk_rto, TCP_RTO_MAX);
876} 760}
877 761
878static __inline__ void tcp_push_pending_frames(struct sock *sk, 762static inline void tcp_push_pending_frames(struct sock *sk,
879 struct tcp_sock *tp) 763 struct tcp_sock *tp)
880{ 764{
881 __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); 765 __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
882} 766}
883 767
884static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) 768static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
885{ 769{
886 tp->snd_wl1 = seq; 770 tp->snd_wl1 = seq;
887} 771}
888 772
889static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) 773static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
890{ 774{
891 tp->snd_wl1 = seq; 775 tp->snd_wl1 = seq;
892} 776}
@@ -894,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
894/* 778/*
895 * Calculate(/check) TCP checksum 779 * Calculate(/check) TCP checksum
896 */ 780 */
897static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, 781static inline u16 tcp_v4_check(struct tcphdr *th, int len,
898 unsigned long saddr, unsigned long daddr, 782 unsigned long saddr, unsigned long daddr,
899 unsigned long base) 783 unsigned long base)
900{ 784{
901 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); 785 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
902} 786}
903 787
904static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) 788static inline int __tcp_checksum_complete(struct sk_buff *skb)
905{ 789{
906 return __skb_checksum_complete(skb); 790 return __skb_checksum_complete(skb);
907} 791}
908 792
909static __inline__ int tcp_checksum_complete(struct sk_buff *skb) 793static inline int tcp_checksum_complete(struct sk_buff *skb)
910{ 794{
911 return skb->ip_summed != CHECKSUM_UNNECESSARY && 795 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
912 __tcp_checksum_complete(skb); 796 __tcp_checksum_complete(skb);
@@ -914,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
914 798
915/* Prequeue for VJ style copy to user, combined with checksumming. */ 799/* Prequeue for VJ style copy to user, combined with checksumming. */
916 800
917static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) 801static inline void tcp_prequeue_init(struct tcp_sock *tp)
918{ 802{
919 tp->ucopy.task = NULL; 803 tp->ucopy.task = NULL;
920 tp->ucopy.len = 0; 804 tp->ucopy.len = 0;
@@ -930,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
930 * 814 *
931 * NOTE: is this not too big to inline? 815 * NOTE: is this not too big to inline?
932 */ 816 */
933static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) 817static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
934{ 818{
935 struct tcp_sock *tp = tcp_sk(sk); 819 struct tcp_sock *tp = tcp_sk(sk);
936 820
@@ -971,7 +855,7 @@ static const char *statename[]={
971}; 855};
972#endif 856#endif
973 857
974static __inline__ void tcp_set_state(struct sock *sk, int state) 858static inline void tcp_set_state(struct sock *sk, int state)
975{ 859{
976 int oldstate = sk->sk_state; 860 int oldstate = sk->sk_state;
977 861
@@ -1005,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state)
1005#endif 889#endif
1006} 890}
1007 891
1008static __inline__ void tcp_done(struct sock *sk) 892static inline void tcp_done(struct sock *sk)
1009{ 893{
1010 tcp_set_state(sk, TCP_CLOSE); 894 tcp_set_state(sk, TCP_CLOSE);
1011 tcp_clear_xmit_timers(sk); 895 tcp_clear_xmit_timers(sk);
@@ -1018,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk)
1018 inet_csk_destroy_sock(sk); 902 inet_csk_destroy_sock(sk);
1019} 903}
1020 904
1021static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) 905static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
1022{ 906{
1023 rx_opt->dsack = 0; 907 rx_opt->dsack = 0;
1024 rx_opt->eff_sacks = 0; 908 rx_opt->eff_sacks = 0;
1025 rx_opt->num_sacks = 0; 909 rx_opt->num_sacks = 0;
1026} 910}
1027 911
1028static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp)
1029{
1030 if (tp->rx_opt.tstamp_ok) {
1031 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
1032 (TCPOPT_NOP << 16) |
1033 (TCPOPT_TIMESTAMP << 8) |
1034 TCPOLEN_TIMESTAMP);
1035 *ptr++ = htonl(tstamp);
1036 *ptr++ = htonl(tp->rx_opt.ts_recent);
1037 }
1038 if (tp->rx_opt.eff_sacks) {
1039 struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
1040 int this_sack;
1041
1042 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
1043 (TCPOPT_NOP << 16) |
1044 (TCPOPT_SACK << 8) |
1045 (TCPOLEN_SACK_BASE +
1046 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
1047 for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
1048 *ptr++ = htonl(sp[this_sack].start_seq);
1049 *ptr++ = htonl(sp[this_sack].end_seq);
1050 }
1051 if (tp->rx_opt.dsack) {
1052 tp->rx_opt.dsack = 0;
1053 tp->rx_opt.eff_sacks--;
1054 }
1055 }
1056}
1057
1058/* Construct a tcp options header for a SYN or SYN_ACK packet.
1059 * If this is every changed make sure to change the definition of
1060 * MAX_SYN_SIZE to match the new maximum number of options that you
1061 * can generate.
1062 */
1063static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
1064 int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent)
1065{
1066 /* We always get an MSS option.
1067 * The option bytes which will be seen in normal data
1068 * packets should timestamps be used, must be in the MSS
1069 * advertised. But we subtract them from tp->mss_cache so
1070 * that calculations in tcp_sendmsg are simpler etc.
1071 * So account for this fact here if necessary. If we
1072 * don't do this correctly, as a receiver we won't
1073 * recognize data packets as being full sized when we
1074 * should, and thus we won't abide by the delayed ACK
1075 * rules correctly.
1076 * SACKs don't matter, we never delay an ACK when we
1077 * have any of those going out.
1078 */
1079 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
1080 if (ts) {
1081 if(sack)
1082 *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
1083 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
1084 else
1085 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1086 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
1087 *ptr++ = htonl(tstamp); /* TSVAL */
1088 *ptr++ = htonl(ts_recent); /* TSECR */
1089 } else if(sack)
1090 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1091 (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
1092 if (offer_wscale)
1093 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
1094}
1095
1096/* Determine a window scaling and initial window to offer. */ 912/* Determine a window scaling and initial window to offer. */
1097extern void tcp_select_initial_window(int __space, __u32 mss, 913extern void tcp_select_initial_window(int __space, __u32 mss,
1098 __u32 *rcv_wnd, __u32 *window_clamp, 914 __u32 *rcv_wnd, __u32 *window_clamp,
@@ -1117,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk)
1117 return tcp_win_from_space(sk->sk_rcvbuf); 933 return tcp_win_from_space(sk->sk_rcvbuf);
1118} 934}
1119 935
1120static __inline__ void tcp_openreq_init(struct request_sock *req, 936static inline void tcp_openreq_init(struct request_sock *req,
1121 struct tcp_options_received *rx_opt, 937 struct tcp_options_received *rx_opt,
1122 struct sk_buff *skb) 938 struct sk_buff *skb)
1123{ 939{
1124 struct inet_request_sock *ireq = inet_rsk(req); 940 struct inet_request_sock *ireq = inet_rsk(req);
1125 941
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index b9d4176b2d15..b0b645988bd8 100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -31,4 +31,20 @@ enum {
31 31
32#define TCP_STATE_MASK 0xF 32#define TCP_STATE_MASK 0xF
33 33
34#define TCP_ACTION_FIN (1 << 7)
35
36enum {
37 TCPF_ESTABLISHED = (1 << 1),
38 TCPF_SYN_SENT = (1 << 2),
39 TCPF_SYN_RECV = (1 << 3),
40 TCPF_FIN_WAIT1 = (1 << 4),
41 TCPF_FIN_WAIT2 = (1 << 5),
42 TCPF_TIME_WAIT = (1 << 6),
43 TCPF_CLOSE = (1 << 7),
44 TCPF_CLOSE_WAIT = (1 << 8),
45 TCPF_LAST_ACK = (1 << 9),
46 TCPF_LISTEN = (1 << 10),
47 TCPF_CLOSING = (1 << 11)
48};
49
34#endif /* _LINUX_TCP_STATES_H */ 50#endif /* _LINUX_TCP_STATES_H */
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
new file mode 100644
index 000000000000..2544281e1d5e
--- /dev/null
+++ b/include/net/timewait_sock.h
@@ -0,0 +1,31 @@
1/*
2 * NET Generic infrastructure for Network protocols.
3 *
4 * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#ifndef _TIMEWAIT_SOCK_H
12#define _TIMEWAIT_SOCK_H
13
14#include <linux/slab.h>
15#include <net/sock.h>
16
17struct timewait_sock_ops {
18 kmem_cache_t *twsk_slab;
19 unsigned int twsk_obj_size;
20 int (*twsk_unique)(struct sock *sk,
21 struct sock *sktw, void *twp);
22};
23
24static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
25{
26 if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
27 return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
28 return 0;
29}
30
31#endif /* _TIMEWAIT_SOCK_H */
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 4e86f2de6638..61f724c1036f 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -44,7 +44,7 @@ extern int datagram_send_ctl(struct msghdr *msg,
44/* 44/*
45 * address family specific functions 45 * address family specific functions
46 */ 46 */
47extern struct tcp_func ipv4_specific; 47extern struct inet_connection_sock_af_ops ipv4_specific;
48 48
49extern int inet6_destroy_sock(struct sock *sk); 49extern int inet6_destroy_sock(struct sock *sk);
50 50
diff --git a/include/net/udp.h b/include/net/udp.h
index 107b9d791a1f..766fba1369ce 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -22,9 +22,8 @@
22#ifndef _UDP_H 22#ifndef _UDP_H
23#define _UDP_H 23#define _UDP_H
24 24
25#include <linux/udp.h>
26#include <linux/ip.h>
27#include <linux/list.h> 25#include <linux/list.h>
26#include <net/inet_sock.h>
28#include <net/sock.h> 27#include <net/sock.h>
29#include <net/snmp.h> 28#include <net/snmp.h>
30#include <linux/seq_file.h> 29#include <linux/seq_file.h>
@@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num)
62 61
63extern struct proto udp_prot; 62extern struct proto udp_prot;
64 63
64struct sk_buff;
65 65
66extern void udp_err(struct sk_buff *, u32); 66extern void udp_err(struct sk_buff *, u32);
67 67
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1cdb87912137..07d7b50cdd76 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2,11 +2,12 @@
2#define _NET_XFRM_H 2#define _NET_XFRM_H
3 3
4#include <linux/compiler.h> 4#include <linux/compiler.h>
5#include <linux/in.h>
5#include <linux/xfrm.h> 6#include <linux/xfrm.h>
6#include <linux/spinlock.h> 7#include <linux/spinlock.h>
7#include <linux/list.h> 8#include <linux/list.h>
8#include <linux/skbuff.h> 9#include <linux/skbuff.h>
9#include <linux/netdevice.h> 10#include <linux/socket.h>
10#include <linux/crypto.h> 11#include <linux/crypto.h>
11#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
12#include <linux/in6.h> 13#include <linux/in6.h>
@@ -144,6 +145,9 @@ struct xfrm_state
144 * transformer. */ 145 * transformer. */
145 struct xfrm_type *type; 146 struct xfrm_type *type;
146 147
148 /* Security context */
149 struct xfrm_sec_ctx *security;
150
147 /* Private data of this transformer, format is opaque, 151 /* Private data of this transformer, format is opaque,
148 * interpreted by xfrm_type methods. */ 152 * interpreted by xfrm_type methods. */
149 void *data; 153 void *data;
@@ -298,6 +302,7 @@ struct xfrm_policy
298 __u8 flags; 302 __u8 flags;
299 __u8 dead; 303 __u8 dead;
300 __u8 xfrm_nr; 304 __u8 xfrm_nr;
305 struct xfrm_sec_ctx *security;
301 struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; 306 struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
302}; 307};
303 308
@@ -510,6 +515,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
510 return 0; 515 return 0;
511} 516}
512 517
518#ifdef CONFIG_SECURITY_NETWORK_XFRM
519/* If neither has a context --> match
520 * Otherwise, both must have a context and the sids, doi, alg must match
521 */
522static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
523{
524 return ((!s1 && !s2) ||
525 (s1 && s2 &&
526 (s1->ctx_sid == s2->ctx_sid) &&
527 (s1->ctx_doi == s2->ctx_doi) &&
528 (s1->ctx_alg == s2->ctx_alg)));
529}
530#else
531static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
532{
533 return 1;
534}
535#endif
536
513/* A struct encoding bundle of transformations to apply to some set of flow. 537/* A struct encoding bundle of transformations to apply to some set of flow.
514 * 538 *
515 * dst->child points to the next element of bundle. 539 * dst->child points to the next element of bundle.
@@ -878,8 +902,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
878struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); 902struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
879extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); 903extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
880int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); 904int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
881struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, 905struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
882 int delete); 906 struct xfrm_sec_ctx *ctx, int delete);
883struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete); 907struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
884void xfrm_policy_flush(void); 908void xfrm_policy_flush(void);
885u32 xfrm_get_acqseq(void); 909u32 xfrm_get_acqseq(void);
diff --git a/init/main.c b/init/main.c
index 27f97f9b4636..54aaf561cf66 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,7 +47,6 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/mempolicy.h> 48#include <linux/mempolicy.h>
49#include <linux/key.h> 49#include <linux/key.h>
50#include <net/sock.h>
51 50
52#include <asm/io.h> 51#include <asm/io.h>
53#include <asm/bugs.h> 52#include <asm/bugs.h>
@@ -614,9 +613,6 @@ static void __init do_basic_setup(void)
614 sysctl_init(); 613 sysctl_init();
615#endif 614#endif
616 615
617 /* Networking initialization needs a process context */
618 sock_init();
619
620 do_initcalls(); 616 do_initcalls();
621} 617}
622 618
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c83..a5144e43aae1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
63#include <linux/atalk.h> 63#include <linux/atalk.h>
64 64
65struct datalink_proto *ddp_dl, *aarp_dl; 65struct datalink_proto *ddp_dl, *aarp_dl;
66static struct proto_ops atalk_dgram_ops; 66static const struct proto_ops atalk_dgram_ops;
67 67
68/**************************************************************************\ 68/**************************************************************************\
69* * 69* *
@@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
1763 */ 1763 */
1764static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1764static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1765{ 1765{
1766 int rc = -EINVAL; 1766 int rc = -ENOIOCTLCMD;
1767 struct sock *sk = sock->sk; 1767 struct sock *sk = sock->sk;
1768 void __user *argp = (void __user *)arg; 1768 void __user *argp = (void __user *)arg;
1769 1769
@@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1813 rc = atif_ioctl(cmd, argp); 1813 rc = atif_ioctl(cmd, argp);
1814 rtnl_unlock(); 1814 rtnl_unlock();
1815 break; 1815 break;
1816 /* Physical layer ioctl calls */
1817 case SIOCSIFLINK:
1818 case SIOCGIFHWADDR:
1819 case SIOCSIFHWADDR:
1820 case SIOCGIFFLAGS:
1821 case SIOCSIFFLAGS:
1822 case SIOCGIFTXQLEN:
1823 case SIOCSIFTXQLEN:
1824 case SIOCGIFMTU:
1825 case SIOCGIFCONF:
1826 case SIOCADDMULTI:
1827 case SIOCDELMULTI:
1828 case SIOCGIFCOUNT:
1829 case SIOCGIFINDEX:
1830 case SIOCGIFNAME:
1831 rc = dev_ioctl(cmd, argp);
1832 break;
1833 } 1816 }
1834 1817
1835 return rc; 1818 return rc;
@@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = {
1841 .owner = THIS_MODULE, 1824 .owner = THIS_MODULE,
1842}; 1825};
1843 1826
1844static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { 1827static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
1845 .family = PF_APPLETALK, 1828 .family = PF_APPLETALK,
1846 .owner = THIS_MODULE, 1829 .owner = THIS_MODULE,
1847 .release = atalk_release, 1830 .release = atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22b..f2c541774dcd 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
102} 102}
103 103
104 104
105static struct proto_ops pvc_proto_ops = { 105static const struct proto_ops pvc_proto_ops = {
106 .family = PF_ATMPVC, 106 .family = PF_ATMPVC,
107 .owner = THIS_MODULE, 107 .owner = THIS_MODULE,
108 108
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf6..3a180cfd7b48 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
613 return error; 613 return error;
614} 614}
615 615
616static struct proto_ops svc_proto_ops = { 616static const struct proto_ops svc_proto_ops = {
617 .family = PF_ATMSVC, 617 .family = PF_ATMSVC,
618 .owner = THIS_MODULE, 618 .owner = THIS_MODULE,
619 619
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f302657..e8753c7fcad1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
54HLIST_HEAD(ax25_list); 54HLIST_HEAD(ax25_list);
55DEFINE_SPINLOCK(ax25_list_lock); 55DEFINE_SPINLOCK(ax25_list_lock);
56 56
57static struct proto_ops ax25_proto_ops; 57static const struct proto_ops ax25_proto_ops;
58 58
59static void ax25_free_sock(struct sock *sk) 59static void ax25_free_sock(struct sock *sk)
60{ 60{
@@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1827 break; 1827 break;
1828 1828
1829 default: 1829 default:
1830 res = dev_ioctl(cmd, argp); 1830 res = -ENOIOCTLCMD;
1831 break; 1831 break;
1832 } 1832 }
1833 release_sock(sk); 1833 release_sock(sk);
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
1944 .owner = THIS_MODULE, 1944 .owner = THIS_MODULE,
1945}; 1945};
1946 1946
1947static struct proto_ops ax25_proto_ops = { 1947static const struct proto_ops ax25_proto_ops = {
1948 .family = PF_AX25, 1948 .family = PF_AX25,
1949 .owner = THIS_MODULE, 1949 .owner = THIS_MODULE,
1950 .release = ax25_release, 1950 .release = ax25_release,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ea616e3fc98e..fb031fe9be9e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
287 timeo = schedule_timeout(timeo); 287 timeo = schedule_timeout(timeo);
288 lock_sock(sk); 288 lock_sock(sk);
289 289
290 if (sk->sk_err) { 290 err = sock_error(sk);
291 err = sock_error(sk); 291 if (err)
292 break; 292 break;
293 }
294 } 293 }
295 set_current_state(TASK_RUNNING); 294 set_current_state(TASK_RUNNING);
296 remove_wait_queue(sk->sk_sleep, &wait); 295 remove_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53b..ccbaf69afc5b 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
146 return 0; 146 return 0;
147} 147}
148 148
149static struct proto_ops bnep_sock_ops = { 149static const struct proto_ops bnep_sock_ops = {
150 .family = PF_BLUETOOTH, 150 .family = PF_BLUETOOTH,
151 .owner = THIS_MODULE, 151 .owner = THIS_MODULE,
152 .release = bnep_sock_release, 152 .release = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf5714..5e22343b6090 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
137 return -EINVAL; 137 return -EINVAL;
138} 138}
139 139
140static struct proto_ops cmtp_sock_ops = { 140static const struct proto_ops cmtp_sock_ops = {
141 .family = PF_BLUETOOTH, 141 .family = PF_BLUETOOTH,
142 .owner = THIS_MODULE, 142 .owner = THIS_MODULE,
143 .release = cmtp_sock_release, 143 .release = cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c099..84e6c93a044a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
575 return 0; 575 return 0;
576} 576}
577 577
578static struct proto_ops hci_sock_ops = { 578static const struct proto_ops hci_sock_ops = {
579 .family = PF_BLUETOOTH, 579 .family = PF_BLUETOOTH,
580 .owner = THIS_MODULE, 580 .owner = THIS_MODULE,
581 .release = hci_sock_release, 581 .release = hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f881431..8f8dd931b294 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
143 return -EINVAL; 143 return -EINVAL;
144} 144}
145 145
146static struct proto_ops hidp_sock_ops = { 146static const struct proto_ops hidp_sock_ops = {
147 .family = PF_BLUETOOTH, 147 .family = PF_BLUETOOTH,
148 .owner = THIS_MODULE, 148 .owner = THIS_MODULE,
149 .release = hidp_sock_release, 149 .release = hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index e3bb11ca4235..7f0781e4326f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
57 57
58#define VERSION "2.8" 58#define VERSION "2.8"
59 59
60static struct proto_ops l2cap_sock_ops; 60static const struct proto_ops l2cap_sock_ops;
61 61
62static struct bt_sock_list l2cap_sk_list = { 62static struct bt_sock_list l2cap_sk_list = {
63 .lock = RW_LOCK_UNLOCKED 63 .lock = RW_LOCK_UNLOCKED
@@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
767 767
768 BT_DBG("sock %p, sk %p", sock, sk); 768 BT_DBG("sock %p, sk %p", sock, sk);
769 769
770 if (sk->sk_err) 770 err = sock_error(sk);
771 return sock_error(sk); 771 if (err)
772 return err;
772 773
773 if (msg->msg_flags & MSG_OOB) 774 if (msg->msg_flags & MSG_OOB)
774 return -EOPNOTSUPP; 775 return -EOPNOTSUPP;
@@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
2160 2161
2161static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); 2162static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
2162 2163
2163static struct proto_ops l2cap_sock_ops = { 2164static const struct proto_ops l2cap_sock_ops = {
2164 .family = PF_BLUETOOTH, 2165 .family = PF_BLUETOOTH,
2165 .owner = THIS_MODULE, 2166 .owner = THIS_MODULE,
2166 .release = l2cap_sock_release, 2167 .release = l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232e..757d2dd3b02f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
58#define BT_DBG(D...) 58#define BT_DBG(D...)
59#endif 59#endif
60 60
61static struct proto_ops rfcomm_sock_ops; 61static const struct proto_ops rfcomm_sock_ops;
62 62
63static struct bt_sock_list rfcomm_sk_list = { 63static struct bt_sock_list rfcomm_sk_list = {
64 .lock = RW_LOCK_UNLOCKED 64 .lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
907 907
908static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); 908static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
909 909
910static struct proto_ops rfcomm_sock_ops = { 910static const struct proto_ops rfcomm_sock_ops = {
911 .family = PF_BLUETOOTH, 911 .family = PF_BLUETOOTH,
912 .owner = THIS_MODULE, 912 .owner = THIS_MODULE,
913 .release = rfcomm_sock_release, 913 .release = rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9cb00dc6c08c..6b61323ce23c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
56 56
57#define VERSION "0.5" 57#define VERSION "0.5"
58 58
59static struct proto_ops sco_sock_ops; 59static const struct proto_ops sco_sock_ops;
60 60
61static struct bt_sock_list sco_sk_list = { 61static struct bt_sock_list sco_sk_list = {
62 .lock = RW_LOCK_UNLOCKED 62 .lock = RW_LOCK_UNLOCKED
@@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
637 637
638 BT_DBG("sock %p, sk %p", sock, sk); 638 BT_DBG("sock %p, sk %p", sock, sk);
639 639
640 if (sk->sk_err) 640 err = sock_error(sk);
641 return sock_error(sk); 641 if (err)
642 return err;
642 643
643 if (msg->msg_flags & MSG_OOB) 644 if (msg->msg_flags & MSG_OOB)
644 return -EOPNOTSUPP; 645 return -EOPNOTSUPP;
@@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
913 914
914static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); 915static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
915 916
916static struct proto_ops sco_sock_ops = { 917static const struct proto_ops sco_sock_ops = {
917 .family = PF_BLUETOOTH, 918 .family = PF_BLUETOOTH,
918 .owner = THIS_MODULE, 919 .owner = THIS_MODULE,
919 .release = sco_sock_release, 920 .release = sco_sock_release,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index f8f184942aaf..188cc1ac49eb 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook);
67module_init(br_init) 67module_init(br_init)
68module_exit(br_deinit) 68module_exit(br_deinit)
69MODULE_LICENSE("GPL"); 69MODULE_LICENSE("GPL");
70MODULE_VERSION(BR_VERSION);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f564ee99782d..0b33a7b3a00c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,9 @@
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/module.h> 18#include <linux/etherdevice.h>
19#include <linux/ethtool.h>
20
19#include <asm/uaccess.h> 21#include <asm/uaccess.h>
20#include "br_private.h" 22#include "br_private.h"
21 23
@@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
82 return 0; 84 return 0;
83} 85}
84 86
87/* Allow setting mac address of pseudo-bridge to be same as
88 * any of the bound interfaces
89 */
90static int br_set_mac_address(struct net_device *dev, void *p)
91{
92 struct net_bridge *br = netdev_priv(dev);
93 struct sockaddr *addr = p;
94 struct net_bridge_port *port;
95 int err = -EADDRNOTAVAIL;
96
97 spin_lock_bh(&br->lock);
98 list_for_each_entry(port, &br->port_list, list) {
99 if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
100 br_stp_change_bridge_id(br, addr->sa_data);
101 err = 0;
102 break;
103 }
104 }
105 spin_unlock_bh(&br->lock);
106
107 return err;
108}
109
110static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
111{
112 strcpy(info->driver, "bridge");
113 strcpy(info->version, BR_VERSION);
114 strcpy(info->fw_version, "N/A");
115 strcpy(info->bus_info, "N/A");
116}
117
118static int br_set_sg(struct net_device *dev, u32 data)
119{
120 struct net_bridge *br = netdev_priv(dev);
121
122 if (data)
123 br->feature_mask |= NETIF_F_SG;
124 else
125 br->feature_mask &= ~NETIF_F_SG;
126
127 br_features_recompute(br);
128 return 0;
129}
130
131static int br_set_tso(struct net_device *dev, u32 data)
132{
133 struct net_bridge *br = netdev_priv(dev);
134
135 if (data)
136 br->feature_mask |= NETIF_F_TSO;
137 else
138 br->feature_mask &= ~NETIF_F_TSO;
139
140 br_features_recompute(br);
141 return 0;
142}
143
144static int br_set_tx_csum(struct net_device *dev, u32 data)
145{
146 struct net_bridge *br = netdev_priv(dev);
147
148 if (data)
149 br->feature_mask |= NETIF_F_IP_CSUM;
150 else
151 br->feature_mask &= ~NETIF_F_IP_CSUM;
152
153 br_features_recompute(br);
154 return 0;
155}
156
157static struct ethtool_ops br_ethtool_ops = {
158 .get_drvinfo = br_getinfo,
159 .get_link = ethtool_op_get_link,
160 .get_sg = ethtool_op_get_sg,
161 .set_sg = br_set_sg,
162 .get_tx_csum = ethtool_op_get_tx_csum,
163 .set_tx_csum = br_set_tx_csum,
164 .get_tso = ethtool_op_get_tso,
165 .set_tso = br_set_tso,
166};
167
85void br_dev_setup(struct net_device *dev) 168void br_dev_setup(struct net_device *dev)
86{ 169{
87 memset(dev->dev_addr, 0, ETH_ALEN); 170 memset(dev->dev_addr, 0, ETH_ALEN);
@@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev)
96 dev->change_mtu = br_change_mtu; 179 dev->change_mtu = br_change_mtu;
97 dev->destructor = free_netdev; 180 dev->destructor = free_netdev;
98 SET_MODULE_OWNER(dev); 181 SET_MODULE_OWNER(dev);
182 SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
99 dev->stop = br_dev_stop; 183 dev->stop = br_dev_stop;
100 dev->tx_queue_len = 0; 184 dev->tx_queue_len = 0;
101 dev->set_mac_address = NULL; 185 dev->set_mac_address = br_set_mac_address;
102 dev->priv_flags = IFF_EBRIDGE; 186 dev->priv_flags = IFF_EBRIDGE;
187
188 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
189 | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
103} 190}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 975abe254b7a..11321197338e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -32,9 +32,8 @@
32 * ethtool, use ethtool_ops. Also, since driver might sleep need to 32 * ethtool, use ethtool_ops. Also, since driver might sleep need to
33 * not be holding any locks. 33 * not be holding any locks.
34 */ 34 */
35static int br_initial_port_cost(struct net_device *dev) 35static int port_cost(struct net_device *dev)
36{ 36{
37
38 struct ethtool_cmd ecmd = { ETHTOOL_GSET }; 37 struct ethtool_cmd ecmd = { ETHTOOL_GSET };
39 struct ifreq ifr; 38 struct ifreq ifr;
40 mm_segment_t old_fs; 39 mm_segment_t old_fs;
@@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev)
58 return 2; 57 return 2;
59 case SPEED_10: 58 case SPEED_10:
60 return 100; 59 return 100;
61 default:
62 pr_info("bridge: can't decode speed from %s: %d\n",
63 dev->name, ecmd.speed);
64 return 100;
65 } 60 }
66 } 61 }
67 62
@@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev)
75 return 100; /* assume old 10Mbps */ 70 return 100; /* assume old 10Mbps */
76} 71}
77 72
73
74/*
75 * Check for port carrier transistions.
76 * Called from work queue to allow for calling functions that
77 * might sleep (such as speed check), and to debounce.
78 */
79static void port_carrier_check(void *arg)
80{
81 struct net_bridge_port *p = arg;
82
83 rtnl_lock();
84 if (netif_carrier_ok(p->dev)) {
85 u32 cost = port_cost(p->dev);
86
87 spin_lock_bh(&p->br->lock);
88 if (p->state == BR_STATE_DISABLED) {
89 p->path_cost = cost;
90 br_stp_enable_port(p);
91 }
92 spin_unlock_bh(&p->br->lock);
93 } else {
94 spin_lock_bh(&p->br->lock);
95 if (p->state != BR_STATE_DISABLED)
96 br_stp_disable_port(p);
97 spin_unlock_bh(&p->br->lock);
98 }
99 rtnl_unlock();
100}
101
78static void destroy_nbp(struct net_bridge_port *p) 102static void destroy_nbp(struct net_bridge_port *p)
79{ 103{
80 struct net_device *dev = p->dev; 104 struct net_device *dev = p->dev;
@@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p)
102 dev->br_port = NULL; 126 dev->br_port = NULL;
103 dev_set_promiscuity(dev, -1); 127 dev_set_promiscuity(dev, -1);
104 128
129 cancel_delayed_work(&p->carrier_check);
130 flush_scheduled_work();
131
105 spin_lock_bh(&br->lock); 132 spin_lock_bh(&br->lock);
106 br_stp_disable_port(p); 133 br_stp_disable_port(p);
107 spin_unlock_bh(&br->lock); 134 spin_unlock_bh(&br->lock);
@@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name)
155 br->bridge_id.prio[1] = 0x00; 182 br->bridge_id.prio[1] = 0x00;
156 memset(br->bridge_id.addr, 0, ETH_ALEN); 183 memset(br->bridge_id.addr, 0, ETH_ALEN);
157 184
185 br->feature_mask = dev->features;
158 br->stp_enabled = 0; 186 br->stp_enabled = 0;
159 br->designated_root = br->bridge_id; 187 br->designated_root = br->bridge_id;
160 br->root_path_cost = 0; 188 br->root_path_cost = 0;
@@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br)
195 return (index >= BR_MAX_PORTS) ? -EXFULL : index; 223 return (index >= BR_MAX_PORTS) ? -EXFULL : index;
196} 224}
197 225
198/* called with RTNL */ 226/* called with RTNL but without bridge lock */
199static struct net_bridge_port *new_nbp(struct net_bridge *br, 227static struct net_bridge_port *new_nbp(struct net_bridge *br,
200 struct net_device *dev, 228 struct net_device *dev)
201 unsigned long cost)
202{ 229{
203 int index; 230 int index;
204 struct net_bridge_port *p; 231 struct net_bridge_port *p;
@@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
215 p->br = br; 242 p->br = br;
216 dev_hold(dev); 243 dev_hold(dev);
217 p->dev = dev; 244 p->dev = dev;
218 p->path_cost = cost; 245 p->path_cost = port_cost(dev);
219 p->priority = 0x8000 >> BR_PORT_BITS; 246 p->priority = 0x8000 >> BR_PORT_BITS;
220 dev->br_port = p; 247 dev->br_port = p;
221 p->port_no = index; 248 p->port_no = index;
222 br_init_port(p); 249 br_init_port(p);
223 p->state = BR_STATE_DISABLED; 250 p->state = BR_STATE_DISABLED;
251 INIT_WORK(&p->carrier_check, port_carrier_check, p);
224 kobject_init(&p->kobj); 252 kobject_init(&p->kobj);
225 253
226 return p; 254 return p;
@@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br)
322 struct net_bridge_port *p; 350 struct net_bridge_port *p;
323 unsigned long features, checksum; 351 unsigned long features, checksum;
324 352
325 features = NETIF_F_SG | NETIF_F_FRAGLIST 353 features = br->feature_mask &~ NETIF_F_IP_CSUM;
326 | NETIF_F_HIGHDMA | NETIF_F_TSO; 354 checksum = br->feature_mask & NETIF_F_IP_CSUM;
327 checksum = NETIF_F_IP_CSUM; /* least commmon subset */
328 355
329 list_for_each_entry(p, &br->port_list, list) { 356 list_for_each_entry(p, &br->port_list, list) {
330 if (!(p->dev->features 357 if (!(p->dev->features
@@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
351 if (dev->br_port != NULL) 378 if (dev->br_port != NULL)
352 return -EBUSY; 379 return -EBUSY;
353 380
354 if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) 381 if (IS_ERR(p = new_nbp(br, dev)))
355 return PTR_ERR(p); 382 return PTR_ERR(p);
356 383
357 if ((err = br_fdb_insert(br, p, dev->dev_addr))) 384 if ((err = br_fdb_insert(br, p, dev->dev_addr)))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b88220a64cd8..c387852f753a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb)
53 /* insert into forwarding database after filtering to avoid spoofing */ 53 /* insert into forwarding database after filtering to avoid spoofing */
54 br_fdb_update(p->br, p, eth_hdr(skb)->h_source); 54 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
55 55
56 if (p->state == BR_STATE_LEARNING) {
57 kfree_skb(skb);
58 goto out;
59 }
60
56 if (br->dev->flags & IFF_PROMISC) { 61 if (br->dev->flags & IFF_PROMISC) {
57 struct sk_buff *skb2; 62 struct sk_buff *skb2;
58 63
@@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
107 if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) 112 if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
108 goto err; 113 goto err;
109 114
110 if (p->state == BR_STATE_LEARNING)
111 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
112
113 if (p->br->stp_enabled && 115 if (p->br->stp_enabled &&
114 !memcmp(dest, bridge_ula, 5) && 116 !memcmp(dest, bridge_ula, 5) &&
115 !(dest[5] & 0xF0)) { 117 !(dest[5] & 0xF0)) {
@@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
118 NULL, br_stp_handle_bpdu); 120 NULL, br_stp_handle_bpdu);
119 return 1; 121 return 1;
120 } 122 }
123 goto err;
121 } 124 }
122 125
123 else if (p->state == BR_STATE_FORWARDING) { 126 if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
124 if (br_should_route_hook) { 127 if (br_should_route_hook) {
125 if (br_should_route_hook(pskb)) 128 if (br_should_route_hook(pskb))
126 return 0; 129 return 0;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
26#include <linux/ip.h> 26#include <linux/ip.h>
27#include <linux/netdevice.h> 27#include <linux/netdevice.h>
28#include <linux/skbuff.h> 28#include <linux/skbuff.h>
29#include <linux/if_arp.h>
29#include <linux/if_ether.h> 30#include <linux/if_ether.h>
30#include <linux/if_vlan.h> 31#include <linux/if_vlan.h>
31#include <linux/netfilter_bridge.h> 32#include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
33#include <linux/netfilter_ipv6.h> 34#include <linux/netfilter_ipv6.h>
34#include <linux/netfilter_arp.h> 35#include <linux/netfilter_arp.h>
35#include <linux/in_route.h> 36#include <linux/in_route.h>
37
36#include <net/ip.h> 38#include <net/ip.h>
37#include <net/ipv6.h> 39#include <net/ipv6.h>
40#include <net/route.h>
41
38#include <asm/uaccess.h> 42#include <asm/uaccess.h>
39#include <asm/checksum.h> 43#include <asm/checksum.h>
40#include "br_private.h" 44#include "br_private.h"
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 917311c6828b..a43a9c1d50d7 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
52 br_stp_recalculate_bridge_id(br); 52 br_stp_recalculate_bridge_id(br);
53 break; 53 break;
54 54
55 case NETDEV_CHANGE: /* device is up but carrier changed */ 55 case NETDEV_CHANGE:
56 if (!(br->dev->flags & IFF_UP)) 56 if (br->dev->flags & IFF_UP)
57 break; 57 schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
58
59 if (netif_carrier_ok(dev)) {
60 if (p->state == BR_STATE_DISABLED)
61 br_stp_enable_port(p);
62 } else {
63 if (p->state != BR_STATE_DISABLED)
64 br_stp_disable_port(p);
65 }
66 break; 58 break;
67 59
68 case NETDEV_FEAT_CHANGE: 60 case NETDEV_FEAT_CHANGE:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bdf95a74d8cd..c5bd631ffcd5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -27,6 +27,10 @@
27#define BR_PORT_BITS 10 27#define BR_PORT_BITS 10
28#define BR_MAX_PORTS (1<<BR_PORT_BITS) 28#define BR_MAX_PORTS (1<<BR_PORT_BITS)
29 29
30#define BR_PORT_DEBOUNCE (HZ/10)
31
32#define BR_VERSION "2.1"
33
30typedef struct bridge_id bridge_id; 34typedef struct bridge_id bridge_id;
31typedef struct mac_addr mac_addr; 35typedef struct mac_addr mac_addr;
32typedef __u16 port_id; 36typedef __u16 port_id;
@@ -78,6 +82,7 @@ struct net_bridge_port
78 struct timer_list hold_timer; 82 struct timer_list hold_timer;
79 struct timer_list message_age_timer; 83 struct timer_list message_age_timer;
80 struct kobject kobj; 84 struct kobject kobj;
85 struct work_struct carrier_check;
81 struct rcu_head rcu; 86 struct rcu_head rcu;
82}; 87};
83 88
@@ -90,6 +95,7 @@ struct net_bridge
90 spinlock_t hash_lock; 95 spinlock_t hash_lock;
91 struct hlist_head hash[BR_HASH_SIZE]; 96 struct hlist_head hash[BR_HASH_SIZE];
92 struct list_head age_list; 97 struct list_head age_list;
98 unsigned long feature_mask;
93 99
94 /* STP */ 100 /* STP */
95 bridge_id designated_root; 101 bridge_id designated_root;
@@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br);
201extern void br_stp_enable_port(struct net_bridge_port *p); 207extern void br_stp_enable_port(struct net_bridge_port *p);
202extern void br_stp_disable_port(struct net_bridge_port *p); 208extern void br_stp_disable_port(struct net_bridge_port *p);
203extern void br_stp_recalculate_bridge_id(struct net_bridge *br); 209extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
210extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
204extern void br_stp_set_bridge_priority(struct net_bridge *br, 211extern void br_stp_set_bridge_priority(struct net_bridge *br,
205 u16 newprio); 212 u16 newprio);
206extern void br_stp_set_port_priority(struct net_bridge_port *p, 213extern void br_stp_set_port_priority(struct net_bridge_port *p,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index ac09b6a23523..cc047f7fb6ef 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
120} 120}
121 121
122/* called under bridge lock */ 122/* called under bridge lock */
123static void br_stp_change_bridge_id(struct net_bridge *br, 123void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
124 const unsigned char *addr)
125{ 124{
126 unsigned char oldaddr[6]; 125 unsigned char oldaddr[6];
127 struct net_bridge_port *p; 126 struct net_bridge_port *p;
@@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
158 157
159 list_for_each_entry(p, &br->port_list, list) { 158 list_for_each_entry(p, &br->port_list, list) {
160 if (addr == br_mac_zero || 159 if (addr == br_mac_zero ||
161 compare_ether_addr(p->dev->dev_addr, addr) < 0) 160 memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
162 addr = p->dev->dev_addr; 161 addr = p->dev->dev_addr;
163 162
164 } 163 }
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c70b3be23026..b84fc6075fe1 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG
196 To compile it as a module, choose M here. If unsure, say N. 196 To compile it as a module, choose M here. If unsure, say N.
197 197
198config BRIDGE_EBT_ULOG 198config BRIDGE_EBT_ULOG
199 tristate "ebt: ulog support" 199 tristate "ebt: ulog support (OBSOLETE)"
200 depends on BRIDGE_NF_EBTABLES 200 depends on BRIDGE_NF_EBTABLES
201 help 201 help
202 This option enables the old bridge-specific "ebt_ulog" implementation
203 which has been obsoleted by the new "nfnetlink_log" code (see
204 CONFIG_NETFILTER_NETLINK_LOG).
205
202 This option adds the ulog watcher, that you can use in any rule 206 This option adds the ulog watcher, that you can use in any rule
203 in any ebtables table. The packet is passed to a userspace 207 in any ebtables table. The packet is passed to a userspace
204 logging daemon using netlink multicast sockets. This differs 208 logging daemon using netlink multicast sockets. This differs
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 662975be3d1d..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -3,13 +3,16 @@
3 * 3 *
4 * Authors: 4 * Authors:
5 * Bart De Schuymer <bdschuym@pandora.be> 5 * Bart De Schuymer <bdschuym@pandora.be>
6 * Harald Welte <laforge@netfilter.org>
6 * 7 *
7 * April, 2002 8 * April, 2002
8 * 9 *
9 */ 10 */
10 11
12#include <linux/in.h>
11#include <linux/netfilter_bridge/ebtables.h> 13#include <linux/netfilter_bridge/ebtables.h>
12#include <linux/netfilter_bridge/ebt_log.h> 14#include <linux/netfilter_bridge/ebt_log.h>
15#include <linux/netfilter.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/ip.h> 17#include <linux/ip.h>
15#include <linux/if_arp.h> 18#include <linux/if_arp.h>
@@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p)
55} 58}
56 59
57#define myNIPQUAD(a) a[0], a[1], a[2], a[3] 60#define myNIPQUAD(a) a[0], a[1], a[2], a[3]
58static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, 61static void
59 const struct net_device *in, const struct net_device *out, 62ebt_log_packet(unsigned int pf, unsigned int hooknum,
60 const void *data, unsigned int datalen) 63 const struct sk_buff *skb, const struct net_device *in,
64 const struct net_device *out, const struct nf_loginfo *loginfo,
65 const char *prefix)
61{ 66{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 67 unsigned int bitmask;
63 char level_string[4] = "< >";
64 68
65 level_string[1] = '0' + info->loglevel;
66 spin_lock_bh(&ebt_log_lock); 69 spin_lock_bh(&ebt_log_lock);
67 printk(level_string); 70 printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
68 printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", 71 prefix, in ? in->name : "", out ? out->name : "");
69 out ? out->name : "");
70 72
71 printk("MAC source = ");
72 print_MAC(eth_hdr(skb)->h_source); 73 print_MAC(eth_hdr(skb)->h_source);
73 printk("MAC dest = "); 74 printk("MAC dest = ");
74 print_MAC(eth_hdr(skb)->h_dest); 75 print_MAC(eth_hdr(skb)->h_dest);
75 76
76 printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); 77 printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
77 78
78 if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == 79 if (loginfo->type == NF_LOG_TYPE_LOG)
80 bitmask = loginfo->u.log.logflags;
81 else
82 bitmask = NF_LOG_MASK;
83
84 if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
79 htons(ETH_P_IP)){ 85 htons(ETH_P_IP)){
80 struct iphdr _iph, *ih; 86 struct iphdr _iph, *ih;
81 87
@@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
84 printk(" INCOMPLETE IP header"); 90 printk(" INCOMPLETE IP header");
85 goto out; 91 goto out;
86 } 92 }
87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 93 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 94 "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos, 95 NIPQUAD(ih->daddr), ih->tos, ih->protocol);
90 ih->protocol);
91 if (ih->protocol == IPPROTO_TCP || 96 if (ih->protocol == IPPROTO_TCP ||
92 ih->protocol == IPPROTO_UDP) { 97 ih->protocol == IPPROTO_UDP) {
93 struct tcpudphdr _ports, *pptr; 98 struct tcpudphdr _ports, *pptr;
@@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
104 goto out; 109 goto out;
105 } 110 }
106 111
107 if ((info->bitmask & EBT_LOG_ARP) && 112 if ((bitmask & EBT_LOG_ARP) &&
108 ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || 113 ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
109 (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { 114 (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
110 struct arphdr _arph, *ah; 115 struct arphdr _arph, *ah;
@@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
144out: 149out:
145 printk("\n"); 150 printk("\n");
146 spin_unlock_bh(&ebt_log_lock); 151 spin_unlock_bh(&ebt_log_lock);
152
153}
154
155static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
156 const struct net_device *in, const struct net_device *out,
157 const void *data, unsigned int datalen)
158{
159 struct ebt_log_info *info = (struct ebt_log_info *)data;
160 struct nf_loginfo li;
161
162 li.type = NF_LOG_TYPE_LOG;
163 li.u.log.level = info->loglevel;
164 li.u.log.logflags = info->bitmask;
165
166 nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix);
147} 167}
148 168
149static struct ebt_watcher log = 169static struct ebt_watcher log =
@@ -154,13 +174,32 @@ static struct ebt_watcher log =
154 .me = THIS_MODULE, 174 .me = THIS_MODULE,
155}; 175};
156 176
177static struct nf_logger ebt_log_logger = {
178 .name = "ebt_log",
179 .logfn = &ebt_log_packet,
180 .me = THIS_MODULE,
181};
182
157static int __init init(void) 183static int __init init(void)
158{ 184{
159 return ebt_register_watcher(&log); 185 int ret;
186
187 ret = ebt_register_watcher(&log);
188 if (ret < 0)
189 return ret;
190 if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
191 printk(KERN_WARNING "ebt_log: not logging via system console "
192 "since somebody else already registered for PF_INET\n");
193 /* we cannot make module load fail here, since otherwise
194 * ebtables userspace would abort */
195 }
196
197 return 0;
160} 198}
161 199
162static void __exit fini(void) 200static void __exit fini(void)
163{ 201{
202 nf_log_unregister_logger(&ebt_log_logger);
164 ebt_unregister_watcher(&log); 203 ebt_unregister_watcher(&log);
165} 204}
166 205
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index aae26ae2e61f..ce617b3dbbb8 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Authors: 4 * Authors:
5 * Bart De Schuymer <bdschuym@pandora.be> 5 * Bart De Schuymer <bdschuym@pandora.be>
6 * Harald Welte <laforge@netfilter.org>
6 * 7 *
7 * November, 2004 8 * November, 2004
8 * 9 *
@@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
115 return skb; 116 return skb;
116} 117}
117 118
118static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, 119static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
119 const struct net_device *in, const struct net_device *out, 120 const struct net_device *in, const struct net_device *out,
120 const void *data, unsigned int datalen) 121 const struct ebt_ulog_info *uloginfo, const char *prefix)
121{ 122{
122 ebt_ulog_packet_msg_t *pm; 123 ebt_ulog_packet_msg_t *pm;
123 size_t size, copy_len; 124 size_t size, copy_len;
124 struct nlmsghdr *nlh; 125 struct nlmsghdr *nlh;
125 struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
126 unsigned int group = uloginfo->nlgroup; 126 unsigned int group = uloginfo->nlgroup;
127 ebt_ulog_buff_t *ub = &ulog_buffers[group]; 127 ebt_ulog_buff_t *ub = &ulog_buffers[group];
128 spinlock_t *lock = &ub->lock; 128 spinlock_t *lock = &ub->lock;
@@ -216,6 +216,39 @@ alloc_failure:
216 goto unlock; 216 goto unlock;
217} 217}
218 218
219/* this function is registered with the netfilter core */
220static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
221 const struct sk_buff *skb, const struct net_device *in,
222 const struct net_device *out, const struct nf_loginfo *li,
223 const char *prefix)
224{
225 struct ebt_ulog_info loginfo;
226
227 if (!li || li->type != NF_LOG_TYPE_ULOG) {
228 loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
229 loginfo.cprange = 0;
230 loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
231 loginfo.prefix[0] = '\0';
232 } else {
233 loginfo.nlgroup = li->u.ulog.group;
234 loginfo.cprange = li->u.ulog.copy_len;
235 loginfo.qthreshold = li->u.ulog.qthreshold;
236 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
237 }
238
239 ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
240}
241
242static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
243 const struct net_device *in, const struct net_device *out,
244 const void *data, unsigned int datalen)
245{
246 struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
247
248 ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
249}
250
251
219static int ebt_ulog_check(const char *tablename, unsigned int hookmask, 252static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
220 const struct ebt_entry *e, void *data, unsigned int datalen) 253 const struct ebt_entry *e, void *data, unsigned int datalen)
221{ 254{
@@ -240,6 +273,12 @@ static struct ebt_watcher ulog = {
240 .me = THIS_MODULE, 273 .me = THIS_MODULE,
241}; 274};
242 275
276static struct nf_logger ebt_ulog_logger = {
277 .name = EBT_ULOG_WATCHER,
278 .logfn = &ebt_log_packet,
279 .me = THIS_MODULE,
280};
281
243static int __init init(void) 282static int __init init(void)
244{ 283{
245 int i, ret = 0; 284 int i, ret = 0;
@@ -265,6 +304,13 @@ static int __init init(void)
265 else if ((ret = ebt_register_watcher(&ulog))) 304 else if ((ret = ebt_register_watcher(&ulog)))
266 sock_release(ebtulognl->sk_socket); 305 sock_release(ebtulognl->sk_socket);
267 306
307 if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
308 printk(KERN_WARNING "ebt_ulog: not logging via ulog "
309 "since somebody else already registered for PF_BRIDGE\n");
310 /* we cannot make module load fail here, since otherwise
311 * ebtables userspace would abort */
312 }
313
268 return ret; 314 return ret;
269} 315}
270 316
@@ -273,6 +319,7 @@ static void __exit fini(void)
273 ebt_ulog_buff_t *ub; 319 ebt_ulog_buff_t *ub;
274 int i; 320 int i;
275 321
322 nf_log_unregister_logger(&ebt_ulog_logger);
276 ebt_unregister_watcher(&ulog); 323 ebt_unregister_watcher(&ulog);
277 for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { 324 for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
278 ub = &ulog_buffers[i]; 325 ub = &ulog_buffers[i];
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac58..f8d322e1ea92 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
47#include <linux/rtnetlink.h> 47#include <linux/rtnetlink.h>
48#include <linux/poll.h> 48#include <linux/poll.h>
49#include <linux/highmem.h> 49#include <linux/highmem.h>
50#include <linux/spinlock.h>
50 51
51#include <net/protocol.h> 52#include <net/protocol.h>
52#include <linux/skbuff.h> 53#include <linux/skbuff.h>
@@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
200} 201}
201 202
202/** 203/**
204 * skb_kill_datagram - Free a datagram skbuff forcibly
205 * @sk: socket
206 * @skb: datagram skbuff
207 * @flags: MSG_ flags
208 *
209 * This function frees a datagram skbuff that was received by
210 * skb_recv_datagram. The flags argument must match the one
211 * used for skb_recv_datagram.
212 *
213 * If the MSG_PEEK flag is set, and the packet is still on the
214 * receive queue of the socket, it will be taken off the queue
215 * before it is freed.
216 *
217 * This function currently only disables BH when acquiring the
218 * sk_receive_queue lock. Therefore it must not be used in a
219 * context where that lock is acquired in an IRQ context.
220 */
221
222void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
223{
224 if (flags & MSG_PEEK) {
225 spin_lock_bh(&sk->sk_receive_queue.lock);
226 if (skb == skb_peek(&sk->sk_receive_queue)) {
227 __skb_unlink(skb, &sk->sk_receive_queue);
228 atomic_dec(&skb->users);
229 }
230 spin_unlock_bh(&sk->sk_receive_queue.lock);
231 }
232
233 kfree_skb(skb);
234}
235
236EXPORT_SYMBOL(skb_kill_datagram);
237
238/**
203 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 239 * skb_copy_datagram_iovec - Copy a datagram to an iovec.
204 * @skb: buffer to copy 240 * @skb: buffer to copy
205 * @offset: offset in the buffer to start copying from 241 * @offset: offset in the buffer to start copying from
diff --git a/net/core/dev.c b/net/core/dev.c
index a5efc9ae010b..29ba109d3e54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close);
3276EXPORT_SYMBOL(dev_get_by_flags); 3276EXPORT_SYMBOL(dev_get_by_flags);
3277EXPORT_SYMBOL(dev_get_by_index); 3277EXPORT_SYMBOL(dev_get_by_index);
3278EXPORT_SYMBOL(dev_get_by_name); 3278EXPORT_SYMBOL(dev_get_by_name);
3279EXPORT_SYMBOL(dev_ioctl);
3280EXPORT_SYMBOL(dev_open); 3279EXPORT_SYMBOL(dev_open);
3281EXPORT_SYMBOL(dev_queue_xmit); 3280EXPORT_SYMBOL(dev_queue_xmit);
3282EXPORT_SYMBOL(dev_remove_pack); 3281EXPORT_SYMBOL(dev_remove_pack);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a10e0bc90e8..8964d3445588 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -13,6 +13,7 @@
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 * 14 *
15 * Andi Kleen - Fix a few bad bugs and races. 15 * Andi Kleen - Fix a few bad bugs and races.
16 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
16 */ 17 */
17 18
18#include <linux/module.h> 19#include <linux/module.h>
@@ -250,7 +251,7 @@ load_b:
250 mem[fentry->k] = X; 251 mem[fentry->k] = X;
251 continue; 252 continue;
252 default: 253 default:
253 /* Invalid instruction counts as RET */ 254 WARN_ON(1);
254 return 0; 255 return 0;
255 } 256 }
256 257
@@ -283,8 +284,8 @@ load_b:
283 * 284 *
284 * Check the user's filter code. If we let some ugly 285 * Check the user's filter code. If we let some ugly
285 * filter code slip through kaboom! The filter must contain 286 * filter code slip through kaboom! The filter must contain
286 * no references or jumps that are out of range, no illegal instructions 287 * no references or jumps that are out of range, no illegal
287 * and no backward jumps. It must end with a RET instruction 288 * instructions, and must end with a RET instruction.
288 * 289 *
289 * Returns 0 if the rule set is legal or a negative errno code if not. 290 * Returns 0 if the rule set is legal or a negative errno code if not.
290 */ 291 */
@@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
300 for (pc = 0; pc < flen; pc++) { 301 for (pc = 0; pc < flen; pc++) {
301 /* all jumps are forward as they are not signed */ 302 /* all jumps are forward as they are not signed */
302 ftest = &filter[pc]; 303 ftest = &filter[pc];
303 if (BPF_CLASS(ftest->code) == BPF_JMP) {
304 /* but they mustn't jump off the end */
305 if (BPF_OP(ftest->code) == BPF_JA) {
306 /*
307 * Note, the large ftest->k might cause loops.
308 * Compare this with conditional jumps below,
309 * where offsets are limited. --ANK (981016)
310 */
311 if (ftest->k >= (unsigned)(flen-pc-1))
312 return -EINVAL;
313 } else {
314 /* for conditionals both must be safe */
315 if (pc + ftest->jt +1 >= flen ||
316 pc + ftest->jf +1 >= flen)
317 return -EINVAL;
318 }
319 }
320 304
321 /* check for division by zero -Kris Katterjohn 2005-10-30 */ 305 /* Only allow valid instructions */
322 if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) 306 switch (ftest->code) {
323 return -EINVAL; 307 case BPF_ALU|BPF_ADD|BPF_K:
308 case BPF_ALU|BPF_ADD|BPF_X:
309 case BPF_ALU|BPF_SUB|BPF_K:
310 case BPF_ALU|BPF_SUB|BPF_X:
311 case BPF_ALU|BPF_MUL|BPF_K:
312 case BPF_ALU|BPF_MUL|BPF_X:
313 case BPF_ALU|BPF_DIV|BPF_X:
314 case BPF_ALU|BPF_AND|BPF_K:
315 case BPF_ALU|BPF_AND|BPF_X:
316 case BPF_ALU|BPF_OR|BPF_K:
317 case BPF_ALU|BPF_OR|BPF_X:
318 case BPF_ALU|BPF_LSH|BPF_K:
319 case BPF_ALU|BPF_LSH|BPF_X:
320 case BPF_ALU|BPF_RSH|BPF_K:
321 case BPF_ALU|BPF_RSH|BPF_X:
322 case BPF_ALU|BPF_NEG:
323 case BPF_LD|BPF_W|BPF_ABS:
324 case BPF_LD|BPF_H|BPF_ABS:
325 case BPF_LD|BPF_B|BPF_ABS:
326 case BPF_LD|BPF_W|BPF_LEN:
327 case BPF_LD|BPF_W|BPF_IND:
328 case BPF_LD|BPF_H|BPF_IND:
329 case BPF_LD|BPF_B|BPF_IND:
330 case BPF_LD|BPF_IMM:
331 case BPF_LDX|BPF_W|BPF_LEN:
332 case BPF_LDX|BPF_B|BPF_MSH:
333 case BPF_LDX|BPF_IMM:
334 case BPF_MISC|BPF_TAX:
335 case BPF_MISC|BPF_TXA:
336 case BPF_RET|BPF_K:
337 case BPF_RET|BPF_A:
338 break;
339
340 /* Some instructions need special checks */
324 341
325 /* check that memory operations use valid addresses. */ 342 case BPF_ALU|BPF_DIV|BPF_K:
326 if (ftest->k >= BPF_MEMWORDS) { 343 /* check for division by zero */
327 /* but it might not be a memory operation... */ 344 if (ftest->k == 0)
328 switch (ftest->code) {
329 case BPF_ST:
330 case BPF_STX:
331 case BPF_LD|BPF_MEM:
332 case BPF_LDX|BPF_MEM:
333 return -EINVAL; 345 return -EINVAL;
334 } 346 break;
347
348 case BPF_LD|BPF_MEM:
349 case BPF_LDX|BPF_MEM:
350 case BPF_ST:
351 case BPF_STX:
352 /* check for invalid memory addresses */
353 if (ftest->k >= BPF_MEMWORDS)
354 return -EINVAL;
355 break;
356
357 case BPF_JMP|BPF_JA:
358 /*
359 * Note, the large ftest->k might cause loops.
360 * Compare this with conditional jumps below,
361 * where offsets are limited. --ANK (981016)
362 */
363 if (ftest->k >= (unsigned)(flen-pc-1))
364 return -EINVAL;
365 break;
366
367 case BPF_JMP|BPF_JEQ|BPF_K:
368 case BPF_JMP|BPF_JEQ|BPF_X:
369 case BPF_JMP|BPF_JGE|BPF_K:
370 case BPF_JMP|BPF_JGE|BPF_X:
371 case BPF_JMP|BPF_JGT|BPF_K:
372 case BPF_JMP|BPF_JGT|BPF_X:
373 case BPF_JMP|BPF_JSET|BPF_K:
374 case BPF_JMP|BPF_JSET|BPF_X:
375 /* for conditionals both must be safe */
376 if (pc + ftest->jt + 1 >= flen ||
377 pc + ftest->jf + 1 >= flen)
378 return -EINVAL;
379 break;
380
381 default:
382 return -EINVAL;
335 } 383 }
336 } 384 }
337 385
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9fd..c4f25385029f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
23#include <net/flow.h> 23#include <net/flow.h>
24#include <asm/atomic.h> 24#include <asm/atomic.h>
25#include <asm/semaphore.h> 25#include <asm/semaphore.h>
26#include <linux/security.h>
26 27
27struct flow_cache_entry { 28struct flow_cache_entry {
28 struct flow_cache_entry *next; 29 struct flow_cache_entry *next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
30 u8 dir; 31 u8 dir;
31 struct flowi key; 32 struct flowi key;
32 u32 genid; 33 u32 genid;
34 u32 sk_sid;
33 void *object; 35 void *object;
34 atomic_t *object_ref; 36 atomic_t *object_ref;
35}; 37};
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
162 return 0; 164 return 0;
163} 165}
164 166
165void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, 167void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
166 flow_resolve_t resolver) 168 flow_resolve_t resolver)
167{ 169{
168 struct flow_cache_entry *fle, **head; 170 struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
186 for (fle = *head; fle; fle = fle->next) { 188 for (fle = *head; fle; fle = fle->next) {
187 if (fle->family == family && 189 if (fle->family == family &&
188 fle->dir == dir && 190 fle->dir == dir &&
191 fle->sk_sid == sk_sid &&
189 flow_key_compare(key, &fle->key) == 0) { 192 flow_key_compare(key, &fle->key) == 0) {
190 if (fle->genid == atomic_read(&flow_cache_genid)) { 193 if (fle->genid == atomic_read(&flow_cache_genid)) {
191 void *ret = fle->object; 194 void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
210 *head = fle; 213 *head = fle;
211 fle->family = family; 214 fle->family = family;
212 fle->dir = dir; 215 fle->dir = dir;
216 fle->sk_sid = sk_sid;
213 memcpy(&fle->key, key, sizeof(*key)); 217 memcpy(&fle->key, key, sizeof(*key));
214 fle->object = NULL; 218 fle->object = NULL;
215 flow_count(cpu)++; 219 flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
221 void *obj; 225 void *obj;
222 atomic_t *obj_ref; 226 atomic_t *obj_ref;
223 227
224 resolver(key, family, dir, &obj, &obj_ref); 228 resolver(key, sk_sid, family, dir, &obj, &obj_ref);
225 229
226 if (fle) { 230 if (fle) {
227 fle->genid = atomic_read(&flow_cache_genid); 231 fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/etherdevice.h> 14#include <linux/etherdevice.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/if_arp.h>
16#include <linux/inetdevice.h> 17#include <linux/inetdevice.h>
17#include <linux/inet.h> 18#include <linux/inet.h>
18#include <linux/interrupt.h> 19#include <linux/interrupt.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7fc3e9e28c34..06cad2d63e8a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]);
487 487
488/* Module parameters, defaults. */ 488/* Module parameters, defaults. */
489static int pg_count_d = 1000; /* 1000 pkts by default */ 489static int pg_count_d = 1000; /* 1000 pkts by default */
490static int pg_delay_d = 0; 490static int pg_delay_d;
491static int pg_clone_skb_d = 0; 491static int pg_clone_skb_d;
492static int debug = 0; 492static int debug;
493 493
494static DECLARE_MUTEX(pktgen_sem); 494static DECLARE_MUTEX(pktgen_sem);
495static struct pktgen_thread *pktgen_threads = NULL; 495static struct pktgen_thread *pktgen_threads = NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38e..070f91cfde59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
135struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 135struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
136 int fclone) 136 int fclone)
137{ 137{
138 struct skb_shared_info *shinfo;
138 struct sk_buff *skb; 139 struct sk_buff *skb;
139 u8 *data; 140 u8 *data;
140 141
141 /* Get the HEAD */ 142 /* Get the HEAD */
142 if (fclone) 143 skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
143 skb = kmem_cache_alloc(skbuff_fclone_cache, 144 gfp_mask & ~__GFP_DMA);
144 gfp_mask & ~__GFP_DMA);
145 else
146 skb = kmem_cache_alloc(skbuff_head_cache,
147 gfp_mask & ~__GFP_DMA);
148
149 if (!skb) 145 if (!skb)
150 goto out; 146 goto out;
151 147
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
162 skb->data = data; 158 skb->data = data;
163 skb->tail = data; 159 skb->tail = data;
164 skb->end = data + size; 160 skb->end = data + size;
161 /* make sure we initialize shinfo sequentially */
162 shinfo = skb_shinfo(skb);
163 atomic_set(&shinfo->dataref, 1);
164 shinfo->nr_frags = 0;
165 shinfo->tso_size = 0;
166 shinfo->tso_segs = 0;
167 shinfo->ufo_size = 0;
168 shinfo->ip6_frag_id = 0;
169 shinfo->frag_list = NULL;
170
165 if (fclone) { 171 if (fclone) {
166 struct sk_buff *child = skb + 1; 172 struct sk_buff *child = skb + 1;
167 atomic_t *fclone_ref = (atomic_t *) (child + 1); 173 atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
171 177
172 child->fclone = SKB_FCLONE_UNAVAILABLE; 178 child->fclone = SKB_FCLONE_UNAVAILABLE;
173 } 179 }
174 atomic_set(&(skb_shinfo(skb)->dataref), 1);
175 skb_shinfo(skb)->nr_frags = 0;
176 skb_shinfo(skb)->tso_size = 0;
177 skb_shinfo(skb)->tso_segs = 0;
178 skb_shinfo(skb)->frag_list = NULL;
179 skb_shinfo(skb)->ufo_size = 0;
180 skb_shinfo(skb)->ip6_frag_id = 0;
181out: 180out:
182 return skb; 181 return skb;
183nodata: 182nodata:
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f056..6465b0e4c8cb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
1488 } 1488 }
1489 } 1489 }
1490 1490
1491 if (prot->twsk_obj_size) { 1491 if (prot->twsk_prot != NULL) {
1492 static const char mask[] = "tw_sock_%s"; 1492 static const char mask[] = "tw_sock_%s";
1493 1493
1494 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); 1494 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
1497 goto out_free_request_sock_slab; 1497 goto out_free_request_sock_slab;
1498 1498
1499 sprintf(timewait_sock_slab_name, mask, prot->name); 1499 sprintf(timewait_sock_slab_name, mask, prot->name);
1500 prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, 1500 prot->twsk_prot->twsk_slab =
1501 prot->twsk_obj_size, 1501 kmem_cache_create(timewait_sock_slab_name,
1502 0, SLAB_HWCACHE_ALIGN, 1502 prot->twsk_prot->twsk_obj_size,
1503 NULL, NULL); 1503 0, SLAB_HWCACHE_ALIGN,
1504 if (prot->twsk_slab == NULL) 1504 NULL, NULL);
1505 if (prot->twsk_prot->twsk_slab == NULL)
1505 goto out_free_timewait_sock_slab_name; 1506 goto out_free_timewait_sock_slab_name;
1506 } 1507 }
1507 } 1508 }
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
1548 prot->rsk_prot->slab = NULL; 1549 prot->rsk_prot->slab = NULL;
1549 } 1550 }
1550 1551
1551 if (prot->twsk_slab != NULL) { 1552 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1552 const char *name = kmem_cache_name(prot->twsk_slab); 1553 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1553 1554
1554 kmem_cache_destroy(prot->twsk_slab); 1555 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1555 kfree(name); 1556 kfree(name);
1556 prot->twsk_slab = NULL; 1557 prot->twsk_prot->twsk_slab = NULL;
1557 } 1558 }
1558} 1559}
1559 1560
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03e8024..35e25259fd95 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
55 int done; 55 int done;
56 56
57 do { 57 do {
58 if (sk->sk_err) 58 int err = sock_error(sk);
59 return sock_error(sk); 59 if (err)
60 return err;
60 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) 61 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
61 return -EPIPE; 62 return -EPIPE;
62 if (!*timeo_p) 63 if (!*timeo_p)
@@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
67 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 68 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
68 sk->sk_write_pending++; 69 sk->sk_write_pending++;
69 done = sk_wait_event(sk, timeo_p, 70 done = sk_wait_event(sk, timeo_p,
71 !sk->sk_err &&
70 !((1 << sk->sk_state) & 72 !((1 << sk->sk_state) &
71 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); 73 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
72 finish_wait(sk->sk_sleep, &wait); 74 finish_wait(sk->sk_sleep, &wait);
@@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
137 139
138 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 140 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
139 sk->sk_write_pending++; 141 sk->sk_write_pending++;
140 sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) && 142 sk_wait_event(sk, &current_timeo, !sk->sk_err &&
143 !(sk->sk_shutdown & SEND_SHUTDOWN) &&
144 sk_stream_memory_free(sk) &&
141 vm_wait); 145 vm_wait);
142 sk->sk_write_pending--; 146 sk->sk_write_pending--;
143 147
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 344a8da153fc..87b27fff6e3b 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,3 +1,7 @@
1obj-$(CONFIG_IPV6) += dccp_ipv6.o
2
3dccp_ipv6-y := ipv6.o
4
1obj-$(CONFIG_IP_DCCP) += dccp.o 5obj-$(CONFIG_IP_DCCP) += dccp.o
2 6
3dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ 7dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index c9a62cca22fc..ce9cb77c5c29 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
55 from = av->dccpav_buf + av->dccpav_buf_head; 55 from = av->dccpav_buf + av->dccpav_buf_head;
56 56
57 /* Check if buf_head wraps */ 57 /* Check if buf_head wraps */
58 if (av->dccpav_buf_head + len > av->dccpav_vec_len) { 58 if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
59 const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); 59 const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
60 60
61 memcpy(to, from, tailsize); 61 memcpy(to, from, tailsize);
62 to += tailsize; 62 to += tailsize;
@@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
93struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, 93struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
94 const gfp_t priority) 94 const gfp_t priority)
95{ 95{
96 struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); 96 struct dccp_ackvec *av;
97 97
98 BUG_ON(len == 0);
99
100 if (len > DCCP_MAX_ACKVEC_LEN)
101 return NULL;
102
103 av = kmalloc(sizeof(*av) + len, priority);
98 if (av != NULL) { 104 if (av != NULL) {
99 av->dccpav_buf_len = len; 105 av->dccpav_buf_len = len;
100 av->dccpav_buf_head = 106 av->dccpav_buf_head =
@@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
117} 123}
118 124
119static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 125static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
120 const unsigned int index) 126 const u8 index)
121{ 127{
122 return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; 128 return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
123} 129}
124 130
125static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 131static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
126 const unsigned int index) 132 const u8 index)
127{ 133{
128 return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; 134 return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
129} 135}
@@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
135 */ 141 */
136static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 142static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
137 const unsigned int packets, 143 const unsigned int packets,
138 const unsigned char state) 144 const unsigned char state)
139{ 145{
140 unsigned int gap; 146 unsigned int gap;
141 signed long new_head; 147 signed long new_head;
@@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
223 * could reduce the complexity of this scan.) 229 * could reduce the complexity of this scan.)
224 */ 230 */
225 u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); 231 u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
226 unsigned int index = av->dccpav_buf_head; 232 u8 index = av->dccpav_buf_head;
227 233
228 while (1) { 234 while (1) {
229 const u8 len = dccp_ackvec_len(av, index); 235 const u8 len = dccp_ackvec_len(av, index);
@@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av)
291} 297}
292#endif 298#endif
293 299
294static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) 300static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
295{ 301{
296 /* 302 /*
297 * As we're keeping track of the ack vector size (dccpav_vec_len) and 303 * As we're keeping track of the ack vector size (dccpav_vec_len) and
@@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
301 * draft-ietf-dccp-spec-11.txt Appendix A. -acme 307 * draft-ietf-dccp-spec-11.txt Appendix A. -acme
302 */ 308 */
303#if 0 309#if 0
304 av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; 310 u32 new_buf_tail = av->dccpav_ack_ptr + 1;
305 if (av->dccpav_buf_tail >= av->dccpav_vec_len) 311 if (new_buf_tail >= av->dccpav_vec_len)
306 av->dccpav_buf_tail -= av->dccpav_vec_len; 312 new_buf_tail -= av->dccpav_vec_len;
313 av->dccpav_buf_tail = new_buf_tail;
307#endif 314#endif
308 av->dccpav_vec_len -= av->dccpav_sent_len; 315 av->dccpav_vec_len -= av->dccpav_sent_len;
309} 316}
@@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
326 debug_prefix, 1, 333 debug_prefix, 1,
327 (unsigned long long)av->dccpav_ack_seqno, 334 (unsigned long long)av->dccpav_ack_seqno,
328 (unsigned long long)av->dccpav_ack_ackno); 335 (unsigned long long)av->dccpav_ack_ackno);
329 dccp_ackvec_trow_away_ack_record(av); 336 dccp_ackvec_throw_away_ack_record(av);
330 av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; 337 av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
331 } 338 }
332} 339}
@@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
389 av->dccpav_ack_seqno, 396 av->dccpav_ack_seqno,
390 (unsigned long long) 397 (unsigned long long)
391 av->dccpav_ack_ackno); 398 av->dccpav_ack_ackno);
392 dccp_ackvec_trow_away_ack_record(av); 399 dccp_ackvec_throw_away_ack_record(av);
393 } 400 }
394 /* 401 /*
395 * If dccpav_ack_seqno was not received, no problem 402 * If dccpav_ack_seqno was not received, no problem
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index d0fd6c60c574..f7dfb5f67b87 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -54,16 +54,16 @@
54 * @dccpav_buf - circular buffer of acknowledgeable packets 54 * @dccpav_buf - circular buffer of acknowledgeable packets
55 */ 55 */
56struct dccp_ackvec { 56struct dccp_ackvec {
57 unsigned int dccpav_buf_head;
58 unsigned int dccpav_buf_tail;
59 u64 dccpav_buf_ackno; 57 u64 dccpav_buf_ackno;
60 u64 dccpav_ack_seqno; 58 u64 dccpav_ack_seqno;
61 u64 dccpav_ack_ackno; 59 u64 dccpav_ack_ackno;
62 unsigned int dccpav_ack_ptr;
63 unsigned int dccpav_sent_len;
64 unsigned int dccpav_vec_len;
65 unsigned int dccpav_buf_len;
66 struct timeval dccpav_time; 60 struct timeval dccpav_time;
61 u8 dccpav_buf_head;
62 u8 dccpav_buf_tail;
63 u8 dccpav_ack_ptr;
64 u8 dccpav_sent_len;
65 u8 dccpav_vec_len;
66 u8 dccpav_buf_len;
67 u8 dccpav_buf_nonce; 67 u8 dccpav_buf_nonce;
68 u8 dccpav_ack_nonce; 68 u8 dccpav_ack_nonce;
69 u8 dccpav_buf[0]; 69 u8 dccpav_buf[0];
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
21 21
22#define CCID_MAX 255 22#define CCID_MAX 255
23 23
24struct tcp_info;
25
24struct ccid { 26struct ccid {
25 unsigned char ccid_id; 27 unsigned char ccid_id;
26 const char *ccid_name; 28 const char *ccid_name;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f97b85d55ad8..93f26dd6e6cb 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
59 59
60#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ 60#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
61 61
62extern struct proto dccp_v4_prot; 62extern struct proto dccp_prot;
63 63
64/* is seq1 < seq2 ? */ 64/* is seq1 < seq2 ? */
65static inline int before48(const u64 seq1, const u64 seq2) 65static inline int before48(const u64 seq1, const u64 seq2)
@@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
228extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 228extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
229 const struct dccp_hdr *dh, const unsigned len); 229 const struct dccp_hdr *dh, const unsigned len);
230 230
231extern int dccp_v4_init_sock(struct sock *sk);
232extern int dccp_v4_destroy_sock(struct sock *sk);
233
231extern void dccp_close(struct sock *sk, long timeout); 234extern void dccp_close(struct sock *sk, long timeout);
232extern struct sk_buff *dccp_make_response(struct sock *sk, 235extern struct sk_buff *dccp_make_response(struct sock *sk,
233 struct dst_entry *dst, 236 struct dst_entry *dst,
@@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk,
238 241
239extern int dccp_connect(struct sock *sk); 242extern int dccp_connect(struct sock *sk);
240extern int dccp_disconnect(struct sock *sk, int flags); 243extern int dccp_disconnect(struct sock *sk, int flags);
244extern void dccp_unhash(struct sock *sk);
241extern int dccp_getsockopt(struct sock *sk, int level, int optname, 245extern int dccp_getsockopt(struct sock *sk, int level, int optname,
242 char __user *optval, int __user *optlen); 246 char __user *optval, int __user *optlen);
243extern int dccp_setsockopt(struct sock *sk, int level, int optname, 247extern int dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
249 struct msghdr *msg, size_t len, int nonblock, 253 struct msghdr *msg, size_t len, int nonblock,
250 int flags, int *addr_len); 254 int flags, int *addr_len);
251extern void dccp_shutdown(struct sock *sk, int how); 255extern void dccp_shutdown(struct sock *sk, int how);
256extern int inet_dccp_listen(struct socket *sock, int backlog);
257extern unsigned int dccp_poll(struct file *file, struct socket *sock,
258 poll_table *wait);
259extern void dccp_v4_send_check(struct sock *sk, int len,
260 struct sk_buff *skb);
261extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
262 int addr_len);
252 263
253extern int dccp_v4_checksum(const struct sk_buff *skb, 264extern int dccp_v4_checksum(const struct sk_buff *skb,
254 const u32 saddr, const u32 daddr); 265 const u32 saddr, const u32 daddr);
@@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb,
256extern int dccp_v4_send_reset(struct sock *sk, 267extern int dccp_v4_send_reset(struct sock *sk,
257 enum dccp_reset_codes code); 268 enum dccp_reset_codes code);
258extern void dccp_send_close(struct sock *sk, const int active); 269extern void dccp_send_close(struct sock *sk, const int active);
270extern int dccp_invalid_packet(struct sk_buff *skb);
271
272static inline int dccp_bad_service_code(const struct sock *sk,
273 const __u32 service)
274{
275 const struct dccp_sock *dp = dccp_sk(sk);
276
277 if (dp->dccps_service == service)
278 return 0;
279 return !dccp_list_has_service(dp->dccps_service_list, service);
280}
259 281
260struct dccp_skb_cb { 282struct dccp_skb_cb {
261 __u8 dccpd_type:4; 283 __u8 dccpd_type:4;
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d3..3f78c00e3822 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
28 info->tcpi_retransmits = icsk->icsk_retransmits; 28 info->tcpi_retransmits = icsk->icsk_retransmits;
29 info->tcpi_probes = icsk->icsk_probes_out; 29 info->tcpi_probes = icsk->icsk_probes_out;
30 info->tcpi_backoff = icsk->icsk_backoff; 30 info->tcpi_backoff = icsk->icsk_backoff;
31 info->tcpi_pmtu = dp->dccps_pmtu_cookie; 31 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
32 32
33 if (dp->dccps_options.dccpo_send_ack_vector) 33 if (dp->dccps_options.dccpo_send_ack_vector)
34 info->tcpi_options |= TCPI_OPT_SACK; 34 info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3454d5941900..b6cba72b44e8 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
151 return 0; 151 return 0;
152} 152}
153 153
154int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 154static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
155 const struct dccp_hdr *dh, const unsigned len) 155 const struct dccp_hdr *dh,
156 const unsigned len)
156{ 157{
157 struct dccp_sock *dp = dccp_sk(sk); 158 struct dccp_sock *dp = dccp_sk(sk);
158 159
159 if (dccp_check_seqno(sk, skb))
160 goto discard;
161
162 if (dccp_parse_options(sk, skb))
163 goto discard;
164
165 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
166 dccp_event_ack_recv(sk, skb);
167
168 if (dp->dccps_options.dccpo_send_ack_vector &&
169 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
170 DCCP_SKB_CB(skb)->dccpd_seq,
171 DCCP_ACKVEC_STATE_RECEIVED))
172 goto discard;
173
174 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
175 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
176
177 switch (dccp_hdr(skb)->dccph_type) { 160 switch (dccp_hdr(skb)->dccph_type) {
178 case DCCP_PKT_DATAACK: 161 case DCCP_PKT_DATAACK:
179 case DCCP_PKT_DATA: 162 case DCCP_PKT_DATA:
@@ -250,6 +233,37 @@ discard:
250 return 0; 233 return 0;
251} 234}
252 235
236int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
237 const struct dccp_hdr *dh, const unsigned len)
238{
239 struct dccp_sock *dp = dccp_sk(sk);
240
241 if (dccp_check_seqno(sk, skb))
242 goto discard;
243
244 if (dccp_parse_options(sk, skb))
245 goto discard;
246
247 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
248 dccp_event_ack_recv(sk, skb);
249
250 if (dp->dccps_options.dccpo_send_ack_vector &&
251 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
252 DCCP_SKB_CB(skb)->dccpd_seq,
253 DCCP_ACKVEC_STATE_RECEIVED))
254 goto discard;
255
256 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
257 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
258
259 return __dccp_rcv_established(sk, skb, dh, len);
260discard:
261 __kfree_skb(skb);
262 return 0;
263}
264
265EXPORT_SYMBOL_GPL(dccp_rcv_established);
266
253static int dccp_rcv_request_sent_state_process(struct sock *sk, 267static int dccp_rcv_request_sent_state_process(struct sock *sk,
254 struct sk_buff *skb, 268 struct sk_buff *skb,
255 const struct dccp_hdr *dh, 269 const struct dccp_hdr *dh,
@@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
286 goto out_invalid_packet; 300 goto out_invalid_packet;
287 } 301 }
288 302
303 if (dp->dccps_options.dccpo_send_ack_vector &&
304 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
305 DCCP_SKB_CB(skb)->dccpd_seq,
306 DCCP_ACKVEC_STATE_RECEIVED))
307 goto out_invalid_packet; /* FIXME: change error code */
308
289 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 309 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
290 dccp_update_gsr(sk, dp->dccps_isr); 310 dccp_update_gsr(sk, dp->dccps_isr);
291 /* 311 /*
@@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
309 goto out_invalid_packet; 329 goto out_invalid_packet;
310 } 330 }
311 331
312 dccp_sync_mss(sk, dp->dccps_pmtu_cookie); 332 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
313 333
314 /* 334 /*
315 * Step 10: Process REQUEST state (second part) 335 * Step 10: Process REQUEST state (second part)
@@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
329 dccp_set_state(sk, DCCP_PARTOPEN); 349 dccp_set_state(sk, DCCP_PARTOPEN);
330 350
331 /* Make sure socket is routed, for correct metrics. */ 351 /* Make sure socket is routed, for correct metrics. */
332 inet_sk_rebuild_header(sk); 352 icsk->icsk_af_ops->rebuild_header(sk);
333 353
334 if (!sock_flag(sk, SOCK_DEAD)) { 354 if (!sock_flag(sk, SOCK_DEAD)) {
335 sk->sk_state_change(sk); 355 sk->sk_state_change(sk);
@@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
398 418
399 if (dh->dccph_type == DCCP_PKT_DATAACK || 419 if (dh->dccph_type == DCCP_PKT_DATAACK ||
400 dh->dccph_type == DCCP_PKT_DATA) { 420 dh->dccph_type == DCCP_PKT_DATA) {
401 dccp_rcv_established(sk, skb, dh, len); 421 __dccp_rcv_established(sk, skb, dh, len);
402 queued = 1; /* packet was queued 422 queued = 1; /* packet was queued
403 (by dccp_rcv_established) */ 423 (by __dccp_rcv_established) */
404 } 424 }
405 break; 425 break;
406 } 426 }
@@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
444 */ 464 */
445 if (sk->sk_state == DCCP_LISTEN) { 465 if (sk->sk_state == DCCP_LISTEN) {
446 if (dh->dccph_type == DCCP_PKT_REQUEST) { 466 if (dh->dccph_type == DCCP_PKT_REQUEST) {
447 if (dccp_v4_conn_request(sk, skb) < 0) 467 if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
468 skb) < 0)
448 return 1; 469 return 1;
449 470
450 /* FIXME: do congestion control initialization */ 471 /* FIXME: do congestion control initialization */
@@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
471 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 492 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
472 dccp_event_ack_recv(sk, skb); 493 dccp_event_ack_recv(sk, skb);
473 494
474 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
475 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
476
477 if (dp->dccps_options.dccpo_send_ack_vector && 495 if (dp->dccps_options.dccpo_send_ack_vector &&
478 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 496 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
479 DCCP_SKB_CB(skb)->dccpd_seq, 497 DCCP_SKB_CB(skb)->dccpd_seq,
480 DCCP_ACKVEC_STATE_RECEIVED)) 498 DCCP_ACKVEC_STATE_RECEIVED))
481 goto discard; 499 goto discard;
500
501 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
502 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
482 } 503 }
483 504
484 /* 505 /*
@@ -566,3 +587,5 @@ discard:
566 } 587 }
567 return 0; 588 return 0;
568} 589}
590
591EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 656e13e38cfb..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,7 +19,9 @@
19 19
20#include <net/icmp.h> 20#include <net/icmp.h>
21#include <net/inet_hashtables.h> 21#include <net/inet_hashtables.h>
22#include <net/inet_sock.h>
22#include <net/sock.h> 23#include <net/sock.h>
24#include <net/timewait_sock.h>
23#include <net/tcp_states.h> 25#include <net/tcp_states.h>
24#include <net/xfrm.h> 26#include <net/xfrm.h>
25 27
@@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
37 39
38static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) 40static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
39{ 41{
40 return inet_csk_get_port(&dccp_hashinfo, sk, snum); 42 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
43 inet_csk_bind_conflict);
41} 44}
42 45
43static void dccp_v4_hash(struct sock *sk) 46static void dccp_v4_hash(struct sock *sk)
@@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk)
45 inet_hash(&dccp_hashinfo, sk); 48 inet_hash(&dccp_hashinfo, sk);
46} 49}
47 50
48static void dccp_v4_unhash(struct sock *sk) 51void dccp_unhash(struct sock *sk)
49{ 52{
50 inet_unhash(&dccp_hashinfo, sk); 53 inet_unhash(&dccp_hashinfo, sk);
51} 54}
52 55
53/* called with local bh disabled */ 56EXPORT_SYMBOL_GPL(dccp_unhash);
54static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
55 struct inet_timewait_sock **twp)
56{
57 struct inet_sock *inet = inet_sk(sk);
58 const u32 daddr = inet->rcv_saddr;
59 const u32 saddr = inet->daddr;
60 const int dif = sk->sk_bound_dev_if;
61 INET_ADDR_COOKIE(acookie, saddr, daddr)
62 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
63 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
64 struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
65 const struct sock *sk2;
66 const struct hlist_node *node;
67 struct inet_timewait_sock *tw;
68
69 prefetch(head->chain.first);
70 write_lock(&head->lock);
71
72 /* Check TIME-WAIT sockets first. */
73 sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
74 tw = inet_twsk(sk2);
75
76 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
77 goto not_unique;
78 }
79 tw = NULL;
80
81 /* And established part... */
82 sk_for_each(sk2, node, &head->chain) {
83 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
84 goto not_unique;
85 }
86 57
87 /* Must record num and sport now. Otherwise we will see 58int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
88 * in hash table socket with a funny identity. */
89 inet->num = lport;
90 inet->sport = htons(lport);
91 sk->sk_hash = hash;
92 BUG_TRAP(sk_unhashed(sk));
93 __sk_add_node(sk, &head->chain);
94 sock_prot_inc_use(sk->sk_prot);
95 write_unlock(&head->lock);
96
97 if (twp != NULL) {
98 *twp = tw;
99 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
100 } else if (tw != NULL) {
101 /* Silly. Should hash-dance instead... */
102 inet_twsk_deschedule(tw, &dccp_death_row);
103 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
104
105 inet_twsk_put(tw);
106 }
107
108 return 0;
109
110not_unique:
111 write_unlock(&head->lock);
112 return -EADDRNOTAVAIL;
113}
114
115/*
116 * Bind a port for a connect operation and hash it.
117 */
118static int dccp_v4_hash_connect(struct sock *sk)
119{
120 const unsigned short snum = inet_sk(sk)->num;
121 struct inet_bind_hashbucket *head;
122 struct inet_bind_bucket *tb;
123 int ret;
124
125 if (snum == 0) {
126 int low = sysctl_local_port_range[0];
127 int high = sysctl_local_port_range[1];
128 int remaining = (high - low) + 1;
129 int rover = net_random() % (high - low) + low;
130 struct hlist_node *node;
131 struct inet_timewait_sock *tw = NULL;
132
133 local_bh_disable();
134 do {
135 head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
136 dccp_hashinfo.bhash_size)];
137 spin_lock(&head->lock);
138
139 /* Does not bother with rcv_saddr checks,
140 * because the established check is already
141 * unique enough.
142 */
143 inet_bind_bucket_for_each(tb, node, &head->chain) {
144 if (tb->port == rover) {
145 BUG_TRAP(!hlist_empty(&tb->owners));
146 if (tb->fastreuse >= 0)
147 goto next_port;
148 if (!__dccp_v4_check_established(sk,
149 rover,
150 &tw))
151 goto ok;
152 goto next_port;
153 }
154 }
155
156 tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
157 head, rover);
158 if (tb == NULL) {
159 spin_unlock(&head->lock);
160 break;
161 }
162 tb->fastreuse = -1;
163 goto ok;
164
165 next_port:
166 spin_unlock(&head->lock);
167 if (++rover > high)
168 rover = low;
169 } while (--remaining > 0);
170
171 local_bh_enable();
172
173 return -EADDRNOTAVAIL;
174
175ok:
176 /* All locks still held and bhs disabled */
177 inet_bind_hash(sk, tb, rover);
178 if (sk_unhashed(sk)) {
179 inet_sk(sk)->sport = htons(rover);
180 __inet_hash(&dccp_hashinfo, sk, 0);
181 }
182 spin_unlock(&head->lock);
183
184 if (tw != NULL) {
185 inet_twsk_deschedule(tw, &dccp_death_row);
186 inet_twsk_put(tw);
187 }
188
189 ret = 0;
190 goto out;
191 }
192
193 head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
194 dccp_hashinfo.bhash_size)];
195 tb = inet_csk(sk)->icsk_bind_hash;
196 spin_lock_bh(&head->lock);
197 if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
198 __inet_hash(&dccp_hashinfo, sk, 0);
199 spin_unlock_bh(&head->lock);
200 return 0;
201 } else {
202 spin_unlock(&head->lock);
203 /* No definite answer... Walk to established hash table */
204 ret = __dccp_v4_check_established(sk, snum, NULL);
205out:
206 local_bh_enable();
207 return ret;
208 }
209}
210
211static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
212 int addr_len)
213{ 59{
214 struct inet_sock *inet = inet_sk(sk); 60 struct inet_sock *inet = inet_sk(sk);
215 struct dccp_sock *dp = dccp_sk(sk); 61 struct dccp_sock *dp = dccp_sk(sk);
@@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
259 inet->dport = usin->sin_port; 105 inet->dport = usin->sin_port;
260 inet->daddr = daddr; 106 inet->daddr = daddr;
261 107
262 dp->dccps_ext_header_len = 0; 108 inet_csk(sk)->icsk_ext_hdr_len = 0;
263 if (inet->opt != NULL) 109 if (inet->opt != NULL)
264 dp->dccps_ext_header_len = inet->opt->optlen; 110 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
265 /* 111 /*
266 * Socket identity is still unknown (sport may be zero). 112 * Socket identity is still unknown (sport may be zero).
267 * However we set state to DCCP_REQUESTING and not releasing socket 113 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
269 * complete initialization after this. 115 * complete initialization after this.
270 */ 116 */
271 dccp_set_state(sk, DCCP_REQUESTING); 117 dccp_set_state(sk, DCCP_REQUESTING);
272 err = dccp_v4_hash_connect(sk); 118 err = inet_hash_connect(&dccp_death_row, sk);
273 if (err != 0) 119 if (err != 0)
274 goto failure; 120 goto failure;
275 121
@@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
287 usin->sin_port); 133 usin->sin_port);
288 dccp_update_gss(sk, dp->dccps_iss); 134 dccp_update_gss(sk, dp->dccps_iss);
289 135
290 /*
291 * SWL and AWL are initially adjusted so that they are not less than
292 * the initial Sequence Numbers received and sent, respectively:
293 * SWL := max(GSR + 1 - floor(W/4), ISR),
294 * AWL := max(GSS - W' + 1, ISS).
295 * These adjustments MUST be applied only at the beginning of the
296 * connection.
297 */
298 dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
299
300 inet->id = dp->dccps_iss ^ jiffies; 136 inet->id = dp->dccps_iss ^ jiffies;
301 137
302 err = dccp_connect(sk); 138 err = dccp_connect(sk);
@@ -316,6 +152,8 @@ failure:
316 goto out; 152 goto out;
317} 153}
318 154
155EXPORT_SYMBOL_GPL(dccp_v4_connect);
156
319/* 157/*
320 * This routine does path mtu discovery as defined in RFC1191. 158 * This routine does path mtu discovery as defined in RFC1191.
321 */ 159 */
@@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
354 mtu = dst_mtu(dst); 192 mtu = dst_mtu(dst);
355 193
356 if (inet->pmtudisc != IP_PMTUDISC_DONT && 194 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
357 dp->dccps_pmtu_cookie > mtu) { 195 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
358 dccp_sync_mss(sk, mtu); 196 dccp_sync_mss(sk, mtu);
359 197
360 /* 198 /*
@@ -606,6 +444,17 @@ out:
606 sock_put(sk); 444 sock_put(sk);
607} 445}
608 446
447/* This routine computes an IPv4 DCCP checksum. */
448void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
449{
450 const struct inet_sock *inet = inet_sk(sk);
451 struct dccp_hdr *dh = dccp_hdr(skb);
452
453 dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
454}
455
456EXPORT_SYMBOL_GPL(dccp_v4_send_check);
457
609int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) 458int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
610{ 459{
611 struct sk_buff *skb; 460 struct sk_buff *skb;
@@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk,
641 dccp_hdr(skb)->dccph_sport); 490 dccp_hdr(skb)->dccph_sport);
642} 491}
643 492
644static inline int dccp_bad_service_code(const struct sock *sk,
645 const __u32 service)
646{
647 const struct dccp_sock *dp = dccp_sk(sk);
648
649 if (dp->dccps_service == service)
650 return 0;
651 return !dccp_list_has_service(dp->dccps_service_list, service);
652}
653
654int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 493int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
655{ 494{
656 struct inet_request_sock *ireq; 495 struct inet_request_sock *ireq;
@@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
662 const __u32 service = dccp_hdr_request(skb)->dccph_req_service; 501 const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
663 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 502 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
664 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; 503 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
665 struct dst_entry *dst = NULL;
666 504
667 /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ 505 /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
668 if (((struct rtable *)skb->dst)->rt_flags & 506 if (((struct rtable *)skb->dst)->rt_flags &
@@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
703 ireq = inet_rsk(req); 541 ireq = inet_rsk(req);
704 ireq->loc_addr = daddr; 542 ireq->loc_addr = daddr;
705 ireq->rmt_addr = saddr; 543 ireq->rmt_addr = saddr;
706 /* FIXME: Merge Aristeu's option parsing code when ready */
707 req->rcv_wnd = 100; /* Fake, option parsing will get the 544 req->rcv_wnd = 100; /* Fake, option parsing will get the
708 right value */ 545 right value */
709 ireq->opt = NULL; 546 ireq->opt = NULL;
@@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
721 dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); 558 dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
722 dreq->dreq_service = service; 559 dreq->dreq_service = service;
723 560
724 if (dccp_v4_send_response(sk, req, dst)) 561 if (dccp_v4_send_response(sk, req, NULL))
725 goto drop_and_free; 562 goto drop_and_free;
726 563
727 inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); 564 inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
728 return 0; 565 return 0;
729 566
730drop_and_free: 567drop_and_free:
731 /* 568 reqsk_free(req);
732 * FIXME: should be reqsk_free after implementing req->rsk_ops
733 */
734 __reqsk_free(req);
735drop: 569drop:
736 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); 570 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
737 dcb->dccpd_reset_code = reset_code; 571 dcb->dccpd_reset_code = reset_code;
738 return -1; 572 return -1;
739} 573}
740 574
575EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
576
741/* 577/*
742 * The three way handshake has completed - we got a valid ACK or DATAACK - 578 * The three way handshake has completed - we got a valid ACK or DATAACK -
743 * now create the new socket. 579 * now create the new socket.
@@ -792,6 +628,8 @@ exit:
792 return NULL; 628 return NULL;
793} 629}
794 630
631EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
632
795static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 633static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
796{ 634{
797 const struct dccp_hdr *dh = dccp_hdr(skb); 635 const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -1011,7 +849,9 @@ discard:
1011 return 0; 849 return 0;
1012} 850}
1013 851
1014static inline int dccp_invalid_packet(struct sk_buff *skb) 852EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
853
854int dccp_invalid_packet(struct sk_buff *skb)
1015{ 855{
1016 const struct dccp_hdr *dh; 856 const struct dccp_hdr *dh;
1017 857
@@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb)
1065 return 1; 905 return 1;
1066 } 906 }
1067 907
1068 /* If the header checksum is incorrect, drop packet and return */
1069 if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
1070 skb->nh.iph->daddr) < 0) {
1071 LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
1072 "incorrect\n");
1073 return 1;
1074 }
1075
1076 return 0; 908 return 0;
1077} 909}
1078 910
911EXPORT_SYMBOL_GPL(dccp_invalid_packet);
912
1079/* this is called when real data arrives */ 913/* this is called when real data arrives */
1080int dccp_v4_rcv(struct sk_buff *skb) 914int dccp_v4_rcv(struct sk_buff *skb)
1081{ 915{
1082 const struct dccp_hdr *dh; 916 const struct dccp_hdr *dh;
1083 struct sock *sk; 917 struct sock *sk;
1084 int rc;
1085 918
1086 /* Step 1: Check header basics: */ 919 /* Step 1: Check header basics: */
1087 920
1088 if (dccp_invalid_packet(skb)) 921 if (dccp_invalid_packet(skb))
1089 goto discard_it; 922 goto discard_it;
1090 923
924 /* If the header checksum is incorrect, drop packet and return */
925 if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
926 skb->nh.iph->daddr) < 0) {
927 LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
928 __FUNCTION__);
929 goto discard_it;
930 }
931
1091 dh = dccp_hdr(skb); 932 dh = dccp_hdr(skb);
1092 933
1093 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); 934 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
@@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb)
1143 goto do_time_wait; 984 goto do_time_wait;
1144 } 985 }
1145 986
1146 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 987 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1147 dccp_pr_debug("xfrm4_policy_check failed\n");
1148 goto discard_and_relse; 988 goto discard_and_relse;
1149 }
1150
1151 if (sk_filter(sk, skb, 0)) {
1152 dccp_pr_debug("sk_filter failed\n");
1153 goto discard_and_relse;
1154 }
1155
1156 skb->dev = NULL;
1157
1158 bh_lock_sock(sk);
1159 rc = 0;
1160 if (!sock_owned_by_user(sk))
1161 rc = dccp_v4_do_rcv(sk, skb);
1162 else
1163 sk_add_backlog(sk, skb);
1164 bh_unlock_sock(sk);
1165 989
1166 sock_put(sk); 990 return sk_receive_skb(sk, skb);
1167 return rc;
1168 991
1169no_dccp_socket: 992no_dccp_socket:
1170 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 993 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -1194,9 +1017,23 @@ do_time_wait:
1194 goto no_dccp_socket; 1017 goto no_dccp_socket;
1195} 1018}
1196 1019
1197static int dccp_v4_init_sock(struct sock *sk) 1020struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
1021 .queue_xmit = ip_queue_xmit,
1022 .send_check = dccp_v4_send_check,
1023 .rebuild_header = inet_sk_rebuild_header,
1024 .conn_request = dccp_v4_conn_request,
1025 .syn_recv_sock = dccp_v4_request_recv_sock,
1026 .net_header_len = sizeof(struct iphdr),
1027 .setsockopt = ip_setsockopt,
1028 .getsockopt = ip_getsockopt,
1029 .addr2sockaddr = inet_csk_addr2sockaddr,
1030 .sockaddr_len = sizeof(struct sockaddr_in),
1031};
1032
1033int dccp_v4_init_sock(struct sock *sk)
1198{ 1034{
1199 struct dccp_sock *dp = dccp_sk(sk); 1035 struct dccp_sock *dp = dccp_sk(sk);
1036 struct inet_connection_sock *icsk = inet_csk(sk);
1200 static int dccp_ctl_socket_init = 1; 1037 static int dccp_ctl_socket_init = 1;
1201 1038
1202 dccp_options_init(&dp->dccps_options); 1039 dccp_options_init(&dp->dccps_options);
@@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk)
1236 dccp_ctl_socket_init = 0; 1073 dccp_ctl_socket_init = 0;
1237 1074
1238 dccp_init_xmit_timers(sk); 1075 dccp_init_xmit_timers(sk);
1239 inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; 1076 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
1240 sk->sk_state = DCCP_CLOSED; 1077 sk->sk_state = DCCP_CLOSED;
1241 sk->sk_write_space = dccp_write_space; 1078 sk->sk_write_space = dccp_write_space;
1079 icsk->icsk_af_ops = &dccp_ipv4_af_ops;
1080 icsk->icsk_sync_mss = dccp_sync_mss;
1242 dp->dccps_mss_cache = 536; 1081 dp->dccps_mss_cache = 536;
1243 dp->dccps_role = DCCP_ROLE_UNDEFINED; 1082 dp->dccps_role = DCCP_ROLE_UNDEFINED;
1244 dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; 1083 dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
@@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk)
1246 return 0; 1085 return 0;
1247} 1086}
1248 1087
1249static int dccp_v4_destroy_sock(struct sock *sk) 1088EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
1089
1090int dccp_v4_destroy_sock(struct sock *sk)
1250{ 1091{
1251 struct dccp_sock *dp = dccp_sk(sk); 1092 struct dccp_sock *dp = dccp_sk(sk);
1252 1093
@@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
1279 return 0; 1120 return 0;
1280} 1121}
1281 1122
1123EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
1124
1282static void dccp_v4_reqsk_destructor(struct request_sock *req) 1125static void dccp_v4_reqsk_destructor(struct request_sock *req)
1283{ 1126{
1284 kfree(inet_rsk(req)->opt); 1127 kfree(inet_rsk(req)->opt);
@@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = {
1293 .send_reset = dccp_v4_ctl_send_reset, 1136 .send_reset = dccp_v4_ctl_send_reset,
1294}; 1137};
1295 1138
1296struct proto dccp_v4_prot = { 1139static struct timewait_sock_ops dccp_timewait_sock_ops = {
1140 .twsk_obj_size = sizeof(struct inet_timewait_sock),
1141};
1142
1143struct proto dccp_prot = {
1297 .name = "DCCP", 1144 .name = "DCCP",
1298 .owner = THIS_MODULE, 1145 .owner = THIS_MODULE,
1299 .close = dccp_close, 1146 .close = dccp_close,
@@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = {
1307 .recvmsg = dccp_recvmsg, 1154 .recvmsg = dccp_recvmsg,
1308 .backlog_rcv = dccp_v4_do_rcv, 1155 .backlog_rcv = dccp_v4_do_rcv,
1309 .hash = dccp_v4_hash, 1156 .hash = dccp_v4_hash,
1310 .unhash = dccp_v4_unhash, 1157 .unhash = dccp_unhash,
1311 .accept = inet_csk_accept, 1158 .accept = inet_csk_accept,
1312 .get_port = dccp_v4_get_port, 1159 .get_port = dccp_v4_get_port,
1313 .shutdown = dccp_shutdown, 1160 .shutdown = dccp_shutdown,
@@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = {
1316 .max_header = MAX_DCCP_HEADER, 1163 .max_header = MAX_DCCP_HEADER,
1317 .obj_size = sizeof(struct dccp_sock), 1164 .obj_size = sizeof(struct dccp_sock),
1318 .rsk_prot = &dccp_request_sock_ops, 1165 .rsk_prot = &dccp_request_sock_ops,
1319 .twsk_obj_size = sizeof(struct inet_timewait_sock), 1166 .twsk_prot = &dccp_timewait_sock_ops,
1320}; 1167};
1168
1169EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 000000000000..c609dc78f487
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1261 @@
1/*
2 * DCCP over IPv6
3 * Linux INET6 implementation
4 *
5 * Based on net/dccp6/ipv6.c
6 *
7 * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/random.h>
18#include <linux/xfrm.h>
19
20#include <net/addrconf.h>
21#include <net/inet_common.h>
22#include <net/inet_hashtables.h>
23#include <net/inet_sock.h>
24#include <net/inet6_connection_sock.h>
25#include <net/inet6_hashtables.h>
26#include <net/ip6_route.h>
27#include <net/ipv6.h>
28#include <net/protocol.h>
29#include <net/transp_v6.h>
30#include <net/xfrm.h>
31
32#include "dccp.h"
33#include "ipv6.h"
34
35static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
36static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
37 struct request_sock *req);
38static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
39
40static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
41
42static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
43static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
44
45static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
46{
47 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
48 inet6_csk_bind_conflict);
49}
50
51static void dccp_v6_hash(struct sock *sk)
52{
53 if (sk->sk_state != DCCP_CLOSED) {
54 if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
55 dccp_prot.hash(sk);
56 return;
57 }
58 local_bh_disable();
59 __inet6_hash(&dccp_hashinfo, sk);
60 local_bh_enable();
61 }
62}
63
64static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
65 struct in6_addr *saddr,
66 struct in6_addr *daddr,
67 unsigned long base)
68{
69 return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
70}
71
72static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
73{
74 const struct dccp_hdr *dh = dccp_hdr(skb);
75
76 if (skb->protocol == htons(ETH_P_IPV6))
77 return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
78 skb->nh.ipv6h->saddr.s6_addr32,
79 dh->dccph_dport,
80 dh->dccph_sport);
81 else
82 return secure_dccp_sequence_number(skb->nh.iph->daddr,
83 skb->nh.iph->saddr,
84 dh->dccph_dport,
85 dh->dccph_sport);
86}
87
88static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
89 int addr_len)
90{
91 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
92 struct inet_connection_sock *icsk = inet_csk(sk);
93 struct inet_sock *inet = inet_sk(sk);
94 struct ipv6_pinfo *np = inet6_sk(sk);
95 struct dccp_sock *dp = dccp_sk(sk);
96 struct in6_addr *saddr = NULL, *final_p = NULL, final;
97 struct flowi fl;
98 struct dst_entry *dst;
99 int addr_type;
100 int err;
101
102 dp->dccps_role = DCCP_ROLE_CLIENT;
103
104 if (addr_len < SIN6_LEN_RFC2133)
105 return -EINVAL;
106
107 if (usin->sin6_family != AF_INET6)
108 return -EAFNOSUPPORT;
109
110 memset(&fl, 0, sizeof(fl));
111
112 if (np->sndflow) {
113 fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
114 IP6_ECN_flow_init(fl.fl6_flowlabel);
115 if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
116 struct ip6_flowlabel *flowlabel;
117 flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
118 if (flowlabel == NULL)
119 return -EINVAL;
120 ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
121 fl6_sock_release(flowlabel);
122 }
123 }
124
125 /*
126 * connect() to INADDR_ANY means loopback (BSD'ism).
127 */
128
129 if (ipv6_addr_any(&usin->sin6_addr))
130 usin->sin6_addr.s6_addr[15] = 0x1;
131
132 addr_type = ipv6_addr_type(&usin->sin6_addr);
133
134 if(addr_type & IPV6_ADDR_MULTICAST)
135 return -ENETUNREACH;
136
137 if (addr_type & IPV6_ADDR_LINKLOCAL) {
138 if (addr_len >= sizeof(struct sockaddr_in6) &&
139 usin->sin6_scope_id) {
140 /* If interface is set while binding, indices
141 * must coincide.
142 */
143 if (sk->sk_bound_dev_if &&
144 sk->sk_bound_dev_if != usin->sin6_scope_id)
145 return -EINVAL;
146
147 sk->sk_bound_dev_if = usin->sin6_scope_id;
148 }
149
150 /* Connect to link-local address requires an interface */
151 if (!sk->sk_bound_dev_if)
152 return -EINVAL;
153 }
154
155 ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
156 np->flow_label = fl.fl6_flowlabel;
157
158 /*
159 * DCCP over IPv4
160 */
161
162 if (addr_type == IPV6_ADDR_MAPPED) {
163 u32 exthdrlen = icsk->icsk_ext_hdr_len;
164 struct sockaddr_in sin;
165
166 SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
167
168 if (__ipv6_only_sock(sk))
169 return -ENETUNREACH;
170
171 sin.sin_family = AF_INET;
172 sin.sin_port = usin->sin6_port;
173 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
174
175 icsk->icsk_af_ops = &dccp_ipv6_mapped;
176 sk->sk_backlog_rcv = dccp_v4_do_rcv;
177
178 err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
179
180 if (err) {
181 icsk->icsk_ext_hdr_len = exthdrlen;
182 icsk->icsk_af_ops = &dccp_ipv6_af_ops;
183 sk->sk_backlog_rcv = dccp_v6_do_rcv;
184 goto failure;
185 } else {
186 ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
187 inet->saddr);
188 ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
189 inet->rcv_saddr);
190 }
191
192 return err;
193 }
194
195 if (!ipv6_addr_any(&np->rcv_saddr))
196 saddr = &np->rcv_saddr;
197
198 fl.proto = IPPROTO_DCCP;
199 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
200 ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
201 fl.oif = sk->sk_bound_dev_if;
202 fl.fl_ip_dport = usin->sin6_port;
203 fl.fl_ip_sport = inet->sport;
204
205 if (np->opt && np->opt->srcrt) {
206 struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
207 ipv6_addr_copy(&final, &fl.fl6_dst);
208 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
209 final_p = &final;
210 }
211
212 err = ip6_dst_lookup(sk, &dst, &fl);
213 if (err)
214 goto failure;
215 if (final_p)
216 ipv6_addr_copy(&fl.fl6_dst, final_p);
217
218 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
219 goto failure;
220
221 if (saddr == NULL) {
222 saddr = &fl.fl6_src;
223 ipv6_addr_copy(&np->rcv_saddr, saddr);
224 }
225
226 /* set the source address */
227 ipv6_addr_copy(&np->saddr, saddr);
228 inet->rcv_saddr = LOOPBACK4_IPV6;
229
230 ip6_dst_store(sk, dst, NULL);
231
232 icsk->icsk_ext_hdr_len = 0;
233 if (np->opt)
234 icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
235 np->opt->opt_nflen);
236
237 inet->dport = usin->sin6_port;
238
239 dccp_set_state(sk, DCCP_REQUESTING);
240 err = inet6_hash_connect(&dccp_death_row, sk);
241 if (err)
242 goto late_failure;
243 /* FIXME */
244#if 0
245 dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
246 np->daddr.s6_addr32,
247 inet->sport,
248 inet->dport);
249#endif
250 err = dccp_connect(sk);
251 if (err)
252 goto late_failure;
253
254 return 0;
255
256late_failure:
257 dccp_set_state(sk, DCCP_CLOSED);
258 __sk_dst_reset(sk);
259failure:
260 inet->dport = 0;
261 sk->sk_route_caps = 0;
262 return err;
263}
264
265static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
266 int type, int code, int offset, __u32 info)
267{
268 struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
269 const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
270 struct ipv6_pinfo *np;
271 struct sock *sk;
272 int err;
273 __u64 seq;
274
275 sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
276 &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
277
278 if (sk == NULL) {
279 ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
280 return;
281 }
282
283 if (sk->sk_state == DCCP_TIME_WAIT) {
284 inet_twsk_put((struct inet_timewait_sock *)sk);
285 return;
286 }
287
288 bh_lock_sock(sk);
289 if (sock_owned_by_user(sk))
290 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
291
292 if (sk->sk_state == DCCP_CLOSED)
293 goto out;
294
295 np = inet6_sk(sk);
296
297 if (type == ICMPV6_PKT_TOOBIG) {
298 struct dst_entry *dst = NULL;
299
300 if (sock_owned_by_user(sk))
301 goto out;
302 if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
303 goto out;
304
305 /* icmp should have updated the destination cache entry */
306 dst = __sk_dst_check(sk, np->dst_cookie);
307
308 if (dst == NULL) {
309 struct inet_sock *inet = inet_sk(sk);
310 struct flowi fl;
311
312 /* BUGGG_FUTURE: Again, it is not clear how
313 to handle rthdr case. Ignore this complexity
314 for now.
315 */
316 memset(&fl, 0, sizeof(fl));
317 fl.proto = IPPROTO_DCCP;
318 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
319 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
320 fl.oif = sk->sk_bound_dev_if;
321 fl.fl_ip_dport = inet->dport;
322 fl.fl_ip_sport = inet->sport;
323
324 if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
325 sk->sk_err_soft = -err;
326 goto out;
327 }
328
329 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
330 sk->sk_err_soft = -err;
331 goto out;
332 }
333
334 } else
335 dst_hold(dst);
336
337 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
338 dccp_sync_mss(sk, dst_mtu(dst));
339 } /* else let the usual retransmit timer handle it */
340 dst_release(dst);
341 goto out;
342 }
343
344 icmpv6_err_convert(type, code, &err);
345
346 seq = DCCP_SKB_CB(skb)->dccpd_seq;
347 /* Might be for an request_sock */
348 switch (sk->sk_state) {
349 struct request_sock *req, **prev;
350 case DCCP_LISTEN:
351 if (sock_owned_by_user(sk))
352 goto out;
353
354 req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
355 &hdr->daddr, &hdr->saddr,
356 inet6_iif(skb));
357 if (!req)
358 goto out;
359
360 /* ICMPs are not backlogged, hence we cannot get
361 * an established socket here.
362 */
363 BUG_TRAP(req->sk == NULL);
364
365 if (seq != dccp_rsk(req)->dreq_iss) {
366 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
367 goto out;
368 }
369
370 inet_csk_reqsk_queue_drop(sk, req, prev);
371 goto out;
372
373 case DCCP_REQUESTING:
374 case DCCP_RESPOND: /* Cannot happen.
375 It can, it SYNs are crossed. --ANK */
376 if (!sock_owned_by_user(sk)) {
377 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
378 sk->sk_err = err;
379 /*
380 * Wake people up to see the error
381 * (see connect in sock.c)
382 */
383 sk->sk_error_report(sk);
384
385 dccp_done(sk);
386 } else
387 sk->sk_err_soft = err;
388 goto out;
389 }
390
391 if (!sock_owned_by_user(sk) && np->recverr) {
392 sk->sk_err = err;
393 sk->sk_error_report(sk);
394 } else
395 sk->sk_err_soft = err;
396
397out:
398 bh_unlock_sock(sk);
399 sock_put(sk);
400}
401
402
403static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
404 struct dst_entry *dst)
405{
406 struct inet6_request_sock *ireq6 = inet6_rsk(req);
407 struct ipv6_pinfo *np = inet6_sk(sk);
408 struct sk_buff *skb;
409 struct ipv6_txoptions *opt = NULL;
410 struct in6_addr *final_p = NULL, final;
411 struct flowi fl;
412 int err = -1;
413
414 memset(&fl, 0, sizeof(fl));
415 fl.proto = IPPROTO_DCCP;
416 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
417 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
418 fl.fl6_flowlabel = 0;
419 fl.oif = ireq6->iif;
420 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
421 fl.fl_ip_sport = inet_sk(sk)->sport;
422
423 if (dst == NULL) {
424 opt = np->opt;
425 if (opt == NULL &&
426 np->rxopt.bits.osrcrt == 2 &&
427 ireq6->pktopts) {
428 struct sk_buff *pktopts = ireq6->pktopts;
429 struct inet6_skb_parm *rxopt = IP6CB(pktopts);
430 if (rxopt->srcrt)
431 opt = ipv6_invert_rthdr(sk,
432 (struct ipv6_rt_hdr *)(pktopts->nh.raw +
433 rxopt->srcrt));
434 }
435
436 if (opt && opt->srcrt) {
437 struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
438 ipv6_addr_copy(&final, &fl.fl6_dst);
439 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
440 final_p = &final;
441 }
442
443 err = ip6_dst_lookup(sk, &dst, &fl);
444 if (err)
445 goto done;
446 if (final_p)
447 ipv6_addr_copy(&fl.fl6_dst, final_p);
448 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
449 goto done;
450 }
451
452 skb = dccp_make_response(sk, dst, req);
453 if (skb != NULL) {
454 struct dccp_hdr *dh = dccp_hdr(skb);
455 dh->dccph_checksum = dccp_v6_check(dh, skb->len,
456 &ireq6->loc_addr,
457 &ireq6->rmt_addr,
458 csum_partial((char *)dh,
459 skb->len,
460 skb->csum));
461 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
462 err = ip6_xmit(sk, skb, &fl, opt, 0);
463 if (err == NET_XMIT_CN)
464 err = 0;
465 }
466
467done:
468 if (opt && opt != np->opt)
469 sock_kfree_s(sk, opt, opt->tot_len);
470 return err;
471}
472
473static void dccp_v6_reqsk_destructor(struct request_sock *req)
474{
475 if (inet6_rsk(req)->pktopts != NULL)
476 kfree_skb(inet6_rsk(req)->pktopts);
477}
478
479static struct request_sock_ops dccp6_request_sock_ops = {
480 .family = AF_INET6,
481 .obj_size = sizeof(struct dccp6_request_sock),
482 .rtx_syn_ack = dccp_v6_send_response,
483 .send_ack = dccp_v6_reqsk_send_ack,
484 .destructor = dccp_v6_reqsk_destructor,
485 .send_reset = dccp_v6_ctl_send_reset,
486};
487
488static struct timewait_sock_ops dccp6_timewait_sock_ops = {
489 .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
490};
491
492static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
493{
494 struct ipv6_pinfo *np = inet6_sk(sk);
495 struct dccp_hdr *dh = dccp_hdr(skb);
496
497 dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
498 len, IPPROTO_DCCP,
499 csum_partial((char *)dh,
500 dh->dccph_doff << 2,
501 skb->csum));
502}
503
504static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
505{
506 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
507 const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
508 sizeof(struct dccp_hdr_ext) +
509 sizeof(struct dccp_hdr_reset);
510 struct sk_buff *skb;
511 struct flowi fl;
512 u64 seqno;
513
514 if (rxdh->dccph_type == DCCP_PKT_RESET)
515 return;
516
517 if (!ipv6_unicast_destination(rxskb))
518 return;
519
520 /*
521 * We need to grab some memory, and put together an RST,
522 * and then put it into the queue to be sent.
523 */
524
525 skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
526 dccp_hdr_reset_len, GFP_ATOMIC);
527 if (skb == NULL)
528 return;
529
530 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
531 dccp_hdr_reset_len);
532
533 skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
534 dh = dccp_hdr(skb);
535 memset(dh, 0, dccp_hdr_reset_len);
536
537 /* Swap the send and the receive. */
538 dh->dccph_type = DCCP_PKT_RESET;
539 dh->dccph_sport = rxdh->dccph_dport;
540 dh->dccph_dport = rxdh->dccph_sport;
541 dh->dccph_doff = dccp_hdr_reset_len / 4;
542 dh->dccph_x = 1;
543 dccp_hdr_reset(skb)->dccph_reset_code =
544 DCCP_SKB_CB(rxskb)->dccpd_reset_code;
545
546 /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
547 seqno = 0;
548 if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
549 dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
550
551 dccp_hdr_set_seq(dh, seqno);
552 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
553 DCCP_SKB_CB(rxskb)->dccpd_seq);
554
555 memset(&fl, 0, sizeof(fl));
556 ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
557 ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
558 dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
559 sizeof(*dh), IPPROTO_DCCP,
560 skb->csum);
561 fl.proto = IPPROTO_DCCP;
562 fl.oif = inet6_iif(rxskb);
563 fl.fl_ip_dport = dh->dccph_dport;
564 fl.fl_ip_sport = dh->dccph_sport;
565
566 /* sk = NULL, but it is safe for now. RST socket required. */
567 if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
568 if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
569 ip6_xmit(NULL, skb, &fl, NULL, 0);
570 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
571 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
572 return;
573 }
574 }
575
576 kfree_skb(skb);
577}
578
579static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
580{
581 struct flowi fl;
582 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
583 const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
584 sizeof(struct dccp_hdr_ext) +
585 sizeof(struct dccp_hdr_ack_bits);
586 struct sk_buff *skb;
587
588 skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
589 dccp_hdr_ack_len, GFP_ATOMIC);
590 if (skb == NULL)
591 return;
592
593 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
594 dccp_hdr_ack_len);
595
596 skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
597 dh = dccp_hdr(skb);
598 memset(dh, 0, dccp_hdr_ack_len);
599
600 /* Build DCCP header and checksum it. */
601 dh->dccph_type = DCCP_PKT_ACK;
602 dh->dccph_sport = rxdh->dccph_dport;
603 dh->dccph_dport = rxdh->dccph_sport;
604 dh->dccph_doff = dccp_hdr_ack_len / 4;
605 dh->dccph_x = 1;
606
607 dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
608 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
609 DCCP_SKB_CB(rxskb)->dccpd_seq);
610
611 memset(&fl, 0, sizeof(fl));
612 ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
613 ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
614
615 /* FIXME: calculate checksum, IPv4 also should... */
616
617 fl.proto = IPPROTO_DCCP;
618 fl.oif = inet6_iif(rxskb);
619 fl.fl_ip_dport = dh->dccph_dport;
620 fl.fl_ip_sport = dh->dccph_sport;
621
622 if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
623 if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
624 ip6_xmit(NULL, skb, &fl, NULL, 0);
625 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
626 return;
627 }
628 }
629
630 kfree_skb(skb);
631}
632
633static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
634 struct request_sock *req)
635{
636 dccp_v6_ctl_send_ack(skb);
637}
638
639static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
640{
641 const struct dccp_hdr *dh = dccp_hdr(skb);
642 const struct ipv6hdr *iph = skb->nh.ipv6h;
643 struct sock *nsk;
644 struct request_sock **prev;
645 /* Find possible connection requests. */
646 struct request_sock *req = inet6_csk_search_req(sk, &prev,
647 dh->dccph_sport,
648 &iph->saddr,
649 &iph->daddr,
650 inet6_iif(skb));
651 if (req != NULL)
652 return dccp_check_req(sk, skb, req, prev);
653
654 nsk = __inet6_lookup_established(&dccp_hashinfo,
655 &iph->saddr, dh->dccph_sport,
656 &iph->daddr, ntohs(dh->dccph_dport),
657 inet6_iif(skb));
658
659 if (nsk != NULL) {
660 if (nsk->sk_state != DCCP_TIME_WAIT) {
661 bh_lock_sock(nsk);
662 return nsk;
663 }
664 inet_twsk_put((struct inet_timewait_sock *)nsk);
665 return NULL;
666 }
667
668 return sk;
669}
670
671static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
672{
673 struct inet_request_sock *ireq;
674 struct dccp_sock dp;
675 struct request_sock *req;
676 struct dccp_request_sock *dreq;
677 struct inet6_request_sock *ireq6;
678 struct ipv6_pinfo *np = inet6_sk(sk);
679 const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
680 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
681 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
682
683 if (skb->protocol == htons(ETH_P_IP))
684 return dccp_v4_conn_request(sk, skb);
685
686 if (!ipv6_unicast_destination(skb))
687 goto drop;
688
689 if (dccp_bad_service_code(sk, service)) {
690 reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
691 goto drop;
692 }
693 /*
694 * There are no SYN attacks on IPv6, yet...
695 */
696 if (inet_csk_reqsk_queue_is_full(sk))
697 goto drop;
698
699 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
700 goto drop;
701
702 req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
703 if (req == NULL)
704 goto drop;
705
706 /* FIXME: process options */
707
708 dccp_openreq_init(req, &dp, skb);
709
710 ireq6 = inet6_rsk(req);
711 ireq = inet_rsk(req);
712 ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
713 ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
714 req->rcv_wnd = 100; /* Fake, option parsing will get the
715 right value */
716 ireq6->pktopts = NULL;
717
718 if (ipv6_opt_accepted(sk, skb) ||
719 np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
720 np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
721 atomic_inc(&skb->users);
722 ireq6->pktopts = skb;
723 }
724 ireq6->iif = sk->sk_bound_dev_if;
725
726 /* So that link locals have meaning */
727 if (!sk->sk_bound_dev_if &&
728 ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
729 ireq6->iif = inet6_iif(skb);
730
731 /*
732 * Step 3: Process LISTEN state
733 *
734 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
735 *
736 * In fact we defer setting S.GSR, S.SWL, S.SWH to
737 * dccp_create_openreq_child.
738 */
739 dreq = dccp_rsk(req);
740 dreq->dreq_isr = dcb->dccpd_seq;
741 dreq->dreq_iss = dccp_v6_init_sequence(sk, skb);
742 dreq->dreq_service = service;
743
744 if (dccp_v6_send_response(sk, req, NULL))
745 goto drop_and_free;
746
747 inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
748 return 0;
749
750drop_and_free:
751 reqsk_free(req);
752drop:
753 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
754 dcb->dccpd_reset_code = reset_code;
755 return -1;
756}
757
758static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
759 struct sk_buff *skb,
760 struct request_sock *req,
761 struct dst_entry *dst)
762{
763 struct inet6_request_sock *ireq6 = inet6_rsk(req);
764 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
765 struct inet_sock *newinet;
766 struct dccp_sock *newdp;
767 struct dccp6_sock *newdp6;
768 struct sock *newsk;
769 struct ipv6_txoptions *opt;
770
771 if (skb->protocol == htons(ETH_P_IP)) {
772 /*
773 * v6 mapped
774 */
775
776 newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
777 if (newsk == NULL)
778 return NULL;
779
780 newdp6 = (struct dccp6_sock *)newsk;
781 newdp = dccp_sk(newsk);
782 newinet = inet_sk(newsk);
783 newinet->pinet6 = &newdp6->inet6;
784 newnp = inet6_sk(newsk);
785
786 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
787
788 ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
789 newinet->daddr);
790
791 ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
792 newinet->saddr);
793
794 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
795
796 inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
797 newsk->sk_backlog_rcv = dccp_v4_do_rcv;
798 newnp->pktoptions = NULL;
799 newnp->opt = NULL;
800 newnp->mcast_oif = inet6_iif(skb);
801 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
802
803 /*
804 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
805 * here, dccp_create_openreq_child now does this for us, see the comment in
806 * that function for the gory details. -acme
807 */
808
809 /* It is tricky place. Until this moment IPv4 tcp
810 worked with IPv6 icsk.icsk_af_ops.
811 Sync it now.
812 */
813 dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
814
815 return newsk;
816 }
817
818 opt = np->opt;
819
820 if (sk_acceptq_is_full(sk))
821 goto out_overflow;
822
823 if (np->rxopt.bits.osrcrt == 2 &&
824 opt == NULL && ireq6->pktopts) {
825 struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
826 if (rxopt->srcrt)
827 opt = ipv6_invert_rthdr(sk,
828 (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
829 rxopt->srcrt));
830 }
831
832 if (dst == NULL) {
833 struct in6_addr *final_p = NULL, final;
834 struct flowi fl;
835
836 memset(&fl, 0, sizeof(fl));
837 fl.proto = IPPROTO_DCCP;
838 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
839 if (opt && opt->srcrt) {
840 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
841 ipv6_addr_copy(&final, &fl.fl6_dst);
842 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
843 final_p = &final;
844 }
845 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
846 fl.oif = sk->sk_bound_dev_if;
847 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
848 fl.fl_ip_sport = inet_sk(sk)->sport;
849
850 if (ip6_dst_lookup(sk, &dst, &fl))
851 goto out;
852
853 if (final_p)
854 ipv6_addr_copy(&fl.fl6_dst, final_p);
855
856 if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
857 goto out;
858 }
859
860 newsk = dccp_create_openreq_child(sk, req, skb);
861 if (newsk == NULL)
862 goto out;
863
864 /*
865 * No need to charge this sock to the relevant IPv6 refcnt debug socks
866 * count here, dccp_create_openreq_child now does this for us, see the
867 * comment in that function for the gory details. -acme
868 */
869
870 ip6_dst_store(newsk, dst, NULL);
871 newsk->sk_route_caps = dst->dev->features &
872 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
873
874 newdp6 = (struct dccp6_sock *)newsk;
875 newinet = inet_sk(newsk);
876 newinet->pinet6 = &newdp6->inet6;
877 newdp = dccp_sk(newsk);
878 newnp = inet6_sk(newsk);
879
880 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
881
882 ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
883 ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
884 ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
885 newsk->sk_bound_dev_if = ireq6->iif;
886
887 /* Now IPv6 options...
888
889 First: no IPv4 options.
890 */
891 newinet->opt = NULL;
892
893 /* Clone RX bits */
894 newnp->rxopt.all = np->rxopt.all;
895
896 /* Clone pktoptions received with SYN */
897 newnp->pktoptions = NULL;
898 if (ireq6->pktopts != NULL) {
899 newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
900 kfree_skb(ireq6->pktopts);
901 ireq6->pktopts = NULL;
902 if (newnp->pktoptions)
903 skb_set_owner_r(newnp->pktoptions, newsk);
904 }
905 newnp->opt = NULL;
906 newnp->mcast_oif = inet6_iif(skb);
907 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
908
909 /* Clone native IPv6 options from listening socket (if any)
910
911 Yes, keeping reference count would be much more clever,
912 but we make one more one thing there: reattach optmem
913 to newsk.
914 */
915 if (opt) {
916 newnp->opt = ipv6_dup_options(newsk, opt);
917 if (opt != np->opt)
918 sock_kfree_s(sk, opt, opt->tot_len);
919 }
920
921 inet_csk(newsk)->icsk_ext_hdr_len = 0;
922 if (newnp->opt)
923 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
924 newnp->opt->opt_flen);
925
926 dccp_sync_mss(newsk, dst_mtu(dst));
927
928 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
929
930 __inet6_hash(&dccp_hashinfo, newsk);
931 inet_inherit_port(&dccp_hashinfo, sk, newsk);
932
933 return newsk;
934
935out_overflow:
936 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
937out:
938 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
939 if (opt && opt != np->opt)
940 sock_kfree_s(sk, opt, opt->tot_len);
941 dst_release(dst);
942 return NULL;
943}
944
945/* The socket must have it's spinlock held when we get
946 * here.
947 *
948 * We have a potential double-lock case here, so even when
949 * doing backlog processing we use the BH locking scheme.
950 * This is because we cannot sleep with the original spinlock
951 * held.
952 */
953static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
954{
955 struct ipv6_pinfo *np = inet6_sk(sk);
956 struct sk_buff *opt_skb = NULL;
957
958 /* Imagine: socket is IPv6. IPv4 packet arrives,
959 goes to IPv4 receive handler and backlogged.
960 From backlog it always goes here. Kerboom...
961 Fortunately, dccp_rcv_established and rcv_established
962 handle them correctly, but it is not case with
963 dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
964 */
965
966 if (skb->protocol == htons(ETH_P_IP))
967 return dccp_v4_do_rcv(sk, skb);
968
969 if (sk_filter(sk, skb, 0))
970 goto discard;
971
972 /*
973 * socket locking is here for SMP purposes as backlog rcv
974 * is currently called with bh processing disabled.
975 */
976
977 /* Do Stevens' IPV6_PKTOPTIONS.
978
979 Yes, guys, it is the only place in our code, where we
980 may make it not affecting IPv4.
981 The rest of code is protocol independent,
982 and I do not like idea to uglify IPv4.
983
984 Actually, all the idea behind IPV6_PKTOPTIONS
985 looks not very well thought. For now we latch
986 options, received in the last packet, enqueued
987 by tcp. Feel free to propose better solution.
988 --ANK (980728)
989 */
990 if (np->rxopt.all)
991 opt_skb = skb_clone(skb, GFP_ATOMIC);
992
993 if (sk->sk_state == DCCP_OPEN) { /* Fast path */
994 if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
995 goto reset;
996 return 0;
997 }
998
999 if (sk->sk_state == DCCP_LISTEN) {
1000 struct sock *nsk = dccp_v6_hnd_req(sk, skb);
1001 if (!nsk)
1002 goto discard;
1003
1004 /*
1005 * Queue it on the new socket if the new socket is active,
1006 * otherwise we just shortcircuit this and continue with
1007 * the new socket..
1008 */
1009 if(nsk != sk) {
1010 if (dccp_child_process(sk, nsk, skb))
1011 goto reset;
1012 if (opt_skb)
1013 __kfree_skb(opt_skb);
1014 return 0;
1015 }
1016 }
1017
1018 if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
1019 goto reset;
1020 return 0;
1021
1022reset:
1023 dccp_v6_ctl_send_reset(skb);
1024discard:
1025 if (opt_skb)
1026 __kfree_skb(opt_skb);
1027 kfree_skb(skb);
1028 return 0;
1029}
1030
1031static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
1032{
1033 const struct dccp_hdr *dh;
1034 struct sk_buff *skb = *pskb;
1035 struct sock *sk;
1036
1037 /* Step 1: Check header basics: */
1038
1039 if (dccp_invalid_packet(skb))
1040 goto discard_it;
1041
1042 dh = dccp_hdr(skb);
1043
1044 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
1045 DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
1046
1047 if (dccp_packet_without_ack(skb))
1048 DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
1049 else
1050 DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
1051
1052 /* Step 2:
1053 * Look up flow ID in table and get corresponding socket */
1054 sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
1055 dh->dccph_sport,
1056 &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
1057 inet6_iif(skb));
1058 /*
1059 * Step 2:
1060 * If no socket ...
1061 * Generate Reset(No Connection) unless P.type == Reset
1062 * Drop packet and return
1063 */
1064 if (sk == NULL)
1065 goto no_dccp_socket;
1066
1067 /*
1068 * Step 2:
1069 * ... or S.state == TIMEWAIT,
1070 * Generate Reset(No Connection) unless P.type == Reset
1071 * Drop packet and return
1072 */
1073
1074 if (sk->sk_state == DCCP_TIME_WAIT)
1075 goto do_time_wait;
1076
1077 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
1078 goto discard_and_relse;
1079
1080 return sk_receive_skb(sk, skb) ? -1 : 0;
1081
1082no_dccp_socket:
1083 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
1084 goto discard_it;
1085 /*
1086 * Step 2:
1087 * Generate Reset(No Connection) unless P.type == Reset
1088 * Drop packet and return
1089 */
1090 if (dh->dccph_type != DCCP_PKT_RESET) {
1091 DCCP_SKB_CB(skb)->dccpd_reset_code =
1092 DCCP_RESET_CODE_NO_CONNECTION;
1093 dccp_v6_ctl_send_reset(skb);
1094 }
1095discard_it:
1096
1097 /*
1098 * Discard frame
1099 */
1100
1101 kfree_skb(skb);
1102 return 0;
1103
1104discard_and_relse:
1105 sock_put(sk);
1106 goto discard_it;
1107
1108do_time_wait:
1109 inet_twsk_put((struct inet_timewait_sock *)sk);
1110 goto no_dccp_socket;
1111}
1112
1113static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
1114 .queue_xmit = inet6_csk_xmit,
1115 .send_check = dccp_v6_send_check,
1116 .rebuild_header = inet6_sk_rebuild_header,
1117 .conn_request = dccp_v6_conn_request,
1118 .syn_recv_sock = dccp_v6_request_recv_sock,
1119 .net_header_len = sizeof(struct ipv6hdr),
1120 .setsockopt = ipv6_setsockopt,
1121 .getsockopt = ipv6_getsockopt,
1122 .addr2sockaddr = inet6_csk_addr2sockaddr,
1123 .sockaddr_len = sizeof(struct sockaddr_in6)
1124};
1125
1126/*
1127 * DCCP over IPv4 via INET6 API
1128 */
1129static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
1130 .queue_xmit = ip_queue_xmit,
1131 .send_check = dccp_v4_send_check,
1132 .rebuild_header = inet_sk_rebuild_header,
1133 .conn_request = dccp_v6_conn_request,
1134 .syn_recv_sock = dccp_v6_request_recv_sock,
1135 .net_header_len = sizeof(struct iphdr),
1136 .setsockopt = ipv6_setsockopt,
1137 .getsockopt = ipv6_getsockopt,
1138 .addr2sockaddr = inet6_csk_addr2sockaddr,
1139 .sockaddr_len = sizeof(struct sockaddr_in6)
1140};
1141
1142/* NOTE: A lot of things set to zero explicitly by call to
1143 * sk_alloc() so need not be done here.
1144 */
1145static int dccp_v6_init_sock(struct sock *sk)
1146{
1147 int err = dccp_v4_init_sock(sk);
1148
1149 if (err == 0)
1150 inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
1151
1152 return err;
1153}
1154
1155static int dccp_v6_destroy_sock(struct sock *sk)
1156{
1157 dccp_v4_destroy_sock(sk);
1158 return inet6_destroy_sock(sk);
1159}
1160
1161static struct proto dccp_v6_prot = {
1162 .name = "DCCPv6",
1163 .owner = THIS_MODULE,
1164 .close = dccp_close,
1165 .connect = dccp_v6_connect,
1166 .disconnect = dccp_disconnect,
1167 .ioctl = dccp_ioctl,
1168 .init = dccp_v6_init_sock,
1169 .setsockopt = dccp_setsockopt,
1170 .getsockopt = dccp_getsockopt,
1171 .sendmsg = dccp_sendmsg,
1172 .recvmsg = dccp_recvmsg,
1173 .backlog_rcv = dccp_v6_do_rcv,
1174 .hash = dccp_v6_hash,
1175 .unhash = dccp_unhash,
1176 .accept = inet_csk_accept,
1177 .get_port = dccp_v6_get_port,
1178 .shutdown = dccp_shutdown,
1179 .destroy = dccp_v6_destroy_sock,
1180 .orphan_count = &dccp_orphan_count,
1181 .max_header = MAX_DCCP_HEADER,
1182 .obj_size = sizeof(struct dccp6_sock),
1183 .rsk_prot = &dccp6_request_sock_ops,
1184 .twsk_prot = &dccp6_timewait_sock_ops,
1185};
1186
1187static struct inet6_protocol dccp_v6_protocol = {
1188 .handler = dccp_v6_rcv,
1189 .err_handler = dccp_v6_err,
1190 .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
1191};
1192
1193static struct proto_ops inet6_dccp_ops = {
1194 .family = PF_INET6,
1195 .owner = THIS_MODULE,
1196 .release = inet6_release,
1197 .bind = inet6_bind,
1198 .connect = inet_stream_connect,
1199 .socketpair = sock_no_socketpair,
1200 .accept = inet_accept,
1201 .getname = inet6_getname,
1202 .poll = dccp_poll,
1203 .ioctl = inet6_ioctl,
1204 .listen = inet_dccp_listen,
1205 .shutdown = inet_shutdown,
1206 .setsockopt = sock_common_setsockopt,
1207 .getsockopt = sock_common_getsockopt,
1208 .sendmsg = inet_sendmsg,
1209 .recvmsg = sock_common_recvmsg,
1210 .mmap = sock_no_mmap,
1211 .sendpage = sock_no_sendpage,
1212};
1213
1214static struct inet_protosw dccp_v6_protosw = {
1215 .type = SOCK_DCCP,
1216 .protocol = IPPROTO_DCCP,
1217 .prot = &dccp_v6_prot,
1218 .ops = &inet6_dccp_ops,
1219 .capability = -1,
1220 .flags = INET_PROTOSW_ICSK,
1221};
1222
1223static int __init dccp_v6_init(void)
1224{
1225 int err = proto_register(&dccp_v6_prot, 1);
1226
1227 if (err != 0)
1228 goto out;
1229
1230 err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
1231 if (err != 0)
1232 goto out_unregister_proto;
1233
1234 inet6_register_protosw(&dccp_v6_protosw);
1235out:
1236 return err;
1237out_unregister_proto:
1238 proto_unregister(&dccp_v6_prot);
1239 goto out;
1240}
1241
1242static void __exit dccp_v6_exit(void)
1243{
1244 inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
1245 inet6_unregister_protosw(&dccp_v6_protosw);
1246 proto_unregister(&dccp_v6_prot);
1247}
1248
1249module_init(dccp_v6_init);
1250module_exit(dccp_v6_exit);
1251
1252/*
1253 * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
1254 * values directly, Also cover the case where the protocol is not specified,
1255 * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
1256 */
1257MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
1258MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
1259MODULE_LICENSE("GPL");
1260MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
1261MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 000000000000..e4d4e9309270
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,37 @@
1#ifndef _DCCP_IPV6_H
2#define _DCCP_IPV6_H
3/*
4 * net/dccp/ipv6.h
5 *
6 * An implementation of the DCCP protocol
7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/config.h>
15#include <linux/dccp.h>
16#include <linux/ipv6.h>
17
18struct dccp6_sock {
19 struct dccp_sock dccp;
20 /*
21 * ipv6_pinfo has to be the last member of dccp6_sock,
22 * see inet6_sk_generic.
23 */
24 struct ipv6_pinfo inet6;
25};
26
27struct dccp6_request_sock {
28 struct dccp_request_sock dccp;
29 struct inet6_request_sock inet6;
30};
31
32struct dccp6_timewait_sock {
33 struct inet_timewait_sock inet;
34 struct inet6_timewait_sock tw6;
35};
36
37#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1393461898bb..29261fc198e7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = {
40 (unsigned long)&dccp_death_row), 40 (unsigned long)&dccp_death_row),
41}; 41};
42 42
43EXPORT_SYMBOL_GPL(dccp_death_row);
44
43void dccp_time_wait(struct sock *sk, int state, int timeo) 45void dccp_time_wait(struct sock *sk, int state, int timeo)
44{ 46{
45 struct inet_timewait_sock *tw = NULL; 47 struct inet_timewait_sock *tw = NULL;
@@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
50 if (tw != NULL) { 52 if (tw != NULL) {
51 const struct inet_connection_sock *icsk = inet_csk(sk); 53 const struct inet_connection_sock *icsk = inet_csk(sk);
52 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 54 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
53 55#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
56 if (tw->tw_family == PF_INET6) {
57 const struct ipv6_pinfo *np = inet6_sk(sk);
58 struct inet6_timewait_sock *tw6;
59
60 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
61 tw6 = inet6_twsk((struct sock *)tw);
62 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
63 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
64 tw->tw_ipv6only = np->ipv6only;
65 }
66#endif
54 /* Linkage updates. */ 67 /* Linkage updates. */
55 __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); 68 __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
56 69
@@ -170,6 +183,8 @@ out_free:
170 return newsk; 183 return newsk;
171} 184}
172 185
186EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
187
173/* 188/*
174 * Process an incoming packet for RESPOND sockets represented 189 * Process an incoming packet for RESPOND sockets represented
175 * as an request_sock. 190 * as an request_sock.
@@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
214 goto drop; 229 goto drop;
215 } 230 }
216 231
217 child = dccp_v4_request_recv_sock(sk, skb, req, NULL); 232 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
218 if (child == NULL) 233 if (child == NULL)
219 goto listen_overflow; 234 goto listen_overflow;
220 235
@@ -236,6 +251,8 @@ drop:
236 goto out; 251 goto out;
237} 252}
238 253
254EXPORT_SYMBOL_GPL(dccp_check_req);
255
239/* 256/*
240 * Queue segment on the new socket if the new socket is active, 257 * Queue segment on the new socket if the new socket is active,
241 * otherwise we just shortcircuit this and continue with 258 * otherwise we just shortcircuit this and continue with
@@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child,
266 sock_put(child); 283 sock_put(child);
267 return ret; 284 return ret;
268} 285}
286
287EXPORT_SYMBOL_GPL(dccp_child_process);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 74ff87025878..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17 17
18#include <net/inet_sock.h>
18#include <net/sock.h> 19#include <net/sock.h>
19 20
20#include "ackvec.h" 21#include "ackvec.h"
@@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
43{ 44{
44 if (likely(skb != NULL)) { 45 if (likely(skb != NULL)) {
45 const struct inet_sock *inet = inet_sk(sk); 46 const struct inet_sock *inet = inet_sk(sk);
47 const struct inet_connection_sock *icsk = inet_csk(sk);
46 struct dccp_sock *dp = dccp_sk(sk); 48 struct dccp_sock *dp = dccp_sk(sk);
47 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 49 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
48 struct dccp_hdr *dh; 50 struct dccp_hdr *dh;
@@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
108 break; 110 break;
109 } 111 }
110 112
111 dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, 113 icsk->icsk_af_ops->send_check(sk, skb->len, skb);
112 inet->daddr);
113 114
114 if (set_ack) 115 if (set_ack)
115 dccp_event_ack_sent(sk); 116 dccp_event_ack_sent(sk);
@@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
117 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 118 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
118 119
119 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 120 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
120 err = ip_queue_xmit(skb, 0); 121 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
121 if (err <= 0) 122 if (err <= 0)
122 return err; 123 return err;
123 124
@@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
134 135
135unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) 136unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
136{ 137{
138 struct inet_connection_sock *icsk = inet_csk(sk);
137 struct dccp_sock *dp = dccp_sk(sk); 139 struct dccp_sock *dp = dccp_sk(sk);
138 int mss_now; 140 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
139 141 sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
140 /*
141 * FIXME: we really should be using the af_specific thing to support
142 * IPv6.
143 * mss_now = pmtu - tp->af_specific->net_header_len -
144 * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
145 */
146 mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
147 sizeof(struct dccp_hdr_ext);
148 142
149 /* Now subtract optional transport overhead */ 143 /* Now subtract optional transport overhead */
150 mss_now -= dp->dccps_ext_header_len; 144 mss_now -= icsk->icsk_ext_hdr_len;
151 145
152 /* 146 /*
153 * FIXME: this should come from the CCID infrastructure, where, say, 147 * FIXME: this should come from the CCID infrastructure, where, say,
@@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
160 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 154 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
161 155
162 /* And store cached results */ 156 /* And store cached results */
163 dp->dccps_pmtu_cookie = pmtu; 157 icsk->icsk_pmtu_cookie = pmtu;
164 dp->dccps_mss_cache = mss_now; 158 dp->dccps_mss_cache = mss_now;
165 159
166 return mss_now; 160 return mss_now;
167} 161}
168 162
163EXPORT_SYMBOL_GPL(dccp_sync_mss);
164
169void dccp_write_space(struct sock *sk) 165void dccp_write_space(struct sock *sk)
170{ 166{
171 read_lock(&sk->sk_callback_lock); 167 read_lock(&sk->sk_callback_lock);
@@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
266 262
267int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 263int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
268{ 264{
269 if (inet_sk_rebuild_header(sk) != 0) 265 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
270 return -EHOSTUNREACH; /* Routing failure or similar. */ 266 return -EHOSTUNREACH; /* Routing failure or similar. */
271 267
272 return dccp_transmit_skb(sk, (skb_cloned(skb) ? 268 return dccp_transmit_skb(sk, (skb_cloned(skb) ?
@@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
321 return skb; 317 return skb;
322} 318}
323 319
320EXPORT_SYMBOL_GPL(dccp_make_response);
321
324struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, 322struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
325 const enum dccp_reset_codes code) 323 const enum dccp_reset_codes code)
326 324
@@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
377 */ 375 */
378static inline void dccp_connect_init(struct sock *sk) 376static inline void dccp_connect_init(struct sock *sk)
379{ 377{
378 struct dccp_sock *dp = dccp_sk(sk);
380 struct dst_entry *dst = __sk_dst_get(sk); 379 struct dst_entry *dst = __sk_dst_get(sk);
381 struct inet_connection_sock *icsk = inet_csk(sk); 380 struct inet_connection_sock *icsk = inet_csk(sk);
382 381
@@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk)
385 384
386 dccp_sync_mss(sk, dst_mtu(dst)); 385 dccp_sync_mss(sk, dst_mtu(dst));
387 386
388 /* 387 dccp_update_gss(sk, dp->dccps_iss);
389 * FIXME: set dp->{dccps_swh,dccps_swl}, with 388 /*
390 * something like dccp_inc_seq 389 * SWL and AWL are initially adjusted so that they are not less than
391 */ 390 * the initial Sequence Numbers received and sent, respectively:
391 * SWL := max(GSR + 1 - floor(W/4), ISR),
392 * AWL := max(GSS - W' + 1, ISS).
393 * These adjustments MUST be applied only at the beginning of the
394 * connection.
395 */
396 dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
392 397
393 icsk->icsk_retransmits = 0; 398 icsk->icsk_retransmits = 0;
394} 399}
@@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk)
420 return 0; 425 return 0;
421} 426}
422 427
428EXPORT_SYMBOL_GPL(dccp_connect);
429
423void dccp_send_ack(struct sock *sk) 430void dccp_send_ack(struct sock *sk)
424{ 431{
425 /* If we have been reset, we may not send again. */ 432 /* If we have been reset, we may not send again. */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6b2a9e4581..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
24#include <net/checksum.h> 24#include <net/checksum.h>
25 25
26#include <net/inet_common.h> 26#include <net/inet_common.h>
27#include <net/ip.h> 27#include <net/inet_sock.h>
28#include <net/protocol.h> 28#include <net/protocol.h>
29#include <net/sock.h> 29#include <net/sock.h>
30#include <net/xfrm.h> 30#include <net/xfrm.h>
@@ -34,15 +34,18 @@
34#include <linux/timer.h> 34#include <linux/timer.h>
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/poll.h> 36#include <linux/poll.h>
37#include <linux/dccp.h>
38 37
39#include "ccid.h" 38#include "ccid.h"
40#include "dccp.h" 39#include "dccp.h"
41 40
42DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; 41DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
43 42
43EXPORT_SYMBOL_GPL(dccp_statistics);
44
44atomic_t dccp_orphan_count = ATOMIC_INIT(0); 45atomic_t dccp_orphan_count = ATOMIC_INIT(0);
45 46
47EXPORT_SYMBOL_GPL(dccp_orphan_count);
48
46static struct net_protocol dccp_protocol = { 49static struct net_protocol dccp_protocol = {
47 .handler = dccp_v4_rcv, 50 .handler = dccp_v4_rcv,
48 .err_handler = dccp_v4_err, 51 .err_handler = dccp_v4_err,
@@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags)
149 return err; 152 return err;
150} 153}
151 154
155EXPORT_SYMBOL_GPL(dccp_disconnect);
156
152/* 157/*
153 * Wait for a DCCP event. 158 * Wait for a DCCP event.
154 * 159 *
@@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags)
156 * take care of normal races (between the test and the event) and we don't 161 * take care of normal races (between the test and the event) and we don't
157 * go look at any of the socket buffers directly. 162 * go look at any of the socket buffers directly.
158 */ 163 */
159static unsigned int dccp_poll(struct file *file, struct socket *sock, 164unsigned int dccp_poll(struct file *file, struct socket *sock,
160 poll_table *wait) 165 poll_table *wait)
161{ 166{
162 unsigned int mask; 167 unsigned int mask;
163 struct sock *sk = sock->sk; 168 struct sock *sk = sock->sk;
@@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock,
205 return mask; 210 return mask;
206} 211}
207 212
213EXPORT_SYMBOL_GPL(dccp_poll);
214
208int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) 215int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
209{ 216{
210 dccp_pr_debug("entry\n"); 217 dccp_pr_debug("entry\n");
211 return -ENOIOCTLCMD; 218 return -ENOIOCTLCMD;
212} 219}
213 220
221EXPORT_SYMBOL_GPL(dccp_ioctl);
222
214static int dccp_setsockopt_service(struct sock *sk, const u32 service, 223static int dccp_setsockopt_service(struct sock *sk, const u32 service,
215 char __user *optval, int optlen) 224 char __user *optval, int optlen)
216{ 225{
@@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
254 int val; 263 int val;
255 264
256 if (level != SOL_DCCP) 265 if (level != SOL_DCCP)
257 return ip_setsockopt(sk, level, optname, optval, optlen); 266 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
267 optname, optval,
268 optlen);
258 269
259 if (optlen < sizeof(int)) 270 if (optlen < sizeof(int))
260 return -EINVAL; 271 return -EINVAL;
@@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
282 return err; 293 return err;
283} 294}
284 295
296EXPORT_SYMBOL_GPL(dccp_setsockopt);
297
285static int dccp_getsockopt_service(struct sock *sk, int len, 298static int dccp_getsockopt_service(struct sock *sk, int len,
286 u32 __user *optval, 299 u32 __user *optval,
287 int __user *optlen) 300 int __user *optlen)
@@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
320 int val, len; 333 int val, len;
321 334
322 if (level != SOL_DCCP) 335 if (level != SOL_DCCP)
323 return ip_getsockopt(sk, level, optname, optval, optlen); 336 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
324 337 optname, optval,
338 optlen);
325 if (get_user(len, optlen)) 339 if (get_user(len, optlen))
326 return -EFAULT; 340 return -EFAULT;
327 341
@@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
354 return 0; 368 return 0;
355} 369}
356 370
371EXPORT_SYMBOL_GPL(dccp_getsockopt);
372
357int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 373int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
358 size_t len) 374 size_t len)
359{ 375{
@@ -410,6 +426,8 @@ out_discard:
410 goto out_release; 426 goto out_release;
411} 427}
412 428
429EXPORT_SYMBOL_GPL(dccp_sendmsg);
430
413int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 431int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
414 size_t len, int nonblock, int flags, int *addr_len) 432 size_t len, int nonblock, int flags, int *addr_len)
415{ 433{
@@ -507,7 +525,9 @@ out:
507 return len; 525 return len;
508} 526}
509 527
510static int inet_dccp_listen(struct socket *sock, int backlog) 528EXPORT_SYMBOL_GPL(dccp_recvmsg);
529
530int inet_dccp_listen(struct socket *sock, int backlog)
511{ 531{
512 struct sock *sk = sock->sk; 532 struct sock *sk = sock->sk;
513 unsigned char old_state; 533 unsigned char old_state;
@@ -543,6 +563,8 @@ out:
543 return err; 563 return err;
544} 564}
545 565
566EXPORT_SYMBOL_GPL(inet_dccp_listen);
567
546static const unsigned char dccp_new_state[] = { 568static const unsigned char dccp_new_state[] = {
547 /* current state: new state: action: */ 569 /* current state: new state: action: */
548 [0] = DCCP_CLOSED, 570 [0] = DCCP_CLOSED,
@@ -648,12 +670,16 @@ adjudge_to_death:
648 sock_put(sk); 670 sock_put(sk);
649} 671}
650 672
673EXPORT_SYMBOL_GPL(dccp_close);
674
651void dccp_shutdown(struct sock *sk, int how) 675void dccp_shutdown(struct sock *sk, int how)
652{ 676{
653 dccp_pr_debug("entry\n"); 677 dccp_pr_debug("entry\n");
654} 678}
655 679
656static struct proto_ops inet_dccp_ops = { 680EXPORT_SYMBOL_GPL(dccp_shutdown);
681
682static const struct proto_ops inet_dccp_ops = {
657 .family = PF_INET, 683 .family = PF_INET,
658 .owner = THIS_MODULE, 684 .owner = THIS_MODULE,
659 .release = inet_release, 685 .release = inet_release,
@@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops;
681static struct inet_protosw dccp_v4_protosw = { 707static struct inet_protosw dccp_v4_protosw = {
682 .type = SOCK_DCCP, 708 .type = SOCK_DCCP,
683 .protocol = IPPROTO_DCCP, 709 .protocol = IPPROTO_DCCP,
684 .prot = &dccp_v4_prot, 710 .prot = &dccp_prot,
685 .ops = &inet_dccp_ops, 711 .ops = &inet_dccp_ops,
686 .capability = -1, 712 .capability = -1,
687 .no_check = 0, 713 .no_check = 0,
688 .flags = 0, 714 .flags = INET_PROTOSW_ICSK,
689}; 715};
690 716
691/* 717/*
@@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
760int dccp_debug; 786int dccp_debug;
761module_param(dccp_debug, int, 0444); 787module_param(dccp_debug, int, 0444);
762MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); 788MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
789
790EXPORT_SYMBOL_GPL(dccp_debug);
763#endif 791#endif
764 792
765static int __init dccp_init(void) 793static int __init dccp_init(void)
766{ 794{
767 unsigned long goal; 795 unsigned long goal;
768 int ehash_order, bhash_order, i; 796 int ehash_order, bhash_order, i;
769 int rc = proto_register(&dccp_v4_prot, 1); 797 int rc = proto_register(&dccp_prot, 1);
770 798
771 if (rc) 799 if (rc)
772 goto out; 800 goto out;
@@ -869,7 +897,7 @@ out_free_bind_bucket_cachep:
869 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 897 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
870 dccp_hashinfo.bind_bucket_cachep = NULL; 898 dccp_hashinfo.bind_bucket_cachep = NULL;
871out_proto_unregister: 899out_proto_unregister:
872 proto_unregister(&dccp_v4_prot); 900 proto_unregister(&dccp_prot);
873 goto out; 901 goto out;
874} 902}
875 903
@@ -892,7 +920,7 @@ static void __exit dccp_fini(void)
892 get_order(dccp_hashinfo.ehash_size * 920 get_order(dccp_hashinfo.ehash_size *
893 sizeof(struct inet_ehash_bucket))); 921 sizeof(struct inet_ehash_bucket)));
894 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 922 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
895 proto_unregister(&dccp_v4_prot); 923 proto_unregister(&dccp_prot);
896} 924}
897 925
898module_init(dccp_init); 926module_init(dccp_init);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c68..78ec5344be86 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
149#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) 149#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
150 150
151 151
152static struct proto_ops dn_proto_ops; 152static const struct proto_ops dn_proto_ops;
153static DEFINE_RWLOCK(dn_hash_lock); 153static DEFINE_RWLOCK(dn_hash_lock);
154static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; 154static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
155static struct hlist_head dn_wild_sk; 155static struct hlist_head dn_wild_sk;
@@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1252 break; 1252 break;
1253 1253
1254 default: 1254 default:
1255 err = dev_ioctl(cmd, (void __user *)arg); 1255 err = -ENOIOCTLCMD;
1256 break; 1256 break;
1257 } 1257 }
1258 1258
@@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = {
2342 .owner = THIS_MODULE, 2342 .owner = THIS_MODULE,
2343}; 2343};
2344 2344
2345static struct proto_ops dn_proto_ops = { 2345static const struct proto_ops dn_proto_ops = {
2346 .family = AF_DECnet, 2346 .family = AF_DECnet,
2347 .owner = THIS_MODULE, 2347 .owner = THIS_MODULE,
2348 .release = dn_release, 2348 .release = dn_release,
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 8d0cc3cf3e49..33ab256cfd4a 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb)
408 } 408 }
409 } 409 }
410 410
411 if (!dn_db->router) { 411 /* Only use routers in our area */
412 dn_db->router = neigh_clone(neigh); 412 if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
413 } else { 413 if (!dn_db->router) {
414 if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) 414 dn_db->router = neigh_clone(neigh);
415 neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); 415 } else {
416 if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
417 neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
418 }
416 } 419 }
417 write_unlock(&neigh->lock); 420 write_unlock(&neigh->lock);
418 neigh_release(neigh); 421 neigh_release(neigh);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 369f25b60f3f..44bda85e678f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
793got_it: 793got_it:
794 if (sk != NULL) { 794 if (sk != NULL) {
795 struct dn_scp *scp = DN_SK(sk); 795 struct dn_scp *scp = DN_SK(sk);
796 int ret;
797 796
798 /* Reset backoff */ 797 /* Reset backoff */
799 scp->nsp_rxtshift = 0; 798 scp->nsp_rxtshift = 0;
@@ -807,21 +806,7 @@ got_it:
807 goto free_out; 806 goto free_out;
808 } 807 }
809 808
810 bh_lock_sock(sk); 809 return sk_receive_skb(sk, skb);
811 ret = NET_RX_SUCCESS;
812 if (decnet_debug_level & 8)
813 printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n",
814 (int)cb->rt_flags, (int)cb->nsp_flags,
815 (int)cb->src_port, (int)cb->dst_port,
816 !!sock_owned_by_user(sk));
817 if (!sock_owned_by_user(sk))
818 ret = dn_nsp_backlog_rcv(sk, skb);
819 else
820 sk_add_backlog(sk, skb);
821 bh_unlock_sock(sk);
822 sock_put(sk);
823
824 return ret;
825 } 810 }
826 811
827 return dn_nsp_no_socket(skb, reason); 812 return dn_nsp_no_socket(skb, reason);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df96..c792994d7952 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
31#include <linux/if_arp.h> 31#include <linux/if_arp.h>
32#include <linux/wireless.h> 32#include <linux/wireless.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/udp.h>
34#include <net/sock.h> 35#include <net/sock.h>
35#include <net/inet_common.h> 36#include <net/inet_common.h>
36#include <linux/stat.h> 37#include <linux/stat.h>
@@ -45,7 +46,7 @@
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/system.h> 47#include <asm/system.h>
47 48
48static struct proto_ops econet_ops; 49static const struct proto_ops econet_ops;
49static struct hlist_head econet_sklist; 50static struct hlist_head econet_sklist;
50static DEFINE_RWLOCK(econet_lock); 51static DEFINE_RWLOCK(econet_lock);
51 52
@@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256];
56#define EC_PORT_IP 0xd2 57#define EC_PORT_IP 0xd2
57 58
58#ifdef CONFIG_ECONET_AUNUDP 59#ifdef CONFIG_ECONET_AUNUDP
59static spinlock_t aun_queue_lock; 60static DEFINE_SPINLOCK(aun_queue_lock);
60static struct socket *udpsock; 61static struct socket *udpsock;
61#define AUN_PORT 0x8000 62#define AUN_PORT 0x8000
62 63
@@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
686 break; 687 break;
687 688
688 default: 689 default:
689 return dev_ioctl(cmd, argp); 690 return -ENOIOCTLCMD;
690 } 691 }
691 /*NOTREACHED*/ 692 /*NOTREACHED*/
692 return 0; 693 return 0;
@@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = {
698 .owner = THIS_MODULE, 699 .owner = THIS_MODULE,
699}; 700};
700 701
701static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { 702static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
702 .family = PF_ECONET, 703 .family = PF_ECONET,
703 .owner = THIS_MODULE, 704 .owner = THIS_MODULE,
704 .release = econet_release, 705 .release = econet_release,
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 03efaacbdb73..4cc6f41c6930 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
410 return 1; 410 return 1;
411 } 411 }
412 412
413 if ((is_multicast_ether_addr(hdr->addr1) || 413 if (is_multicast_ether_addr(hdr->addr1)
414 is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : 414 ? ieee->host_mc_decrypt : ieee->host_decrypt) {
415 ieee->host_decrypt) {
416 int idx = 0; 415 int idx = 0;
417 if (skb->len >= hdrlen + 3) 416 if (skb->len >= hdrlen + 3)
418 idx = skb->data[hdrlen + 3] >> 6; 417 idx = skb->data[hdrlen + 3] >> 6;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f4..011cca7ae02b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
456 increase provides TCP friendliness. 456 increase provides TCP friendliness.
457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ 457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
458 458
459config TCP_CONG_CUBIC
460 tristate "CUBIC TCP"
461 default m
462 ---help---
463 This is version 2.0 of BIC-TCP which uses a cubic growth function
464 among other techniques.
465 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
466
459config TCP_CONG_WESTWOOD 467config TCP_CONG_WESTWOOD
460 tristate "TCP Westwood+" 468 tristate "TCP Westwood+"
461 default m 469 default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6b..c54edd76de09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
37obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
37obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 38obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
38obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 39obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
39obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 40obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..966a071a408c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
93#include <linux/smp_lock.h> 93#include <linux/smp_lock.h>
94#include <linux/inet.h> 94#include <linux/inet.h>
95#include <linux/igmp.h> 95#include <linux/igmp.h>
96#include <linux/inetdevice.h>
96#include <linux/netdevice.h> 97#include <linux/netdevice.h>
97#include <net/ip.h> 98#include <net/ip.h>
98#include <net/protocol.h> 99#include <net/protocol.h>
@@ -302,6 +303,7 @@ lookup_protocol:
302 sk->sk_reuse = 1; 303 sk->sk_reuse = 1;
303 304
304 inet = inet_sk(sk); 305 inet = inet_sk(sk);
306 inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
305 307
306 if (SOCK_RAW == sock->type) { 308 if (SOCK_RAW == sock->type) {
307 inet->num = protocol; 309 inet->num = protocol;
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
775 err = devinet_ioctl(cmd, (void __user *)arg); 777 err = devinet_ioctl(cmd, (void __user *)arg);
776 break; 778 break;
777 default: 779 default:
778 if (!sk->sk_prot->ioctl || 780 if (sk->sk_prot->ioctl)
779 (err = sk->sk_prot->ioctl(sk, cmd, arg)) == 781 err = sk->sk_prot->ioctl(sk, cmd, arg);
780 -ENOIOCTLCMD) 782 else
781 err = dev_ioctl(cmd, (void __user *)arg); 783 err = -ENOIOCTLCMD;
782 break; 784 break;
783 } 785 }
784 return err; 786 return err;
785} 787}
786 788
787struct proto_ops inet_stream_ops = { 789const struct proto_ops inet_stream_ops = {
788 .family = PF_INET, 790 .family = PF_INET,
789 .owner = THIS_MODULE, 791 .owner = THIS_MODULE,
790 .release = inet_release, 792 .release = inet_release,
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = {
805 .sendpage = tcp_sendpage 807 .sendpage = tcp_sendpage
806}; 808};
807 809
808struct proto_ops inet_dgram_ops = { 810const struct proto_ops inet_dgram_ops = {
809 .family = PF_INET, 811 .family = PF_INET,
810 .owner = THIS_MODULE, 812 .owner = THIS_MODULE,
811 .release = inet_release, 813 .release = inet_release,
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = {
830 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 832 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
831 * udp_poll 833 * udp_poll
832 */ 834 */
833static struct proto_ops inet_sockraw_ops = { 835static const struct proto_ops inet_sockraw_ops = {
834 .family = PF_INET, 836 .family = PF_INET,
835 .owner = THIS_MODULE, 837 .owner = THIS_MODULE,
836 .release = inet_release, 838 .release = inet_release,
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] =
869 .ops = &inet_stream_ops, 871 .ops = &inet_stream_ops,
870 .capability = -1, 872 .capability = -1,
871 .no_check = 0, 873 .no_check = 0,
872 .flags = INET_PROTOSW_PERMANENT, 874 .flags = INET_PROTOSW_PERMANENT |
875 INET_PROTOSW_ICSK,
873 }, 876 },
874 877
875 { 878 {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
6#include <linux/crypto.h> 6#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 7#include <linux/pfkeyv2.h>
8#include <net/icmp.h> 8#include <net/icmp.h>
9#include <net/protocol.h>
9#include <asm/scatterlist.h> 10#include <asm/scatterlist.h>
10 11
11 12
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
86#include <linux/in.h> 86#include <linux/in.h>
87#include <linux/mm.h> 87#include <linux/mm.h>
88#include <linux/inet.h> 88#include <linux/inet.h>
89#include <linux/inetdevice.h>
89#include <linux/netdevice.h> 90#include <linux/netdevice.h>
90#include <linux/etherdevice.h> 91#include <linux/etherdevice.h>
91#include <linux/fddidevice.h> 92#include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
58#endif 58#endif
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60 60
61#include <net/arp.h>
61#include <net/ip.h> 62#include <net/ip.h>
62#include <net/route.h> 63#include <net/route.h>
63#include <net/ip_fib.h> 64#include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
9#include <linux/pfkeyv2.h> 9#include <linux/pfkeyv2.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <net/icmp.h> 11#include <net/icmp.h>
12#include <net/protocol.h>
12#include <net/udp.h> 13#include <net/udp.h>
13 14
14/* decapsulation data for use when post-processing */ 15/* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/in.h> 31#include <linux/in.h>
32#include <linux/inet.h> 32#include <linux/inet.h>
33#include <linux/inetdevice.h>
33#include <linux/netdevice.h> 34#include <linux/netdevice.h>
34#include <linux/if_arp.h> 35#include <linux/if_arp.h>
35#include <linux/skbuff.h> 36#include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
33#include <linux/errno.h> 33#include <linux/errno.h>
34#include <linux/in.h> 34#include <linux/in.h>
35#include <linux/inet.h> 35#include <linux/inet.h>
36#include <linux/inetdevice.h>
36#include <linux/netdevice.h> 37#include <linux/netdevice.h>
37#include <linux/if_arp.h> 38#include <linux/if_arp.h>
38#include <linux/proc_fs.h> 39#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
36#include <linux/netlink.h> 37#include <linux/netlink.h>
37#include <linux/init.h> 38#include <linux/init.h>
38 39
40#include <net/arp.h>
39#include <net/ip.h> 41#include <net/ip.h>
40#include <net/protocol.h> 42#include <net/protocol.h>
41#include <net/route.h> 43#include <net/route.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df9..e320b32373e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
41 * modify it under the terms of the GNU General Public License 41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version 42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
44 */ 51 */
45 52
46#define VERSION "0.404" 53#define VERSION "0.404"
@@ -59,6 +66,7 @@
59#include <linux/errno.h> 66#include <linux/errno.h>
60#include <linux/in.h> 67#include <linux/in.h>
61#include <linux/inet.h> 68#include <linux/inet.h>
69#include <linux/inetdevice.h>
62#include <linux/netdevice.h> 70#include <linux/netdevice.h>
63#include <linux/if_arp.h> 71#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 72#include <linux/proc_fs.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
73#include <linux/socket.h> 73#include <linux/socket.h>
74#include <linux/in.h> 74#include <linux/in.h>
75#include <linux/inet.h> 75#include <linux/inet.h>
76#include <linux/inetdevice.h>
76#include <linux/netdevice.h> 77#include <linux/netdevice.h>
77#include <linux/string.h> 78#include <linux/string.h>
78#include <linux/netfilter_ipv4.h> 79#include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
91#include <linux/if_arp.h> 91#include <linux/if_arp.h>
92#include <linux/rtnetlink.h> 92#include <linux/rtnetlink.h>
93#include <linux/times.h> 93#include <linux/times.h>
94
95#include <net/arp.h>
94#include <net/ip.h> 96#include <net/ip.h>
95#include <net/protocol.h> 97#include <net/protocol.h>
96#include <net/route.h> 98#include <net/route.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a566..ae20281d8deb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
37 */ 37 */
38int sysctl_local_port_range[2] = { 1024, 4999 }; 38int sysctl_local_port_range[2] = { 1024, 4999 };
39 39
40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) 40int inet_csk_bind_conflict(const struct sock *sk,
41 const struct inet_bind_bucket *tb)
41{ 42{
42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 43 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
43 struct sock *sk2; 44 struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
62 return node != NULL; 63 return node != NULL;
63} 64}
64 65
66EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
67
65/* Obtain a reference to a local port for the given sock, 68/* Obtain a reference to a local port for the given sock,
66 * if snum is zero it means select any available local port. 69 * if snum is zero it means select any available local port.
67 */ 70 */
68int inet_csk_get_port(struct inet_hashinfo *hashinfo, 71int inet_csk_get_port(struct inet_hashinfo *hashinfo,
69 struct sock *sk, unsigned short snum) 72 struct sock *sk, unsigned short snum,
73 int (*bind_conflict)(const struct sock *sk,
74 const struct inet_bind_bucket *tb))
70{ 75{
71 struct inet_bind_hashbucket *head; 76 struct inet_bind_hashbucket *head;
72 struct hlist_node *node; 77 struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
125 goto success; 130 goto success;
126 } else { 131 } else {
127 ret = 1; 132 ret = 1;
128 if (inet_csk_bind_conflict(sk, tb)) 133 if (bind_conflict(sk, tb))
129 goto fail_unlock; 134 goto fail_unlock;
130 } 135 }
131 } 136 }
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
380EXPORT_SYMBOL_GPL(inet_csk_search_req); 385EXPORT_SYMBOL_GPL(inet_csk_search_req);
381 386
382void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 387void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
383 const unsigned timeout) 388 unsigned long timeout)
384{ 389{
385 struct inet_connection_sock *icsk = inet_csk(sk); 390 struct inet_connection_sock *icsk = inet_csk(sk);
386 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 391 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
631} 636}
632 637
633EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 638EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
639
640void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
641{
642 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
643 const struct inet_sock *inet = inet_sk(sk);
644
645 sin->sin_family = AF_INET;
646 sin->sin_addr.s_addr = inet->daddr;
647 sin->sin_port = inet->dport;
648}
649
650EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
112 r->idiag_inode = 0; 112 r->idiag_inode = 0;
113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
114 if (r->idiag_family == AF_INET6) { 114 if (r->idiag_family == AF_INET6) {
115 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); 115 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
116 116
117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
118 &tcp6tw->tw_v6_rcv_saddr); 118 &tw6->tw_v6_rcv_saddr);
119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
120 &tcp6tw->tw_v6_daddr); 120 &tw6->tw_v6_daddr);
121 } 121 }
122#endif 122#endif
123 nlh->nlmsg_len = skb->tail - b; 123 nlh->nlmsg_len = skb->tail - b;
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
490 if (r->idiag_family == AF_INET6) { 490 if (r->idiag_family == AF_INET6) {
491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
492 &tcp6_rsk(req)->loc_addr); 492 &inet6_rsk(req)->loc_addr);
493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
494 &tcp6_rsk(req)->rmt_addr); 494 &inet6_rsk(req)->rmt_addr);
495 } 495 }
496#endif 496#endif
497 nlh->nlmsg_len = skb->tail - b; 497 nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
553 entry.saddr = 553 entry.saddr =
554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
555 (entry.family == AF_INET6) ? 555 (entry.family == AF_INET6) ?
556 tcp6_rsk(req)->loc_addr.s6_addr32 : 556 inet6_rsk(req)->loc_addr.s6_addr32 :
557#endif 557#endif
558 &ireq->loc_addr; 558 &ireq->loc_addr;
559 entry.daddr = 559 entry.daddr =
560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
561 (entry.family == AF_INET6) ? 561 (entry.family == AF_INET6) ?
562 tcp6_rsk(req)->rmt_addr.s6_addr32 : 562 inet6_rsk(req)->rmt_addr.s6_addr32 :
563#endif 563#endif
564 &ireq->rmt_addr; 564 &ireq->rmt_addr;
565 entry.dport = ntohs(ireq->rmt_port); 565 entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
15 15
16#include <linux/config.h> 16#include <linux/config.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/random.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/wait.h> 21#include <linux/wait.h>
21 22
22#include <net/inet_connection_sock.h> 23#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h> 24#include <net/inet_hashtables.h>
25#include <net/ip.h>
24 26
25/* 27/*
26 * Allocate and initialize a new local port bind bucket. 28 * Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
163} 165}
164 166
165EXPORT_SYMBOL_GPL(__inet_lookup_listener); 167EXPORT_SYMBOL_GPL(__inet_lookup_listener);
168
169/* called with local bh disabled */
170static int __inet_check_established(struct inet_timewait_death_row *death_row,
171 struct sock *sk, __u16 lport,
172 struct inet_timewait_sock **twp)
173{
174 struct inet_hashinfo *hinfo = death_row->hashinfo;
175 struct inet_sock *inet = inet_sk(sk);
176 u32 daddr = inet->rcv_saddr;
177 u32 saddr = inet->daddr;
178 int dif = sk->sk_bound_dev_if;
179 INET_ADDR_COOKIE(acookie, saddr, daddr)
180 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
181 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
182 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
183 struct sock *sk2;
184 const struct hlist_node *node;
185 struct inet_timewait_sock *tw;
186
187 prefetch(head->chain.first);
188 write_lock(&head->lock);
189
190 /* Check TIME-WAIT sockets first. */
191 sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
192 tw = inet_twsk(sk2);
193
194 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
195 if (twsk_unique(sk, sk2, twp))
196 goto unique;
197 else
198 goto not_unique;
199 }
200 }
201 tw = NULL;
202
203 /* And established part... */
204 sk_for_each(sk2, node, &head->chain) {
205 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
206 goto not_unique;
207 }
208
209unique:
210 /* Must record num and sport now. Otherwise we will see
211 * in hash table socket with a funny identity. */
212 inet->num = lport;
213 inet->sport = htons(lport);
214 sk->sk_hash = hash;
215 BUG_TRAP(sk_unhashed(sk));
216 __sk_add_node(sk, &head->chain);
217 sock_prot_inc_use(sk->sk_prot);
218 write_unlock(&head->lock);
219
220 if (twp) {
221 *twp = tw;
222 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
223 } else if (tw) {
224 /* Silly. Should hash-dance instead... */
225 inet_twsk_deschedule(tw, death_row);
226 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
227
228 inet_twsk_put(tw);
229 }
230
231 return 0;
232
233not_unique:
234 write_unlock(&head->lock);
235 return -EADDRNOTAVAIL;
236}
237
238static inline u32 inet_sk_port_offset(const struct sock *sk)
239{
240 const struct inet_sock *inet = inet_sk(sk);
241 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
242 inet->dport);
243}
244
245/*
246 * Bind a port for a connect operation and hash it.
247 */
248int inet_hash_connect(struct inet_timewait_death_row *death_row,
249 struct sock *sk)
250{
251 struct inet_hashinfo *hinfo = death_row->hashinfo;
252 const unsigned short snum = inet_sk(sk)->num;
253 struct inet_bind_hashbucket *head;
254 struct inet_bind_bucket *tb;
255 int ret;
256
257 if (!snum) {
258 int low = sysctl_local_port_range[0];
259 int high = sysctl_local_port_range[1];
260 int range = high - low;
261 int i;
262 int port;
263 static u32 hint;
264 u32 offset = hint + inet_sk_port_offset(sk);
265 struct hlist_node *node;
266 struct inet_timewait_sock *tw = NULL;
267
268 local_bh_disable();
269 for (i = 1; i <= range; i++) {
270 port = low + (i + offset) % range;
271 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
272 spin_lock(&head->lock);
273
274 /* Does not bother with rcv_saddr checks,
275 * because the established check is already
276 * unique enough.
277 */
278 inet_bind_bucket_for_each(tb, node, &head->chain) {
279 if (tb->port == port) {
280 BUG_TRAP(!hlist_empty(&tb->owners));
281 if (tb->fastreuse >= 0)
282 goto next_port;
283 if (!__inet_check_established(death_row,
284 sk, port,
285 &tw))
286 goto ok;
287 goto next_port;
288 }
289 }
290
291 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
292 if (!tb) {
293 spin_unlock(&head->lock);
294 break;
295 }
296 tb->fastreuse = -1;
297 goto ok;
298
299 next_port:
300 spin_unlock(&head->lock);
301 }
302 local_bh_enable();
303
304 return -EADDRNOTAVAIL;
305
306ok:
307 hint += i;
308
309 /* Head lock still held and bh's disabled */
310 inet_bind_hash(sk, tb, port);
311 if (sk_unhashed(sk)) {
312 inet_sk(sk)->sport = htons(port);
313 __inet_hash(hinfo, sk, 0);
314 }
315 spin_unlock(&head->lock);
316
317 if (tw) {
318 inet_twsk_deschedule(tw, death_row);;
319 inet_twsk_put(tw);
320 }
321
322 ret = 0;
323 goto out;
324 }
325
326 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
327 tb = inet_csk(sk)->icsk_bind_hash;
328 spin_lock_bh(&head->lock);
329 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
330 __inet_hash(hinfo, sk, 0);
331 spin_unlock_bh(&head->lock);
332 return 0;
333 } else {
334 spin_unlock(&head->lock);
335 /* No definite answer... Walk to established hash table */
336 ret = __inet_check_established(death_row, sk, snum, NULL);
337out:
338 local_bh_enable();
339 return ret;
340 }
341}
342
343EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
90 90
91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
92{ 92{
93 struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, 93 struct inet_timewait_sock *tw =
94 SLAB_ATOMIC); 94 kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
95 SLAB_ATOMIC);
95 if (tw != NULL) { 96 if (tw != NULL) {
96 const struct inet_sock *inet = inet_sk(sk); 97 const struct inet_sock *inet = inet_sk(sk);
97 98
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
401 return NULL; 401 return NULL;
402 n->v4daddr = daddr; 402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1); 403 atomic_set(&n->refcnt, 1);
404 atomic_set(&n->rid, 0);
404 n->ip_id_count = secure_ip_id(daddr); 405 n->ip_id_count = secure_ip_id(daddr);
405 n->tcp_ts_stamp = 0; 406 n->tcp_ts_stamp = 0;
406 407
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
22 * Patrick McHardy : LRU queue of frag heads for evictor. 22 * Patrick McHardy : LRU queue of frag heads for evictor.
23 */ 23 */
24 24
25#include <linux/compiler.h>
25#include <linux/config.h> 26#include <linux/config.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/types.h> 28#include <linux/types.h>
@@ -38,6 +39,7 @@
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/icmp.h> 40#include <net/icmp.h>
40#include <net/checksum.h> 41#include <net/checksum.h>
42#include <net/inetpeer.h>
41#include <linux/tcp.h> 43#include <linux/tcp.h>
42#include <linux/udp.h> 44#include <linux/udp.h>
43#include <linux/inet.h> 45#include <linux/inet.h>
@@ -56,6 +58,8 @@
56int sysctl_ipfrag_high_thresh = 256*1024; 58int sysctl_ipfrag_high_thresh = 256*1024;
57int sysctl_ipfrag_low_thresh = 192*1024; 59int sysctl_ipfrag_low_thresh = 192*1024;
58 60
61int sysctl_ipfrag_max_dist = 64;
62
59/* Important NOTE! Fragment queue must be destroyed before MSL expires. 63/* Important NOTE! Fragment queue must be destroyed before MSL expires.
60 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. 64 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
61 */ 65 */
@@ -89,8 +93,10 @@ struct ipq {
89 spinlock_t lock; 93 spinlock_t lock;
90 atomic_t refcnt; 94 atomic_t refcnt;
91 struct timer_list timer; /* when will this queue expire? */ 95 struct timer_list timer; /* when will this queue expire? */
92 int iif;
93 struct timeval stamp; 96 struct timeval stamp;
97 int iif;
98 unsigned int rid;
99 struct inet_peer *peer;
94}; 100};
95 101
96/* Hash table. */ 102/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
195 BUG_TRAP(qp->last_in&COMPLETE); 201 BUG_TRAP(qp->last_in&COMPLETE);
196 BUG_TRAP(del_timer(&qp->timer) == 0); 202 BUG_TRAP(del_timer(&qp->timer) == 0);
197 203
204 if (qp->peer)
205 inet_putpeer(qp->peer);
206
198 /* Release all fragment data. */ 207 /* Release all fragment data. */
199 fp = qp->fragments; 208 fp = qp->fragments;
200 while (fp) { 209 while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
353 qp->meat = 0; 362 qp->meat = 0;
354 qp->fragments = NULL; 363 qp->fragments = NULL;
355 qp->iif = 0; 364 qp->iif = 0;
365 qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
356 366
357 /* Initialize a timer for this entry. */ 367 /* Initialize a timer for this entry. */
358 init_timer(&qp->timer); 368 init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
398 return ip_frag_create(hash, iph, user); 408 return ip_frag_create(hash, iph, user);
399} 409}
400 410
411/* Is the fragment too far ahead to be part of ipq? */
412static inline int ip_frag_too_far(struct ipq *qp)
413{
414 struct inet_peer *peer = qp->peer;
415 unsigned int max = sysctl_ipfrag_max_dist;
416 unsigned int start, end;
417
418 int rc;
419
420 if (!peer || !max)
421 return 0;
422
423 start = qp->rid;
424 end = atomic_inc_return(&peer->rid);
425 qp->rid = end;
426
427 rc = qp->fragments && (end - start) > max;
428
429 if (rc) {
430 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
431 }
432
433 return rc;
434}
435
436static int ip_frag_reinit(struct ipq *qp)
437{
438 struct sk_buff *fp;
439
440 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
441 atomic_inc(&qp->refcnt);
442 return -ETIMEDOUT;
443 }
444
445 fp = qp->fragments;
446 do {
447 struct sk_buff *xp = fp->next;
448 frag_kfree_skb(fp, NULL);
449 fp = xp;
450 } while (fp);
451
452 qp->last_in = 0;
453 qp->len = 0;
454 qp->meat = 0;
455 qp->fragments = NULL;
456 qp->iif = 0;
457
458 return 0;
459}
460
401/* Add new segment to existing queue. */ 461/* Add new segment to existing queue. */
402static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 462static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
403{ 463{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
408 if (qp->last_in & COMPLETE) 468 if (qp->last_in & COMPLETE)
409 goto err; 469 goto err;
410 470
471 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
472 unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
473 ipq_kill(qp);
474 goto err;
475 }
476
411 offset = ntohs(skb->nh.iph->frag_off); 477 offset = ntohs(skb->nh.iph->frag_off);
412 flags = offset & ~IP_OFFSET; 478 flags = offset & ~IP_OFFSET;
413 offset &= IP_OFFSET; 479 offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
128#include <linux/sockios.h> 128#include <linux/sockios.h>
129#include <linux/in.h> 129#include <linux/in.h>
130#include <linux/inet.h> 130#include <linux/inet.h>
131#include <linux/inetdevice.h>
131#include <linux/netdevice.h> 132#include <linux/netdevice.h>
132#include <linux/etherdevice.h> 133#include <linux/etherdevice.h>
133 134
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
22#include <net/sock.h> 22#include <net/sock.h>
23#include <net/ip.h> 23#include <net/ip.h>
24#include <net/icmp.h> 24#include <net/icmp.h>
25#include <net/route.h>
25 26
26/* 27/*
27 * Write options to IP header, record destination address to 28 * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
445 445
446 hlen = iph->ihl * 4; 446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
448 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
448 449
449 /* When frag_list is given, use it. First, check its validity: 450 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing 451 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,12 +25,12 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <linux/ip.h> 26#include <linux/ip.h>
27#include <linux/icmp.h> 27#include <linux/icmp.h>
28#include <linux/inetdevice.h>
28#include <linux/netdevice.h> 29#include <linux/netdevice.h>
29#include <net/sock.h> 30#include <net/sock.h>
30#include <net/ip.h> 31#include <net/ip.h>
31#include <net/icmp.h> 32#include <net/icmp.h>
32#include <net/tcp.h> 33#include <net/tcp_states.h>
33#include <linux/tcp.h>
34#include <linux/udp.h> 34#include <linux/udp.h>
35#include <linux/igmp.h> 35#include <linux/igmp.h>
36#include <linux/netfilter.h> 36#include <linux/netfilter.h>
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
427 err = ip_options_get_from_user(&opt, optval, optlen); 427 err = ip_options_get_from_user(&opt, optval, optlen);
428 if (err) 428 if (err)
429 break; 429 break;
430 if (sk->sk_type == SOCK_STREAM) { 430 if (inet->is_icsk) {
431 struct tcp_sock *tp = tcp_sk(sk); 431 struct inet_connection_sock *icsk = inet_csk(sk);
432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
433 if (sk->sk_family == PF_INET || 433 if (sk->sk_family == PF_INET ||
434 (!((1 << sk->sk_state) & 434 (!((1 << sk->sk_state) &
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
436 inet->daddr != LOOPBACK4_IPV6)) { 436 inet->daddr != LOOPBACK4_IPV6)) {
437#endif 437#endif
438 if (inet->opt) 438 if (inet->opt)
439 tp->ext_header_len -= inet->opt->optlen; 439 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
440 if (opt) 440 if (opt)
441 tp->ext_header_len += opt->optlen; 441 icsk->icsk_ext_hdr_len += opt->optlen;
442 tcp_sync_mss(sk, tp->pmtu_cookie); 442 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
444 } 444 }
445#endif 445#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
28#include <net/xfrm.h> 28#include <net/xfrm.h>
29#include <net/icmp.h> 29#include <net/icmp.h>
30#include <net/ipcomp.h> 30#include <net/ipcomp.h>
31#include <net/protocol.h>
31 32
32struct ipcomp_tfms { 33struct ipcomp_tfms {
33 struct list_head list; 34 struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
42#include <linux/in.h> 42#include <linux/in.h>
43#include <linux/if.h> 43#include <linux/if.h>
44#include <linux/inet.h> 44#include <linux/inet.h>
45#include <linux/inetdevice.h>
45#include <linux/netdevice.h> 46#include <linux/netdevice.h>
46#include <linux/if_arp.h> 47#include <linux/if_arp.h>
47#include <linux/skbuff.h> 48#include <linux/skbuff.h>
@@ -58,6 +59,7 @@
58#include <net/arp.h> 59#include <net/arp.h>
59#include <net/ip.h> 60#include <net/ip.h>
60#include <net/ipconfig.h> 61#include <net/ipconfig.h>
62#include <net/route.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <net/checksum.h> 65#include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
52#include <net/ip.h> 52#include <net/ip.h>
53#include <net/protocol.h> 53#include <net/protocol.h>
54#include <linux/skbuff.h> 54#include <linux/skbuff.h>
55#include <net/route.h>
55#include <net/sock.h> 56#include <net/sock.h>
56#include <net/icmp.h> 57#include <net/icmp.h>
57#include <net/udp.h> 58#include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c2..9b176a942ac5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
224} 224}
225 225
226 226
227#if 0000
228/*
229 * Get reference to app by name (called from user context)
230 */
231struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
232{
233 struct ip_vs_app *app, *a = NULL;
234
235 down(&__ip_vs_app_mutex);
236
237 list_for_each_entry(ent, &ip_vs_app_list, a_list) {
238 if (strcmp(app->name, appname))
239 continue;
240
241 /* softirq may call ip_vs_app_get too, so the caller
242 must disable softirq on the current CPU */
243 if (ip_vs_app_get(app))
244 a = app;
245 break;
246 }
247
248 up(&__ip_vs_app_mutex);
249
250 return a;
251}
252#endif
253
254
255/* 227/*
256 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) 228 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
257 */ 229 */
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..81d90354c928 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,10 @@
24 * 24 *
25 */ 25 */
26 26
27#include <linux/in.h>
28#include <linux/net.h>
27#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/module.h>
28#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
29#include <linux/proc_fs.h> /* for proc_net_* */ 32#include <linux/proc_fs.h> /* for proc_net_* */
30#include <linux/seq_file.h> 33#include <linux/seq_file.h>
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
219 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
220 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
221 224
222 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
223 ip_vs_proto_name(protocol), 226 ip_vs_proto_name(protocol),
224 NIPQUAD(s_addr), ntohs(s_port), 227 NIPQUAD(s_addr), ntohs(s_port),
225 NIPQUAD(d_addr), ntohs(d_port), 228 NIPQUAD(d_addr), ntohs(d_port),
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
254 out: 257 out:
255 ct_read_unlock(hash); 258 ct_read_unlock(hash);
256 259
257 IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
258 ip_vs_proto_name(protocol), 261 ip_vs_proto_name(protocol),
259 NIPQUAD(s_addr), ntohs(s_port), 262 NIPQUAD(s_addr), ntohs(s_port),
260 NIPQUAD(d_addr), ntohs(d_port), 263 NIPQUAD(d_addr), ntohs(d_port),
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
295 298
296 ct_read_unlock(hash); 299 ct_read_unlock(hash);
297 300
298 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
299 ip_vs_proto_name(protocol), 302 ip_vs_proto_name(protocol),
300 NIPQUAD(s_addr), ntohs(s_port), 303 NIPQUAD(s_addr), ntohs(s_port),
301 NIPQUAD(d_addr), ntohs(d_port), 304 NIPQUAD(d_addr), ntohs(d_port),
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
391 cp->flags |= atomic_read(&dest->conn_flags); 394 cp->flags |= atomic_read(&dest->conn_flags);
392 cp->dest = dest; 395 cp->dest = dest;
393 396
394 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 397 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
395 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 398 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
399 "dest->refcnt:%d\n",
396 ip_vs_proto_name(cp->protocol), 400 ip_vs_proto_name(cp->protocol),
397 NIPQUAD(cp->caddr), ntohs(cp->cport), 401 NIPQUAD(cp->caddr), ntohs(cp->cport),
398 NIPQUAD(cp->vaddr), ntohs(cp->vport), 402 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
430 if (!dest) 434 if (!dest)
431 return; 435 return;
432 436
433 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 437 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
434 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 438 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
439 "dest->refcnt:%d\n",
435 ip_vs_proto_name(cp->protocol), 440 ip_vs_proto_name(cp->protocol),
436 NIPQUAD(cp->caddr), ntohs(cp->cport), 441 NIPQUAD(cp->caddr), ntohs(cp->cport),
437 NIPQUAD(cp->vaddr), ntohs(cp->vport), 442 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data)
571 ip_vs_conn_hash(cp); 576 ip_vs_conn_hash(cp);
572 577
573 expire_later: 578 expire_later:
574 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", 579 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
575 atomic_read(&cp->refcnt)-1, 580 atomic_read(&cp->refcnt)-1,
576 atomic_read(&cp->n_control)); 581 atomic_read(&cp->n_control));
577 582
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a9..1aca94a9fd8b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
426 return NULL; 426 return NULL;
427 427
428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " 428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
429 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", 429 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
430 ip_vs_fwd_tag(cp), 430 ip_vs_fwd_tag(cp),
431 NIPQUAD(cp->caddr), ntohs(cp->cport), 431 NIPQUAD(cp->caddr), ntohs(cp->cport),
432 NIPQUAD(cp->vaddr), ntohs(cp->vport), 432 NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..c935c5086d33 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
35#include <linux/netfilter_ipv4.h> 35#include <linux/netfilter_ipv4.h>
36 36
37#include <net/ip.h> 37#include <net/ip.h>
38#include <net/route.h>
38#include <net/sock.h> 39#include <net/sock.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
447 out: 448 out:
448 read_unlock(&__ip_vs_svc_lock); 449 read_unlock(&__ip_vs_svc_lock);
449 450
450 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", 451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451 fwmark, ip_vs_proto_name(protocol), 452 fwmark, ip_vs_proto_name(protocol),
452 NIPQUAD(vaddr), ntohs(vport), 453 NIPQUAD(vaddr), ntohs(vport),
453 svc?"hit":"not hit"); 454 svc?"hit":"not hit");
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
597 */ 598 */
598 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 599 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " 600 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
600 "refcnt=%d\n", 601 "dest->refcnt=%d\n",
601 dest->vfwmark, 602 dest->vfwmark,
602 NIPQUAD(dest->addr), ntohs(dest->port), 603 NIPQUAD(dest->addr), ntohs(dest->port),
603 atomic_read(&dest->refcnt)); 604 atomic_read(&dest->refcnt));
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
804 dest = ip_vs_trash_get_dest(svc, daddr, dport); 805 dest = ip_vs_trash_get_dest(svc, daddr, dport);
805 if (dest != NULL) { 806 if (dest != NULL) {
806 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " 807 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", 808 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808 NIPQUAD(daddr), ntohs(dport), 809 NIPQUAD(daddr), ntohs(dport),
809 atomic_read(&dest->refcnt), 810 atomic_read(&dest->refcnt),
810 dest->vfwmark, 811 dest->vfwmark,
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
949 atomic_dec(&dest->svc->refcnt); 950 atomic_dec(&dest->svc->refcnt);
950 kfree(dest); 951 kfree(dest);
951 } else { 952 } else {
952 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", 953 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
954 "dest->refcnt=%d\n",
953 NIPQUAD(dest->addr), ntohs(dest->port), 955 NIPQUAD(dest->addr), ntohs(dest->port),
954 atomic_read(&dest->refcnt)); 956 atomic_read(&dest->refcnt));
955 list_add(&dest->n_list, &ip_vs_dest_trash); 957 list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
37 * 37 *
38 */ 38 */
39 39
40#include <linux/ip.h>
40#include <linux/module.h> 41#include <linux/module.h>
41#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/skbuff.h>
42 44
43#include <net/ip_vs.h> 45#include <net/ip_vs.h>
44 46
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
13 * Changes: 13 * Changes:
14 * 14 *
15 */ 15 */
16#include <linux/config.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/jiffies.h>
19#include <linux/slab.h>
17#include <linux/types.h> 20#include <linux/types.h>
18 21
19#include <net/ip_vs.h> 22#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa8..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
41 * me to write this module. 41 * me to write this module.
42 */ 42 */
43 43
44#include <linux/ip.h>
44#include <linux/module.h> 45#include <linux/module.h>
45#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/skbuff.h>
46 48
47/* for sysctl */ 49/* for sysctl */
48#include <linux/fs.h> 50#include <linux/fs.h>
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
228} 230}
229 231
230 232
231#if 0000
232/*
233 * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
234 * returns bool success.
235 */
236static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
237 struct ip_vs_lblc_entry *en)
238{
239 if (list_empty(&en->list)) {
240 IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
241 "called from %p\n", __builtin_return_address(0));
242 return 0;
243 }
244
245 /*
246 * Remove it from the table
247 */
248 write_lock(&tbl->lock);
249 list_del(&en->list);
250 INIT_LIST_HEAD(&en->list);
251 write_unlock(&tbl->lock);
252
253 return 1;
254}
255#endif
256
257
258/* 233/*
259 * Get ip_vs_lblc_entry associated with supplied parameters. 234 * Get ip_vs_lblc_entry associated with supplied parameters.
260 */ 235 */
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a5..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
39 * 39 *
40 */ 40 */
41 41
42#include <linux/ip.h>
42#include <linux/module.h> 43#include <linux/module.h>
43#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/skbuff.h>
44 46
45/* for sysctl */ 47/* for sysctl */
46#include <linux/fs.h> 48#include <linux/fs.h>
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
414} 416}
415 417
416 418
417#if 0000
418/*
419 * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
420 * returns bool success.
421 */
422static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
423 struct ip_vs_lblcr_entry *en)
424{
425 if (list_empty(&en->list)) {
426 IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
427 "called from %p\n", __builtin_return_address(0));
428 return 0;
429 }
430
431 /*
432 * Remove it from the table
433 */
434 write_lock(&tbl->lock);
435 list_del(&en->list);
436 INIT_LIST_HEAD(&en->list);
437 write_unlock(&tbl->lock);
438
439 return 1;
440}
441#endif
442
443
444/* 419/*
445 * Get ip_vs_lblcr_entry associated with supplied parameters. 420 * Get ip_vs_lblcr_entry associated with supplied parameters.
446 */ 421 */
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215c..bc28b1160a3a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
275 [IP_VS_TCP_S_LAST] = 2*HZ, 275 [IP_VS_TCP_S_LAST] = 2*HZ,
276}; 276};
277 277
278
279#if 0
280
281/* FIXME: This is going to die */
282
283static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
284 [IP_VS_TCP_S_NONE] = 2*HZ,
285 [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
286 [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
287 [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
288 [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
289 [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
290 [IP_VS_TCP_S_CLOSE] = 10*HZ,
291 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
292 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
293 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
294 [IP_VS_TCP_S_SYNACK] = 100*HZ,
295 [IP_VS_TCP_S_LAST] = 2*HZ,
296};
297
298#endif
299
300static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { 278static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
301 [IP_VS_TCP_S_NONE] = "NONE", 279 [IP_VS_TCP_S_NONE] = "NONE",
302 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", 280 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
448 struct ip_vs_dest *dest = cp->dest; 426 struct ip_vs_dest *dest = cp->dest;
449 427
450 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" 428 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
451 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", 429 "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
452 pp->name, 430 pp->name,
453 (state_off==TCP_DIR_OUTPUT)?"output ":"input ", 431 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
454 th->syn? 'S' : '.', 432 th->syn? 'S' : '.',
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
15 * 15 *
16 */ 16 */
17 17
18#include <linux/in.h>
19#include <linux/ip.h>
18#include <linux/kernel.h> 20#include <linux/kernel.h>
19#include <linux/netfilter_ipv4.h> 21#include <linux/netfilter_ipv4.h>
22#include <linux/udp.h>
20 23
21#include <net/ip_vs.h> 24#include <net/ip_vs.h>
22 25
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
34 * 34 *
35 */ 35 */
36 36
37#include <linux/ip.h>
37#include <linux/module.h> 38#include <linux/module.h>
38#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/skbuff.h>
39 41
40#include <net/ip_vs.h> 42#include <net/ip_vs.h>
41 43
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/inetdevice.h>
24#include <linux/net.h> 25#include <linux/net.h>
25#include <linux/completion.h> 26#include <linux/completion.h>
26#include <linux/delay.h> 27#include <linux/delay.h>
27#include <linux/skbuff.h> 28#include <linux/skbuff.h>
28#include <linux/in.h> 29#include <linux/in.h>
29#include <linux/igmp.h> /* for ip_mc_join_group */ 30#include <linux/igmp.h> /* for ip_mc_join_group */
31#include <linux/udp.h>
30 32
31#include <net/ip.h> 33#include <net/ip.h>
32#include <net/sock.h> 34#include <net/sock.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba6..bba156304695 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
68 unsigned int initial_entries; 68 unsigned int initial_entries;
69 unsigned int hook_entry[NF_ARP_NUMHOOKS]; 69 unsigned int hook_entry[NF_ARP_NUMHOOKS];
70 unsigned int underflow[NF_ARP_NUMHOOKS]; 70 unsigned int underflow[NF_ARP_NUMHOOKS];
71 char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); 71 void *entries[NR_CPUS];
72}; 72};
73 73
74static LIST_HEAD(arpt_target); 74static LIST_HEAD(arpt_target);
75static LIST_HEAD(arpt_tables); 75static LIST_HEAD(arpt_tables);
76#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
76#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 77#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
77 78
78#ifdef CONFIG_SMP
79#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
80#else
81#define TABLE_OFFSET(t,p) 0
82#endif
83
84static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 79static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
85 char *hdr_addr, int len) 80 char *hdr_addr, int len)
86{ 81{
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
269 outdev = out ? out->name : nulldevname; 264 outdev = out ? out->name : nulldevname;
270 265
271 read_lock_bh(&table->lock); 266 read_lock_bh(&table->lock);
272 table_base = (void *)table->private->entries 267 table_base = (void *)table->private->entries[smp_processor_id()];
273 + TABLE_OFFSET(table->private,
274 smp_processor_id());
275 e = get_entry(table_base, table->private->hook_entry[hook]); 268 e = get_entry(table_base, table->private->hook_entry[hook]);
276 back = get_entry(table_base, table->private->underflow[hook]); 269 back = get_entry(table_base, table->private->underflow[hook]);
277 270
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
462/* Figures out from what hook each rule can be called: returns 0 if 455/* Figures out from what hook each rule can be called: returns 0 if
463 * there are loops. Puts hook bitmask in comefrom. 456 * there are loops. Puts hook bitmask in comefrom.
464 */ 457 */
465static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) 458static int mark_source_chains(struct arpt_table_info *newinfo,
459 unsigned int valid_hooks, void *entry0)
466{ 460{
467 unsigned int hook; 461 unsigned int hook;
468 462
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
472 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { 466 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
473 unsigned int pos = newinfo->hook_entry[hook]; 467 unsigned int pos = newinfo->hook_entry[hook];
474 struct arpt_entry *e 468 struct arpt_entry *e
475 = (struct arpt_entry *)(newinfo->entries + pos); 469 = (struct arpt_entry *)(entry0 + pos);
476 470
477 if (!(valid_hooks & (1 << hook))) 471 if (!(valid_hooks & (1 << hook)))
478 continue; 472 continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
514 goto next; 508 goto next;
515 509
516 e = (struct arpt_entry *) 510 e = (struct arpt_entry *)
517 (newinfo->entries + pos); 511 (entry0 + pos);
518 } while (oldpos == pos + e->next_offset); 512 } while (oldpos == pos + e->next_offset);
519 513
520 /* Move along one */ 514 /* Move along one */
521 size = e->next_offset; 515 size = e->next_offset;
522 e = (struct arpt_entry *) 516 e = (struct arpt_entry *)
523 (newinfo->entries + pos + size); 517 (entry0 + pos + size);
524 e->counters.pcnt = pos; 518 e->counters.pcnt = pos;
525 pos += size; 519 pos += size;
526 } else { 520 } else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
537 newpos = pos + e->next_offset; 531 newpos = pos + e->next_offset;
538 } 532 }
539 e = (struct arpt_entry *) 533 e = (struct arpt_entry *)
540 (newinfo->entries + newpos); 534 (entry0 + newpos);
541 e->counters.pcnt = pos; 535 e->counters.pcnt = pos;
542 pos = newpos; 536 pos = newpos;
543 } 537 }
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
689static int translate_table(const char *name, 683static int translate_table(const char *name,
690 unsigned int valid_hooks, 684 unsigned int valid_hooks,
691 struct arpt_table_info *newinfo, 685 struct arpt_table_info *newinfo,
686 void *entry0,
692 unsigned int size, 687 unsigned int size,
693 unsigned int number, 688 unsigned int number,
694 const unsigned int *hook_entries, 689 const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
710 i = 0; 705 i = 0;
711 706
712 /* Walk through entries, checking offsets. */ 707 /* Walk through entries, checking offsets. */
713 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 708 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
714 check_entry_size_and_hooks, 709 check_entry_size_and_hooks,
715 newinfo, 710 newinfo,
716 newinfo->entries, 711 entry0,
717 newinfo->entries + size, 712 entry0 + size,
718 hook_entries, underflows, &i); 713 hook_entries, underflows, &i);
719 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 714 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
720 if (ret != 0) 715 if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
743 } 738 }
744 } 739 }
745 740
746 if (!mark_source_chains(newinfo, valid_hooks)) { 741 if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
747 duprintf("Looping hook\n"); 742 duprintf("Looping hook\n");
748 return -ELOOP; 743 return -ELOOP;
749 } 744 }
750 745
751 /* Finally, each sanity check must pass */ 746 /* Finally, each sanity check must pass */
752 i = 0; 747 i = 0;
753 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 748 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
754 check_entry, name, size, &i); 749 check_entry, name, size, &i);
755 750
756 if (ret != 0) { 751 if (ret != 0) {
757 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 752 ARPT_ENTRY_ITERATE(entry0, newinfo->size,
758 cleanup_entry, &i); 753 cleanup_entry, &i);
759 return ret; 754 return ret;
760 } 755 }
761 756
762 /* And one copy for every other CPU */ 757 /* And one copy for every other CPU */
763 for_each_cpu(i) { 758 for_each_cpu(i) {
764 if (i == 0) 759 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
765 continue; 760 memcpy(newinfo->entries[i], entry0, newinfo->size);
766 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
767 newinfo->entries,
768 SMP_ALIGN(newinfo->size));
769 } 761 }
770 762
771 return ret; 763 return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
807 return 0; 799 return 0;
808} 800}
809 801
802static inline int set_entry_to_counter(const struct arpt_entry *e,
803 struct arpt_counters total[],
804 unsigned int *i)
805{
806 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
807
808 (*i)++;
809 return 0;
810}
811
810static void get_counters(const struct arpt_table_info *t, 812static void get_counters(const struct arpt_table_info *t,
811 struct arpt_counters counters[]) 813 struct arpt_counters counters[])
812{ 814{
813 unsigned int cpu; 815 unsigned int cpu;
814 unsigned int i; 816 unsigned int i;
817 unsigned int curcpu;
818
819 /* Instead of clearing (by a previous call to memset())
820 * the counters and using adds, we set the counters
821 * with data used by 'current' CPU
822 * We dont care about preemption here.
823 */
824 curcpu = raw_smp_processor_id();
825
826 i = 0;
827 ARPT_ENTRY_ITERATE(t->entries[curcpu],
828 t->size,
829 set_entry_to_counter,
830 counters,
831 &i);
815 832
816 for_each_cpu(cpu) { 833 for_each_cpu(cpu) {
834 if (cpu == curcpu)
835 continue;
817 i = 0; 836 i = 0;
818 ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 837 ARPT_ENTRY_ITERATE(t->entries[cpu],
819 t->size, 838 t->size,
820 add_entry_to_counter, 839 add_entry_to_counter,
821 counters, 840 counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
831 struct arpt_entry *e; 850 struct arpt_entry *e;
832 struct arpt_counters *counters; 851 struct arpt_counters *counters;
833 int ret = 0; 852 int ret = 0;
853 void *loc_cpu_entry;
834 854
835 /* We need atomic snapshot of counters: rest doesn't change 855 /* We need atomic snapshot of counters: rest doesn't change
836 * (other than comefrom, which userspace doesn't care 856 * (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
843 return -ENOMEM; 863 return -ENOMEM;
844 864
845 /* First, sum counters... */ 865 /* First, sum counters... */
846 memset(counters, 0, countersize);
847 write_lock_bh(&table->lock); 866 write_lock_bh(&table->lock);
848 get_counters(table->private, counters); 867 get_counters(table->private, counters);
849 write_unlock_bh(&table->lock); 868 write_unlock_bh(&table->lock);
850 869
851 /* ... then copy entire thing from CPU 0... */ 870 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
852 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 871 /* ... then copy entire thing ... */
872 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
853 ret = -EFAULT; 873 ret = -EFAULT;
854 goto free_counters; 874 goto free_counters;
855 } 875 }
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
859 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 879 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
860 struct arpt_entry_target *t; 880 struct arpt_entry_target *t;
861 881
862 e = (struct arpt_entry *)(table->private->entries + off); 882 e = (struct arpt_entry *)(loc_cpu_entry + off);
863 if (copy_to_user(userptr + off 883 if (copy_to_user(userptr + off
864 + offsetof(struct arpt_entry, counters), 884 + offsetof(struct arpt_entry, counters),
865 &counters[num], 885 &counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
911 return ret; 931 return ret;
912} 932}
913 933
934static void free_table_info(struct arpt_table_info *info)
935{
936 int cpu;
937 for_each_cpu(cpu) {
938 if (info->size <= PAGE_SIZE)
939 kfree(info->entries[cpu]);
940 else
941 vfree(info->entries[cpu]);
942 }
943 kfree(info);
944}
945
946static struct arpt_table_info *alloc_table_info(unsigned int size)
947{
948 struct arpt_table_info *newinfo;
949 int cpu;
950
951 newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
952 if (!newinfo)
953 return NULL;
954
955 newinfo->size = size;
956
957 for_each_cpu(cpu) {
958 if (size <= PAGE_SIZE)
959 newinfo->entries[cpu] = kmalloc_node(size,
960 GFP_KERNEL,
961 cpu_to_node(cpu));
962 else
963 newinfo->entries[cpu] = vmalloc_node(size,
964 cpu_to_node(cpu));
965
966 if (newinfo->entries[cpu] == NULL) {
967 free_table_info(newinfo);
968 return NULL;
969 }
970 }
971
972 return newinfo;
973}
974
914static int do_replace(void __user *user, unsigned int len) 975static int do_replace(void __user *user, unsigned int len)
915{ 976{
916 int ret; 977 int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
918 struct arpt_table *t; 979 struct arpt_table *t;
919 struct arpt_table_info *newinfo, *oldinfo; 980 struct arpt_table_info *newinfo, *oldinfo;
920 struct arpt_counters *counters; 981 struct arpt_counters *counters;
982 void *loc_cpu_entry, *loc_cpu_old_entry;
921 983
922 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 984 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
923 return -EFAULT; 985 return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
930 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 992 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
931 return -ENOMEM; 993 return -ENOMEM;
932 994
933 newinfo = vmalloc(sizeof(struct arpt_table_info) 995 newinfo = alloc_table_info(tmp.size);
934 + SMP_ALIGN(tmp.size) *
935 (highest_possible_processor_id()+1));
936 if (!newinfo) 996 if (!newinfo)
937 return -ENOMEM; 997 return -ENOMEM;
938 998
939 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 999 /* choose the copy that is on our node/cpu */
1000 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1001 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
940 tmp.size) != 0) { 1002 tmp.size) != 0) {
941 ret = -EFAULT; 1003 ret = -EFAULT;
942 goto free_newinfo; 1004 goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
947 ret = -ENOMEM; 1009 ret = -ENOMEM;
948 goto free_newinfo; 1010 goto free_newinfo;
949 } 1011 }
950 memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
951 1012
952 ret = translate_table(tmp.name, tmp.valid_hooks, 1013 ret = translate_table(tmp.name, tmp.valid_hooks,
953 newinfo, tmp.size, tmp.num_entries, 1014 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
954 tmp.hook_entry, tmp.underflow); 1015 tmp.hook_entry, tmp.underflow);
955 if (ret != 0) 1016 if (ret != 0)
956 goto free_newinfo_counters; 1017 goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
989 /* Get the old counters. */ 1050 /* Get the old counters. */
990 get_counters(oldinfo, counters); 1051 get_counters(oldinfo, counters);
991 /* Decrease module usage counts and free resource */ 1052 /* Decrease module usage counts and free resource */
992 ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1053 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
993 vfree(oldinfo); 1054 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1055
1056 free_table_info(oldinfo);
994 if (copy_to_user(tmp.counters, counters, 1057 if (copy_to_user(tmp.counters, counters,
995 sizeof(struct arpt_counters) * tmp.num_counters) != 0) 1058 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
996 ret = -EFAULT; 1059 ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
1002 module_put(t->me); 1065 module_put(t->me);
1003 up(&arpt_mutex); 1066 up(&arpt_mutex);
1004 free_newinfo_counters_untrans: 1067 free_newinfo_counters_untrans:
1005 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); 1068 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
1006 free_newinfo_counters: 1069 free_newinfo_counters:
1007 vfree(counters); 1070 vfree(counters);
1008 free_newinfo: 1071 free_newinfo:
1009 vfree(newinfo); 1072 free_table_info(newinfo);
1010 return ret; 1073 return ret;
1011} 1074}
1012 1075
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
1030 struct arpt_counters_info tmp, *paddc; 1093 struct arpt_counters_info tmp, *paddc;
1031 struct arpt_table *t; 1094 struct arpt_table *t;
1032 int ret = 0; 1095 int ret = 0;
1096 void *loc_cpu_entry;
1033 1097
1034 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1098 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1035 return -EFAULT; 1099 return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
1059 } 1123 }
1060 1124
1061 i = 0; 1125 i = 0;
1062 ARPT_ENTRY_ITERATE(t->private->entries, 1126 /* Choose the copy that is on our node */
1127 loc_cpu_entry = t->private->entries[smp_processor_id()];
1128 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1063 t->private->size, 1129 t->private->size,
1064 add_counter_to_entry, 1130 add_counter_to_entry,
1065 paddc->counters, 1131 paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
1220 struct arpt_table_info *newinfo; 1286 struct arpt_table_info *newinfo;
1221 static struct arpt_table_info bootstrap 1287 static struct arpt_table_info bootstrap
1222 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1288 = { 0, 0, 0, { 0 }, { 0 }, { } };
1289 void *loc_cpu_entry;
1223 1290
1224 newinfo = vmalloc(sizeof(struct arpt_table_info) 1291 newinfo = alloc_table_info(repl->size);
1225 + SMP_ALIGN(repl->size) *
1226 (highest_possible_processor_id()+1));
1227 if (!newinfo) { 1292 if (!newinfo) {
1228 ret = -ENOMEM; 1293 ret = -ENOMEM;
1229 return ret; 1294 return ret;
1230 } 1295 }
1231 memcpy(newinfo->entries, repl->entries, repl->size); 1296
1297 /* choose the copy on our node/cpu */
1298 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1299 memcpy(loc_cpu_entry, repl->entries, repl->size);
1232 1300
1233 ret = translate_table(table->name, table->valid_hooks, 1301 ret = translate_table(table->name, table->valid_hooks,
1234 newinfo, repl->size, 1302 newinfo, loc_cpu_entry, repl->size,
1235 repl->num_entries, 1303 repl->num_entries,
1236 repl->hook_entry, 1304 repl->hook_entry,
1237 repl->underflow); 1305 repl->underflow);
1238 duprintf("arpt_register_table: translate table gives %d\n", ret); 1306 duprintf("arpt_register_table: translate table gives %d\n", ret);
1239 if (ret != 0) { 1307 if (ret != 0) {
1240 vfree(newinfo); 1308 free_table_info(newinfo);
1241 return ret; 1309 return ret;
1242 } 1310 }
1243 1311
1244 ret = down_interruptible(&arpt_mutex); 1312 ret = down_interruptible(&arpt_mutex);
1245 if (ret != 0) { 1313 if (ret != 0) {
1246 vfree(newinfo); 1314 free_table_info(newinfo);
1247 return ret; 1315 return ret;
1248 } 1316 }
1249 1317
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
1272 return ret; 1340 return ret;
1273 1341
1274 free_unlock: 1342 free_unlock:
1275 vfree(newinfo); 1343 free_table_info(newinfo);
1276 goto unlock; 1344 goto unlock;
1277} 1345}
1278 1346
1279void arpt_unregister_table(struct arpt_table *table) 1347void arpt_unregister_table(struct arpt_table *table)
1280{ 1348{
1349 void *loc_cpu_entry;
1350
1281 down(&arpt_mutex); 1351 down(&arpt_mutex);
1282 LIST_DELETE(&arpt_tables, table); 1352 LIST_DELETE(&arpt_tables, table);
1283 up(&arpt_mutex); 1353 up(&arpt_mutex);
1284 1354
1285 /* Decrease module usage counts and free resources */ 1355 /* Decrease module usage counts and free resources */
1286 ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1356 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1357 ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1287 cleanup_entry, NULL); 1358 cleanup_entry, NULL);
1288 vfree(table->private); 1359 free_table_info(table->private);
1289} 1360}
1290 1361
1291/* The built-in targets: standard (NULL) and error. */ 1362/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
18 * 18 *
19 */ 19 */
20 20
21#include <linux/in.h>
21#include <linux/kernel.h> 22#include <linux/kernel.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/netfilter.h> 24#include <linux/netfilter.h>
24#include <linux/ip.h> 25#include <linux/ip.h>
25#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
27#include <linux/udp.h>
26#include <net/checksum.h> 28#include <net/checksum.h>
27#include <net/udp.h> 29#include <net/udp.h>
28 30
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
31#include <linux/ip.h> 31#include <linux/ip.h>
32#include <linux/in.h> 32#include <linux/in.h>
33#include <linux/list.h> 33#include <linux/list.h>
34#include <linux/seq_file.h>
34 35
35static DEFINE_RWLOCK(ip_ct_gre_lock); 36static DEFINE_RWLOCK(ip_ct_gre_lock);
36#define ASSERT_READ_LOCK(x) 37#define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
11#include <linux/timer.h> 11#include <linux/timer.h>
12#include <linux/netfilter.h> 12#include <linux/netfilter.h>
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/ip.h>
14#include <linux/udp.h> 15#include <linux/udp.h>
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <net/checksum.h> 17#include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
27#endif 27#endif
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30#include <net/route.h>
30 31
31#define ASSERT_READ_LOCK(x) 32#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) 33#define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
44 * 44 *
45 */ 45 */
46#include <linux/config.h> 46#include <linux/config.h>
47#include <linux/in.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/types.h> 49#include <linux/types.h>
49#include <linux/kernel.h> 50#include <linux/kernel.h>
@@ -53,6 +54,7 @@
53#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 54#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
54#include <linux/netfilter_ipv4/ip_nat_helper.h> 55#include <linux/netfilter_ipv4/ip_nat_helper.h>
55#include <linux/ip.h> 56#include <linux/ip.h>
57#include <linux/udp.h>
56#include <net/checksum.h> 58#include <net/checksum.h>
57#include <net/udp.h> 59#include <net/udp.h>
58#include <asm/uaccess.h> 60#include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e8..2a26d167e149 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
83 context stops packets coming through and allows user context to read 83 context stops packets coming through and allows user context to read
84 the counters or update the rules. 84 the counters or update the rules.
85 85
86 To be cache friendly on SMP, we arrange them like so:
87 [ n-entries ]
88 ... cache-align padding ...
89 [ n-entries ]
90
91 Hence the start of any table is given by get_table() below. */ 86 Hence the start of any table is given by get_table() below. */
92 87
93/* The table itself */ 88/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
105 unsigned int underflow[NF_IP_NUMHOOKS]; 100 unsigned int underflow[NF_IP_NUMHOOKS];
106 101
107 /* ipt_entry tables: one per CPU */ 102 /* ipt_entry tables: one per CPU */
108 char entries[0] ____cacheline_aligned; 103 void *entries[NR_CPUS];
109}; 104};
110 105
111static LIST_HEAD(ipt_target); 106static LIST_HEAD(ipt_target);
112static LIST_HEAD(ipt_match); 107static LIST_HEAD(ipt_match);
113static LIST_HEAD(ipt_tables); 108static LIST_HEAD(ipt_tables);
109#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 110#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
115 111
116#ifdef CONFIG_SMP
117#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
118#else
119#define TABLE_OFFSET(t,p) 0
120#endif
121
122#if 0 112#if 0
123#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) 113#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
124#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) 114#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
290 280
291 read_lock_bh(&table->lock); 281 read_lock_bh(&table->lock);
292 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 282 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
293 table_base = (void *)table->private->entries 283 table_base = (void *)table->private->entries[smp_processor_id()];
294 + TABLE_OFFSET(table->private, smp_processor_id());
295 e = get_entry(table_base, table->private->hook_entry[hook]); 284 e = get_entry(table_base, table->private->hook_entry[hook]);
296 285
297#ifdef CONFIG_NETFILTER_DEBUG 286#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
563/* Figures out from what hook each rule can be called: returns 0 if 552/* Figures out from what hook each rule can be called: returns 0 if
564 there are loops. Puts hook bitmask in comefrom. */ 553 there are loops. Puts hook bitmask in comefrom. */
565static int 554static int
566mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) 555mark_source_chains(struct ipt_table_info *newinfo,
556 unsigned int valid_hooks, void *entry0)
567{ 557{
568 unsigned int hook; 558 unsigned int hook;
569 559
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
572 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { 562 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
573 unsigned int pos = newinfo->hook_entry[hook]; 563 unsigned int pos = newinfo->hook_entry[hook];
574 struct ipt_entry *e 564 struct ipt_entry *e
575 = (struct ipt_entry *)(newinfo->entries + pos); 565 = (struct ipt_entry *)(entry0 + pos);
576 566
577 if (!(valid_hooks & (1 << hook))) 567 if (!(valid_hooks & (1 << hook)))
578 continue; 568 continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
622 goto next; 612 goto next;
623 613
624 e = (struct ipt_entry *) 614 e = (struct ipt_entry *)
625 (newinfo->entries + pos); 615 (entry0 + pos);
626 } while (oldpos == pos + e->next_offset); 616 } while (oldpos == pos + e->next_offset);
627 617
628 /* Move along one */ 618 /* Move along one */
629 size = e->next_offset; 619 size = e->next_offset;
630 e = (struct ipt_entry *) 620 e = (struct ipt_entry *)
631 (newinfo->entries + pos + size); 621 (entry0 + pos + size);
632 e->counters.pcnt = pos; 622 e->counters.pcnt = pos;
633 pos += size; 623 pos += size;
634 } else { 624 } else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
645 newpos = pos + e->next_offset; 635 newpos = pos + e->next_offset;
646 } 636 }
647 e = (struct ipt_entry *) 637 e = (struct ipt_entry *)
648 (newinfo->entries + newpos); 638 (entry0 + newpos);
649 e->counters.pcnt = pos; 639 e->counters.pcnt = pos;
650 pos = newpos; 640 pos = newpos;
651 } 641 }
@@ -855,6 +845,7 @@ static int
855translate_table(const char *name, 845translate_table(const char *name,
856 unsigned int valid_hooks, 846 unsigned int valid_hooks,
857 struct ipt_table_info *newinfo, 847 struct ipt_table_info *newinfo,
848 void *entry0,
858 unsigned int size, 849 unsigned int size,
859 unsigned int number, 850 unsigned int number,
860 const unsigned int *hook_entries, 851 const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
875 duprintf("translate_table: size %u\n", newinfo->size); 866 duprintf("translate_table: size %u\n", newinfo->size);
876 i = 0; 867 i = 0;
877 /* Walk through entries, checking offsets. */ 868 /* Walk through entries, checking offsets. */
878 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 869 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
879 check_entry_size_and_hooks, 870 check_entry_size_and_hooks,
880 newinfo, 871 newinfo,
881 newinfo->entries, 872 entry0,
882 newinfo->entries + size, 873 entry0 + size,
883 hook_entries, underflows, &i); 874 hook_entries, underflows, &i);
884 if (ret != 0) 875 if (ret != 0)
885 return ret; 876 return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
907 } 898 }
908 } 899 }
909 900
910 if (!mark_source_chains(newinfo, valid_hooks)) 901 if (!mark_source_chains(newinfo, valid_hooks, entry0))
911 return -ELOOP; 902 return -ELOOP;
912 903
913 /* Finally, each sanity check must pass */ 904 /* Finally, each sanity check must pass */
914 i = 0; 905 i = 0;
915 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 906 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
916 check_entry, name, size, &i); 907 check_entry, name, size, &i);
917 908
918 if (ret != 0) { 909 if (ret != 0) {
919 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 910 IPT_ENTRY_ITERATE(entry0, newinfo->size,
920 cleanup_entry, &i); 911 cleanup_entry, &i);
921 return ret; 912 return ret;
922 } 913 }
923 914
924 /* And one copy for every other CPU */ 915 /* And one copy for every other CPU */
925 for_each_cpu(i) { 916 for_each_cpu(i) {
926 if (i == 0) 917 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
927 continue; 918 memcpy(newinfo->entries[i], entry0, newinfo->size);
928 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
929 newinfo->entries,
930 SMP_ALIGN(newinfo->size));
931 } 919 }
932 920
933 return ret; 921 return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
943 931
944#ifdef CONFIG_NETFILTER_DEBUG 932#ifdef CONFIG_NETFILTER_DEBUG
945 { 933 {
946 struct ipt_entry *table_base; 934 int cpu;
947 unsigned int i;
948 935
949 for_each_cpu(i) { 936 for_each_cpu(cpu) {
950 table_base = 937 struct ipt_entry *table_base = newinfo->entries[cpu];
951 (void *)newinfo->entries 938 if (table_base)
952 + TABLE_OFFSET(newinfo, i); 939 table_base->comefrom = 0xdead57ac;
953
954 table_base->comefrom = 0xdead57ac;
955 } 940 }
956 } 941 }
957#endif 942#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
986 return 0; 971 return 0;
987} 972}
988 973
974static inline int
975set_entry_to_counter(const struct ipt_entry *e,
976 struct ipt_counters total[],
977 unsigned int *i)
978{
979 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
980
981 (*i)++;
982 return 0;
983}
984
989static void 985static void
990get_counters(const struct ipt_table_info *t, 986get_counters(const struct ipt_table_info *t,
991 struct ipt_counters counters[]) 987 struct ipt_counters counters[])
992{ 988{
993 unsigned int cpu; 989 unsigned int cpu;
994 unsigned int i; 990 unsigned int i;
991 unsigned int curcpu;
992
993 /* Instead of clearing (by a previous call to memset())
994 * the counters and using adds, we set the counters
995 * with data used by 'current' CPU
996 * We dont care about preemption here.
997 */
998 curcpu = raw_smp_processor_id();
999
1000 i = 0;
1001 IPT_ENTRY_ITERATE(t->entries[curcpu],
1002 t->size,
1003 set_entry_to_counter,
1004 counters,
1005 &i);
995 1006
996 for_each_cpu(cpu) { 1007 for_each_cpu(cpu) {
1008 if (cpu == curcpu)
1009 continue;
997 i = 0; 1010 i = 0;
998 IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 1011 IPT_ENTRY_ITERATE(t->entries[cpu],
999 t->size, 1012 t->size,
1000 add_entry_to_counter, 1013 add_entry_to_counter,
1001 counters, 1014 counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
1012 struct ipt_entry *e; 1025 struct ipt_entry *e;
1013 struct ipt_counters *counters; 1026 struct ipt_counters *counters;
1014 int ret = 0; 1027 int ret = 0;
1028 void *loc_cpu_entry;
1015 1029
1016 /* We need atomic snapshot of counters: rest doesn't change 1030 /* We need atomic snapshot of counters: rest doesn't change
1017 (other than comefrom, which userspace doesn't care 1031 (other than comefrom, which userspace doesn't care
1018 about). */ 1032 about). */
1019 countersize = sizeof(struct ipt_counters) * table->private->number; 1033 countersize = sizeof(struct ipt_counters) * table->private->number;
1020 counters = vmalloc(countersize); 1034 counters = vmalloc_node(countersize, numa_node_id());
1021 1035
1022 if (counters == NULL) 1036 if (counters == NULL)
1023 return -ENOMEM; 1037 return -ENOMEM;
1024 1038
1025 /* First, sum counters... */ 1039 /* First, sum counters... */
1026 memset(counters, 0, countersize);
1027 write_lock_bh(&table->lock); 1040 write_lock_bh(&table->lock);
1028 get_counters(table->private, counters); 1041 get_counters(table->private, counters);
1029 write_unlock_bh(&table->lock); 1042 write_unlock_bh(&table->lock);
1030 1043
1031 /* ... then copy entire thing from CPU 0... */ 1044 /* choose the copy that is on our node/cpu, ...
1032 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 1045 * This choice is lazy (because current thread is
1046 * allowed to migrate to another cpu)
1047 */
1048 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1049 /* ... then copy entire thing ... */
1050 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
1033 ret = -EFAULT; 1051 ret = -EFAULT;
1034 goto free_counters; 1052 goto free_counters;
1035 } 1053 }
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
1041 struct ipt_entry_match *m; 1059 struct ipt_entry_match *m;
1042 struct ipt_entry_target *t; 1060 struct ipt_entry_target *t;
1043 1061
1044 e = (struct ipt_entry *)(table->private->entries + off); 1062 e = (struct ipt_entry *)(loc_cpu_entry + off);
1045 if (copy_to_user(userptr + off 1063 if (copy_to_user(userptr + off
1046 + offsetof(struct ipt_entry, counters), 1064 + offsetof(struct ipt_entry, counters),
1047 &counters[num], 1065 &counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
1110 return ret; 1128 return ret;
1111} 1129}
1112 1130
1131static void free_table_info(struct ipt_table_info *info)
1132{
1133 int cpu;
1134 for_each_cpu(cpu) {
1135 if (info->size <= PAGE_SIZE)
1136 kfree(info->entries[cpu]);
1137 else
1138 vfree(info->entries[cpu]);
1139 }
1140 kfree(info);
1141}
1142
1143static struct ipt_table_info *alloc_table_info(unsigned int size)
1144{
1145 struct ipt_table_info *newinfo;
1146 int cpu;
1147
1148 newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
1149 if (!newinfo)
1150 return NULL;
1151
1152 newinfo->size = size;
1153
1154 for_each_cpu(cpu) {
1155 if (size <= PAGE_SIZE)
1156 newinfo->entries[cpu] = kmalloc_node(size,
1157 GFP_KERNEL,
1158 cpu_to_node(cpu));
1159 else
1160 newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
1161 if (newinfo->entries[cpu] == 0) {
1162 free_table_info(newinfo);
1163 return NULL;
1164 }
1165 }
1166
1167 return newinfo;
1168}
1169
1113static int 1170static int
1114do_replace(void __user *user, unsigned int len) 1171do_replace(void __user *user, unsigned int len)
1115{ 1172{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
1118 struct ipt_table *t; 1175 struct ipt_table *t;
1119 struct ipt_table_info *newinfo, *oldinfo; 1176 struct ipt_table_info *newinfo, *oldinfo;
1120 struct ipt_counters *counters; 1177 struct ipt_counters *counters;
1178 void *loc_cpu_entry, *loc_cpu_old_entry;
1121 1179
1122 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1180 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1123 return -EFAULT; 1181 return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
1130 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 1188 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1131 return -ENOMEM; 1189 return -ENOMEM;
1132 1190
1133 newinfo = vmalloc(sizeof(struct ipt_table_info) 1191 newinfo = alloc_table_info(tmp.size);
1134 + SMP_ALIGN(tmp.size) *
1135 (highest_possible_processor_id()+1));
1136 if (!newinfo) 1192 if (!newinfo)
1137 return -ENOMEM; 1193 return -ENOMEM;
1138 1194
1139 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 1195 /* choose the copy that is our node/cpu */
1196 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1197 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
1140 tmp.size) != 0) { 1198 tmp.size) != 0) {
1141 ret = -EFAULT; 1199 ret = -EFAULT;
1142 goto free_newinfo; 1200 goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
1147 ret = -ENOMEM; 1205 ret = -ENOMEM;
1148 goto free_newinfo; 1206 goto free_newinfo;
1149 } 1207 }
1150 memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
1151 1208
1152 ret = translate_table(tmp.name, tmp.valid_hooks, 1209 ret = translate_table(tmp.name, tmp.valid_hooks,
1153 newinfo, tmp.size, tmp.num_entries, 1210 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1154 tmp.hook_entry, tmp.underflow); 1211 tmp.hook_entry, tmp.underflow);
1155 if (ret != 0) 1212 if (ret != 0)
1156 goto free_newinfo_counters; 1213 goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
1189 /* Get the old counters. */ 1246 /* Get the old counters. */
1190 get_counters(oldinfo, counters); 1247 get_counters(oldinfo, counters);
1191 /* Decrease module usage counts and free resource */ 1248 /* Decrease module usage counts and free resource */
1192 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1249 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1193 vfree(oldinfo); 1250 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1251 free_table_info(oldinfo);
1194 if (copy_to_user(tmp.counters, counters, 1252 if (copy_to_user(tmp.counters, counters,
1195 sizeof(struct ipt_counters) * tmp.num_counters) != 0) 1253 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
1196 ret = -EFAULT; 1254 ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
1202 module_put(t->me); 1260 module_put(t->me);
1203 up(&ipt_mutex); 1261 up(&ipt_mutex);
1204 free_newinfo_counters_untrans: 1262 free_newinfo_counters_untrans:
1205 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); 1263 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1206 free_newinfo_counters: 1264 free_newinfo_counters:
1207 vfree(counters); 1265 vfree(counters);
1208 free_newinfo: 1266 free_newinfo:
1209 vfree(newinfo); 1267 free_table_info(newinfo);
1210 return ret; 1268 return ret;
1211} 1269}
1212 1270
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
1239 struct ipt_counters_info tmp, *paddc; 1297 struct ipt_counters_info tmp, *paddc;
1240 struct ipt_table *t; 1298 struct ipt_table *t;
1241 int ret = 0; 1299 int ret = 0;
1300 void *loc_cpu_entry;
1242 1301
1243 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1302 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1244 return -EFAULT; 1303 return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
1246 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) 1305 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
1247 return -EINVAL; 1306 return -EINVAL;
1248 1307
1249 paddc = vmalloc(len); 1308 paddc = vmalloc_node(len, numa_node_id());
1250 if (!paddc) 1309 if (!paddc)
1251 return -ENOMEM; 1310 return -ENOMEM;
1252 1311
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
1268 } 1327 }
1269 1328
1270 i = 0; 1329 i = 0;
1271 IPT_ENTRY_ITERATE(t->private->entries, 1330 /* Choose the copy that is on our node */
1331 loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
1332 IPT_ENTRY_ITERATE(loc_cpu_entry,
1272 t->private->size, 1333 t->private->size,
1273 add_counter_to_entry, 1334 add_counter_to_entry,
1274 paddc->counters, 1335 paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1460 struct ipt_table_info *newinfo; 1521 struct ipt_table_info *newinfo;
1461 static struct ipt_table_info bootstrap 1522 static struct ipt_table_info bootstrap
1462 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1523 = { 0, 0, 0, { 0 }, { 0 }, { } };
1524 void *loc_cpu_entry;
1463 1525
1464 newinfo = vmalloc(sizeof(struct ipt_table_info) 1526 newinfo = alloc_table_info(repl->size);
1465 + SMP_ALIGN(repl->size) *
1466 (highest_possible_processor_id()+1));
1467 if (!newinfo) 1527 if (!newinfo)
1468 return -ENOMEM; 1528 return -ENOMEM;
1469 1529
1470 memcpy(newinfo->entries, repl->entries, repl->size); 1530 /* choose the copy on our node/cpu
1531 * but dont care of preemption
1532 */
1533 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1534 memcpy(loc_cpu_entry, repl->entries, repl->size);
1471 1535
1472 ret = translate_table(table->name, table->valid_hooks, 1536 ret = translate_table(table->name, table->valid_hooks,
1473 newinfo, repl->size, 1537 newinfo, loc_cpu_entry, repl->size,
1474 repl->num_entries, 1538 repl->num_entries,
1475 repl->hook_entry, 1539 repl->hook_entry,
1476 repl->underflow); 1540 repl->underflow);
1477 if (ret != 0) { 1541 if (ret != 0) {
1478 vfree(newinfo); 1542 free_table_info(newinfo);
1479 return ret; 1543 return ret;
1480 } 1544 }
1481 1545
1482 ret = down_interruptible(&ipt_mutex); 1546 ret = down_interruptible(&ipt_mutex);
1483 if (ret != 0) { 1547 if (ret != 0) {
1484 vfree(newinfo); 1548 free_table_info(newinfo);
1485 return ret; 1549 return ret;
1486 } 1550 }
1487 1551
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1510 return ret; 1574 return ret;
1511 1575
1512 free_unlock: 1576 free_unlock:
1513 vfree(newinfo); 1577 free_table_info(newinfo);
1514 goto unlock; 1578 goto unlock;
1515} 1579}
1516 1580
1517void ipt_unregister_table(struct ipt_table *table) 1581void ipt_unregister_table(struct ipt_table *table)
1518{ 1582{
1583 void *loc_cpu_entry;
1584
1519 down(&ipt_mutex); 1585 down(&ipt_mutex);
1520 LIST_DELETE(&ipt_tables, table); 1586 LIST_DELETE(&ipt_tables, table);
1521 up(&ipt_mutex); 1587 up(&ipt_mutex);
1522 1588
1523 /* Decrease module usage counts and free resources */ 1589 /* Decrease module usage counts and free resources */
1524 IPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1590 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1591 IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1525 cleanup_entry, NULL); 1592 cleanup_entry, NULL);
1526 vfree(table->private); 1593 free_table_info(table->private);
1527} 1594}
1528 1595
1529/* Returns 1 if the port is matched by the range, 0 otherwise */ 1596/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/inetdevice.h>
14#include <linux/ip.h> 15#include <linux/ip.h>
15#include <linux/timer.h> 16#include <linux/timer.h>
16#include <linux/module.h> 17#include <linux/module.h>
@@ -18,6 +19,7 @@
18#include <net/protocol.h> 19#include <net/protocol.h>
19#include <net/ip.h> 20#include <net/ip.h>
20#include <net/checksum.h> 21#include <net/checksum.h>
22#include <net/route.h>
21#include <linux/netfilter_ipv4.h> 23#include <linux/netfilter_ipv4.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h> 24#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/netdevice.h>
12#include <linux/skbuff.h> 13#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ipt_physdev.h> 14#include <linux/netfilter_ipv4/ipt_physdev.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
38#include <net/protocol.h> 38#include <net/protocol.h>
39#include <net/tcp.h> 39#include <net/tcp.h>
40#include <net/udp.h> 40#include <net/udp.h>
41#include <linux/inetdevice.h>
41#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
42#include <linux/seq_file.h> 43#include <linux/seq_file.h>
43#include <net/sock.h> 44#include <net/sock.h>
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
173 struct request_sock *req, 173 struct request_sock *req,
174 struct dst_entry *dst) 174 struct dst_entry *dst)
175{ 175{
176 struct tcp_sock *tp = tcp_sk(sk); 176 struct inet_connection_sock *icsk = inet_csk(sk);
177 struct sock *child; 177 struct sock *child;
178 178
179 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); 179 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
180 if (child) 180 if (child)
181 inet_csk_reqsk_queue_add(sk, req, child); 181 inet_csk_reqsk_queue_add(sk, req, child);
182 else 182 else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/sysctl.h> 12#include <linux/sysctl.h>
13#include <linux/config.h> 13#include <linux/config.h>
14#include <linux/igmp.h> 14#include <linux/igmp.h>
15#include <linux/inetdevice.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -22,6 +23,7 @@
22extern int sysctl_ip_nonlocal_bind; 23extern int sysctl_ip_nonlocal_bind;
23 24
24#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
26static int zero;
25static int tcp_retr1_max = 255; 27static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 28static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 29static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = {
614 .strategy = &sysctl_jiffies 616 .strategy = &sysctl_jiffies
615 }, 617 },
616 { 618 {
619 .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
620 .procname = "ipfrag_max_dist",
621 .data = &sysctl_ipfrag_max_dist,
622 .maxlen = sizeof(int),
623 .mode = 0644,
624 .proc_handler = &proc_dointvec_minmax,
625 .extra1 = &zero
626 },
627 {
617 .ctl_name = NET_TCP_NO_METRICS_SAVE, 628 .ctl_name = NET_TCP_NO_METRICS_SAVE,
618 .procname = "tcp_no_metrics_save", 629 .procname = "tcp_no_metrics_save",
619 .data = &sysctl_tcp_nometrics_save, 630 .data = &sysctl_tcp_nometrics_save,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1696 int err = 0; 1696 int err = 0;
1697 1697
1698 if (level != SOL_TCP) 1698 if (level != SOL_TCP)
1699 return tp->af_specific->setsockopt(sk, level, optname, 1699 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1700 optval, optlen); 1700 optval, optlen);
1701 1701
1702 /* This is a string value all the others are int's */ 1702 /* This is a string value all the others are int's */
1703 if (optname == TCP_CONGESTION) { 1703 if (optname == TCP_CONGESTION) {
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1916 1916
1917 info->tcpi_pmtu = tp->pmtu_cookie; 1917 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; 1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; 1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1939 int val, len; 1939 int val, len;
1940 1940
1941 if (level != SOL_TCP) 1941 if (level != SOL_TCP)
1942 return tp->af_specific->getsockopt(sk, level, optname, 1942 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
1943 optval, optlen); 1943 optval, optlen);
1944 1944
1945 if (get_user(len, optlen)) 1945 if (get_user(len, optlen))
1946 return -EFAULT; 1946 return -EFAULT;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b1..035f2092d73a 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
30static int max_increment = 16; 30static int max_increment = 16;
31static int low_window = 14; 31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ 32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100; 33static int initial_ssthresh = 100;
36static int smooth_part = 20; 34static int smooth_part = 20;
37 35
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); 41MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644); 42module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase"); 43MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644); 44module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); 45MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644); 46module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
60 u32 loss_cwnd; /* congestion window at last loss */ 54 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */ 55 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */ 56 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */ 57 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4 58#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 59 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
77 ca->loss_cwnd = 0; 66 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0; 67 ca->last_cwnd = 0;
79 ca->last_time = 0; 68 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0; 69 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 70 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87} 71}
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
143 } 127 }
144 128
145 /* if in slow start or link utilization is very low */ 129 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 || 130 if (ca->loss_cwnd == 0) {
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */ 131 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20; 132 ca->cnt = 20;
150 } 133 }
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
154 ca->cnt = 1; 137 ca->cnt = 1;
155} 138}
156 139
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct sock *sk, int flag)
160{
161 const struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
163 u32 dist, delay;
164
165 /* No time stamp */
166 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
167 /* Discard delay samples right after fast recovery */
168 tcp_time_stamp < ca->epoch_start + HZ ||
169 /* this delay samples may not be accurate */
170 flag == 0) {
171 ca->last_delay = 0;
172 goto notlow;
173 }
174
175 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
176 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
177 if (delay == 0) /* no previous delay sample */
178 goto notlow;
179
180 /* first time call or link delay decreases */
181 if (ca->delay_min == 0 || ca->delay_min > delay) {
182 ca->delay_min = ca->delay_max = delay;
183 goto notlow;
184 }
185
186 if (ca->delay_max < delay)
187 ca->delay_max = delay;
188
189 /* utilization is low, if avg delay < dist*threshold
190 for checking_period time */
191 dist = ca->delay_max - ca->delay_min;
192 if (dist <= ca->delay_min>>6 ||
193 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
194 goto notlow;
195
196 if (ca->low_utilization_start == 0) {
197 ca->low_utilization = 0;
198 ca->low_utilization_start = tcp_time_stamp;
199 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
200 > low_utilization_period*HZ) {
201 ca->low_utilization = 1;
202 }
203
204 return;
205
206 notlow:
207 ca->low_utilization = 0;
208 ca->low_utilization_start = 0;
209
210}
211
212static void bictcp_cong_avoid(struct sock *sk, u32 ack, 140static void bictcp_cong_avoid(struct sock *sk, u32 ack,
213 u32 seq_rtt, u32 in_flight, int data_acked) 141 u32 seq_rtt, u32 in_flight, int data_acked)
214{ 142{
215 struct tcp_sock *tp = tcp_sk(sk); 143 struct tcp_sock *tp = tcp_sk(sk);
216 struct bictcp *ca = inet_csk_ca(sk); 144 struct bictcp *ca = inet_csk_ca(sk);
217 145
218 bictcp_low_utilization(sk, data_acked);
219
220 if (!tcp_is_cwnd_limited(sk, in_flight)) 146 if (!tcp_is_cwnd_limited(sk, in_flight))
221 return; 147 return;
222 148
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
249 175
250 ca->epoch_start = 0; /* end of epoch */ 176 ca->epoch_start = 0; /* end of epoch */
251 177
252 /* in case of wrong delay_max*/
253 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
254 ca->delay_max = ca->delay_min
255 + ((ca->delay_max - ca->delay_min)* 90) / 100;
256
257 /* Wmax and fast convergence */ 178 /* Wmax and fast convergence */
258 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) 179 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
259 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) 180 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
289 bictcp_reset(inet_csk_ca(sk)); 210 bictcp_reset(inet_csk_ca(sk));
290} 211}
291 212
292/* Track delayed acknowledgement ratio using sliding window 213/* Track delayed acknowledgment ratio using sliding window
293 * ratio = (15*ratio + sample) / 16 214 * ratio = (15*ratio + sample) / 16
294 */ 215 */
295static void bictcp_acked(struct sock *sk, u32 cnt) 216static void bictcp_acked(struct sock *sk, u32 cnt)
296{ 217{
297 const struct inet_connection_sock *icsk = inet_csk(sk); 218 const struct inet_connection_sock *icsk = inet_csk(sk);
298 219
299 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { 220 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
300 struct bictcp *ca = inet_csk_ca(sk); 221 struct bictcp *ca = inet_csk_ca(sk);
301 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 222 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
302 ca->delayed_ack += cnt; 223 ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc12..e688c687d62d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
174 return err; 174 return err;
175} 175}
176 176
177
178/*
179 * Linear increase during slow start
180 */
181void tcp_slow_start(struct tcp_sock *tp)
182{
183 if (sysctl_tcp_abc) {
184 /* RFC3465: Slow Start
185 * TCP sender SHOULD increase cwnd by the number of
186 * previously unacknowledged bytes ACKed by each incoming
187 * acknowledgment, provided the increase is not more than L
188 */
189 if (tp->bytes_acked < tp->mss_cache)
190 return;
191
192 /* We MAY increase by 2 if discovered delayed ack */
193 if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
194 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
195 tp->snd_cwnd++;
196 }
197 }
198 tp->bytes_acked = 0;
199
200 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
201 tp->snd_cwnd++;
202}
203EXPORT_SYMBOL_GPL(tcp_slow_start);
204
177/* 205/*
178 * TCP Reno congestion control 206 * TCP Reno congestion control
179 * This is special case used for fallback as well. 207 * This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000000..31a4986dfbf7
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,411 @@
1/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
3 *
4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu.
6 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
7 * in PFLDnet 2005
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
10 *
11 * Unless CUBIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19#include <asm/div64.h>
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
29
30static int fast_convergence = 1;
31static int max_increment = 16;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int initial_ssthresh = 100;
34static int bic_scale = 41;
35static int tcp_friendliness = 1;
36
37static u32 cube_rtt_scale;
38static u32 beta_scale;
39static u64 cube_factor;
40
41/* Note parameters that are used for precomputing scale factors are read-only */
42module_param(fast_convergence, int, 0644);
43MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
44module_param(max_increment, int, 0644);
45MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
46module_param(beta, int, 0444);
47MODULE_PARM_DESC(beta, "beta for multiplicative increase");
48module_param(initial_ssthresh, int, 0644);
49MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
50module_param(bic_scale, int, 0444);
51MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
52module_param(tcp_friendliness, int, 0644);
53MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
54
55#include <asm/div64.h>
56
57/* BIC TCP Parameters */
58struct bictcp {
59 u32 cnt; /* increase cwnd by 1 after ACKs */
60 u32 last_max_cwnd; /* last maximum snd_cwnd */
61 u32 loss_cwnd; /* congestion window at last loss */
62 u32 last_cwnd; /* the last snd_cwnd */
63 u32 last_time; /* time when updated last_cwnd */
64 u32 bic_origin_point;/* origin point of bic function */
65 u32 bic_K; /* time to origin point from the beginning of the current epoch */
66 u32 delay_min; /* min delay */
67 u32 epoch_start; /* beginning of an epoch */
68 u32 ack_cnt; /* number of acks */
69 u32 tcp_cwnd; /* estimated tcp cwnd */
70#define ACK_RATIO_SHIFT 4
71 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
72};
73
74static inline void bictcp_reset(struct bictcp *ca)
75{
76 ca->cnt = 0;
77 ca->last_max_cwnd = 0;
78 ca->loss_cwnd = 0;
79 ca->last_cwnd = 0;
80 ca->last_time = 0;
81 ca->bic_origin_point = 0;
82 ca->bic_K = 0;
83 ca->delay_min = 0;
84 ca->epoch_start = 0;
85 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
86 ca->ack_cnt = 0;
87 ca->tcp_cwnd = 0;
88}
89
90static void bictcp_init(struct sock *sk)
91{
92 bictcp_reset(inet_csk_ca(sk));
93 if (initial_ssthresh)
94 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
95}
96
97/* 64bit divisor, dividend and result. dynamic precision */
98static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
99{
100 u_int32_t d = divisor;
101
102 if (divisor > 0xffffffffULL) {
103 unsigned int shift = fls(divisor >> 32);
104
105 d = divisor >> shift;
106 dividend >>= shift;
107 }
108
109 /* avoid 64 bit division if possible */
110 if (dividend >> 32)
111 do_div(dividend, d);
112 else
113 dividend = (uint32_t) dividend / d;
114
115 return dividend;
116}
117
118/*
119 * calculate the cubic root of x using Newton-Raphson
120 */
121static u32 cubic_root(u64 a)
122{
123 u32 x, x1;
124
125 /* Initial estimate is based on:
126 * cbrt(x) = exp(log(x) / 3)
127 */
128 x = 1u << (fls64(a)/3);
129
130 /*
131 * Iteration based on:
132 * 2
133 * x = ( 2 * x + a / x ) / 3
134 * k+1 k k
135 */
136 do {
137 x1 = x;
138 x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
139 } while (abs(x1 - x) > 1);
140
141 return x;
142}
143
144/*
145 * Compute congestion window to use.
146 */
147static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
148{
149 u64 offs;
150 u32 delta, t, bic_target, min_cnt, max_cnt;
151
152 ca->ack_cnt++; /* count the number of ACKs */
153
154 if (ca->last_cwnd == cwnd &&
155 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
156 return;
157
158 ca->last_cwnd = cwnd;
159 ca->last_time = tcp_time_stamp;
160
161 if (ca->epoch_start == 0) {
162 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
163 ca->ack_cnt = 1; /* start counting */
164 ca->tcp_cwnd = cwnd; /* syn with cubic */
165
166 if (ca->last_max_cwnd <= cwnd) {
167 ca->bic_K = 0;
168 ca->bic_origin_point = cwnd;
169 } else {
170 /* Compute new K based on
171 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
172 */
173 ca->bic_K = cubic_root(cube_factor
174 * (ca->last_max_cwnd - cwnd));
175 ca->bic_origin_point = ca->last_max_cwnd;
176 }
177 }
178
179 /* cubic function - calc*/
180 /* calculate c * time^3 / rtt,
181 * while considering overflow in calculation of time^3
182 * (so time^3 is done by using 64 bit)
183 * and without the support of division of 64bit numbers
184 * (so all divisions are done by using 32 bit)
185 * also NOTE the unit of those veriables
186 * time = (t - K) / 2^bictcp_HZ
187 * c = bic_scale >> 10
188 * rtt = (srtt >> 3) / HZ
189 * !!! The following code does not have overflow problems,
190 * if the cwnd < 1 million packets !!!
191 */
192
193 /* change the unit from HZ to bictcp_HZ */
194 t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
195 << BICTCP_HZ) / HZ;
196
197 if (t < ca->bic_K) /* t - K */
198 offs = ca->bic_K - t;
199 else
200 offs = t - ca->bic_K;
201
202 /* c/rtt * (t-K)^3 */
203 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
204 if (t < ca->bic_K) /* below origin*/
205 bic_target = ca->bic_origin_point - delta;
206 else /* above origin*/
207 bic_target = ca->bic_origin_point + delta;
208
209 /* cubic function - calc bictcp_cnt*/
210 if (bic_target > cwnd) {
211 ca->cnt = cwnd / (bic_target - cwnd);
212 } else {
213 ca->cnt = 100 * cwnd; /* very small increment*/
214 }
215
216 if (ca->delay_min > 0) {
217 /* max increment = Smax * rtt / 0.1 */
218 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
219 if (ca->cnt < min_cnt)
220 ca->cnt = min_cnt;
221 }
222
223 /* slow start and low utilization */
224 if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
225 ca->cnt = 50;
226
227 /* TCP Friendly */
228 if (tcp_friendliness) {
229 u32 scale = beta_scale;
230 delta = (cwnd * scale) >> 3;
231 while (ca->ack_cnt > delta) { /* update tcp cwnd */
232 ca->ack_cnt -= delta;
233 ca->tcp_cwnd++;
234 }
235
236 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
237 delta = ca->tcp_cwnd - cwnd;
238 max_cnt = cwnd / delta;
239 if (ca->cnt > max_cnt)
240 ca->cnt = max_cnt;
241 }
242 }
243
244 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
245 if (ca->cnt == 0) /* cannot be zero */
246 ca->cnt = 1;
247}
248
249
250/* Keep track of minimum rtt */
251static inline void measure_delay(struct sock *sk)
252{
253 const struct tcp_sock *tp = tcp_sk(sk);
254 struct bictcp *ca = inet_csk_ca(sk);
255 u32 delay;
256
257 /* No time stamp */
258 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
259 /* Discard delay samples right after fast recovery */
260 (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
261 return;
262
263 delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
264 if (delay == 0)
265 delay = 1;
266
267 /* first time call or link delay decreases */
268 if (ca->delay_min == 0 || ca->delay_min > delay)
269 ca->delay_min = delay;
270}
271
272static void bictcp_cong_avoid(struct sock *sk, u32 ack,
273 u32 seq_rtt, u32 in_flight, int data_acked)
274{
275 struct tcp_sock *tp = tcp_sk(sk);
276 struct bictcp *ca = inet_csk_ca(sk);
277
278 if (data_acked)
279 measure_delay(sk);
280
281 if (!tcp_is_cwnd_limited(sk, in_flight))
282 return;
283
284 if (tp->snd_cwnd <= tp->snd_ssthresh)
285 tcp_slow_start(tp);
286 else {
287 bictcp_update(ca, tp->snd_cwnd);
288
289 /* In dangerous area, increase slowly.
290 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
291 */
292 if (tp->snd_cwnd_cnt >= ca->cnt) {
293 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
294 tp->snd_cwnd++;
295 tp->snd_cwnd_cnt = 0;
296 } else
297 tp->snd_cwnd_cnt++;
298 }
299
300}
301
302static u32 bictcp_recalc_ssthresh(struct sock *sk)
303{
304 const struct tcp_sock *tp = tcp_sk(sk);
305 struct bictcp *ca = inet_csk_ca(sk);
306
307 ca->epoch_start = 0; /* end of epoch */
308
309 /* Wmax and fast convergence */
310 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
311 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
312 / (2 * BICTCP_BETA_SCALE);
313 else
314 ca->last_max_cwnd = tp->snd_cwnd;
315
316 ca->loss_cwnd = tp->snd_cwnd;
317
318 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
319}
320
321static u32 bictcp_undo_cwnd(struct sock *sk)
322{
323 struct bictcp *ca = inet_csk_ca(sk);
324
325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
326}
327
328static u32 bictcp_min_cwnd(struct sock *sk)
329{
330 return tcp_sk(sk)->snd_ssthresh;
331}
332
333static void bictcp_state(struct sock *sk, u8 new_state)
334{
335 if (new_state == TCP_CA_Loss)
336 bictcp_reset(inet_csk_ca(sk));
337}
338
339/* Track delayed acknowledgment ratio using sliding window
340 * ratio = (15*ratio + sample) / 16
341 */
342static void bictcp_acked(struct sock *sk, u32 cnt)
343{
344 const struct inet_connection_sock *icsk = inet_csk(sk);
345
346 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
347 struct bictcp *ca = inet_csk_ca(sk);
348 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
349 ca->delayed_ack += cnt;
350 }
351}
352
353
354static struct tcp_congestion_ops cubictcp = {
355 .init = bictcp_init,
356 .ssthresh = bictcp_recalc_ssthresh,
357 .cong_avoid = bictcp_cong_avoid,
358 .set_state = bictcp_state,
359 .undo_cwnd = bictcp_undo_cwnd,
360 .min_cwnd = bictcp_min_cwnd,
361 .pkts_acked = bictcp_acked,
362 .owner = THIS_MODULE,
363 .name = "cubic",
364};
365
366static int __init cubictcp_register(void)
367{
368 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
369
370 /* Precompute a bunch of the scaling factors that are used per-packet
371 * based on SRTT of 100ms
372 */
373
374 beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
375
376 cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
377
378 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
379 * so K = cubic_root( (wmax-cwnd)*rtt/c )
380 * the unit of K is bictcp_HZ=2^10, not HZ
381 *
382 * c = bic_scale >> 10
383 * rtt = 100ms
384 *
385 * the following code has been designed and tested for
386 * cwnd < 1 million packets
387 * RTT < 100 seconds
388 * HZ < 1,000,00 (corresponding to 10 nano-second)
389 */
390
391 /* 1/c * 2^2*bictcp_HZ * srtt */
392 cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
393
394 /* divide by bic_scale and by constant Srtt (100ms) */
395 do_div(cube_factor, bic_scale * 10);
396
397 return tcp_register_congestion_control(&cubictcp);
398}
399
400static void __exit cubictcp_unregister(void)
401{
402 tcp_unregister_congestion_control(&cubictcp);
403}
404
405module_init(cubictcp_register);
406module_exit(cubictcp_unregister);
407
408MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
409MODULE_LICENSE("GPL");
410MODULE_DESCRIPTION("CUBIC TCP");
411MODULE_VERSION("2.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..0a461232329f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
115/* Adapt the MSS value used to make delayed ack decision to the 115/* Adapt the MSS value used to make delayed ack decision to the
116 * real world. 116 * real world.
117 */ 117 */
118static inline void tcp_measure_rcv_mss(struct sock *sk, 118static void tcp_measure_rcv_mss(struct sock *sk,
119 const struct sk_buff *skb) 119 const struct sk_buff *skb)
120{ 120{
121 struct inet_connection_sock *icsk = inet_csk(sk); 121 struct inet_connection_sock *icsk = inet_csk(sk);
122 const unsigned int lss = icsk->icsk_ack.last_seg_size; 122 const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
246 return 0; 246 return 0;
247} 247}
248 248
249static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 249static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
250 struct sk_buff *skb) 250 struct sk_buff *skb)
251{ 251{
252 /* Check #1 */ 252 /* Check #1 */
253 if (tp->rcv_ssthresh < tp->window_clamp && 253 if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
342} 342}
343 343
344
345/* Initialize RCV_MSS value.
346 * RCV_MSS is an our guess about MSS used by the peer.
347 * We haven't any direct information about the MSS.
348 * It's better to underestimate the RCV_MSS rather than overestimate.
349 * Overestimations make us ACKing less frequently than needed.
350 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
351 */
352void tcp_initialize_rcv_mss(struct sock *sk)
353{
354 struct tcp_sock *tp = tcp_sk(sk);
355 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
356
357 hint = min(hint, tp->rcv_wnd/2);
358 hint = min(hint, TCP_MIN_RCVMSS);
359 hint = max(hint, TCP_MIN_MSS);
360
361 inet_csk(sk)->icsk_ack.rcv_mss = hint;
362}
363
344/* Receiver "autotuning" code. 364/* Receiver "autotuning" code.
345 * 365 *
346 * The algorithm for RTT estimation w/o timestamps is based on 366 * The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
735 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 755 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
736} 756}
737 757
758/* Set slow start threshold and cwnd not falling to slow start */
759void tcp_enter_cwr(struct sock *sk)
760{
761 struct tcp_sock *tp = tcp_sk(sk);
762
763 tp->prior_ssthresh = 0;
764 tp->bytes_acked = 0;
765 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
766 tp->undo_marker = 0;
767 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
768 tp->snd_cwnd = min(tp->snd_cwnd,
769 tcp_packets_in_flight(tp) + 1U);
770 tp->snd_cwnd_cnt = 0;
771 tp->high_seq = tp->snd_nxt;
772 tp->snd_cwnd_stamp = tcp_time_stamp;
773 TCP_ECN_queue_cwr(tp);
774
775 tcp_set_ca_state(sk, TCP_CA_CWR);
776 }
777}
778
738/* Initialize metrics on socket. */ 779/* Initialize metrics on socket. */
739 780
740static void tcp_init_metrics(struct sock *sk) 781static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
2070 tcp_ack_no_tstamp(sk, seq_rtt, flag); 2111 tcp_ack_no_tstamp(sk, seq_rtt, flag);
2071} 2112}
2072 2113
2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2114static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2074 u32 in_flight, int good) 2115 u32 in_flight, int good)
2075{ 2116{
2076 const struct inet_connection_sock *icsk = inet_csk(sk); 2117 const struct inet_connection_sock *icsk = inet_csk(sk);
2077 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); 2118 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2082 * RFC2988 recommends to restart timer to now+rto. 2123 * RFC2988 recommends to restart timer to now+rto.
2083 */ 2124 */
2084 2125
2085static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 2126static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2086{ 2127{
2087 if (!tp->packets_out) { 2128 if (!tp->packets_out) {
2088 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2129 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2147 return acked; 2188 return acked;
2148} 2189}
2149 2190
2150static inline u32 tcp_usrtt(const struct sk_buff *skb) 2191static u32 tcp_usrtt(const struct sk_buff *skb)
2151{ 2192{
2152 struct timeval tv, now; 2193 struct timeval tv, now;
2153 2194
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2342 2383
2343 if (nwin > tp->max_window) { 2384 if (nwin > tp->max_window) {
2344 tp->max_window = nwin; 2385 tp->max_window = nwin;
2345 tcp_sync_mss(sk, tp->pmtu_cookie); 2386 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
2346 } 2387 }
2347 } 2388 }
2348 } 2389 }
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2583/* Fast parse options. This hopes to only see timestamps. 2624/* Fast parse options. This hopes to only see timestamps.
2584 * If it is wrong it falls back on tcp_parse_options(). 2625 * If it is wrong it falls back on tcp_parse_options().
2585 */ 2626 */
2586static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 2627static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
2587 struct tcp_sock *tp) 2628 struct tcp_sock *tp)
2588{ 2629{
2589 if (th->doff == sizeof(struct tcphdr)>>2) { 2630 if (th->doff == sizeof(struct tcphdr)>>2) {
2590 tp->rx_opt.saw_tstamp = 0; 2631 tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2804 } 2845 }
2805} 2846}
2806 2847
2807static __inline__ int 2848static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2808tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2809{ 2849{
2810 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 2850 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
2811 if (before(seq, sp->start_seq)) 2851 if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2817 return 0; 2857 return 0;
2818} 2858}
2819 2859
2820static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) 2860static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2821{ 2861{
2822 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { 2862 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
2823 if (before(seq, tp->rcv_nxt)) 2863 if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2832 } 2872 }
2833} 2873}
2834 2874
2835static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) 2875static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
2836{ 2876{
2837 if (!tp->rx_opt.dsack) 2877 if (!tp->rx_opt.dsack)
2838 tcp_dsack_set(tp, seq, end_seq); 2878 tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
2890 } 2930 }
2891} 2931}
2892 2932
2893static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) 2933static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
2894{ 2934{
2895 __u32 tmp; 2935 __u32 tmp;
2896 2936
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
3455 tp->snd_cwnd_stamp = tcp_time_stamp; 3495 tp->snd_cwnd_stamp = tcp_time_stamp;
3456} 3496}
3457 3497
3458static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) 3498static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3459{ 3499{
3460 /* If the user specified a specific send buffer setting, do 3500 /* If the user specified a specific send buffer setting, do
3461 * not modify it. 3501 * not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
3502 sk->sk_write_space(sk); 3542 sk->sk_write_space(sk);
3503} 3543}
3504 3544
3505static inline void tcp_check_space(struct sock *sk) 3545static void tcp_check_space(struct sock *sk)
3506{ 3546{
3507 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 3547 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3508 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 3548 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
3512 } 3552 }
3513} 3553}
3514 3554
3515static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) 3555static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3516{ 3556{
3517 tcp_push_pending_frames(sk, tp); 3557 tcp_push_pending_frames(sk, tp);
3518 tcp_check_space(sk); 3558 tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3544 } 3584 }
3545} 3585}
3546 3586
3547static __inline__ void tcp_ack_snd_check(struct sock *sk) 3587static inline void tcp_ack_snd_check(struct sock *sk)
3548{ 3588{
3549 if (!inet_csk_ack_scheduled(sk)) { 3589 if (!inet_csk_ack_scheduled(sk)) {
3550 /* We sent a data segment already. */ 3590 /* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3692 return result; 3732 return result;
3693} 3733}
3694 3734
3695static __inline__ int 3735static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3696tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3697{ 3736{
3698 return skb->ip_summed != CHECKSUM_UNNECESSARY && 3737 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
3699 __tcp_checksum_complete_user(sk, skb); 3738 __tcp_checksum_complete_user(sk, skb);
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3967 struct tcphdr *th, unsigned len) 4006 struct tcphdr *th, unsigned len)
3968{ 4007{
3969 struct tcp_sock *tp = tcp_sk(sk); 4008 struct tcp_sock *tp = tcp_sk(sk);
4009 struct inet_connection_sock *icsk = inet_csk(sk);
3970 int saved_clamp = tp->rx_opt.mss_clamp; 4010 int saved_clamp = tp->rx_opt.mss_clamp;
3971 4011
3972 tcp_parse_options(skb, &tp->rx_opt, 0); 4012 tcp_parse_options(skb, &tp->rx_opt, 0);
3973 4013
3974 if (th->ack) { 4014 if (th->ack) {
3975 struct inet_connection_sock *icsk;
3976 /* rfc793: 4015 /* rfc793:
3977 * "If the state is SYN-SENT then 4016 * "If the state is SYN-SENT then
3978 * first check the ACK bit 4017 * first check the ACK bit
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4061 if (tp->rx_opt.sack_ok && sysctl_tcp_fack) 4100 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4062 tp->rx_opt.sack_ok |= 2; 4101 tp->rx_opt.sack_ok |= 2;
4063 4102
4064 tcp_sync_mss(sk, tp->pmtu_cookie); 4103 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4065 tcp_initialize_rcv_mss(sk); 4104 tcp_initialize_rcv_mss(sk);
4066 4105
4067 /* Remember, tcp_poll() does not lock socket! 4106 /* Remember, tcp_poll() does not lock socket!
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4072 tcp_set_state(sk, TCP_ESTABLISHED); 4111 tcp_set_state(sk, TCP_ESTABLISHED);
4073 4112
4074 /* Make sure socket is routed, for correct metrics. */ 4113 /* Make sure socket is routed, for correct metrics. */
4075 tp->af_specific->rebuild_header(sk); 4114 icsk->icsk_af_ops->rebuild_header(sk);
4076 4115
4077 tcp_init_metrics(sk); 4116 tcp_init_metrics(sk);
4078 4117
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4098 sk_wake_async(sk, 0, POLL_OUT); 4137 sk_wake_async(sk, 0, POLL_OUT);
4099 } 4138 }
4100 4139
4101 icsk = inet_csk(sk);
4102
4103 if (sk->sk_write_pending || 4140 if (sk->sk_write_pending ||
4104 icsk->icsk_accept_queue.rskq_defer_accept || 4141 icsk->icsk_accept_queue.rskq_defer_accept ||
4105 icsk->icsk_ack.pingpong) { 4142 icsk->icsk_ack.pingpong) {
@@ -4173,7 +4210,7 @@ discard:
4173 if (tp->ecn_flags&TCP_ECN_OK) 4210 if (tp->ecn_flags&TCP_ECN_OK)
4174 sock_set_flag(sk, SOCK_NO_LARGESEND); 4211 sock_set_flag(sk, SOCK_NO_LARGESEND);
4175 4212
4176 tcp_sync_mss(sk, tp->pmtu_cookie); 4213 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4177 tcp_initialize_rcv_mss(sk); 4214 tcp_initialize_rcv_mss(sk);
4178 4215
4179 4216
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4220 struct tcphdr *th, unsigned len) 4257 struct tcphdr *th, unsigned len)
4221{ 4258{
4222 struct tcp_sock *tp = tcp_sk(sk); 4259 struct tcp_sock *tp = tcp_sk(sk);
4260 struct inet_connection_sock *icsk = inet_csk(sk);
4223 int queued = 0; 4261 int queued = 0;
4224 4262
4225 tp->rx_opt.saw_tstamp = 0; 4263 tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4236 goto discard; 4274 goto discard;
4237 4275
4238 if(th->syn) { 4276 if(th->syn) {
4239 if(tp->af_specific->conn_request(sk, skb) < 0) 4277 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
4240 return 1; 4278 return 1;
4241 4279
4242 /* Now we have several options: In theory there is 4280 /* Now we have several options: In theory there is
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4349 /* Make sure socket is routed, for 4387 /* Make sure socket is routed, for
4350 * correct metrics. 4388 * correct metrics.
4351 */ 4389 */
4352 tp->af_specific->rebuild_header(sk); 4390 icsk->icsk_af_ops->rebuild_header(sk);
4353 4391
4354 tcp_init_metrics(sk); 4392 tcp_init_metrics(sk);
4355 4393
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
4475EXPORT_SYMBOL(tcp_parse_options); 4513EXPORT_SYMBOL(tcp_parse_options);
4476EXPORT_SYMBOL(tcp_rcv_established); 4514EXPORT_SYMBOL(tcp_rcv_established);
4477EXPORT_SYMBOL(tcp_rcv_state_process); 4515EXPORT_SYMBOL(tcp_rcv_state_process);
4516EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
69#include <net/transp_v6.h> 69#include <net/transp_v6.h>
70#include <net/ipv6.h> 70#include <net/ipv6.h>
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
72#include <net/xfrm.h> 73#include <net/xfrm.h>
73 74
74#include <linux/inet.h> 75#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
86/* Socket used for sending RSTs */ 87/* Socket used for sending RSTs */
87static struct socket *tcp_socket; 88static struct socket *tcp_socket;
88 89
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90 struct sk_buff *skb);
91 91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
97 97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{ 99{
100 return inet_csk_get_port(&tcp_hashinfo, sk, snum); 100 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101 inet_csk_bind_conflict);
101} 102}
102 103
103static void tcp_v4_hash(struct sock *sk) 104static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
118 skb->h.th->source); 119 skb->h.th->source);
119} 120}
120 121
121/* called with local bh disabled */ 122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123 struct inet_timewait_sock **twp)
124{ 123{
125 struct inet_sock *inet = inet_sk(sk); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126 u32 daddr = inet->rcv_saddr; 125 struct tcp_sock *tp = tcp_sk(sk);
127 u32 saddr = inet->daddr;
128 int dif = sk->sk_bound_dev_if;
129 INET_ADDR_COOKIE(acookie, saddr, daddr)
130 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133 struct sock *sk2;
134 const struct hlist_node *node;
135 struct inet_timewait_sock *tw;
136
137 prefetch(head->chain.first);
138 write_lock(&head->lock);
139
140 /* Check TIME-WAIT sockets first. */
141 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142 tw = inet_twsk(sk2);
143
144 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146 struct tcp_sock *tp = tcp_sk(sk);
147
148 /* With PAWS, it is safe from the viewpoint
149 of data integrity. Even without PAWS it
150 is safe provided sequence spaces do not
151 overlap i.e. at data rates <= 80Mbit/sec.
152
153 Actually, the idea is close to VJ's one,
154 only timestamp cache is held not per host,
155 but per port pair and TW bucket is used
156 as state holder.
157 126
158 If TW bucket has been already destroyed we 127 /* With PAWS, it is safe from the viewpoint
159 fall back to VJ's scheme and use initial 128 of data integrity. Even without PAWS it is safe provided sequence
160 timestamp retrieved from peer table. 129 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 */
162 if (tcptw->tw_ts_recent_stamp &&
163 (!twp || (sysctl_tcp_tw_reuse &&
164 xtime.tv_sec -
165 tcptw->tw_ts_recent_stamp > 1))) {
166 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167 if (tp->write_seq == 0)
168 tp->write_seq = 1;
169 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
170 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171 sock_hold(sk2);
172 goto unique;
173 } else
174 goto not_unique;
175 }
176 }
177 tw = NULL;
178 130
179 /* And established part... */ 131 Actually, the idea is close to VJ's one, only timestamp cache is
180 sk_for_each(sk2, node, &head->chain) { 132 held not per host, but per port pair and TW bucket is used as state
181 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) 133 holder.
182 goto not_unique;
183 }
184 134
185unique: 135 If TW bucket has been already destroyed we fall back to VJ's scheme
186 /* Must record num and sport now. Otherwise we will see 136 and use initial timestamp retrieved from peer table.
187 * in hash table socket with a funny identity. */ 137 */
188 inet->num = lport; 138 if (tcptw->tw_ts_recent_stamp &&
189 inet->sport = htons(lport); 139 (twp == NULL || (sysctl_tcp_tw_reuse &&
190 sk->sk_hash = hash; 140 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
191 BUG_TRAP(sk_unhashed(sk)); 141 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
192 __sk_add_node(sk, &head->chain); 142 if (tp->write_seq == 0)
193 sock_prot_inc_use(sk->sk_prot); 143 tp->write_seq = 1;
194 write_unlock(&head->lock); 144 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
195 145 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
196 if (twp) { 146 sock_hold(sktw);
197 *twp = tw; 147 return 1;
198 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199 } else if (tw) {
200 /* Silly. Should hash-dance instead... */
201 inet_twsk_deschedule(tw, &tcp_death_row);
202 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203
204 inet_twsk_put(tw);
205 } 148 }
206 149
207 return 0; 150 return 0;
208
209not_unique:
210 write_unlock(&head->lock);
211 return -EADDRNOTAVAIL;
212} 151}
213 152
214static inline u32 connect_port_offset(const struct sock *sk) 153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
215{
216 const struct inet_sock *inet = inet_sk(sk);
217
218 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219 inet->dport);
220}
221
222/*
223 * Bind a port for a connect operation and hash it.
224 */
225static inline int tcp_v4_hash_connect(struct sock *sk)
226{
227 const unsigned short snum = inet_sk(sk)->num;
228 struct inet_bind_hashbucket *head;
229 struct inet_bind_bucket *tb;
230 int ret;
231
232 if (!snum) {
233 int low = sysctl_local_port_range[0];
234 int high = sysctl_local_port_range[1];
235 int range = high - low;
236 int i;
237 int port;
238 static u32 hint;
239 u32 offset = hint + connect_port_offset(sk);
240 struct hlist_node *node;
241 struct inet_timewait_sock *tw = NULL;
242
243 local_bh_disable();
244 for (i = 1; i <= range; i++) {
245 port = low + (i + offset) % range;
246 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247 spin_lock(&head->lock);
248
249 /* Does not bother with rcv_saddr checks,
250 * because the established check is already
251 * unique enough.
252 */
253 inet_bind_bucket_for_each(tb, node, &head->chain) {
254 if (tb->port == port) {
255 BUG_TRAP(!hlist_empty(&tb->owners));
256 if (tb->fastreuse >= 0)
257 goto next_port;
258 if (!__tcp_v4_check_established(sk,
259 port,
260 &tw))
261 goto ok;
262 goto next_port;
263 }
264 }
265
266 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267 if (!tb) {
268 spin_unlock(&head->lock);
269 break;
270 }
271 tb->fastreuse = -1;
272 goto ok;
273
274 next_port:
275 spin_unlock(&head->lock);
276 }
277 local_bh_enable();
278
279 return -EADDRNOTAVAIL;
280
281ok:
282 hint += i;
283
284 /* Head lock still held and bh's disabled */
285 inet_bind_hash(sk, tb, port);
286 if (sk_unhashed(sk)) {
287 inet_sk(sk)->sport = htons(port);
288 __inet_hash(&tcp_hashinfo, sk, 0);
289 }
290 spin_unlock(&head->lock);
291
292 if (tw) {
293 inet_twsk_deschedule(tw, &tcp_death_row);;
294 inet_twsk_put(tw);
295 }
296
297 ret = 0;
298 goto out;
299 }
300
301 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302 tb = inet_csk(sk)->icsk_bind_hash;
303 spin_lock_bh(&head->lock);
304 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305 __inet_hash(&tcp_hashinfo, sk, 0);
306 spin_unlock_bh(&head->lock);
307 return 0;
308 } else {
309 spin_unlock(&head->lock);
310 /* No definite answer... Walk to established hash table */
311 ret = __tcp_v4_check_established(sk, snum, NULL);
312out:
313 local_bh_enable();
314 return ret;
315 }
316}
317 154
318/* This will initiate an outgoing connection. */ 155/* This will initiate an outgoing connection. */
319int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
383 inet->dport = usin->sin_port; 220 inet->dport = usin->sin_port;
384 inet->daddr = daddr; 221 inet->daddr = daddr;
385 222
386 tp->ext_header_len = 0; 223 inet_csk(sk)->icsk_ext_hdr_len = 0;
387 if (inet->opt) 224 if (inet->opt)
388 tp->ext_header_len = inet->opt->optlen; 225 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
389 226
390 tp->rx_opt.mss_clamp = 536; 227 tp->rx_opt.mss_clamp = 536;
391 228
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
395 * complete initialization after this. 232 * complete initialization after this.
396 */ 233 */
397 tcp_set_state(sk, TCP_SYN_SENT); 234 tcp_set_state(sk, TCP_SYN_SENT);
398 err = tcp_v4_hash_connect(sk); 235 err = inet_hash_connect(&tcp_death_row, sk);
399 if (err) 236 if (err)
400 goto failure; 237 goto failure;
401 238
@@ -433,12 +270,10 @@ failure:
433/* 270/*
434 * This routine does path mtu discovery as defined in RFC1191. 271 * This routine does path mtu discovery as defined in RFC1191.
435 */ 272 */
436static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, 273static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
437 u32 mtu)
438{ 274{
439 struct dst_entry *dst; 275 struct dst_entry *dst;
440 struct inet_sock *inet = inet_sk(sk); 276 struct inet_sock *inet = inet_sk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 277
443 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 278 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444 * send out by Linux are always <576bytes so they should go through 279 * send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
467 mtu = dst_mtu(dst); 302 mtu = dst_mtu(dst);
468 303
469 if (inet->pmtudisc != IP_PMTUDISC_DONT && 304 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470 tp->pmtu_cookie > mtu) { 305 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
471 tcp_sync_mss(sk, mtu); 306 tcp_sync_mss(sk, mtu);
472 307
473 /* Resend the TCP packet because it's 308 /* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
644} 479}
645 480
646/* This routine computes an IPv4 TCP checksum. */ 481/* This routine computes an IPv4 TCP checksum. */
647void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
648 struct sk_buff *skb)
649{ 483{
650 struct inet_sock *inet = inet_sk(sk); 484 struct inet_sock *inet = inet_sk(sk);
485 struct tcphdr *th = skb->h.th;
651 486
652 if (skb->ip_summed == CHECKSUM_HW) { 487 if (skb->ip_summed == CHECKSUM_HW) {
653 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); 488 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
826 kfree(inet_rsk(req)->opt); 661 kfree(inet_rsk(req)->opt);
827} 662}
828 663
829static inline void syn_flood_warning(struct sk_buff *skb) 664#ifdef CONFIG_SYN_COOKIES
665static void syn_flood_warning(struct sk_buff *skb)
830{ 666{
831 static unsigned long warntime; 667 static unsigned long warntime;
832 668
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
837 ntohs(skb->h.th->dest)); 673 ntohs(skb->h.th->dest));
838 } 674 }
839} 675}
676#endif
840 677
841/* 678/*
842 * Save and compile IPv4 options into the request_sock if needed. 679 * Save and compile IPv4 options into the request_sock if needed.
843 */ 680 */
844static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 681static struct ip_options *tcp_v4_save_options(struct sock *sk,
845 struct sk_buff *skb) 682 struct sk_buff *skb)
846{ 683{
847 struct ip_options *opt = &(IPCB(skb)->opt); 684 struct ip_options *opt = &(IPCB(skb)->opt);
848 struct ip_options *dopt = NULL; 685 struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
869 .send_reset = tcp_v4_send_reset, 706 .send_reset = tcp_v4_send_reset,
870}; 707};
871 708
709static struct timewait_sock_ops tcp_timewait_sock_ops = {
710 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
711 .twsk_unique = tcp_twsk_unique,
712};
713
872int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 714int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873{ 715{
874 struct inet_request_sock *ireq; 716 struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1053 ireq->opt = NULL; 895 ireq->opt = NULL;
1054 newinet->mc_index = inet_iif(skb); 896 newinet->mc_index = inet_iif(skb);
1055 newinet->mc_ttl = skb->nh.iph->ttl; 897 newinet->mc_ttl = skb->nh.iph->ttl;
1056 newtp->ext_header_len = 0; 898 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1057 if (newinet->opt) 899 if (newinet->opt)
1058 newtp->ext_header_len = newinet->opt->optlen; 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1059 newinet->id = newtp->write_seq ^ jiffies; 901 newinet->id = newtp->write_seq ^ jiffies;
1060 902
1061 tcp_sync_mss(newsk, dst_mtu(dst)); 903 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
1314 goto discard_it; 1156 goto discard_it;
1315} 1157}
1316 1158
1317static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318{
1319 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320 struct inet_sock *inet = inet_sk(sk);
1321
1322 sin->sin_family = AF_INET;
1323 sin->sin_addr.s_addr = inet->daddr;
1324 sin->sin_port = inet->dport;
1325}
1326
1327/* VJ's idea. Save last timestamp seen from this destination 1159/* VJ's idea. Save last timestamp seen from this destination
1328 * and hold it at least for normal timewait interval to use for duplicate 1160 * and hold it at least for normal timewait interval to use for duplicate
1329 * segment detection in subsequent connections, before they enter synchronized 1161 * segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1382 return 0; 1214 return 0;
1383} 1215}
1384 1216
1385struct tcp_func ipv4_specific = { 1217struct inet_connection_sock_af_ops ipv4_specific = {
1386 .queue_xmit = ip_queue_xmit, 1218 .queue_xmit = ip_queue_xmit,
1387 .send_check = tcp_v4_send_check, 1219 .send_check = tcp_v4_send_check,
1388 .rebuild_header = inet_sk_rebuild_header, 1220 .rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
1392 .net_header_len = sizeof(struct iphdr), 1224 .net_header_len = sizeof(struct iphdr),
1393 .setsockopt = ip_setsockopt, 1225 .setsockopt = ip_setsockopt,
1394 .getsockopt = ip_getsockopt, 1226 .getsockopt = ip_getsockopt,
1395 .addr2sockaddr = v4_addr2sockaddr, 1227 .addr2sockaddr = inet_csk_addr2sockaddr,
1396 .sockaddr_len = sizeof(struct sockaddr_in), 1228 .sockaddr_len = sizeof(struct sockaddr_in),
1397}; 1229};
1398 1230
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
1433 sk->sk_write_space = sk_stream_write_space; 1265 sk->sk_write_space = sk_stream_write_space;
1434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1266 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435 1267
1436 tp->af_specific = &ipv4_specific; 1268 icsk->icsk_af_ops = &ipv4_specific;
1269 icsk->icsk_sync_mss = tcp_sync_mss;
1437 1270
1438 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1271 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1272 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
1989 .sysctl_rmem = sysctl_tcp_rmem, 1822 .sysctl_rmem = sysctl_tcp_rmem,
1990 .max_header = MAX_TCP_HEADER, 1823 .max_header = MAX_TCP_HEADER,
1991 .obj_size = sizeof(struct tcp_sock), 1824 .obj_size = sizeof(struct tcp_sock),
1992 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1825 .twsk_prot = &tcp_timewait_sock_ops,
1993 .rsk_prot = &tcp_request_sock_ops, 1826 .rsk_prot = &tcp_request_sock_ops,
1994}; 1827};
1995 1828
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
274void tcp_time_wait(struct sock *sk, int state, int timeo) 274void tcp_time_wait(struct sock *sk, int state, int timeo)
275{ 275{
276 struct inet_timewait_sock *tw = NULL; 276 struct inet_timewait_sock *tw = NULL;
277 const struct inet_connection_sock *icsk = inet_csk(sk);
277 const struct tcp_sock *tp = tcp_sk(sk); 278 const struct tcp_sock *tp = tcp_sk(sk);
278 int recycle_ok = 0; 279 int recycle_ok = 0;
279 280
280 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 281 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
281 recycle_ok = tp->af_specific->remember_stamp(sk); 282 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
282 283
283 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 284 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
284 tw = inet_twsk_alloc(sk, state); 285 tw = inet_twsk_alloc(sk, state);
285 286
286 if (tw != NULL) { 287 if (tw != NULL) {
287 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 288 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
288 const struct inet_connection_sock *icsk = inet_csk(sk);
289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
290 290
291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
299 if (tw->tw_family == PF_INET6) { 299 if (tw->tw_family == PF_INET6) {
300 struct ipv6_pinfo *np = inet6_sk(sk); 300 struct ipv6_pinfo *np = inet6_sk(sk);
301 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 301 struct inet6_timewait_sock *tw6;
302 302
303 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); 303 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
304 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); 304 tw6 = inet6_twsk((struct sock *)tw);
305 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
306 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
305 tw->tw_ipv6only = np->ipv6only; 307 tw->tw_ipv6only = np->ipv6only;
306 } 308 }
307#endif 309#endif
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
456 struct request_sock **prev) 458 struct request_sock **prev)
457{ 459{
458 struct tcphdr *th = skb->h.th; 460 struct tcphdr *th = skb->h.th;
459 struct tcp_sock *tp = tcp_sk(sk);
460 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 461 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
461 int paws_reject = 0; 462 int paws_reject = 0;
462 struct tcp_options_received tmp_opt; 463 struct tcp_options_received tmp_opt;
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
613 * ESTABLISHED STATE. If it will be dropped after 614 * ESTABLISHED STATE. If it will be dropped after
614 * socket is created, wait for troubles. 615 * socket is created, wait for troubles.
615 */ 616 */
616 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); 617 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
618 req, NULL);
617 if (child == NULL) 619 if (child == NULL)
618 goto listen_overflow; 620 goto listen_overflow;
619 621
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..a7623ead39a8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 3; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
56{ 56{
57 sk->sk_send_head = skb->next; 57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) 58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
124 tp->snd_cwnd_used = 0; 124 tp->snd_cwnd_used = 0;
125} 125}
126 126
127static inline void tcp_event_data_sent(struct tcp_sock *tp, 127static void tcp_event_data_sent(struct tcp_sock *tp,
128 struct sk_buff *skb, struct sock *sk) 128 struct sk_buff *skb, struct sock *sk)
129{ 129{
130 struct inet_connection_sock *icsk = inet_csk(sk); 130 struct inet_connection_sock *icsk = inet_csk(sk);
131 const u32 now = tcp_time_stamp; 131 const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
142 icsk->icsk_ack.pingpong = 1; 142 icsk->icsk_ack.pingpong = 1;
143} 143}
144 144
145static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 145static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
146{ 146{
147 tcp_dec_quickack_mode(sk, pkts); 147 tcp_dec_quickack_mode(sk, pkts);
148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
212 * value can be stuffed directly into th->window for an outgoing 212 * value can be stuffed directly into th->window for an outgoing
213 * frame. 213 * frame.
214 */ 214 */
215static __inline__ u16 tcp_select_window(struct sock *sk) 215static u16 tcp_select_window(struct sock *sk)
216{ 216{
217 struct tcp_sock *tp = tcp_sk(sk); 217 struct tcp_sock *tp = tcp_sk(sk);
218 u32 cur_win = tcp_receive_window(tp); 218 u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
250 return new_win; 250 return new_win;
251} 251}
252 252
253static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
254 __u32 tstamp)
255{
256 if (tp->rx_opt.tstamp_ok) {
257 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
258 (TCPOPT_NOP << 16) |
259 (TCPOPT_TIMESTAMP << 8) |
260 TCPOLEN_TIMESTAMP);
261 *ptr++ = htonl(tstamp);
262 *ptr++ = htonl(tp->rx_opt.ts_recent);
263 }
264 if (tp->rx_opt.eff_sacks) {
265 struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
266 int this_sack;
267
268 *ptr++ = htonl((TCPOPT_NOP << 24) |
269 (TCPOPT_NOP << 16) |
270 (TCPOPT_SACK << 8) |
271 (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
272 TCPOLEN_SACK_PERBLOCK)));
273 for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
274 *ptr++ = htonl(sp[this_sack].start_seq);
275 *ptr++ = htonl(sp[this_sack].end_seq);
276 }
277 if (tp->rx_opt.dsack) {
278 tp->rx_opt.dsack = 0;
279 tp->rx_opt.eff_sacks--;
280 }
281 }
282}
283
284/* Construct a tcp options header for a SYN or SYN_ACK packet.
285 * If this is every changed make sure to change the definition of
286 * MAX_SYN_SIZE to match the new maximum number of options that you
287 * can generate.
288 */
289static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
290 int offer_wscale, int wscale, __u32 tstamp,
291 __u32 ts_recent)
292{
293 /* We always get an MSS option.
294 * The option bytes which will be seen in normal data
295 * packets should timestamps be used, must be in the MSS
296 * advertised. But we subtract them from tp->mss_cache so
297 * that calculations in tcp_sendmsg are simpler etc.
298 * So account for this fact here if necessary. If we
299 * don't do this correctly, as a receiver we won't
300 * recognize data packets as being full sized when we
301 * should, and thus we won't abide by the delayed ACK
302 * rules correctly.
303 * SACKs don't matter, we never delay an ACK when we
304 * have any of those going out.
305 */
306 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
307 if (ts) {
308 if(sack)
309 *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
310 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
311 else
312 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
313 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
314 *ptr++ = htonl(tstamp); /* TSVAL */
315 *ptr++ = htonl(ts_recent); /* TSECR */
316 } else if(sack)
317 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
318 (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
319 if (offer_wscale)
320 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
321}
253 322
254/* This routine actually transmits TCP packets queued in by 323/* This routine actually transmits TCP packets queued in by
255 * tcp_do_sendmsg(). This is used by both the initial 324 * tcp_do_sendmsg(). This is used by both the initial
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
371 TCP_ECN_send(sk, tp, skb, tcp_header_size); 440 TCP_ECN_send(sk, tp, skb, tcp_header_size);
372 } 441 }
373 442
374 tp->af_specific->send_check(sk, th, skb->len, skb); 443 icsk->icsk_af_ops->send_check(sk, skb->len, skb);
375 444
376 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 445 if (likely(tcb->flags & TCPCB_FLAG_ACK))
377 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 446 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
381 450
382 TCP_INC_STATS(TCP_MIB_OUTSEGS); 451 TCP_INC_STATS(TCP_MIB_OUTSEGS);
383 452
384 err = tp->af_specific->queue_xmit(skb, 0); 453 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
385 if (unlikely(err <= 0)) 454 if (unlikely(err <= 0))
386 return err; 455 return err;
387 456
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
621 It is minimum of user_mss and mss received with SYN. 690 It is minimum of user_mss and mss received with SYN.
622 It also does not include TCP options. 691 It also does not include TCP options.
623 692
624 tp->pmtu_cookie is last pmtu, seen by this function. 693 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
625 694
626 tp->mss_cache is current effective sending mss, including 695 tp->mss_cache is current effective sending mss, including
627 all tcp options except for SACKs. It is evaluated, 696 all tcp options except for SACKs. It is evaluated,
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
631 NOTE1. rfc1122 clearly states that advertised MSS 700 NOTE1. rfc1122 clearly states that advertised MSS
632 DOES NOT include either tcp or ip options. 701 DOES NOT include either tcp or ip options.
633 702
634 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside 703 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
635 this function. --ANK (980731) 704 are READ ONLY outside this function. --ANK (980731)
636 */ 705 */
637 706
638unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) 707unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
639{ 708{
640 struct tcp_sock *tp = tcp_sk(sk); 709 struct tcp_sock *tp = tcp_sk(sk);
641 int mss_now; 710 struct inet_connection_sock *icsk = inet_csk(sk);
642
643 /* Calculate base mss without TCP options: 711 /* Calculate base mss without TCP options:
644 It is MMS_S - sizeof(tcphdr) of rfc1122 712 It is MMS_S - sizeof(tcphdr) of rfc1122
645 */ 713 */
646 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); 714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
647 716
648 /* Clamp it (mss_clamp does not include tcp options) */ 717 /* Clamp it (mss_clamp does not include tcp options) */
649 if (mss_now > tp->rx_opt.mss_clamp) 718 if (mss_now > tp->rx_opt.mss_clamp)
650 mss_now = tp->rx_opt.mss_clamp; 719 mss_now = tp->rx_opt.mss_clamp;
651 720
652 /* Now subtract optional transport overhead */ 721 /* Now subtract optional transport overhead */
653 mss_now -= tp->ext_header_len; 722 mss_now -= icsk->icsk_ext_hdr_len;
654 723
655 /* Then reserve room for full set of TCP options and 8 bytes of data */ 724 /* Then reserve room for full set of TCP options and 8 bytes of data */
656 if (mss_now < 48) 725 if (mss_now < 48)
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
664 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); 733 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
665 734
666 /* And store cached results */ 735 /* And store cached results */
667 tp->pmtu_cookie = pmtu; 736 icsk->icsk_pmtu_cookie = pmtu;
668 tp->mss_cache = mss_now; 737 tp->mss_cache = mss_now;
669 738
670 return mss_now; 739 return mss_now;
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
694 763
695 if (dst) { 764 if (dst) {
696 u32 mtu = dst_mtu(dst); 765 u32 mtu = dst_mtu(dst);
697 if (mtu != tp->pmtu_cookie) 766 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
698 mss_now = tcp_sync_mss(sk, mtu); 767 mss_now = tcp_sync_mss(sk, mtu);
699 } 768 }
700 769
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
705 xmit_size_goal = mss_now; 774 xmit_size_goal = mss_now;
706 775
707 if (doing_tso) { 776 if (doing_tso) {
708 xmit_size_goal = 65535 - 777 xmit_size_goal = (65535 -
709 tp->af_specific->net_header_len - 778 inet_csk(sk)->icsk_af_ops->net_header_len -
710 tp->ext_header_len - tp->tcp_header_len; 779 inet_csk(sk)->icsk_ext_hdr_len -
780 tp->tcp_header_len);
711 781
712 if (tp->max_window && 782 if (tp->max_window &&
713 (xmit_size_goal > (tp->max_window >> 1))) 783 (xmit_size_goal > (tp->max_window >> 1)))
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
723 793
724/* Congestion window validation. (RFC2861) */ 794/* Congestion window validation. (RFC2861) */
725 795
726static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) 796static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
727{ 797{
728 __u32 packets_out = tp->packets_out; 798 __u32 packets_out = tp->packets_out;
729 799
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
772/* This must be invoked the first time we consider transmitting 842/* This must be invoked the first time we consider transmitting
773 * SKB onto the wire. 843 * SKB onto the wire.
774 */ 844 */
775static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 845static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
776{ 846{
777 int tso_segs = tcp_skb_pcount(skb); 847 int tso_segs = tcp_skb_pcount(skb);
778 848
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1422 (sysctl_tcp_retrans_collapse != 0)) 1492 (sysctl_tcp_retrans_collapse != 0))
1423 tcp_retrans_try_collapse(sk, skb, cur_mss); 1493 tcp_retrans_try_collapse(sk, skb, cur_mss);
1424 1494
1425 if(tp->af_specific->rebuild_header(sk)) 1495 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1426 return -EHOSTUNREACH; /* Routing failure or similar. */ 1496 return -EHOSTUNREACH; /* Routing failure or similar. */
1427 1497
1428 /* Some Solaris stacks overoptimize and ignore the FIN on a 1498 /* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1793/* 1863/*
1794 * Do all connect socket setups that can be done AF independent. 1864 * Do all connect socket setups that can be done AF independent.
1795 */ 1865 */
1796static inline void tcp_connect_init(struct sock *sk) 1866static void tcp_connect_init(struct sock *sk)
1797{ 1867{
1798 struct dst_entry *dst = __sk_dst_get(sk); 1868 struct dst_entry *dst = __sk_dst_get(sk);
1799 struct tcp_sock *tp = tcp_sk(sk); 1869 struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df16..3b7403495052 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
330 vegas->cntRTT = 0; 330 vegas->cntRTT = 0;
331 vegas->minRTT = 0x7fffffff; 331 vegas->minRTT = 0x7fffffff;
332 } 332 }
333 /* Use normal slow start */
334 else if (tp->snd_cwnd <= tp->snd_ssthresh)
335 tcp_slow_start(tp);
336
333} 337}
334 338
335/* Extract info for Tcp socket info provided via netlink. */ 339/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
86#include <linux/module.h> 86#include <linux/module.h>
87#include <linux/socket.h> 87#include <linux/socket.h>
88#include <linux/sockios.h> 88#include <linux/sockios.h>
89#include <linux/igmp.h>
89#include <linux/in.h> 90#include <linux/in.h>
90#include <linux/errno.h> 91#include <linux/errno.h>
91#include <linux/timer.h> 92#include <linux/timer.h>
@@ -846,20 +847,7 @@ out:
846csum_copy_err: 847csum_copy_err:
847 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 848 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
848 849
849 /* Clear queue. */ 850 skb_kill_datagram(sk, skb, flags);
850 if (flags&MSG_PEEK) {
851 int clear = 0;
852 spin_lock_bh(&sk->sk_receive_queue.lock);
853 if (skb == skb_peek(&sk->sk_receive_queue)) {
854 __skb_unlink(skb, &sk->sk_receive_queue);
855 clear = 1;
856 }
857 spin_unlock_bh(&sk->sk_receive_queue.lock);
858 if (clear)
859 kfree_skb(skb);
860 }
861
862 skb_free_datagram(sk, skb);
863 851
864 if (noblock) 852 if (noblock)
865 return -EAGAIN; 853 return -EAGAIN;
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
1094 * Otherwise, csum completion requires chacksumming packet body, 1082 * Otherwise, csum completion requires chacksumming packet body,
1095 * including udp header and folding it to skb->csum. 1083 * including udp header and folding it to skb->csum.
1096 */ 1084 */
1097static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, 1085static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1098 unsigned short ulen, u32 saddr, u32 daddr) 1086 unsigned short ulen, u32 saddr, u32 daddr)
1099{ 1087{
1100 if (uh->check == 0) { 1088 if (uh->check == 0) {
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1108 /* Probably, we should checksum udp header (it should be in cache 1096 /* Probably, we should checksum udp header (it should be in cache
1109 * in any case) and data in tiny packets (< rx copybreak). 1097 * in any case) and data in tiny packets (< rx copybreak).
1110 */ 1098 */
1111 return 0;
1112} 1099}
1113 1100
1114/* 1101/*
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb)
1141 if (pskb_trim_rcsum(skb, ulen)) 1128 if (pskb_trim_rcsum(skb, ulen))
1142 goto short_packet; 1129 goto short_packet;
1143 1130
1144 if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) 1131 udp_checksum_init(skb, uh, ulen, saddr, daddr);
1145 goto csum_error;
1146 1132
1147 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1133 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1148 return udp_v4_mcast_deliver(skb, uh, saddr, daddr); 1134 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6460eec834b7..9601fd7f9d66 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ 8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ 9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ 10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
11 ip6_flowlabel.o ipv6_syms.o netfilter.o 11 ip6_flowlabel.o ipv6_syms.o netfilter.o \
12 inet6_connection_sock.o
12 13
13ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ 14ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
14 xfrm6_output.o 15 xfrm6_output.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85ad..704fb73e6c5f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
1195int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) 1195int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
1196{ 1196{
1197 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; 1197 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
1198 const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); 1198 const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
1199 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; 1199 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
1200 u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 1200 u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
1201 int sk_ipv6only = ipv6_only_sock(sk); 1201 int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa04..68afc53be662 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
167 sk->sk_reuse = 1; 167 sk->sk_reuse = 1;
168 168
169 inet = inet_sk(sk); 169 inet = inet_sk(sk);
170 inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
170 171
171 if (SOCK_RAW == sock->type) { 172 if (SOCK_RAW == sock->type) {
172 inet->num = protocol; 173 inet->num = protocol;
@@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk)
389 return 0; 390 return 0;
390} 391}
391 392
393EXPORT_SYMBOL_GPL(inet6_destroy_sock);
394
392/* 395/*
393 * This does both peername and sockname. 396 * This does both peername and sockname.
394 */ 397 */
@@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
431int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 434int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
432{ 435{
433 struct sock *sk = sock->sk; 436 struct sock *sk = sock->sk;
434 int err = -EINVAL;
435 437
436 switch(cmd) 438 switch(cmd)
437 { 439 {
@@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
450 case SIOCSIFDSTADDR: 452 case SIOCSIFDSTADDR:
451 return addrconf_set_dstaddr((void __user *) arg); 453 return addrconf_set_dstaddr((void __user *) arg);
452 default: 454 default:
453 if (!sk->sk_prot->ioctl || 455 if (!sk->sk_prot->ioctl)
454 (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) 456 return -ENOIOCTLCMD;
455 return(dev_ioctl(cmd,(void __user *) arg)); 457 return sk->sk_prot->ioctl(sk, cmd, arg);
456 return err;
457 } 458 }
458 /*NOTREACHED*/ 459 /*NOTREACHED*/
459 return(0); 460 return(0);
460} 461}
461 462
462struct proto_ops inet6_stream_ops = { 463const struct proto_ops inet6_stream_ops = {
463 .family = PF_INET6, 464 .family = PF_INET6,
464 .owner = THIS_MODULE, 465 .owner = THIS_MODULE,
465 .release = inet6_release, 466 .release = inet6_release,
@@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = {
480 .sendpage = tcp_sendpage 481 .sendpage = tcp_sendpage
481}; 482};
482 483
483struct proto_ops inet6_dgram_ops = { 484const struct proto_ops inet6_dgram_ops = {
484 .family = PF_INET6, 485 .family = PF_INET6,
485 .owner = THIS_MODULE, 486 .owner = THIS_MODULE,
486 .release = inet6_release, 487 .release = inet6_release,
@@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = {
508}; 509};
509 510
510/* Same as inet6_dgram_ops, sans udp_poll. */ 511/* Same as inet6_dgram_ops, sans udp_poll. */
511static struct proto_ops inet6_sockraw_ops = { 512static const struct proto_ops inet6_sockraw_ops = {
512 .family = PF_INET6, 513 .family = PF_INET6,
513 .owner = THIS_MODULE, 514 .owner = THIS_MODULE,
514 .release = inet6_release, 515 .release = inet6_release,
@@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p)
609 } 610 }
610} 611}
611 612
613int inet6_sk_rebuild_header(struct sock *sk)
614{
615 int err;
616 struct dst_entry *dst;
617 struct ipv6_pinfo *np = inet6_sk(sk);
618
619 dst = __sk_dst_check(sk, np->dst_cookie);
620
621 if (dst == NULL) {
622 struct inet_sock *inet = inet_sk(sk);
623 struct in6_addr *final_p = NULL, final;
624 struct flowi fl;
625
626 memset(&fl, 0, sizeof(fl));
627 fl.proto = sk->sk_protocol;
628 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
629 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
630 fl.fl6_flowlabel = np->flow_label;
631 fl.oif = sk->sk_bound_dev_if;
632 fl.fl_ip_dport = inet->dport;
633 fl.fl_ip_sport = inet->sport;
634
635 if (np->opt && np->opt->srcrt) {
636 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
637 ipv6_addr_copy(&final, &fl.fl6_dst);
638 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
639 final_p = &final;
640 }
641
642 err = ip6_dst_lookup(sk, &dst, &fl);
643 if (err) {
644 sk->sk_route_caps = 0;
645 return err;
646 }
647 if (final_p)
648 ipv6_addr_copy(&fl.fl6_dst, final_p);
649
650 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
651 sk->sk_err_soft = -err;
652 return err;
653 }
654
655 ip6_dst_store(sk, dst, NULL);
656 sk->sk_route_caps = dst->dev->features &
657 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
658 }
659
660 return 0;
661}
662
663EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
664
665int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
666{
667 struct ipv6_pinfo *np = inet6_sk(sk);
668 struct inet6_skb_parm *opt = IP6CB(skb);
669
670 if (np->rxopt.all) {
671 if ((opt->hop && (np->rxopt.bits.hopopts ||
672 np->rxopt.bits.ohopopts)) ||
673 ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) &&
674 np->rxopt.bits.rxflow) ||
675 (opt->srcrt && (np->rxopt.bits.srcrt ||
676 np->rxopt.bits.osrcrt)) ||
677 ((opt->dst1 || opt->dst0) &&
678 (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
679 return 1;
680 }
681 return 0;
682}
683
684EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
685
612int 686int
613snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) 687snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
614{ 688{
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <net/icmp.h> 34#include <net/icmp.h>
35#include <net/ipv6.h> 35#include <net/ipv6.h>
36#include <net/protocol.h>
36#include <net/xfrm.h> 37#include <net/xfrm.h>
37#include <asm/scatterlist.h> 38#include <asm/scatterlist.h>
38 39
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
36#include <linux/random.h> 36#include <linux/random.h>
37#include <net/icmp.h> 37#include <net/icmp.h>
38#include <net/ipv6.h> 38#include <net/ipv6.h>
39#include <net/protocol.h>
39#include <linux/icmpv6.h> 40#include <linux/icmpv6.h>
40 41
41static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) 42static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index be6faf311387..113374dc342c 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
413 return opt; 413 return opt;
414} 414}
415 415
416EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
417
416/********************************** 418/**********************************
417 Hop-by-hop options. 419 Hop-by-hop options.
418 **********************************/ 420 **********************************/
@@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
579 return opt2; 581 return opt2;
580} 582}
581 583
584EXPORT_SYMBOL_GPL(ipv6_dup_options);
585
582static int ipv6_renew_option(void *ohdr, 586static int ipv6_renew_option(void *ohdr,
583 struct ipv6_opt_hdr __user *newopt, int newoptlen, 587 struct ipv6_opt_hdr __user *newopt, int newoptlen,
584 int inherit, 588 int inherit,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 000000000000..792f90f0f9ec
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,199 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET6 connection oriented protocols.
7 *
8 * Authors: See the TCPv6 sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/in6.h>
19#include <linux/ipv6.h>
20#include <linux/jhash.h>
21
22#include <net/addrconf.h>
23#include <net/inet_connection_sock.h>
24#include <net/inet_ecn.h>
25#include <net/inet_hashtables.h>
26#include <net/ip6_route.h>
27#include <net/sock.h>
28
29int inet6_csk_bind_conflict(const struct sock *sk,
30 const struct inet_bind_bucket *tb)
31{
32 const struct sock *sk2;
33 const struct hlist_node *node;
34
35 /* We must walk the whole port owner list in this case. -DaveM */
36 sk_for_each_bound(sk2, node, &tb->owners) {
37 if (sk != sk2 &&
38 (!sk->sk_bound_dev_if ||
39 !sk2->sk_bound_dev_if ||
40 sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
41 (!sk->sk_reuse || !sk2->sk_reuse ||
42 sk2->sk_state == TCP_LISTEN) &&
43 ipv6_rcv_saddr_equal(sk, sk2))
44 break;
45 }
46
47 return node != NULL;
48}
49
50EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
51
52/*
53 * request_sock (formerly open request) hash tables.
54 */
55static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport,
56 const u32 rnd, const u16 synq_hsize)
57{
58 u32 a = raddr->s6_addr32[0];
59 u32 b = raddr->s6_addr32[1];
60 u32 c = raddr->s6_addr32[2];
61
62 a += JHASH_GOLDEN_RATIO;
63 b += JHASH_GOLDEN_RATIO;
64 c += rnd;
65 __jhash_mix(a, b, c);
66
67 a += raddr->s6_addr32[3];
68 b += (u32)rport;
69 __jhash_mix(a, b, c);
70
71 return c & (synq_hsize - 1);
72}
73
74struct request_sock *inet6_csk_search_req(const struct sock *sk,
75 struct request_sock ***prevp,
76 const __u16 rport,
77 const struct in6_addr *raddr,
78 const struct in6_addr *laddr,
79 const int iif)
80{
81 const struct inet_connection_sock *icsk = inet_csk(sk);
82 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
83 struct request_sock *req, **prev;
84
85 for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
86 lopt->hash_rnd,
87 lopt->nr_table_entries)];
88 (req = *prev) != NULL;
89 prev = &req->dl_next) {
90 const struct inet6_request_sock *treq = inet6_rsk(req);
91
92 if (inet_rsk(req)->rmt_port == rport &&
93 req->rsk_ops->family == AF_INET6 &&
94 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
95 ipv6_addr_equal(&treq->loc_addr, laddr) &&
96 (!treq->iif || treq->iif == iif)) {
97 BUG_TRAP(req->sk == NULL);
98 *prevp = prev;
99 return req;
100 }
101 }
102
103 return NULL;
104}
105
106EXPORT_SYMBOL_GPL(inet6_csk_search_req);
107
108void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
109 struct request_sock *req,
110 const unsigned long timeout)
111{
112 struct inet_connection_sock *icsk = inet_csk(sk);
113 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
114 const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
115 inet_rsk(req)->rmt_port,
116 lopt->hash_rnd, lopt->nr_table_entries);
117
118 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
119 inet_csk_reqsk_queue_added(sk, timeout);
120}
121
122EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
123
124void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
125{
126 struct ipv6_pinfo *np = inet6_sk(sk);
127 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
128
129 sin6->sin6_family = AF_INET6;
130 ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
131 sin6->sin6_port = inet_sk(sk)->dport;
132 /* We do not store received flowlabel for TCP */
133 sin6->sin6_flowinfo = 0;
134 sin6->sin6_scope_id = 0;
135 if (sk->sk_bound_dev_if &&
136 ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
137 sin6->sin6_scope_id = sk->sk_bound_dev_if;
138}
139
140EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
141
142int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
143{
144 struct sock *sk = skb->sk;
145 struct inet_sock *inet = inet_sk(sk);
146 struct ipv6_pinfo *np = inet6_sk(sk);
147 struct flowi fl;
148 struct dst_entry *dst;
149 struct in6_addr *final_p = NULL, final;
150
151 memset(&fl, 0, sizeof(fl));
152 fl.proto = sk->sk_protocol;
153 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
154 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
155 fl.fl6_flowlabel = np->flow_label;
156 IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
157 fl.oif = sk->sk_bound_dev_if;
158 fl.fl_ip_sport = inet->sport;
159 fl.fl_ip_dport = inet->dport;
160
161 if (np->opt && np->opt->srcrt) {
162 struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
163 ipv6_addr_copy(&final, &fl.fl6_dst);
164 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
165 final_p = &final;
166 }
167
168 dst = __sk_dst_check(sk, np->dst_cookie);
169
170 if (dst == NULL) {
171 int err = ip6_dst_lookup(sk, &dst, &fl);
172
173 if (err) {
174 sk->sk_err_soft = -err;
175 return err;
176 }
177
178 if (final_p)
179 ipv6_addr_copy(&fl.fl6_dst, final_p);
180
181 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
182 sk->sk_route_caps = 0;
183 return err;
184 }
185
186 ip6_dst_store(sk, dst, NULL);
187 sk->sk_route_caps = dst->dev->features &
188 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
189 }
190
191 skb->dst = dst_clone(dst);
192
193 /* Restore final destination back after routing done */
194 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
195
196 return ip6_xmit(sk, skb, &fl, np->opt, 0);
197}
198
199EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e40..4154f3a8b6cf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Generic INET6 transport hashtables 6 * Generic INET6 transport hashtables
7 * 7 *
8 * Authors: Lotsa people, from code originally in tcp 8 * Authors: Lotsa people, from code originally in tcp, generalised here
9 * by Arnaldo Carvalho de Melo <acme@mandriva.com>
9 * 10 *
10 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
14 */ 15 */
15 16
16#include <linux/config.h> 17#include <linux/config.h>
17
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/random.h>
19 20
20#include <net/inet_connection_sock.h> 21#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h> 22#include <net/inet_hashtables.h>
22#include <net/inet6_hashtables.h> 23#include <net/inet6_hashtables.h>
24#include <net/ip.h>
23 25
24struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, 26struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
25 const struct in6_addr *daddr, 27 const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
79} 81}
80 82
81EXPORT_SYMBOL_GPL(inet6_lookup); 83EXPORT_SYMBOL_GPL(inet6_lookup);
84
85static int __inet6_check_established(struct inet_timewait_death_row *death_row,
86 struct sock *sk, const __u16 lport,
87 struct inet_timewait_sock **twp)
88{
89 struct inet_hashinfo *hinfo = death_row->hashinfo;
90 const struct inet_sock *inet = inet_sk(sk);
91 const struct ipv6_pinfo *np = inet6_sk(sk);
92 const struct in6_addr *daddr = &np->rcv_saddr;
93 const struct in6_addr *saddr = &np->daddr;
94 const int dif = sk->sk_bound_dev_if;
95 const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
96 const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
97 inet->dport);
98 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
99 struct sock *sk2;
100 const struct hlist_node *node;
101 struct inet_timewait_sock *tw;
102
103 prefetch(head->chain.first);
104 write_lock(&head->lock);
105
106 /* Check TIME-WAIT sockets first. */
107 sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
108 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
109
110 tw = inet_twsk(sk2);
111
112 if(*((__u32 *)&(tw->tw_dport)) == ports &&
113 sk2->sk_family == PF_INET6 &&
114 ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
115 ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
116 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
117 if (twsk_unique(sk, sk2, twp))
118 goto unique;
119 else
120 goto not_unique;
121 }
122 }
123 tw = NULL;
124
125 /* And established part... */
126 sk_for_each(sk2, node, &head->chain) {
127 if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
128 goto not_unique;
129 }
130
131unique:
132 BUG_TRAP(sk_unhashed(sk));
133 __sk_add_node(sk, &head->chain);
134 sk->sk_hash = hash;
135 sock_prot_inc_use(sk->sk_prot);
136 write_unlock(&head->lock);
137
138 if (twp != NULL) {
139 *twp = tw;
140 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
141 } else if (tw != NULL) {
142 /* Silly. Should hash-dance instead... */
143 inet_twsk_deschedule(tw, death_row);
144 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
145
146 inet_twsk_put(tw);
147 }
148 return 0;
149
150not_unique:
151 write_unlock(&head->lock);
152 return -EADDRNOTAVAIL;
153}
154
155static inline u32 inet6_sk_port_offset(const struct sock *sk)
156{
157 const struct inet_sock *inet = inet_sk(sk);
158 const struct ipv6_pinfo *np = inet6_sk(sk);
159 return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
160 np->daddr.s6_addr32,
161 inet->dport);
162}
163
164int inet6_hash_connect(struct inet_timewait_death_row *death_row,
165 struct sock *sk)
166{
167 struct inet_hashinfo *hinfo = death_row->hashinfo;
168 const unsigned short snum = inet_sk(sk)->num;
169 struct inet_bind_hashbucket *head;
170 struct inet_bind_bucket *tb;
171 int ret;
172
173 if (snum == 0) {
174 const int low = sysctl_local_port_range[0];
175 const int high = sysctl_local_port_range[1];
176 const int range = high - low;
177 int i, port;
178 static u32 hint;
179 const u32 offset = hint + inet6_sk_port_offset(sk);
180 struct hlist_node *node;
181 struct inet_timewait_sock *tw = NULL;
182
183 local_bh_disable();
184 for (i = 1; i <= range; i++) {
185 port = low + (i + offset) % range;
186 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
187 spin_lock(&head->lock);
188
189 /* Does not bother with rcv_saddr checks,
190 * because the established check is already
191 * unique enough.
192 */
193 inet_bind_bucket_for_each(tb, node, &head->chain) {
194 if (tb->port == port) {
195 BUG_TRAP(!hlist_empty(&tb->owners));
196 if (tb->fastreuse >= 0)
197 goto next_port;
198 if (!__inet6_check_established(death_row,
199 sk, port,
200 &tw))
201 goto ok;
202 goto next_port;
203 }
204 }
205
206 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
207 head, port);
208 if (!tb) {
209 spin_unlock(&head->lock);
210 break;
211 }
212 tb->fastreuse = -1;
213 goto ok;
214
215 next_port:
216 spin_unlock(&head->lock);
217 }
218 local_bh_enable();
219
220 return -EADDRNOTAVAIL;
221
222ok:
223 hint += i;
224
225 /* Head lock still held and bh's disabled */
226 inet_bind_hash(sk, tb, port);
227 if (sk_unhashed(sk)) {
228 inet_sk(sk)->sport = htons(port);
229 __inet6_hash(hinfo, sk);
230 }
231 spin_unlock(&head->lock);
232
233 if (tw) {
234 inet_twsk_deschedule(tw, death_row);
235 inet_twsk_put(tw);
236 }
237
238 ret = 0;
239 goto out;
240 }
241
242 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
243 tb = inet_csk(sk)->icsk_bind_hash;
244 spin_lock_bh(&head->lock);
245
246 if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
247 __inet6_hash(hinfo, sk);
248 spin_unlock_bh(&head->lock);
249 return 0;
250 } else {
251 spin_unlock(&head->lock);
252 /* No definite answer... Walk to established hash table */
253 ret = __inet6_check_established(death_row, sk, snum, NULL);
254out:
255 local_bh_enable();
256 return ret;
257 }
258}
259
260EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1cf02765fb5c..89d12b4817a9 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label)
200 return NULL; 200 return NULL;
201} 201}
202 202
203EXPORT_SYMBOL_GPL(fl6_sock_lookup);
204
203void fl6_free_socklist(struct sock *sk) 205void fl6_free_socklist(struct sock *sk)
204{ 206{
205 struct ipv6_pinfo *np = inet6_sk(sk); 207 struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8523c76ebf76..b4c4beba0ede 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -775,6 +775,8 @@ out_err_release:
775 return err; 775 return err;
776} 776}
777 777
778EXPORT_SYMBOL_GPL(ip6_dst_lookup);
779
778static inline int ip6_ufo_append_data(struct sock *sk, 780static inline int ip6_ufo_append_data(struct sock *sk,
779 int getfrag(void *from, char *to, int offset, int len, 781 int getfrag(void *from, char *to, int offset, int len,
780 int odd, struct sk_buff *skb), 782 int odd, struct sk_buff *skb),
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
47#include <linux/rtnetlink.h> 47#include <linux/rtnetlink.h>
48#include <net/icmp.h> 48#include <net/icmp.h>
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <net/protocol.h>
50#include <linux/ipv6.h> 51#include <linux/ipv6.h>
51#include <linux/icmpv6.h> 52#include <linux/icmpv6.h>
52 53
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe6..c63868dd2ca2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
163 sk_refcnt_debug_dec(sk); 163 sk_refcnt_debug_dec(sk);
164 164
165 if (sk->sk_protocol == IPPROTO_TCP) { 165 if (sk->sk_protocol == IPPROTO_TCP) {
166 struct tcp_sock *tp = tcp_sk(sk); 166 struct inet_connection_sock *icsk = inet_csk(sk);
167 167
168 local_bh_disable(); 168 local_bh_disable();
169 sock_prot_dec_use(sk->sk_prot); 169 sock_prot_dec_use(sk->sk_prot);
170 sock_prot_inc_use(&tcp_prot); 170 sock_prot_inc_use(&tcp_prot);
171 local_bh_enable(); 171 local_bh_enable();
172 sk->sk_prot = &tcp_prot; 172 sk->sk_prot = &tcp_prot;
173 tp->af_specific = &ipv4_specific; 173 icsk->icsk_af_ops = &ipv4_specific;
174 sk->sk_socket->ops = &inet_stream_ops; 174 sk->sk_socket->ops = &inet_stream_ops;
175 sk->sk_family = PF_INET; 175 sk->sk_family = PF_INET;
176 tcp_sync_mss(sk, tp->pmtu_cookie); 176 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
177 } else { 177 } else {
178 local_bh_disable(); 178 local_bh_disable();
179 sock_prot_dec_use(sk->sk_prot); 179 sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
317 } 317 }
318 318
319 retv = 0; 319 retv = 0;
320 if (sk->sk_type == SOCK_STREAM) { 320 if (inet_sk(sk)->is_icsk) {
321 if (opt) { 321 if (opt) {
322 struct tcp_sock *tp = tcp_sk(sk); 322 struct inet_connection_sock *icsk = inet_csk(sk);
323 if (!((1 << sk->sk_state) & 323 if (!((1 << sk->sk_state) &
324 (TCPF_LISTEN | TCPF_CLOSE)) 324 (TCPF_LISTEN | TCPF_CLOSE))
325 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { 325 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
326 tp->ext_header_len = opt->opt_flen + opt->opt_nflen; 326 icsk->icsk_ext_hdr_len =
327 tcp_sync_mss(sk, tp->pmtu_cookie); 327 opt->opt_flen + opt->opt_nflen;
328 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
328 } 329 }
329 } 330 }
330 opt = xchg(&np->opt, opt); 331 opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
380 goto done; 381 goto done;
381update: 382update:
382 retv = 0; 383 retv = 0;
383 if (sk->sk_type == SOCK_STREAM) { 384 if (inet_sk(sk)->is_icsk) {
384 if (opt) { 385 if (opt) {
385 struct tcp_sock *tp = tcp_sk(sk); 386 struct inet_connection_sock *icsk = inet_csk(sk);
386 if (!((1 << sk->sk_state) & 387 if (!((1 << sk->sk_state) &
387 (TCPF_LISTEN | TCPF_CLOSE)) 388 (TCPF_LISTEN | TCPF_CLOSE))
388 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { 389 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
389 tp->ext_header_len = opt->opt_flen + opt->opt_nflen; 390 icsk->icsk_ext_hdr_len =
390 tcp_sync_mss(sk, tp->pmtu_cookie); 391 opt->opt_flen + opt->opt_nflen;
392 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
391 } 393 }
392 } 394 }
393 opt = xchg(&np->opt, opt); 395 opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f829a4ad3ccc..1cf305a9f8dd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
224 224
225 mc_lst->ifindex = dev->ifindex; 225 mc_lst->ifindex = dev->ifindex;
226 mc_lst->sfmode = MCAST_EXCLUDE; 226 mc_lst->sfmode = MCAST_EXCLUDE;
227 mc_lst->sflock = RW_LOCK_UNLOCKED; 227 rwlock_init(&mc_lst->sflock);
228 mc_lst->sflist = NULL; 228 mc_lst->sflist = NULL;
229 229
230 /* 230 /*
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 95d469271c4d..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
15 * - new extension header parser code 15 * - new extension header parser code
16 */ 16 */
17#include <linux/config.h> 17#include <linux/config.h>
18#include <linux/in.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
@@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex);
86 context stops packets coming through and allows user context to read 87 context stops packets coming through and allows user context to read
87 the counters or update the rules. 88 the counters or update the rules.
88 89
89 To be cache friendly on SMP, we arrange them like so:
90 [ n-entries ]
91 ... cache-align padding ...
92 [ n-entries ]
93
94 Hence the start of any table is given by get_table() below. */ 90 Hence the start of any table is given by get_table() below. */
95 91
96/* The table itself */ 92/* The table itself */
@@ -108,20 +104,15 @@ struct ip6t_table_info
108 unsigned int underflow[NF_IP6_NUMHOOKS]; 104 unsigned int underflow[NF_IP6_NUMHOOKS];
109 105
110 /* ip6t_entry tables: one per CPU */ 106 /* ip6t_entry tables: one per CPU */
111 char entries[0] ____cacheline_aligned; 107 void *entries[NR_CPUS];
112}; 108};
113 109
114static LIST_HEAD(ip6t_target); 110static LIST_HEAD(ip6t_target);
115static LIST_HEAD(ip6t_match); 111static LIST_HEAD(ip6t_match);
116static LIST_HEAD(ip6t_tables); 112static LIST_HEAD(ip6t_tables);
113#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
117#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
118 115
119#ifdef CONFIG_SMP
120#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
121#else
122#define TABLE_OFFSET(t,p) 0
123#endif
124
125#if 0 116#if 0
126#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) 117#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
127#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) 118#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb,
376 367
377 read_lock_bh(&table->lock); 368 read_lock_bh(&table->lock);
378 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 369 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
379 table_base = (void *)table->private->entries 370 table_base = (void *)table->private->entries[smp_processor_id()];
380 + TABLE_OFFSET(table->private, smp_processor_id());
381 e = get_entry(table_base, table->private->hook_entry[hook]); 371 e = get_entry(table_base, table->private->hook_entry[hook]);
382 372
383#ifdef CONFIG_NETFILTER_DEBUG 373#ifdef CONFIG_NETFILTER_DEBUG
@@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6)
649/* Figures out from what hook each rule can be called: returns 0 if 639/* Figures out from what hook each rule can be called: returns 0 if
650 there are loops. Puts hook bitmask in comefrom. */ 640 there are loops. Puts hook bitmask in comefrom. */
651static int 641static int
652mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) 642mark_source_chains(struct ip6t_table_info *newinfo,
643 unsigned int valid_hooks, void *entry0)
653{ 644{
654 unsigned int hook; 645 unsigned int hook;
655 646
@@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
658 for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { 649 for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
659 unsigned int pos = newinfo->hook_entry[hook]; 650 unsigned int pos = newinfo->hook_entry[hook];
660 struct ip6t_entry *e 651 struct ip6t_entry *e
661 = (struct ip6t_entry *)(newinfo->entries + pos); 652 = (struct ip6t_entry *)(entry0 + pos);
662 653
663 if (!(valid_hooks & (1 << hook))) 654 if (!(valid_hooks & (1 << hook)))
664 continue; 655 continue;
@@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
708 goto next; 699 goto next;
709 700
710 e = (struct ip6t_entry *) 701 e = (struct ip6t_entry *)
711 (newinfo->entries + pos); 702 (entry0 + pos);
712 } while (oldpos == pos + e->next_offset); 703 } while (oldpos == pos + e->next_offset);
713 704
714 /* Move along one */ 705 /* Move along one */
715 size = e->next_offset; 706 size = e->next_offset;
716 e = (struct ip6t_entry *) 707 e = (struct ip6t_entry *)
717 (newinfo->entries + pos + size); 708 (entry0 + pos + size);
718 e->counters.pcnt = pos; 709 e->counters.pcnt = pos;
719 pos += size; 710 pos += size;
720 } else { 711 } else {
@@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
731 newpos = pos + e->next_offset; 722 newpos = pos + e->next_offset;
732 } 723 }
733 e = (struct ip6t_entry *) 724 e = (struct ip6t_entry *)
734 (newinfo->entries + newpos); 725 (entry0 + newpos);
735 e->counters.pcnt = pos; 726 e->counters.pcnt = pos;
736 pos = newpos; 727 pos = newpos;
737 } 728 }
@@ -941,6 +932,7 @@ static int
941translate_table(const char *name, 932translate_table(const char *name,
942 unsigned int valid_hooks, 933 unsigned int valid_hooks,
943 struct ip6t_table_info *newinfo, 934 struct ip6t_table_info *newinfo,
935 void *entry0,
944 unsigned int size, 936 unsigned int size,
945 unsigned int number, 937 unsigned int number,
946 const unsigned int *hook_entries, 938 const unsigned int *hook_entries,
@@ -961,11 +953,11 @@ translate_table(const char *name,
961 duprintf("translate_table: size %u\n", newinfo->size); 953 duprintf("translate_table: size %u\n", newinfo->size);
962 i = 0; 954 i = 0;
963 /* Walk through entries, checking offsets. */ 955 /* Walk through entries, checking offsets. */
964 ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 956 ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
965 check_entry_size_and_hooks, 957 check_entry_size_and_hooks,
966 newinfo, 958 newinfo,
967 newinfo->entries, 959 entry0,
968 newinfo->entries + size, 960 entry0 + size,
969 hook_entries, underflows, &i); 961 hook_entries, underflows, &i);
970 if (ret != 0) 962 if (ret != 0)
971 return ret; 963 return ret;
@@ -993,27 +985,24 @@ translate_table(const char *name,
993 } 985 }
994 } 986 }
995 987
996 if (!mark_source_chains(newinfo, valid_hooks)) 988 if (!mark_source_chains(newinfo, valid_hooks, entry0))
997 return -ELOOP; 989 return -ELOOP;
998 990
999 /* Finally, each sanity check must pass */ 991 /* Finally, each sanity check must pass */
1000 i = 0; 992 i = 0;
1001 ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 993 ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
1002 check_entry, name, size, &i); 994 check_entry, name, size, &i);
1003 995
1004 if (ret != 0) { 996 if (ret != 0) {
1005 IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 997 IP6T_ENTRY_ITERATE(entry0, newinfo->size,
1006 cleanup_entry, &i); 998 cleanup_entry, &i);
1007 return ret; 999 return ret;
1008 } 1000 }
1009 1001
1010 /* And one copy for every other CPU */ 1002 /* And one copy for every other CPU */
1011 for_each_cpu(i) { 1003 for_each_cpu(i) {
1012 if (i == 0) 1004 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
1013 continue; 1005 memcpy(newinfo->entries[i], entry0, newinfo->size);
1014 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
1015 newinfo->entries,
1016 SMP_ALIGN(newinfo->size));
1017 } 1006 }
1018 1007
1019 return ret; 1008 return ret;
@@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table,
1029 1018
1030#ifdef CONFIG_NETFILTER_DEBUG 1019#ifdef CONFIG_NETFILTER_DEBUG
1031 { 1020 {
1032 struct ip6t_entry *table_base; 1021 int cpu;
1033 unsigned int i;
1034 1022
1035 for_each_cpu(i) { 1023 for_each_cpu(cpu) {
1036 table_base = 1024 struct ip6t_entry *table_base = newinfo->entries[cpu];
1037 (void *)newinfo->entries 1025 if (table_base)
1038 + TABLE_OFFSET(newinfo, i); 1026 table_base->comefrom = 0xdead57ac;
1039
1040 table_base->comefrom = 0xdead57ac;
1041 } 1027 }
1042 } 1028 }
1043#endif 1029#endif
@@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e,
1072 return 0; 1058 return 0;
1073} 1059}
1074 1060
1061static inline int
1062set_entry_to_counter(const struct ip6t_entry *e,
1063 struct ip6t_counters total[],
1064 unsigned int *i)
1065{
1066 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
1067
1068 (*i)++;
1069 return 0;
1070}
1071
1075static void 1072static void
1076get_counters(const struct ip6t_table_info *t, 1073get_counters(const struct ip6t_table_info *t,
1077 struct ip6t_counters counters[]) 1074 struct ip6t_counters counters[])
1078{ 1075{
1079 unsigned int cpu; 1076 unsigned int cpu;
1080 unsigned int i; 1077 unsigned int i;
1078 unsigned int curcpu;
1079
1080 /* Instead of clearing (by a previous call to memset())
1081 * the counters and using adds, we set the counters
1082 * with data used by 'current' CPU
1083 * We dont care about preemption here.
1084 */
1085 curcpu = raw_smp_processor_id();
1086
1087 i = 0;
1088 IP6T_ENTRY_ITERATE(t->entries[curcpu],
1089 t->size,
1090 set_entry_to_counter,
1091 counters,
1092 &i);
1081 1093
1082 for_each_cpu(cpu) { 1094 for_each_cpu(cpu) {
1095 if (cpu == curcpu)
1096 continue;
1083 i = 0; 1097 i = 0;
1084 IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 1098 IP6T_ENTRY_ITERATE(t->entries[cpu],
1085 t->size, 1099 t->size,
1086 add_entry_to_counter, 1100 add_entry_to_counter,
1087 counters, 1101 counters,
@@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size,
1098 struct ip6t_entry *e; 1112 struct ip6t_entry *e;
1099 struct ip6t_counters *counters; 1113 struct ip6t_counters *counters;
1100 int ret = 0; 1114 int ret = 0;
1115 void *loc_cpu_entry;
1101 1116
1102 /* We need atomic snapshot of counters: rest doesn't change 1117 /* We need atomic snapshot of counters: rest doesn't change
1103 (other than comefrom, which userspace doesn't care 1118 (other than comefrom, which userspace doesn't care
@@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size,
1109 return -ENOMEM; 1124 return -ENOMEM;
1110 1125
1111 /* First, sum counters... */ 1126 /* First, sum counters... */
1112 memset(counters, 0, countersize);
1113 write_lock_bh(&table->lock); 1127 write_lock_bh(&table->lock);
1114 get_counters(table->private, counters); 1128 get_counters(table->private, counters);
1115 write_unlock_bh(&table->lock); 1129 write_unlock_bh(&table->lock);
1116 1130
1117 /* ... then copy entire thing from CPU 0... */ 1131 /* choose the copy that is on ourc node/cpu */
1118 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 1132 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1133 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
1119 ret = -EFAULT; 1134 ret = -EFAULT;
1120 goto free_counters; 1135 goto free_counters;
1121 } 1136 }
@@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size,
1127 struct ip6t_entry_match *m; 1142 struct ip6t_entry_match *m;
1128 struct ip6t_entry_target *t; 1143 struct ip6t_entry_target *t;
1129 1144
1130 e = (struct ip6t_entry *)(table->private->entries + off); 1145 e = (struct ip6t_entry *)(loc_cpu_entry + off);
1131 if (copy_to_user(userptr + off 1146 if (copy_to_user(userptr + off
1132 + offsetof(struct ip6t_entry, counters), 1147 + offsetof(struct ip6t_entry, counters),
1133 &counters[num], 1148 &counters[num],
@@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries,
1196 return ret; 1211 return ret;
1197} 1212}
1198 1213
1214static void free_table_info(struct ip6t_table_info *info)
1215{
1216 int cpu;
1217 for_each_cpu(cpu) {
1218 if (info->size <= PAGE_SIZE)
1219 kfree(info->entries[cpu]);
1220 else
1221 vfree(info->entries[cpu]);
1222 }
1223 kfree(info);
1224}
1225
1226static struct ip6t_table_info *alloc_table_info(unsigned int size)
1227{
1228 struct ip6t_table_info *newinfo;
1229 int cpu;
1230
1231 newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
1232 if (!newinfo)
1233 return NULL;
1234
1235 newinfo->size = size;
1236
1237 for_each_cpu(cpu) {
1238 if (size <= PAGE_SIZE)
1239 newinfo->entries[cpu] = kmalloc_node(size,
1240 GFP_KERNEL,
1241 cpu_to_node(cpu));
1242 else
1243 newinfo->entries[cpu] = vmalloc_node(size,
1244 cpu_to_node(cpu));
1245 if (newinfo->entries[cpu] == NULL) {
1246 free_table_info(newinfo);
1247 return NULL;
1248 }
1249 }
1250
1251 return newinfo;
1252}
1253
1199static int 1254static int
1200do_replace(void __user *user, unsigned int len) 1255do_replace(void __user *user, unsigned int len)
1201{ 1256{
@@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len)
1204 struct ip6t_table *t; 1259 struct ip6t_table *t;
1205 struct ip6t_table_info *newinfo, *oldinfo; 1260 struct ip6t_table_info *newinfo, *oldinfo;
1206 struct ip6t_counters *counters; 1261 struct ip6t_counters *counters;
1262 void *loc_cpu_entry, *loc_cpu_old_entry;
1207 1263
1208 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1264 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1209 return -EFAULT; 1265 return -EFAULT;
@@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len)
1212 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 1268 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1213 return -ENOMEM; 1269 return -ENOMEM;
1214 1270
1215 newinfo = vmalloc(sizeof(struct ip6t_table_info) 1271 newinfo = alloc_table_info(tmp.size);
1216 + SMP_ALIGN(tmp.size) *
1217 (highest_possible_processor_id()+1));
1218 if (!newinfo) 1272 if (!newinfo)
1219 return -ENOMEM; 1273 return -ENOMEM;
1220 1274
1221 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 1275 /* choose the copy that is on our node/cpu */
1276 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1277 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
1222 tmp.size) != 0) { 1278 tmp.size) != 0) {
1223 ret = -EFAULT; 1279 ret = -EFAULT;
1224 goto free_newinfo; 1280 goto free_newinfo;
@@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len)
1229 ret = -ENOMEM; 1285 ret = -ENOMEM;
1230 goto free_newinfo; 1286 goto free_newinfo;
1231 } 1287 }
1232 memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters));
1233 1288
1234 ret = translate_table(tmp.name, tmp.valid_hooks, 1289 ret = translate_table(tmp.name, tmp.valid_hooks,
1235 newinfo, tmp.size, tmp.num_entries, 1290 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1236 tmp.hook_entry, tmp.underflow); 1291 tmp.hook_entry, tmp.underflow);
1237 if (ret != 0) 1292 if (ret != 0)
1238 goto free_newinfo_counters; 1293 goto free_newinfo_counters;
@@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len)
1271 /* Get the old counters. */ 1326 /* Get the old counters. */
1272 get_counters(oldinfo, counters); 1327 get_counters(oldinfo, counters);
1273 /* Decrease module usage counts and free resource */ 1328 /* Decrease module usage counts and free resource */
1274 IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1329 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1275 vfree(oldinfo); 1330 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1331 free_table_info(oldinfo);
1276 if (copy_to_user(tmp.counters, counters, 1332 if (copy_to_user(tmp.counters, counters,
1277 sizeof(struct ip6t_counters) * tmp.num_counters) != 0) 1333 sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
1278 ret = -EFAULT; 1334 ret = -EFAULT;
@@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len)
1284 module_put(t->me); 1340 module_put(t->me);
1285 up(&ip6t_mutex); 1341 up(&ip6t_mutex);
1286 free_newinfo_counters_untrans: 1342 free_newinfo_counters_untrans:
1287 IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); 1343 IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1288 free_newinfo_counters: 1344 free_newinfo_counters:
1289 vfree(counters); 1345 vfree(counters);
1290 free_newinfo: 1346 free_newinfo:
1291 vfree(newinfo); 1347 free_table_info(newinfo);
1292 return ret; 1348 return ret;
1293} 1349}
1294 1350
@@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len)
1321 struct ip6t_counters_info tmp, *paddc; 1377 struct ip6t_counters_info tmp, *paddc;
1322 struct ip6t_table *t; 1378 struct ip6t_table *t;
1323 int ret = 0; 1379 int ret = 0;
1380 void *loc_cpu_entry;
1324 1381
1325 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1382 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1326 return -EFAULT; 1383 return -EFAULT;
@@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len)
1350 } 1407 }
1351 1408
1352 i = 0; 1409 i = 0;
1353 IP6T_ENTRY_ITERATE(t->private->entries, 1410 /* Choose the copy that is on our node */
1411 loc_cpu_entry = t->private->entries[smp_processor_id()];
1412 IP6T_ENTRY_ITERATE(loc_cpu_entry,
1354 t->private->size, 1413 t->private->size,
1355 add_counter_to_entry, 1414 add_counter_to_entry,
1356 paddc->counters, 1415 paddc->counters,
@@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table,
1543 struct ip6t_table_info *newinfo; 1602 struct ip6t_table_info *newinfo;
1544 static struct ip6t_table_info bootstrap 1603 static struct ip6t_table_info bootstrap
1545 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1604 = { 0, 0, 0, { 0 }, { 0 }, { } };
1605 void *loc_cpu_entry;
1546 1606
1547 newinfo = vmalloc(sizeof(struct ip6t_table_info) 1607 newinfo = alloc_table_info(repl->size);
1548 + SMP_ALIGN(repl->size) *
1549 (highest_possible_processor_id()+1));
1550 if (!newinfo) 1608 if (!newinfo)
1551 return -ENOMEM; 1609 return -ENOMEM;
1552 1610
1553 memcpy(newinfo->entries, repl->entries, repl->size); 1611 /* choose the copy on our node/cpu */
1612 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1613 memcpy(loc_cpu_entry, repl->entries, repl->size);
1554 1614
1555 ret = translate_table(table->name, table->valid_hooks, 1615 ret = translate_table(table->name, table->valid_hooks,
1556 newinfo, repl->size, 1616 newinfo, loc_cpu_entry, repl->size,
1557 repl->num_entries, 1617 repl->num_entries,
1558 repl->hook_entry, 1618 repl->hook_entry,
1559 repl->underflow); 1619 repl->underflow);
1560 if (ret != 0) { 1620 if (ret != 0) {
1561 vfree(newinfo); 1621 free_table_info(newinfo);
1562 return ret; 1622 return ret;
1563 } 1623 }
1564 1624
1565 ret = down_interruptible(&ip6t_mutex); 1625 ret = down_interruptible(&ip6t_mutex);
1566 if (ret != 0) { 1626 if (ret != 0) {
1567 vfree(newinfo); 1627 free_table_info(newinfo);
1568 return ret; 1628 return ret;
1569 } 1629 }
1570 1630
@@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table,
1593 return ret; 1653 return ret;
1594 1654
1595 free_unlock: 1655 free_unlock:
1596 vfree(newinfo); 1656 free_table_info(newinfo);
1597 goto unlock; 1657 goto unlock;
1598} 1658}
1599 1659
1600void ip6t_unregister_table(struct ip6t_table *table) 1660void ip6t_unregister_table(struct ip6t_table *table)
1601{ 1661{
1662 void *loc_cpu_entry;
1663
1602 down(&ip6t_mutex); 1664 down(&ip6t_mutex);
1603 LIST_DELETE(&ip6t_tables, table); 1665 LIST_DELETE(&ip6t_tables, table);
1604 up(&ip6t_mutex); 1666 up(&ip6t_mutex);
1605 1667
1606 /* Decrease module usage counts and free resources */ 1668 /* Decrease module usage counts and free resources */
1607 IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, 1669 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1670 IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1608 cleanup_entry, NULL); 1671 cleanup_entry, NULL);
1609 vfree(table->private); 1672 free_table_info(table->private);
1610} 1673}
1611 1674
1612/* Returns 1 if the port is matched by the range, 0 otherwise */ 1675/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/if_arp.h>
16#include <linux/ip.h> 17#include <linux/ip.h>
17#include <linux/spinlock.h> 18#include <linux/spinlock.h>
18#include <linux/icmpv6.h> 19#include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/ip.h>
12#include <linux/ipv6.h> 13#include <linux/ipv6.h>
13#include <linux/types.h> 14#include <linux/types.h>
14#include <net/checksum.h> 15#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/ip.h>
12#include <linux/ipv6.h> 13#include <linux/ipv6.h>
13#include <linux/types.h> 14#include <linux/types.h>
14#include <net/checksum.h> 15#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c52af9e560..f3e5ffbd592f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -98,7 +98,7 @@ struct nf_ct_frag6_queue
98#define FRAG6Q_HASHSZ 64 98#define FRAG6Q_HASHSZ 64
99 99
100static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; 100static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
101static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; 101static DEFINE_RWLOCK(nf_ct_frag6_lock);
102static u32 nf_ct_frag6_hash_rnd; 102static u32 nf_ct_frag6_hash_rnd;
103static LIST_HEAD(nf_ct_frag6_lru_list); 103static LIST_HEAD(nf_ct_frag6_lru_list);
104int nf_ct_frag6_nqueues = 0; 104int nf_ct_frag6_nqueues = 0;
@@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct
371 init_timer(&fq->timer); 371 init_timer(&fq->timer);
372 fq->timer.function = nf_ct_frag6_expire; 372 fq->timer.function = nf_ct_frag6_expire;
373 fq->timer.data = (long) fq; 373 fq->timer.data = (long) fq;
374 fq->lock = SPIN_LOCK_UNLOCKED; 374 spin_lock_init(&fq->lock);
375 atomic_set(&fq->refcnt, 1); 375 atomic_set(&fq->refcnt, 1);
376 376
377 return nf_ct_frag6_intern(hash, fq); 377 return nf_ct_frag6_intern(hash, fq);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2af..66f1d12ea578 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
32#include <linux/icmpv6.h> 32#include <linux/icmpv6.h>
33#include <linux/netfilter.h> 33#include <linux/netfilter.h>
34#include <linux/netfilter_ipv6.h> 34#include <linux/netfilter_ipv6.h>
35#include <linux/skbuff.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/ioctls.h> 37#include <asm/ioctls.h>
37#include <asm/bug.h> 38#include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
433 return err; 434 return err;
434 435
435csum_copy_err: 436csum_copy_err:
436 /* Clear queue. */ 437 skb_kill_datagram(sk, skb, flags);
437 if (flags&MSG_PEEK) {
438 int clear = 0;
439 spin_lock_bh(&sk->sk_receive_queue.lock);
440 if (skb == skb_peek(&sk->sk_receive_queue)) {
441 __skb_unlink(skb, &sk->sk_receive_queue);
442 clear = 1;
443 }
444 spin_unlock_bh(&sk->sk_receive_queue.lock);
445 if (clear)
446 kfree_skb(skb);
447 }
448 438
449 /* Error for blocking case is chosen to masquerade 439 /* Error for blocking case is chosen to masquerade
450 as some normal condition. 440 as some normal condition.
451 */ 441 */
452 err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; 442 err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
453 /* FIXME: increment a raw6 drops counter here */ 443 /* FIXME: increment a raw6 drops counter here */
454 goto out_free; 444 goto out;
455} 445}
456 446
457static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, 447static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8827389abaf7..2947bc56d8a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -48,6 +48,7 @@
48#include <net/tcp.h> 48#include <net/tcp.h>
49#include <net/ndisc.h> 49#include <net/ndisc.h>
50#include <net/inet6_hashtables.h> 50#include <net/inet6_hashtables.h>
51#include <net/inet6_connection_sock.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <net/transp_v6.h> 53#include <net/transp_v6.h>
53#include <net/addrconf.h> 54#include <net/addrconf.h>
@@ -59,6 +60,7 @@
59#include <net/addrconf.h> 60#include <net/addrconf.h>
60#include <net/snmp.h> 61#include <net/snmp.h>
61#include <net/dsfield.h> 62#include <net/dsfield.h>
63#include <net/timewait_sock.h>
62 64
63#include <asm/uaccess.h> 65#include <asm/uaccess.h>
64 66
@@ -67,224 +69,33 @@
67 69
68static void tcp_v6_send_reset(struct sk_buff *skb); 70static void tcp_v6_send_reset(struct sk_buff *skb);
69static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); 71static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
70static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 72static void tcp_v6_send_check(struct sock *sk, int len,
71 struct sk_buff *skb); 73 struct sk_buff *skb);
72 74
73static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); 75static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
74static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
75 76
76static struct tcp_func ipv6_mapped; 77static struct inet_connection_sock_af_ops ipv6_mapped;
77static struct tcp_func ipv6_specific; 78static struct inet_connection_sock_af_ops ipv6_specific;
78 79
79static inline int tcp_v6_bind_conflict(const struct sock *sk,
80 const struct inet_bind_bucket *tb)
81{
82 const struct sock *sk2;
83 const struct hlist_node *node;
84
85 /* We must walk the whole port owner list in this case. -DaveM */
86 sk_for_each_bound(sk2, node, &tb->owners) {
87 if (sk != sk2 &&
88 (!sk->sk_bound_dev_if ||
89 !sk2->sk_bound_dev_if ||
90 sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
91 (!sk->sk_reuse || !sk2->sk_reuse ||
92 sk2->sk_state == TCP_LISTEN) &&
93 ipv6_rcv_saddr_equal(sk, sk2))
94 break;
95 }
96
97 return node != NULL;
98}
99
100/* Grrr, addr_type already calculated by caller, but I don't want
101 * to add some silly "cookie" argument to this method just for that.
102 * But it doesn't matter, the recalculation is in the rarest path
103 * this function ever takes.
104 */
105static int tcp_v6_get_port(struct sock *sk, unsigned short snum) 80static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
106{ 81{
107 struct inet_bind_hashbucket *head; 82 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
108 struct inet_bind_bucket *tb; 83 inet6_csk_bind_conflict);
109 struct hlist_node *node;
110 int ret;
111
112 local_bh_disable();
113 if (snum == 0) {
114 int low = sysctl_local_port_range[0];
115 int high = sysctl_local_port_range[1];
116 int remaining = (high - low) + 1;
117 int rover = net_random() % (high - low) + low;
118
119 do {
120 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
121 spin_lock(&head->lock);
122 inet_bind_bucket_for_each(tb, node, &head->chain)
123 if (tb->port == rover)
124 goto next;
125 break;
126 next:
127 spin_unlock(&head->lock);
128 if (++rover > high)
129 rover = low;
130 } while (--remaining > 0);
131
132 /* Exhausted local port range during search? It is not
133 * possible for us to be holding one of the bind hash
134 * locks if this test triggers, because if 'remaining'
135 * drops to zero, we broke out of the do/while loop at
136 * the top level, not from the 'break;' statement.
137 */
138 ret = 1;
139 if (unlikely(remaining <= 0))
140 goto fail;
141
142 /* OK, here is the one we will use. */
143 snum = rover;
144 } else {
145 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
146 spin_lock(&head->lock);
147 inet_bind_bucket_for_each(tb, node, &head->chain)
148 if (tb->port == snum)
149 goto tb_found;
150 }
151 tb = NULL;
152 goto tb_not_found;
153tb_found:
154 if (tb && !hlist_empty(&tb->owners)) {
155 if (tb->fastreuse > 0 && sk->sk_reuse &&
156 sk->sk_state != TCP_LISTEN) {
157 goto success;
158 } else {
159 ret = 1;
160 if (tcp_v6_bind_conflict(sk, tb))
161 goto fail_unlock;
162 }
163 }
164tb_not_found:
165 ret = 1;
166 if (tb == NULL) {
167 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
168 if (tb == NULL)
169 goto fail_unlock;
170 }
171 if (hlist_empty(&tb->owners)) {
172 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
173 tb->fastreuse = 1;
174 else
175 tb->fastreuse = 0;
176 } else if (tb->fastreuse &&
177 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
178 tb->fastreuse = 0;
179
180success:
181 if (!inet_csk(sk)->icsk_bind_hash)
182 inet_bind_hash(sk, tb, snum);
183 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
184 ret = 0;
185
186fail_unlock:
187 spin_unlock(&head->lock);
188fail:
189 local_bh_enable();
190 return ret;
191}
192
193static __inline__ void __tcp_v6_hash(struct sock *sk)
194{
195 struct hlist_head *list;
196 rwlock_t *lock;
197
198 BUG_TRAP(sk_unhashed(sk));
199
200 if (sk->sk_state == TCP_LISTEN) {
201 list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
202 lock = &tcp_hashinfo.lhash_lock;
203 inet_listen_wlock(&tcp_hashinfo);
204 } else {
205 unsigned int hash;
206 sk->sk_hash = hash = inet6_sk_ehashfn(sk);
207 hash &= (tcp_hashinfo.ehash_size - 1);
208 list = &tcp_hashinfo.ehash[hash].chain;
209 lock = &tcp_hashinfo.ehash[hash].lock;
210 write_lock(lock);
211 }
212
213 __sk_add_node(sk, list);
214 sock_prot_inc_use(sk->sk_prot);
215 write_unlock(lock);
216} 84}
217 85
218
219static void tcp_v6_hash(struct sock *sk) 86static void tcp_v6_hash(struct sock *sk)
220{ 87{
221 if (sk->sk_state != TCP_CLOSE) { 88 if (sk->sk_state != TCP_CLOSE) {
222 struct tcp_sock *tp = tcp_sk(sk); 89 if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
223
224 if (tp->af_specific == &ipv6_mapped) {
225 tcp_prot.hash(sk); 90 tcp_prot.hash(sk);
226 return; 91 return;
227 } 92 }
228 local_bh_disable(); 93 local_bh_disable();
229 __tcp_v6_hash(sk); 94 __inet6_hash(&tcp_hashinfo, sk);
230 local_bh_enable(); 95 local_bh_enable();
231 } 96 }
232} 97}
233 98
234/*
235 * Open request hash tables.
236 */
237
238static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
239{
240 u32 a, b, c;
241
242 a = raddr->s6_addr32[0];
243 b = raddr->s6_addr32[1];
244 c = raddr->s6_addr32[2];
245
246 a += JHASH_GOLDEN_RATIO;
247 b += JHASH_GOLDEN_RATIO;
248 c += rnd;
249 __jhash_mix(a, b, c);
250
251 a += raddr->s6_addr32[3];
252 b += (u32) rport;
253 __jhash_mix(a, b, c);
254
255 return c & (TCP_SYNQ_HSIZE - 1);
256}
257
258static struct request_sock *tcp_v6_search_req(const struct sock *sk,
259 struct request_sock ***prevp,
260 __u16 rport,
261 struct in6_addr *raddr,
262 struct in6_addr *laddr,
263 int iif)
264{
265 const struct inet_connection_sock *icsk = inet_csk(sk);
266 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
267 struct request_sock *req, **prev;
268
269 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
270 (req = *prev) != NULL;
271 prev = &req->dl_next) {
272 const struct tcp6_request_sock *treq = tcp6_rsk(req);
273
274 if (inet_rsk(req)->rmt_port == rport &&
275 req->rsk_ops->family == AF_INET6 &&
276 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
277 ipv6_addr_equal(&treq->loc_addr, laddr) &&
278 (!treq->iif || treq->iif == iif)) {
279 BUG_TRAP(req->sk == NULL);
280 *prevp = prev;
281 return req;
282 }
283 }
284
285 return NULL;
286}
287
288static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, 99static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len,
289 struct in6_addr *saddr, 100 struct in6_addr *saddr,
290 struct in6_addr *daddr, 101 struct in6_addr *daddr,
@@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
308 } 119 }
309} 120}
310 121
311static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
312 struct inet_timewait_sock **twp)
313{
314 struct inet_sock *inet = inet_sk(sk);
315 const struct ipv6_pinfo *np = inet6_sk(sk);
316 const struct in6_addr *daddr = &np->rcv_saddr;
317 const struct in6_addr *saddr = &np->daddr;
318 const int dif = sk->sk_bound_dev_if;
319 const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
320 unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
321 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
322 struct sock *sk2;
323 const struct hlist_node *node;
324 struct inet_timewait_sock *tw;
325
326 prefetch(head->chain.first);
327 write_lock(&head->lock);
328
329 /* Check TIME-WAIT sockets first. */
330 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
331 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
332
333 tw = inet_twsk(sk2);
334
335 if(*((__u32 *)&(tw->tw_dport)) == ports &&
336 sk2->sk_family == PF_INET6 &&
337 ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
338 ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
339 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
340 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
341 struct tcp_sock *tp = tcp_sk(sk);
342
343 if (tcptw->tw_ts_recent_stamp &&
344 (!twp ||
345 (sysctl_tcp_tw_reuse &&
346 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
347 /* See comment in tcp_ipv4.c */
348 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
349 if (!tp->write_seq)
350 tp->write_seq = 1;
351 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
352 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
353 sock_hold(sk2);
354 goto unique;
355 } else
356 goto not_unique;
357 }
358 }
359 tw = NULL;
360
361 /* And established part... */
362 sk_for_each(sk2, node, &head->chain) {
363 if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
364 goto not_unique;
365 }
366
367unique:
368 BUG_TRAP(sk_unhashed(sk));
369 __sk_add_node(sk, &head->chain);
370 sk->sk_hash = hash;
371 sock_prot_inc_use(sk->sk_prot);
372 write_unlock(&head->lock);
373
374 if (twp) {
375 *twp = tw;
376 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
377 } else if (tw) {
378 /* Silly. Should hash-dance instead... */
379 inet_twsk_deschedule(tw, &tcp_death_row);
380 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
381
382 inet_twsk_put(tw);
383 }
384 return 0;
385
386not_unique:
387 write_unlock(&head->lock);
388 return -EADDRNOTAVAIL;
389}
390
391static inline u32 tcpv6_port_offset(const struct sock *sk)
392{
393 const struct inet_sock *inet = inet_sk(sk);
394 const struct ipv6_pinfo *np = inet6_sk(sk);
395
396 return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
397 np->daddr.s6_addr32,
398 inet->dport);
399}
400
401static int tcp_v6_hash_connect(struct sock *sk)
402{
403 unsigned short snum = inet_sk(sk)->num;
404 struct inet_bind_hashbucket *head;
405 struct inet_bind_bucket *tb;
406 int ret;
407
408 if (!snum) {
409 int low = sysctl_local_port_range[0];
410 int high = sysctl_local_port_range[1];
411 int range = high - low;
412 int i;
413 int port;
414 static u32 hint;
415 u32 offset = hint + tcpv6_port_offset(sk);
416 struct hlist_node *node;
417 struct inet_timewait_sock *tw = NULL;
418
419 local_bh_disable();
420 for (i = 1; i <= range; i++) {
421 port = low + (i + offset) % range;
422 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
423 spin_lock(&head->lock);
424
425 /* Does not bother with rcv_saddr checks,
426 * because the established check is already
427 * unique enough.
428 */
429 inet_bind_bucket_for_each(tb, node, &head->chain) {
430 if (tb->port == port) {
431 BUG_TRAP(!hlist_empty(&tb->owners));
432 if (tb->fastreuse >= 0)
433 goto next_port;
434 if (!__tcp_v6_check_established(sk,
435 port,
436 &tw))
437 goto ok;
438 goto next_port;
439 }
440 }
441
442 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
443 if (!tb) {
444 spin_unlock(&head->lock);
445 break;
446 }
447 tb->fastreuse = -1;
448 goto ok;
449
450 next_port:
451 spin_unlock(&head->lock);
452 }
453 local_bh_enable();
454
455 return -EADDRNOTAVAIL;
456
457ok:
458 hint += i;
459
460 /* Head lock still held and bh's disabled */
461 inet_bind_hash(sk, tb, port);
462 if (sk_unhashed(sk)) {
463 inet_sk(sk)->sport = htons(port);
464 __tcp_v6_hash(sk);
465 }
466 spin_unlock(&head->lock);
467
468 if (tw) {
469 inet_twsk_deschedule(tw, &tcp_death_row);
470 inet_twsk_put(tw);
471 }
472
473 ret = 0;
474 goto out;
475 }
476
477 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
478 tb = inet_csk(sk)->icsk_bind_hash;
479 spin_lock_bh(&head->lock);
480
481 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
482 __tcp_v6_hash(sk);
483 spin_unlock_bh(&head->lock);
484 return 0;
485 } else {
486 spin_unlock(&head->lock);
487 /* No definite answer... Walk to established hash table */
488 ret = __tcp_v6_check_established(sk, snum, NULL);
489out:
490 local_bh_enable();
491 return ret;
492 }
493}
494
495static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 122static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
496 int addr_len) 123 int addr_len)
497{ 124{
498 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; 125 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
499 struct inet_sock *inet = inet_sk(sk); 126 struct inet_sock *inet = inet_sk(sk);
127 struct inet_connection_sock *icsk = inet_csk(sk);
500 struct ipv6_pinfo *np = inet6_sk(sk); 128 struct ipv6_pinfo *np = inet6_sk(sk);
501 struct tcp_sock *tp = tcp_sk(sk); 129 struct tcp_sock *tp = tcp_sk(sk);
502 struct in6_addr *saddr = NULL, *final_p = NULL, final; 130 struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
571 */ 199 */
572 200
573 if (addr_type == IPV6_ADDR_MAPPED) { 201 if (addr_type == IPV6_ADDR_MAPPED) {
574 u32 exthdrlen = tp->ext_header_len; 202 u32 exthdrlen = icsk->icsk_ext_hdr_len;
575 struct sockaddr_in sin; 203 struct sockaddr_in sin;
576 204
577 SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); 205 SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
583 sin.sin_port = usin->sin6_port; 211 sin.sin_port = usin->sin6_port;
584 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; 212 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
585 213
586 tp->af_specific = &ipv6_mapped; 214 icsk->icsk_af_ops = &ipv6_mapped;
587 sk->sk_backlog_rcv = tcp_v4_do_rcv; 215 sk->sk_backlog_rcv = tcp_v4_do_rcv;
588 216
589 err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); 217 err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
590 218
591 if (err) { 219 if (err) {
592 tp->ext_header_len = exthdrlen; 220 icsk->icsk_ext_hdr_len = exthdrlen;
593 tp->af_specific = &ipv6_specific; 221 icsk->icsk_af_ops = &ipv6_specific;
594 sk->sk_backlog_rcv = tcp_v6_do_rcv; 222 sk->sk_backlog_rcv = tcp_v6_do_rcv;
595 goto failure; 223 goto failure;
596 } else { 224 } else {
@@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
643 sk->sk_route_caps = dst->dev->features & 271 sk->sk_route_caps = dst->dev->features &
644 ~(NETIF_F_IP_CSUM | NETIF_F_TSO); 272 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
645 273
646 tp->ext_header_len = 0; 274 icsk->icsk_ext_hdr_len = 0;
647 if (np->opt) 275 if (np->opt)
648 tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; 276 icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
277 np->opt->opt_nflen);
649 278
650 tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); 279 tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
651 280
652 inet->dport = usin->sin6_port; 281 inet->dport = usin->sin6_port;
653 282
654 tcp_set_state(sk, TCP_SYN_SENT); 283 tcp_set_state(sk, TCP_SYN_SENT);
655 err = tcp_v6_hash_connect(sk); 284 err = inet6_hash_connect(&tcp_death_row, sk);
656 if (err) 285 if (err)
657 goto late_failure; 286 goto late_failure;
658 287
@@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
758 } else 387 } else
759 dst_hold(dst); 388 dst_hold(dst);
760 389
761 if (tp->pmtu_cookie > dst_mtu(dst)) { 390 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
762 tcp_sync_mss(sk, dst_mtu(dst)); 391 tcp_sync_mss(sk, dst_mtu(dst));
763 tcp_simple_retransmit(sk); 392 tcp_simple_retransmit(sk);
764 } /* else let the usual retransmit timer handle it */ 393 } /* else let the usual retransmit timer handle it */
@@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
775 if (sock_owned_by_user(sk)) 404 if (sock_owned_by_user(sk))
776 goto out; 405 goto out;
777 406
778 req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, 407 req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
779 &hdr->saddr, inet6_iif(skb)); 408 &hdr->saddr, inet6_iif(skb));
780 if (!req) 409 if (!req)
781 goto out; 410 goto out;
782 411
@@ -822,7 +451,7 @@ out:
822static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, 451static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
823 struct dst_entry *dst) 452 struct dst_entry *dst)
824{ 453{
825 struct tcp6_request_sock *treq = tcp6_rsk(req); 454 struct inet6_request_sock *treq = inet6_rsk(req);
826 struct ipv6_pinfo *np = inet6_sk(sk); 455 struct ipv6_pinfo *np = inet6_sk(sk);
827 struct sk_buff * skb; 456 struct sk_buff * skb;
828 struct ipv6_txoptions *opt = NULL; 457 struct ipv6_txoptions *opt = NULL;
@@ -888,8 +517,8 @@ done:
888 517
889static void tcp_v6_reqsk_destructor(struct request_sock *req) 518static void tcp_v6_reqsk_destructor(struct request_sock *req)
890{ 519{
891 if (tcp6_rsk(req)->pktopts) 520 if (inet6_rsk(req)->pktopts)
892 kfree_skb(tcp6_rsk(req)->pktopts); 521 kfree_skb(inet6_rsk(req)->pktopts);
893} 522}
894 523
895static struct request_sock_ops tcp6_request_sock_ops = { 524static struct request_sock_ops tcp6_request_sock_ops = {
@@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = {
901 .send_reset = tcp_v6_send_reset 530 .send_reset = tcp_v6_send_reset
902}; 531};
903 532
904static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) 533static struct timewait_sock_ops tcp6_timewait_sock_ops = {
905{ 534 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
906 struct ipv6_pinfo *np = inet6_sk(sk); 535 .twsk_unique = tcp_twsk_unique,
907 struct inet6_skb_parm *opt = IP6CB(skb); 536};
908
909 if (np->rxopt.all) {
910 if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
911 ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
912 (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
913 ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
914 return 1;
915 }
916 return 0;
917}
918
919 537
920static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 538static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
921 struct sk_buff *skb)
922{ 539{
923 struct ipv6_pinfo *np = inet6_sk(sk); 540 struct ipv6_pinfo *np = inet6_sk(sk);
541 struct tcphdr *th = skb->h.th;
924 542
925 if (skb->ip_summed == CHECKSUM_HW) { 543 if (skb->ip_summed == CHECKSUM_HW) {
926 th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); 544 th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0);
@@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1091 struct sock *nsk; 709 struct sock *nsk;
1092 710
1093 /* Find possible connection requests. */ 711 /* Find possible connection requests. */
1094 req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, 712 req = inet6_csk_search_req(sk, &prev, th->source,
1095 &skb->nh.ipv6h->daddr, inet6_iif(skb)); 713 &skb->nh.ipv6h->saddr,
714 &skb->nh.ipv6h->daddr, inet6_iif(skb));
1096 if (req) 715 if (req)
1097 return tcp_check_req(sk, skb, req, prev); 716 return tcp_check_req(sk, skb, req, prev);
1098 717
@@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1116 return sk; 735 return sk;
1117} 736}
1118 737
1119static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
1120{
1121 struct inet_connection_sock *icsk = inet_csk(sk);
1122 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
1123 const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1124
1125 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
1126 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
1127}
1128
1129
1130/* FIXME: this is substantially similar to the ipv4 code. 738/* FIXME: this is substantially similar to the ipv4 code.
1131 * Can some kind of merge be done? -- erics 739 * Can some kind of merge be done? -- erics
1132 */ 740 */
1133static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) 741static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1134{ 742{
1135 struct tcp6_request_sock *treq; 743 struct inet6_request_sock *treq;
1136 struct ipv6_pinfo *np = inet6_sk(sk); 744 struct ipv6_pinfo *np = inet6_sk(sk);
1137 struct tcp_options_received tmp_opt; 745 struct tcp_options_received tmp_opt;
1138 struct tcp_sock *tp = tcp_sk(sk); 746 struct tcp_sock *tp = tcp_sk(sk);
@@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1157 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 765 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1158 goto drop; 766 goto drop;
1159 767
1160 req = reqsk_alloc(&tcp6_request_sock_ops); 768 req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
1161 if (req == NULL) 769 if (req == NULL)
1162 goto drop; 770 goto drop;
1163 771
@@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1170 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 778 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1171 tcp_openreq_init(req, &tmp_opt, skb); 779 tcp_openreq_init(req, &tmp_opt, skb);
1172 780
1173 treq = tcp6_rsk(req); 781 treq = inet6_rsk(req);
1174 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); 782 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
1175 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); 783 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
1176 TCP_ECN_create_request(req, skb->h.th); 784 TCP_ECN_create_request(req, skb->h.th);
@@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1196 if (tcp_v6_send_synack(sk, req, NULL)) 804 if (tcp_v6_send_synack(sk, req, NULL))
1197 goto drop; 805 goto drop;
1198 806
1199 tcp_v6_synq_add(sk, req); 807 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1200
1201 return 0; 808 return 0;
1202 809
1203drop: 810drop:
@@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1212 struct request_sock *req, 819 struct request_sock *req,
1213 struct dst_entry *dst) 820 struct dst_entry *dst)
1214{ 821{
1215 struct tcp6_request_sock *treq = tcp6_rsk(req); 822 struct inet6_request_sock *treq = inet6_rsk(req);
1216 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 823 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
1217 struct tcp6_sock *newtcp6sk; 824 struct tcp6_sock *newtcp6sk;
1218 struct inet_sock *newinet; 825 struct inet_sock *newinet;
@@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1247 854
1248 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); 855 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
1249 856
1250 newtp->af_specific = &ipv6_mapped; 857 inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
1251 newsk->sk_backlog_rcv = tcp_v4_do_rcv; 858 newsk->sk_backlog_rcv = tcp_v4_do_rcv;
1252 newnp->pktoptions = NULL; 859 newnp->pktoptions = NULL;
1253 newnp->opt = NULL; 860 newnp->opt = NULL;
@@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1261 */ 868 */
1262 869
1263 /* It is tricky place. Until this moment IPv4 tcp 870 /* It is tricky place. Until this moment IPv4 tcp
1264 worked with IPv6 af_tcp.af_specific. 871 worked with IPv6 icsk.icsk_af_ops.
1265 Sync it now. 872 Sync it now.
1266 */ 873 */
1267 tcp_sync_mss(newsk, newtp->pmtu_cookie); 874 tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
1268 875
1269 return newsk; 876 return newsk;
1270 } 877 }
@@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1371 sock_kfree_s(sk, opt, opt->tot_len); 978 sock_kfree_s(sk, opt, opt->tot_len);
1372 } 979 }
1373 980
1374 newtp->ext_header_len = 0; 981 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1375 if (newnp->opt) 982 if (newnp->opt)
1376 newtp->ext_header_len = newnp->opt->opt_nflen + 983 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
1377 newnp->opt->opt_flen; 984 newnp->opt->opt_flen);
1378 985
1379 tcp_sync_mss(newsk, dst_mtu(dst)); 986 tcp_sync_mss(newsk, dst_mtu(dst));
1380 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 987 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1382 989
1383 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; 990 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
1384 991
1385 __tcp_v6_hash(newsk); 992 __inet6_hash(&tcp_hashinfo, newsk);
1386 inet_inherit_port(&tcp_hashinfo, sk, newsk); 993 inet_inherit_port(&tcp_hashinfo, sk, newsk);
1387 994
1388 return newsk; 995 return newsk;
@@ -1679,139 +1286,16 @@ do_time_wait:
1679 goto discard_it; 1286 goto discard_it;
1680} 1287}
1681 1288
1682static int tcp_v6_rebuild_header(struct sock *sk)
1683{
1684 int err;
1685 struct dst_entry *dst;
1686 struct ipv6_pinfo *np = inet6_sk(sk);
1687
1688 dst = __sk_dst_check(sk, np->dst_cookie);
1689
1690 if (dst == NULL) {
1691 struct inet_sock *inet = inet_sk(sk);
1692 struct in6_addr *final_p = NULL, final;
1693 struct flowi fl;
1694
1695 memset(&fl, 0, sizeof(fl));
1696 fl.proto = IPPROTO_TCP;
1697 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1698 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
1699 fl.fl6_flowlabel = np->flow_label;
1700 fl.oif = sk->sk_bound_dev_if;
1701 fl.fl_ip_dport = inet->dport;
1702 fl.fl_ip_sport = inet->sport;
1703
1704 if (np->opt && np->opt->srcrt) {
1705 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
1706 ipv6_addr_copy(&final, &fl.fl6_dst);
1707 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1708 final_p = &final;
1709 }
1710
1711 err = ip6_dst_lookup(sk, &dst, &fl);
1712 if (err) {
1713 sk->sk_route_caps = 0;
1714 return err;
1715 }
1716 if (final_p)
1717 ipv6_addr_copy(&fl.fl6_dst, final_p);
1718
1719 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
1720 sk->sk_err_soft = -err;
1721 return err;
1722 }
1723
1724 ip6_dst_store(sk, dst, NULL);
1725 sk->sk_route_caps = dst->dev->features &
1726 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
1727 }
1728
1729 return 0;
1730}
1731
1732static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
1733{
1734 struct sock *sk = skb->sk;
1735 struct inet_sock *inet = inet_sk(sk);
1736 struct ipv6_pinfo *np = inet6_sk(sk);
1737 struct flowi fl;
1738 struct dst_entry *dst;
1739 struct in6_addr *final_p = NULL, final;
1740
1741 memset(&fl, 0, sizeof(fl));
1742 fl.proto = IPPROTO_TCP;
1743 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1744 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
1745 fl.fl6_flowlabel = np->flow_label;
1746 IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
1747 fl.oif = sk->sk_bound_dev_if;
1748 fl.fl_ip_sport = inet->sport;
1749 fl.fl_ip_dport = inet->dport;
1750
1751 if (np->opt && np->opt->srcrt) {
1752 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
1753 ipv6_addr_copy(&final, &fl.fl6_dst);
1754 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1755 final_p = &final;
1756 }
1757
1758 dst = __sk_dst_check(sk, np->dst_cookie);
1759
1760 if (dst == NULL) {
1761 int err = ip6_dst_lookup(sk, &dst, &fl);
1762
1763 if (err) {
1764 sk->sk_err_soft = -err;
1765 return err;
1766 }
1767
1768 if (final_p)
1769 ipv6_addr_copy(&fl.fl6_dst, final_p);
1770
1771 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
1772 sk->sk_route_caps = 0;
1773 return err;
1774 }
1775
1776 ip6_dst_store(sk, dst, NULL);
1777 sk->sk_route_caps = dst->dev->features &
1778 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
1779 }
1780
1781 skb->dst = dst_clone(dst);
1782
1783 /* Restore final destination back after routing done */
1784 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1785
1786 return ip6_xmit(sk, skb, &fl, np->opt, 0);
1787}
1788
1789static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1790{
1791 struct ipv6_pinfo *np = inet6_sk(sk);
1792 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
1793
1794 sin6->sin6_family = AF_INET6;
1795 ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
1796 sin6->sin6_port = inet_sk(sk)->dport;
1797 /* We do not store received flowlabel for TCP */
1798 sin6->sin6_flowinfo = 0;
1799 sin6->sin6_scope_id = 0;
1800 if (sk->sk_bound_dev_if &&
1801 ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
1802 sin6->sin6_scope_id = sk->sk_bound_dev_if;
1803}
1804
1805static int tcp_v6_remember_stamp(struct sock *sk) 1289static int tcp_v6_remember_stamp(struct sock *sk)
1806{ 1290{
1807 /* Alas, not yet... */ 1291 /* Alas, not yet... */
1808 return 0; 1292 return 0;
1809} 1293}
1810 1294
1811static struct tcp_func ipv6_specific = { 1295static struct inet_connection_sock_af_ops ipv6_specific = {
1812 .queue_xmit = tcp_v6_xmit, 1296 .queue_xmit = inet6_csk_xmit,
1813 .send_check = tcp_v6_send_check, 1297 .send_check = tcp_v6_send_check,
1814 .rebuild_header = tcp_v6_rebuild_header, 1298 .rebuild_header = inet6_sk_rebuild_header,
1815 .conn_request = tcp_v6_conn_request, 1299 .conn_request = tcp_v6_conn_request,
1816 .syn_recv_sock = tcp_v6_syn_recv_sock, 1300 .syn_recv_sock = tcp_v6_syn_recv_sock,
1817 .remember_stamp = tcp_v6_remember_stamp, 1301 .remember_stamp = tcp_v6_remember_stamp,
@@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = {
1819 1303
1820 .setsockopt = ipv6_setsockopt, 1304 .setsockopt = ipv6_setsockopt,
1821 .getsockopt = ipv6_getsockopt, 1305 .getsockopt = ipv6_getsockopt,
1822 .addr2sockaddr = v6_addr2sockaddr, 1306 .addr2sockaddr = inet6_csk_addr2sockaddr,
1823 .sockaddr_len = sizeof(struct sockaddr_in6) 1307 .sockaddr_len = sizeof(struct sockaddr_in6)
1824}; 1308};
1825 1309
@@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = {
1827 * TCP over IPv4 via INET6 API 1311 * TCP over IPv4 via INET6 API
1828 */ 1312 */
1829 1313
1830static struct tcp_func ipv6_mapped = { 1314static struct inet_connection_sock_af_ops ipv6_mapped = {
1831 .queue_xmit = ip_queue_xmit, 1315 .queue_xmit = ip_queue_xmit,
1832 .send_check = tcp_v4_send_check, 1316 .send_check = tcp_v4_send_check,
1833 .rebuild_header = inet_sk_rebuild_header, 1317 .rebuild_header = inet_sk_rebuild_header,
@@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = {
1838 1322
1839 .setsockopt = ipv6_setsockopt, 1323 .setsockopt = ipv6_setsockopt,
1840 .getsockopt = ipv6_getsockopt, 1324 .getsockopt = ipv6_getsockopt,
1841 .addr2sockaddr = v6_addr2sockaddr, 1325 .addr2sockaddr = inet6_csk_addr2sockaddr,
1842 .sockaddr_len = sizeof(struct sockaddr_in6) 1326 .sockaddr_len = sizeof(struct sockaddr_in6)
1843}; 1327};
1844 1328
@@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk)
1877 1361
1878 sk->sk_state = TCP_CLOSE; 1362 sk->sk_state = TCP_CLOSE;
1879 1363
1880 tp->af_specific = &ipv6_specific; 1364 icsk->icsk_af_ops = &ipv6_specific;
1881 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1365 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1366 icsk->icsk_sync_mss = tcp_sync_mss;
1882 sk->sk_write_space = sk_stream_write_space; 1367 sk->sk_write_space = sk_stream_write_space;
1883 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1368 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1884 1369
@@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
1900static void get_openreq6(struct seq_file *seq, 1385static void get_openreq6(struct seq_file *seq,
1901 struct sock *sk, struct request_sock *req, int i, int uid) 1386 struct sock *sk, struct request_sock *req, int i, int uid)
1902{ 1387{
1903 struct in6_addr *dest, *src;
1904 int ttd = req->expires - jiffies; 1388 int ttd = req->expires - jiffies;
1389 struct in6_addr *src = &inet6_rsk(req)->loc_addr;
1390 struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
1905 1391
1906 if (ttd < 0) 1392 if (ttd < 0)
1907 ttd = 0; 1393 ttd = 0;
1908 1394
1909 src = &tcp6_rsk(req)->loc_addr;
1910 dest = &tcp6_rsk(req)->rmt_addr;
1911 seq_printf(seq, 1395 seq_printf(seq,
1912 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " 1396 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
1913 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", 1397 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq,
1988{ 1472{
1989 struct in6_addr *dest, *src; 1473 struct in6_addr *dest, *src;
1990 __u16 destp, srcp; 1474 __u16 destp, srcp;
1991 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 1475 struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
1992 int ttd = tw->tw_ttd - jiffies; 1476 int ttd = tw->tw_ttd - jiffies;
1993 1477
1994 if (ttd < 0) 1478 if (ttd < 0)
1995 ttd = 0; 1479 ttd = 0;
1996 1480
1997 dest = &tcp6tw->tw_v6_daddr; 1481 dest = &tw6->tw_v6_daddr;
1998 src = &tcp6tw->tw_v6_rcv_saddr; 1482 src = &tw6->tw_v6_rcv_saddr;
1999 destp = ntohs(tw->tw_dport); 1483 destp = ntohs(tw->tw_dport);
2000 srcp = ntohs(tw->tw_sport); 1484 srcp = ntohs(tw->tw_sport);
2001 1485
@@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = {
2093 .sysctl_rmem = sysctl_tcp_rmem, 1577 .sysctl_rmem = sysctl_tcp_rmem,
2094 .max_header = MAX_TCP_HEADER, 1578 .max_header = MAX_TCP_HEADER,
2095 .obj_size = sizeof(struct tcp6_sock), 1579 .obj_size = sizeof(struct tcp6_sock),
2096 .twsk_obj_size = sizeof(struct tcp6_timewait_sock), 1580 .twsk_prot = &tcp6_timewait_sock_ops,
2097 .rsk_prot = &tcp6_request_sock_ops, 1581 .rsk_prot = &tcp6_request_sock_ops,
2098}; 1582};
2099 1583
@@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
2110 .ops = &inet6_stream_ops, 1594 .ops = &inet6_stream_ops,
2111 .capability = -1, 1595 .capability = -1,
2112 .no_check = 0, 1596 .no_check = 0,
2113 .flags = INET_PROTOSW_PERMANENT, 1597 .flags = INET_PROTOSW_PERMANENT |
1598 INET_PROTOSW_ICSK,
2114}; 1599};
2115 1600
2116void __init tcpv6_init(void) 1601void __init tcpv6_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55b..d8538dcea813 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
36#include <linux/ipv6.h> 36#include <linux/ipv6.h>
37#include <linux/icmpv6.h> 37#include <linux/icmpv6.h>
38#include <linux/init.h> 38#include <linux/init.h>
39#include <linux/skbuff.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <net/sock.h> 42#include <net/sock.h>
@@ -300,20 +301,7 @@ out:
300 return err; 301 return err;
301 302
302csum_copy_err: 303csum_copy_err:
303 /* Clear queue. */ 304 skb_kill_datagram(sk, skb, flags);
304 if (flags&MSG_PEEK) {
305 int clear = 0;
306 spin_lock_bh(&sk->sk_receive_queue.lock);
307 if (skb == skb_peek(&sk->sk_receive_queue)) {
308 __skb_unlink(skb, &sk->sk_receive_queue);
309 clear = 1;
310 }
311 spin_unlock_bh(&sk->sk_receive_queue.lock);
312 if (clear)
313 kfree_skb(skb);
314 }
315
316 skb_free_datagram(sk, skb);
317 305
318 if (flags & MSG_DONTWAIT) { 306 if (flags & MSG_DONTWAIT) {
319 UDP6_INC_STATS_USER(UDP_MIB_INERRORS); 307 UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb868409..0dc519b40404 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
75static struct datalink_proto *p8023_datalink; 75static struct datalink_proto *p8023_datalink;
76static struct datalink_proto *pSNAP_datalink; 76static struct datalink_proto *pSNAP_datalink;
77 77
78static struct proto_ops ipx_dgram_ops; 78static const struct proto_ops ipx_dgram_ops;
79 79
80LIST_HEAD(ipx_interfaces); 80LIST_HEAD(ipx_interfaces);
81DEFINE_SPINLOCK(ipx_interfaces_lock); 81DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1884 rc = -EINVAL; 1884 rc = -EINVAL;
1885 break; 1885 break;
1886 default: 1886 default:
1887 rc = dev_ioctl(cmd, argp); 1887 rc = -ENOIOCTLCMD;
1888 break; 1888 break;
1889 } 1889 }
1890 1890
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
1901 .owner = THIS_MODULE, 1901 .owner = THIS_MODULE,
1902}; 1902};
1903 1903
1904static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { 1904static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
1905 .family = PF_IPX, 1905 .family = PF_IPX,
1906 .owner = THIS_MODULE, 1906 .owner = THIS_MODULE,
1907 .release = ipx_release, 1907 .release = ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 6f92f9c62990..fbfa96754417 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
62 62
63static int irda_create(struct socket *sock, int protocol); 63static int irda_create(struct socket *sock, int protocol);
64 64
65static struct proto_ops irda_stream_ops; 65static const struct proto_ops irda_stream_ops;
66static struct proto_ops irda_seqpacket_ops; 66static const struct proto_ops irda_seqpacket_ops;
67static struct proto_ops irda_dgram_ops; 67static const struct proto_ops irda_dgram_ops;
68 68
69#ifdef CONFIG_IRDA_ULTRA 69#ifdef CONFIG_IRDA_ULTRA
70static struct proto_ops irda_ultra_ops; 70static const struct proto_ops irda_ultra_ops;
71#define ULTRA_MAX_DATA 382 71#define ULTRA_MAX_DATA 382
72#endif /* CONFIG_IRDA_ULTRA */ 72#endif /* CONFIG_IRDA_ULTRA */
73 73
@@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
1438 /* 1438 /*
1439 * POSIX 1003.1g mandates this order. 1439 * POSIX 1003.1g mandates this order.
1440 */ 1440 */
1441 if (sk->sk_err) 1441 ret = sock_error(sk);
1442 ret = sock_error(sk); 1442 if (ret)
1443 break;
1443 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1444 else if (sk->sk_shutdown & RCV_SHUTDOWN)
1444 ; 1445 ;
1445 else if (noblock) 1446 else if (noblock)
@@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1821 return -EINVAL; 1822 return -EINVAL;
1822 default: 1823 default:
1823 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); 1824 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__);
1824 return dev_ioctl(cmd, (void __user *) arg); 1825 return -ENOIOCTLCMD;
1825 } 1826 }
1826 1827
1827 /*NOTREACHED*/ 1828 /*NOTREACHED*/
@@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
2463 .owner = THIS_MODULE, 2464 .owner = THIS_MODULE,
2464}; 2465};
2465 2466
2466static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { 2467static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
2467 .family = PF_IRDA, 2468 .family = PF_IRDA,
2468 .owner = THIS_MODULE, 2469 .owner = THIS_MODULE,
2469 .release = irda_release, 2470 .release = irda_release,
@@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
2484 .sendpage = sock_no_sendpage, 2485 .sendpage = sock_no_sendpage,
2485}; 2486};
2486 2487
2487static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { 2488static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
2488 .family = PF_IRDA, 2489 .family = PF_IRDA,
2489 .owner = THIS_MODULE, 2490 .owner = THIS_MODULE,
2490 .release = irda_release, 2491 .release = irda_release,
@@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
2505 .sendpage = sock_no_sendpage, 2506 .sendpage = sock_no_sendpage,
2506}; 2507};
2507 2508
2508static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { 2509static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
2509 .family = PF_IRDA, 2510 .family = PF_IRDA,
2510 .owner = THIS_MODULE, 2511 .owner = THIS_MODULE,
2511 .release = irda_release, 2512 .release = irda_release,
@@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
2527}; 2528};
2528 2529
2529#ifdef CONFIG_IRDA_ULTRA 2530#ifdef CONFIG_IRDA_ULTRA
2530static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { 2531static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
2531 .family = PF_IRDA, 2532 .family = PF_IRDA,
2532 .owner = THIS_MODULE, 2533 .owner = THIS_MODULE,
2533 .release = irda_release, 2534 .release = irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65c..52efd04cbedb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
113} 113}
114 114
115 115
116static struct proto_ops pfkey_ops; 116static const struct proto_ops pfkey_ops;
117 117
118static void pfkey_insert(struct sock *sk) 118static void pfkey_insert(struct sock *sk)
119{ 119{
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
336 [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), 336 [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
337 [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), 337 [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
338 [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), 338 [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
339 [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx),
339}; 340};
340 341
341/* Verify sadb_address_{len,prefixlen} against sa_family. */ 342/* Verify sadb_address_{len,prefixlen} against sa_family. */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
383 return 0; 384 return 0;
384} 385}
385 386
387static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
388{
389 int len = 0;
390
391 len += sizeof(struct sadb_x_sec_ctx);
392 len += sec_ctx->sadb_x_ctx_len;
393 len += sizeof(uint64_t) - 1;
394 len /= sizeof(uint64_t);
395
396 return len;
397}
398
399static inline int verify_sec_ctx_len(void *p)
400{
401 struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
402 int len;
403
404 if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
405 return -EINVAL;
406
407 len = pfkey_sec_ctx_len(sec_ctx);
408
409 if (sec_ctx->sadb_x_sec_len != len)
410 return -EINVAL;
411
412 return 0;
413}
414
415static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
416{
417 struct xfrm_user_sec_ctx *uctx = NULL;
418 int ctx_size = sec_ctx->sadb_x_ctx_len;
419
420 uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
421
422 if (!uctx)
423 return NULL;
424
425 uctx->len = pfkey_sec_ctx_len(sec_ctx);
426 uctx->exttype = sec_ctx->sadb_x_sec_exttype;
427 uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
428 uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
429 uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
430 memcpy(uctx + 1, sec_ctx + 1,
431 uctx->ctx_len);
432
433 return uctx;
434}
435
386static int present_and_same_family(struct sadb_address *src, 436static int present_and_same_family(struct sadb_address *src,
387 struct sadb_address *dst) 437 struct sadb_address *dst)
388{ 438{
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
438 if (verify_address_len(p)) 488 if (verify_address_len(p))
439 return -EINVAL; 489 return -EINVAL;
440 } 490 }
491 if (ext_type == SADB_X_EXT_SEC_CTX) {
492 if (verify_sec_ctx_len(p))
493 return -EINVAL;
494 }
441 ext_hdrs[ext_type-1] = p; 495 ext_hdrs[ext_type-1] = p;
442 } 496 }
443 p += ext_len; 497 p += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
586 struct sadb_key *key; 640 struct sadb_key *key;
587 struct sadb_x_sa2 *sa2; 641 struct sadb_x_sa2 *sa2;
588 struct sockaddr_in *sin; 642 struct sockaddr_in *sin;
643 struct sadb_x_sec_ctx *sec_ctx;
644 struct xfrm_sec_ctx *xfrm_ctx;
645 int ctx_size = 0;
589#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 646#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
590 struct sockaddr_in6 *sin6; 647 struct sockaddr_in6 *sin6;
591#endif 648#endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
609 sizeof(struct sadb_address)*2 + 666 sizeof(struct sadb_address)*2 +
610 sockaddr_size*2 + 667 sockaddr_size*2 +
611 sizeof(struct sadb_x_sa2); 668 sizeof(struct sadb_x_sa2);
669
670 if ((xfrm_ctx = x->security)) {
671 ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
672 size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
673 }
674
612 /* identity & sensitivity */ 675 /* identity & sensitivity */
613 676
614 if ((x->props.family == AF_INET && 677 if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
899 n_port->sadb_x_nat_t_port_reserved = 0; 962 n_port->sadb_x_nat_t_port_reserved = 0;
900 } 963 }
901 964
965 /* security context */
966 if (xfrm_ctx) {
967 sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
968 sizeof(struct sadb_x_sec_ctx) + ctx_size);
969 sec_ctx->sadb_x_sec_len =
970 (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
971 sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
972 sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
973 sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
974 sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
975 memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
976 xfrm_ctx->ctx_len);
977 }
978
902 return skb; 979 return skb;
903} 980}
904 981
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
909 struct sadb_lifetime *lifetime; 986 struct sadb_lifetime *lifetime;
910 struct sadb_sa *sa; 987 struct sadb_sa *sa;
911 struct sadb_key *key; 988 struct sadb_key *key;
989 struct sadb_x_sec_ctx *sec_ctx;
912 uint16_t proto; 990 uint16_t proto;
913 int err; 991 int err;
914 992
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
993 x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; 1071 x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
994 x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; 1072 x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
995 } 1073 }
1074
1075 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
1076 if (sec_ctx != NULL) {
1077 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
1078
1079 if (!uctx)
1080 goto out;
1081
1082 err = security_xfrm_state_alloc(x, uctx);
1083 kfree(uctx);
1084
1085 if (err)
1086 goto out;
1087 }
1088
996 key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; 1089 key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
997 if (sa->sadb_sa_auth) { 1090 if (sa->sadb_sa_auth) {
998 int keysize = 0; 1091 int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
1720 return 0; 1813 return 0;
1721} 1814}
1722 1815
1816static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
1817{
1818 struct xfrm_sec_ctx *xfrm_ctx = xp->security;
1819
1820 if (xfrm_ctx) {
1821 int len = sizeof(struct sadb_x_sec_ctx);
1822 len += xfrm_ctx->ctx_len;
1823 return PFKEY_ALIGN8(len);
1824 }
1825 return 0;
1826}
1827
1723static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) 1828static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
1724{ 1829{
1725 int sockaddr_size = pfkey_sockaddr_size(xp->family); 1830 int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
1733 (sockaddr_size * 2) + 1838 (sockaddr_size * 2) +
1734 sizeof(struct sadb_x_policy) + 1839 sizeof(struct sadb_x_policy) +
1735 (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + 1840 (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
1736 (socklen * 2))); 1841 (socklen * 2))) +
1842 pfkey_xfrm_policy2sec_ctx_size(xp);
1737} 1843}
1738 1844
1739static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) 1845static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1757 struct sadb_lifetime *lifetime; 1863 struct sadb_lifetime *lifetime;
1758 struct sadb_x_policy *pol; 1864 struct sadb_x_policy *pol;
1759 struct sockaddr_in *sin; 1865 struct sockaddr_in *sin;
1866 struct sadb_x_sec_ctx *sec_ctx;
1867 struct xfrm_sec_ctx *xfrm_ctx;
1760#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1868#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1761 struct sockaddr_in6 *sin6; 1869 struct sockaddr_in6 *sin6;
1762#endif 1870#endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1941 } 2049 }
1942 } 2050 }
1943 } 2051 }
2052
2053 /* security context */
2054 if ((xfrm_ctx = xp->security)) {
2055 int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
2056
2057 sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
2058 sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
2059 sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
2060 sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
2061 sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
2062 sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
2063 memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
2064 xfrm_ctx->ctx_len);
2065 }
2066
1944 hdr->sadb_msg_len = size / sizeof(uint64_t); 2067 hdr->sadb_msg_len = size / sizeof(uint64_t);
1945 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); 2068 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
1946} 2069}
@@ -1976,12 +2099,13 @@ out:
1976 2099
1977static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 2100static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1978{ 2101{
1979 int err; 2102 int err = 0;
1980 struct sadb_lifetime *lifetime; 2103 struct sadb_lifetime *lifetime;
1981 struct sadb_address *sa; 2104 struct sadb_address *sa;
1982 struct sadb_x_policy *pol; 2105 struct sadb_x_policy *pol;
1983 struct xfrm_policy *xp; 2106 struct xfrm_policy *xp;
1984 struct km_event c; 2107 struct km_event c;
2108 struct sadb_x_sec_ctx *sec_ctx;
1985 2109
1986 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2110 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1987 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2111 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2028 if (xp->selector.dport) 2152 if (xp->selector.dport)
2029 xp->selector.dport_mask = ~0; 2153 xp->selector.dport_mask = ~0;
2030 2154
2155 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
2156 if (sec_ctx != NULL) {
2157 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2158
2159 if (!uctx) {
2160 err = -ENOBUFS;
2161 goto out;
2162 }
2163
2164 err = security_xfrm_policy_alloc(xp, uctx);
2165 kfree(uctx);
2166
2167 if (err)
2168 goto out;
2169 }
2170
2031 xp->lft.soft_byte_limit = XFRM_INF; 2171 xp->lft.soft_byte_limit = XFRM_INF;
2032 xp->lft.hard_byte_limit = XFRM_INF; 2172 xp->lft.hard_byte_limit = XFRM_INF;
2033 xp->lft.soft_packet_limit = XFRM_INF; 2173 xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2051 2191
2052 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, 2192 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
2053 hdr->sadb_msg_type != SADB_X_SPDUPDATE); 2193 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
2054 if (err) { 2194
2055 kfree(xp); 2195 if (err)
2056 return err; 2196 goto out;
2057 }
2058 2197
2059 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) 2198 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
2060 c.event = XFRM_MSG_UPDPOLICY; 2199 c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2069 return 0; 2208 return 0;
2070 2209
2071out: 2210out:
2211 security_xfrm_policy_free(xp);
2072 kfree(xp); 2212 kfree(xp);
2073 return err; 2213 return err;
2074} 2214}
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2078 int err; 2218 int err;
2079 struct sadb_address *sa; 2219 struct sadb_address *sa;
2080 struct sadb_x_policy *pol; 2220 struct sadb_x_policy *pol;
2081 struct xfrm_policy *xp; 2221 struct xfrm_policy *xp, tmp;
2082 struct xfrm_selector sel; 2222 struct xfrm_selector sel;
2083 struct km_event c; 2223 struct km_event c;
2224 struct sadb_x_sec_ctx *sec_ctx;
2084 2225
2085 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2226 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
2086 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2227 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2109 if (sel.dport) 2250 if (sel.dport)
2110 sel.dport_mask = ~0; 2251 sel.dport_mask = ~0;
2111 2252
2112 xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); 2253 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
2254 memset(&tmp, 0, sizeof(struct xfrm_policy));
2255
2256 if (sec_ctx != NULL) {
2257 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2258
2259 if (!uctx)
2260 return -ENOMEM;
2261
2262 err = security_xfrm_policy_alloc(&tmp, uctx);
2263 kfree(uctx);
2264
2265 if (err)
2266 return err;
2267 }
2268
2269 xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
2270 security_xfrm_policy_free(&tmp);
2113 if (xp == NULL) 2271 if (xp == NULL)
2114 return -ENOENT; 2272 return -ENOENT;
2115 2273
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
2660{ 2818{
2661 struct xfrm_policy *xp; 2819 struct xfrm_policy *xp;
2662 struct sadb_x_policy *pol = (struct sadb_x_policy*)data; 2820 struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
2821 struct sadb_x_sec_ctx *sec_ctx;
2663 2822
2664 switch (family) { 2823 switch (family) {
2665 case AF_INET: 2824 case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
2709 (*dir = parse_ipsecrequests(xp, pol)) < 0) 2868 (*dir = parse_ipsecrequests(xp, pol)) < 0)
2710 goto out; 2869 goto out;
2711 2870
2871 /* security context too */
2872 if (len >= (pol->sadb_x_policy_len*8 +
2873 sizeof(struct sadb_x_sec_ctx))) {
2874 char *p = (char *)pol;
2875 struct xfrm_user_sec_ctx *uctx;
2876
2877 p += pol->sadb_x_policy_len*8;
2878 sec_ctx = (struct sadb_x_sec_ctx *)p;
2879 if (len < pol->sadb_x_policy_len*8 +
2880 sec_ctx->sadb_x_sec_len)
2881 goto out;
2882 if ((*dir = verify_sec_ctx_len(p)))
2883 goto out;
2884 uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2885 *dir = security_xfrm_policy_alloc(xp, uctx);
2886 kfree(uctx);
2887
2888 if (*dir)
2889 goto out;
2890 }
2891
2712 *dir = pol->sadb_x_policy_dir-1; 2892 *dir = pol->sadb_x_policy_dir-1;
2713 return xp; 2893 return xp;
2714 2894
2715out: 2895out:
2896 security_xfrm_policy_free(xp);
2716 kfree(xp); 2897 kfree(xp);
2717 return NULL; 2898 return NULL;
2718} 2899}
@@ -2946,7 +3127,7 @@ out:
2946 return err; 3127 return err;
2947} 3128}
2948 3129
2949static struct proto_ops pfkey_ops = { 3130static const struct proto_ops pfkey_ops = {
2950 .family = PF_KEY, 3131 .family = PF_KEY,
2951 .owner = THIS_MODULE, 3132 .owner = THIS_MODULE,
2952 /* Operations that make no sense on pfkey sockets. */ 3133 /* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c3f0b0783453..8171c53bc0ed 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
36static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; 36static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
37static u16 llc_ui_sap_link_no_max[256]; 37static u16 llc_ui_sap_link_no_max[256];
38static struct sockaddr_llc llc_ui_addrnull; 38static struct sockaddr_llc llc_ui_addrnull;
39static struct proto_ops llc_ui_ops; 39static const struct proto_ops llc_ui_ops;
40 40
41static int llc_ui_wait_for_conn(struct sock *sk, long timeout); 41static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
42static int llc_ui_wait_for_disc(struct sock *sk, long timeout); 42static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo)
566 /* 566 /*
567 * POSIX 1003.1g mandates this order. 567 * POSIX 1003.1g mandates this order.
568 */ 568 */
569 if (sk->sk_err) { 569 rc = sock_error(sk);
570 rc = sock_error(sk); 570 if (rc)
571 break; 571 break;
572 }
573 rc = 0; 572 rc = 0;
574 if (sk->sk_shutdown & RCV_SHUTDOWN) 573 if (sk->sk_shutdown & RCV_SHUTDOWN)
575 break; 574 break;
@@ -960,7 +959,7 @@ out:
960static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, 959static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
961 unsigned long arg) 960 unsigned long arg)
962{ 961{
963 return dev_ioctl(cmd, (void __user *)arg); 962 return -ENOIOCTLCMD;
964} 963}
965 964
966/** 965/**
@@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
1099 .owner = THIS_MODULE, 1098 .owner = THIS_MODULE,
1100}; 1099};
1101 1100
1102static struct proto_ops llc_ui_ops = { 1101static const struct proto_ops llc_ui_ops = {
1103 .family = PF_LLC, 1102 .family = PF_LLC,
1104 .owner = THIS_MODULE, 1103 .owner = THIS_MODULE,
1105 .release = llc_ui_release, 1104 .release = llc_ui_release,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cba63729313d..e10512e229b6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid)
151 goto out_unlock; 151 goto out_unlock;
152 152
153 INIT_HLIST_NODE(&inst->hlist); 153 INIT_HLIST_NODE(&inst->hlist);
154 inst->lock = SPIN_LOCK_UNLOCKED; 154 spin_lock_init(&inst->lock);
155 /* needs to be two, since we _put() after creation */ 155 /* needs to be two, since we _put() after creation */
156 atomic_set(&inst->use, 2); 156 atomic_set(&inst->use, 2);
157 157
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f28460b61e47..55afdda3d940 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid)
148 atomic_set(&inst->id_sequence, 0); 148 atomic_set(&inst->id_sequence, 0);
149 /* needs to be two, since we _put() after creation */ 149 /* needs to be two, since we _put() after creation */
150 atomic_set(&inst->use, 2); 150 atomic_set(&inst->use, 2);
151 inst->lock = SPIN_LOCK_UNLOCKED; 151 spin_lock_init(&inst->lock);
152 INIT_LIST_HEAD(&inst->queue_list); 152 INIT_LIST_HEAD(&inst->queue_list);
153 153
154 if (!try_module_get(THIS_MODULE)) 154 if (!try_module_get(THIS_MODULE))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e8..7849cac14d3a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
293 return 0; 293 return 0;
294} 294}
295 295
296static struct proto_ops netlink_ops; 296static const struct proto_ops netlink_ops;
297 297
298static int netlink_insert(struct sock *sk, u32 pid) 298static int netlink_insert(struct sock *sk, u32 pid)
299{ 299{
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
1656 return notifier_chain_unregister(&netlink_chain, nb); 1656 return notifier_chain_unregister(&netlink_chain, nb);
1657} 1657}
1658 1658
1659static struct proto_ops netlink_ops = { 1659static const struct proto_ops netlink_ops = {
1660 .family = PF_NETLINK, 1660 .family = PF_NETLINK,
1661 .owner = THIS_MODULE, 1661 .owner = THIS_MODULE,
1662 .release = netlink_release, 1662 .release = netlink_release,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 287cfcc56951..3b1378498d50 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -441,7 +441,7 @@ errout:
441} 441}
442 442
443static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, 443static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
444 int seq, int cmd) 444 int seq, u8 cmd)
445{ 445{
446 struct sk_buff *skb; 446 struct sk_buff *skb;
447 int err; 447 int err;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711cae..63b0e4afeb33 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
63static HLIST_HEAD(nr_list); 63static HLIST_HEAD(nr_list);
64static DEFINE_SPINLOCK(nr_list_lock); 64static DEFINE_SPINLOCK(nr_list_lock);
65 65
66static struct proto_ops nr_proto_ops; 66static const struct proto_ops nr_proto_ops;
67 67
68/* 68/*
69 * Socket removal during an interrupt is now safe. 69 * Socket removal during an interrupt is now safe.
@@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1166 void __user *argp = (void __user *)arg; 1166 void __user *argp = (void __user *)arg;
1167 int ret; 1167 int ret;
1168 1168
1169 lock_sock(sk);
1170 switch (cmd) { 1169 switch (cmd) {
1171 case TIOCOUTQ: { 1170 case TIOCOUTQ: {
1172 long amount; 1171 long amount;
1172
1173 lock_sock(sk);
1173 amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); 1174 amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
1174 if (amount < 0) 1175 if (amount < 0)
1175 amount = 0; 1176 amount = 0;
@@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1180 case TIOCINQ: { 1181 case TIOCINQ: {
1181 struct sk_buff *skb; 1182 struct sk_buff *skb;
1182 long amount = 0L; 1183 long amount = 0L;
1184
1185 lock_sock(sk);
1183 /* These two are safe on a single CPU system as only user tasks fiddle here */ 1186 /* These two are safe on a single CPU system as only user tasks fiddle here */
1184 if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) 1187 if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
1185 amount = skb->len; 1188 amount = skb->len;
@@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1188 } 1191 }
1189 1192
1190 case SIOCGSTAMP: 1193 case SIOCGSTAMP:
1194 lock_sock(sk);
1191 ret = sock_get_timestamp(sk, argp); 1195 ret = sock_get_timestamp(sk, argp);
1192 release_sock(sk); 1196 release_sock(sk);
1193 return ret; 1197 return ret;
@@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1202 case SIOCSIFNETMASK: 1206 case SIOCSIFNETMASK:
1203 case SIOCGIFMETRIC: 1207 case SIOCGIFMETRIC:
1204 case SIOCSIFMETRIC: 1208 case SIOCSIFMETRIC:
1205 release_sock(sk);
1206 return -EINVAL; 1209 return -EINVAL;
1207 1210
1208 case SIOCADDRT: 1211 case SIOCADDRT:
1209 case SIOCDELRT: 1212 case SIOCDELRT:
1210 case SIOCNRDECOBS: 1213 case SIOCNRDECOBS:
1211 release_sock(sk);
1212 if (!capable(CAP_NET_ADMIN)) return -EPERM; 1214 if (!capable(CAP_NET_ADMIN)) return -EPERM;
1213 return nr_rt_ioctl(cmd, argp); 1215 return nr_rt_ioctl(cmd, argp);
1214 1216
1215 default: 1217 default:
1216 release_sock(sk); 1218 return -ENOIOCTLCMD;
1217 return dev_ioctl(cmd, argp);
1218 } 1219 }
1219 release_sock(sk);
1220 1220
1221 return 0; 1221 return 0;
1222} 1222}
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
1337 .owner = THIS_MODULE, 1337 .owner = THIS_MODULE,
1338}; 1338};
1339 1339
1340static struct proto_ops nr_proto_ops = { 1340static const struct proto_ops nr_proto_ops = {
1341 .family = PF_NETROM, 1341 .family = PF_NETROM,
1342 .owner = THIS_MODULE, 1342 .owner = THIS_MODULE,
1343 .release = nr_release, 1343 .release = nr_release,
diff --git a/net/nonet.c b/net/nonet.c
index e5241dceaa57..1230f0ae832e 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -14,11 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16 16
17void __init sock_init(void)
18{
19 printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n");
20}
21
22static int sock_no_open(struct inode *irrelevant, struct file *dontcare) 17static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
23{ 18{
24 return -ENXIO; 19 return -ENXIO;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e2462760413..f69e5ed9bd06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
251} 251}
252 252
253 253
254static struct proto_ops packet_ops; 254static const struct proto_ops packet_ops;
255 255
256#ifdef CONFIG_SOCK_PACKET 256#ifdef CONFIG_SOCK_PACKET
257static struct proto_ops packet_ops_spkt; 257static const struct proto_ops packet_ops_spkt;
258 258
259static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 259static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
260{ 260{
@@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
1521#endif 1521#endif
1522 1522
1523 default: 1523 default:
1524 return dev_ioctl(cmd, (void __user *)arg); 1524 return -ENOIOCTLCMD;
1525 } 1525 }
1526 return 0; 1526 return 0;
1527} 1527}
@@ -1784,7 +1784,7 @@ out:
1784 1784
1785 1785
1786#ifdef CONFIG_SOCK_PACKET 1786#ifdef CONFIG_SOCK_PACKET
1787static struct proto_ops packet_ops_spkt = { 1787static const struct proto_ops packet_ops_spkt = {
1788 .family = PF_PACKET, 1788 .family = PF_PACKET,
1789 .owner = THIS_MODULE, 1789 .owner = THIS_MODULE,
1790 .release = packet_release, 1790 .release = packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
1806}; 1806};
1807#endif 1807#endif
1808 1808
1809static struct proto_ops packet_ops = { 1809static const struct proto_ops packet_ops = {
1810 .family = PF_PACKET, 1810 .family = PF_PACKET,
1811 .owner = THIS_MODULE, 1811 .owner = THIS_MODULE,
1812 .release = packet_release, 1812 .release = packet_release,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 829fdbc4400b..63090be2315a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1320 return 0; 1320 return 0;
1321 1321
1322 default: 1322 default:
1323 return dev_ioctl(cmd, argp); 1323 return -ENOIOCTLCMD;
1324 } 1324 }
1325 1325
1326 return 0; 1326 return 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a5..ba5283204837 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
25 25
26#include <net/pkt_sched.h> 26#include <net/pkt_sched.h>
27 27
28#define VERSION "1.1" 28#define VERSION "1.2"
29 29
30/* Network Emulation Queuing algorithm. 30/* Network Emulation Queuing algorithm.
31 ==================================== 31 ====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
65 u32 jitter; 65 u32 jitter;
66 u32 duplicate; 66 u32 duplicate;
67 u32 reorder; 67 u32 reorder;
68 u32 corrupt;
68 69
69 struct crndstate { 70 struct crndstate {
70 unsigned long last; 71 unsigned long last;
71 unsigned long rho; 72 unsigned long rho;
72 } delay_cor, loss_cor, dup_cor, reorder_cor; 73 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
73 74
74 struct disttable { 75 struct disttable {
75 u32 size; 76 u32 size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
183 q->duplicate = dupsave; 184 q->duplicate = dupsave;
184 } 185 }
185 186
187 /*
188 * Randomized packet corruption.
189 * Make copy if needed since we are modifying
190 * If packet is going to be hardware checksummed, then
191 * do it now in software before we mangle it.
192 */
193 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
194 if (!(skb = skb_unshare(skb, GFP_ATOMIC))
195 || (skb->ip_summed == CHECKSUM_HW
196 && skb_checksum_help(skb, 0))) {
197 sch->qstats.drops++;
198 return NET_XMIT_DROP;
199 }
200
201 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
202 }
203
186 if (q->gap == 0 /* not doing reordering */ 204 if (q->gap == 0 /* not doing reordering */
187 || q->counter < q->gap /* inside last reordering gap */ 205 || q->counter < q->gap /* inside last reordering gap */
188 || q->reorder < get_crandom(&q->reorder_cor)) { 206 || q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
382 return 0; 400 return 0;
383} 401}
384 402
403static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
404{
405 struct netem_sched_data *q = qdisc_priv(sch);
406 const struct tc_netem_corrupt *r = RTA_DATA(attr);
407
408 if (RTA_PAYLOAD(attr) != sizeof(*r))
409 return -EINVAL;
410
411 q->corrupt = r->probability;
412 init_crandom(&q->corrupt_cor, r->correlation);
413 return 0;
414}
415
416/* Parse netlink message to set options */
385static int netem_change(struct Qdisc *sch, struct rtattr *opt) 417static int netem_change(struct Qdisc *sch, struct rtattr *opt)
386{ 418{
387 struct netem_sched_data *q = qdisc_priv(sch); 419 struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
432 if (ret) 464 if (ret)
433 return ret; 465 return ret;
434 } 466 }
467
435 if (tb[TCA_NETEM_REORDER-1]) { 468 if (tb[TCA_NETEM_REORDER-1]) {
436 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); 469 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
437 if (ret) 470 if (ret)
438 return ret; 471 return ret;
439 } 472 }
440 }
441 473
474 if (tb[TCA_NETEM_CORRUPT-1]) {
475 ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
476 if (ret)
477 return ret;
478 }
479 }
442 480
443 return 0; 481 return 0;
444} 482}
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
564 struct tc_netem_qopt qopt; 602 struct tc_netem_qopt qopt;
565 struct tc_netem_corr cor; 603 struct tc_netem_corr cor;
566 struct tc_netem_reorder reorder; 604 struct tc_netem_reorder reorder;
605 struct tc_netem_corrupt corrupt;
567 606
568 qopt.latency = q->latency; 607 qopt.latency = q->latency;
569 qopt.jitter = q->jitter; 608 qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
582 reorder.correlation = q->reorder_cor.rho; 621 reorder.correlation = q->reorder_cor.rho;
583 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); 622 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
584 623
624 corrupt.probability = q->corrupt;
625 corrupt.correlation = q->corrupt_cor.rho;
626 RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
627
585 rta->rta_len = skb->tail - b; 628 rta->rta_len = skb->tail - b;
586 629
587 return skb->len; 630 return skb->len;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/if_arp.h>
25#include <linux/if_ether.h> 26#include <linux/if_ether.h>
26#include <linux/inet.h> 27#include <linux/inet.h>
27#include <linux/netdevice.h> 28#include <linux/netdevice.h>
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dec68a604773..9d05e13e92f6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
110 asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; 110 asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
111 asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) 111 asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
112 * 1000; 112 * 1000;
113 asoc->pmtu = 0;
114 asoc->frag_point = 0; 113 asoc->frag_point = 0;
115 114
116 /* Set the association max_retrans and RTO values from the 115 /* Set the association max_retrans and RTO values from the
@@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
123 122
124 asoc->overall_error_count = 0; 123 asoc->overall_error_count = 0;
125 124
125 /* Initialize the association's heartbeat interval based on the
126 * sock configured value.
127 */
128 asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
129
130 /* Initialize path max retrans value. */
131 asoc->pathmaxrxt = sp->pathmaxrxt;
132
133 /* Initialize default path MTU. */
134 asoc->pathmtu = sp->pathmtu;
135
136 /* Set association default SACK delay */
137 asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
138
139 /* Set the association default flags controlling
140 * Heartbeat, SACK delay, and Path MTU Discovery.
141 */
142 asoc->param_flags = sp->param_flags;
143
126 /* Initialize the maximum mumber of new data packets that can be sent 144 /* Initialize the maximum mumber of new data packets that can be sent
127 * in a burst. 145 * in a burst.
128 */ 146 */
@@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
144 = 5 * asoc->rto_max; 162 = 5 * asoc->rto_max;
145 163
146 asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; 164 asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
147 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 165 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
148 SCTP_DEFAULT_TIMEOUT_SACK;
149 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = 166 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
150 sp->autoclose * HZ; 167 sp->autoclose * HZ;
151 168
@@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
540 557
541 sctp_transport_set_owner(peer, asoc); 558 sctp_transport_set_owner(peer, asoc);
542 559
560 /* Initialize the peer's heartbeat interval based on the
561 * association configured value.
562 */
563 peer->hbinterval = asoc->hbinterval;
564
565 /* Set the path max_retrans. */
566 peer->pathmaxrxt = asoc->pathmaxrxt;
567
568 /* Initialize the peer's SACK delay timeout based on the
569 * association configured value.
570 */
571 peer->sackdelay = asoc->sackdelay;
572
573 /* Enable/disable heartbeat, SACK delay, and path MTU discovery
574 * based on association setting.
575 */
576 peer->param_flags = asoc->param_flags;
577
543 /* Initialize the pmtu of the transport. */ 578 /* Initialize the pmtu of the transport. */
544 sctp_transport_pmtu(peer); 579 if (peer->param_flags & SPP_PMTUD_ENABLE)
580 sctp_transport_pmtu(peer);
581 else if (asoc->pathmtu)
582 peer->pathmtu = asoc->pathmtu;
583 else
584 peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
545 585
546 /* If this is the first transport addr on this association, 586 /* If this is the first transport addr on this association,
547 * initialize the association PMTU to the peer's PMTU. 587 * initialize the association PMTU to the peer's PMTU.
548 * If not and the current association PMTU is higher than the new 588 * If not and the current association PMTU is higher than the new
549 * peer's PMTU, reset the association PMTU to the new peer's PMTU. 589 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
550 */ 590 */
551 if (asoc->pmtu) 591 if (asoc->pathmtu)
552 asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); 592 asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
553 else 593 else
554 asoc->pmtu = peer->pmtu; 594 asoc->pathmtu = peer->pathmtu;
555 595
556 SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " 596 SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
557 "%d\n", asoc, asoc->pmtu); 597 "%d\n", asoc, asoc->pathmtu);
558 598
559 asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); 599 asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
560 600
561 /* The asoc->peer.port might not be meaningful yet, but 601 /* The asoc->peer.port might not be meaningful yet, but
562 * initialize the packet structure anyway. 602 * initialize the packet structure anyway.
@@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
574 * (for example, implementations MAY use the size of the 614 * (for example, implementations MAY use the size of the
575 * receiver advertised window). 615 * receiver advertised window).
576 */ 616 */
577 peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); 617 peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
578 618
579 /* At this point, we may not have the receiver's advertised window, 619 /* At this point, we may not have the receiver's advertised window,
580 * so initialize ssthresh to the default value and it will be set 620 * so initialize ssthresh to the default value and it will be set
@@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
585 peer->partial_bytes_acked = 0; 625 peer->partial_bytes_acked = 0;
586 peer->flight_size = 0; 626 peer->flight_size = 0;
587 627
588 /* By default, enable heartbeat for peer address. */
589 peer->hb_allowed = 1;
590
591 /* Initialize the peer's heartbeat interval based on the
592 * sock configured value.
593 */
594 peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval);
595
596 /* Set the path max_retrans. */
597 peer->max_retrans = sp->paddrparam.spp_pathmaxrxt;
598
599 /* Set the transport's RTO.initial value */ 628 /* Set the transport's RTO.initial value */
600 peer->rto = asoc->rto_initial; 629 peer->rto = asoc->rto_initial;
601 630
@@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
1155 /* Get the lowest pmtu of all the transports. */ 1184 /* Get the lowest pmtu of all the transports. */
1156 list_for_each(pos, &asoc->peer.transport_addr_list) { 1185 list_for_each(pos, &asoc->peer.transport_addr_list) {
1157 t = list_entry(pos, struct sctp_transport, transports); 1186 t = list_entry(pos, struct sctp_transport, transports);
1158 if (!pmtu || (t->pmtu < pmtu)) 1187 if (!pmtu || (t->pathmtu < pmtu))
1159 pmtu = t->pmtu; 1188 pmtu = t->pathmtu;
1160 } 1189 }
1161 1190
1162 if (pmtu) { 1191 if (pmtu) {
1163 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 1192 struct sctp_sock *sp = sctp_sk(asoc->base.sk);
1164 asoc->pmtu = pmtu; 1193 asoc->pathmtu = pmtu;
1165 asoc->frag_point = sctp_frag_point(sp, pmtu); 1194 asoc->frag_point = sctp_frag_point(sp, pmtu);
1166 } 1195 }
1167 1196
1168 SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", 1197 SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
1169 __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); 1198 __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point);
1170} 1199}
1171 1200
1172/* Should we send a SACK to update our peer? */ 1201/* Should we send a SACK to update our peer? */
@@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc)
1179 case SCTP_STATE_SHUTDOWN_SENT: 1208 case SCTP_STATE_SHUTDOWN_SENT:
1180 if ((asoc->rwnd > asoc->a_rwnd) && 1209 if ((asoc->rwnd > asoc->a_rwnd) &&
1181 ((asoc->rwnd - asoc->a_rwnd) >= 1210 ((asoc->rwnd - asoc->a_rwnd) >=
1182 min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) 1211 min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu)))
1183 return 1; 1212 return 1;
1184 break; 1213 break;
1185 default: 1214 default:
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b24ff2c1aef5..238f1bffa684 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
305void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, 305void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
306 struct sctp_transport *t, __u32 pmtu) 306 struct sctp_transport *t, __u32 pmtu)
307{ 307{
308 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { 308 if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu))
309 printk(KERN_WARNING "%s: Reported pmtu %d too low, " 309 return;
310 "using default minimum of %d\n", __FUNCTION__, pmtu,
311 SCTP_DEFAULT_MINSEGMENT);
312 pmtu = SCTP_DEFAULT_MINSEGMENT;
313 }
314 310
315 if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { 311 if (t->param_flags & SPP_PMTUD_ENABLE) {
316 t->pmtu = pmtu; 312 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
313 printk(KERN_WARNING "%s: Reported pmtu %d too low, "
314 "using default minimum of %d\n",
315 __FUNCTION__, pmtu,
316 SCTP_DEFAULT_MINSEGMENT);
317 /* Use default minimum segment size and disable
318 * pmtu discovery on this transport.
319 */
320 t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
321 t->param_flags = (t->param_flags & ~SPP_HB) |
322 SPP_PMTUD_DISABLE;
323 } else {
324 t->pathmtu = pmtu;
325 }
326
327 /* Update association pmtu. */
317 sctp_assoc_sync_pmtu(asoc); 328 sctp_assoc_sync_pmtu(asoc);
318 sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
319 } 329 }
330
331 /* Retransmit with the new pmtu setting.
332 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
333 * Needed will never be sent, but if a message was sent before
334 * PMTU discovery was disabled that was larger than the PMTU, it
335 * would not be fragmented, so it must be re-transmitted fragmented.
336 */
337 sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
320} 338}
321 339
322/* 340/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5f..15c05165c905 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
866 return 2; 866 return 2;
867} 867}
868 868
869static struct proto_ops inet6_seqpacket_ops = { 869static const struct proto_ops inet6_seqpacket_ops = {
870 .family = PF_INET6, 870 .family = PF_INET6,
871 .owner = THIS_MODULE, 871 .owner = THIS_MODULE,
872 .release = inet6_release, 872 .release = inet6_release,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 931371633464..a40991ef72c9 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
234 goto finish; 234 goto finish;
235 235
236 pmtu = ((packet->transport->asoc) ? 236 pmtu = ((packet->transport->asoc) ?
237 (packet->transport->asoc->pmtu) : 237 (packet->transport->asoc->pathmtu) :
238 (packet->transport->pmtu)); 238 (packet->transport->pathmtu));
239 239
240 too_big = (psize + chunk_len > pmtu); 240 too_big = (psize + chunk_len > pmtu);
241 241
@@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
482 if (!dst || (dst->obsolete > 1)) { 482 if (!dst || (dst->obsolete > 1)) {
483 dst_release(dst); 483 dst_release(dst);
484 sctp_transport_route(tp, NULL, sctp_sk(sk)); 484 sctp_transport_route(tp, NULL, sctp_sk(sk));
485 sctp_assoc_sync_pmtu(asoc); 485 if (asoc->param_flags & SPP_PMTUD_ENABLE) {
486 sctp_assoc_sync_pmtu(asoc);
487 }
486 } 488 }
487 489
488 nskb->dst = dst_clone(tp->dst); 490 nskb->dst = dst_clone(tp->dst);
@@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
492 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", 494 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
493 nskb->len); 495 nskb->len);
494 496
495 (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); 497 if (tp->param_flags & SPP_PMTUD_ENABLE)
498 (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
499 else
500 (*tp->af_specific->sctp_xmit)(nskb, tp, 1);
496 501
497out: 502out:
498 packet->size = packet->overhead; 503 packet->size = packet->overhead;
@@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
577 * if ((flightsize + Max.Burst * MTU) < cwnd) 582 * if ((flightsize + Max.Burst * MTU) < cwnd)
578 * cwnd = flightsize + Max.Burst * MTU 583 * cwnd = flightsize + Max.Burst * MTU
579 */ 584 */
580 max_burst_bytes = asoc->max_burst * asoc->pmtu; 585 max_burst_bytes = asoc->max_burst * asoc->pathmtu;
581 if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { 586 if ((transport->flight_size + max_burst_bytes) < transport->cwnd) {
582 transport->cwnd = transport->flight_size + max_burst_bytes; 587 transport->cwnd = transport->flight_size + max_burst_bytes;
583 SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " 588 SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: "
@@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
622 * data will fit or delay in hopes of bundling a full 627 * data will fit or delay in hopes of bundling a full
623 * sized packet. 628 * sized packet.
624 */ 629 */
625 if (len < asoc->pmtu - packet->overhead) { 630 if (len < asoc->pathmtu - packet->overhead) {
626 retval = SCTP_XMIT_NAGLE_DELAY; 631 retval = SCTP_XMIT_NAGLE_DELAY;
627 goto finish; 632 goto finish;
628 } 633 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
54#include <net/protocol.h> 54#include <net/protocol.h>
55#include <net/ip.h> 55#include <net/ip.h>
56#include <net/ipv6.h> 56#include <net/ipv6.h>
57#include <net/route.h>
57#include <net/sctp/sctp.h> 58#include <net/sctp/sctp.h>
58#include <net/addrconf.h> 59#include <net/addrconf.h>
59#include <net/inet_common.h> 60#include <net/inet_common.h>
@@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
829}; 830};
830 831
831/* Socket operations. */ 832/* Socket operations. */
832static struct proto_ops inet_seqpacket_ops = { 833static const struct proto_ops inet_seqpacket_ops = {
833 .family = PF_INET, 834 .family = PF_INET,
834 .owner = THIS_MODULE, 835 .owner = THIS_MODULE,
835 .release = inet_release, /* Needs to be wrapped... */ 836 .release = inet_release, /* Needs to be wrapped... */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 823947170a33..2d7d8a5db2ac 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
157{ 157{
158 __u32 ctsn, max_tsn_seen; 158 __u32 ctsn, max_tsn_seen;
159 struct sctp_chunk *sack; 159 struct sctp_chunk *sack;
160 struct sctp_transport *trans = asoc->peer.last_data_from;
160 int error = 0; 161 int error = 0;
161 162
162 if (force) 163 if (force ||
164 (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
165 (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
163 asoc->peer.sack_needed = 1; 166 asoc->peer.sack_needed = 1;
164 167
165 ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); 168 ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
189 if (!asoc->peer.sack_needed) { 192 if (!asoc->peer.sack_needed) {
190 /* We will need a SACK for the next packet. */ 193 /* We will need a SACK for the next packet. */
191 asoc->peer.sack_needed = 1; 194 asoc->peer.sack_needed = 1;
192 goto out; 195
196 /* Set the SACK delay timeout based on the
197 * SACK delay for the last transport
198 * data was received from, or the default
199 * for the association.
200 */
201 if (trans)
202 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
203 trans->sackdelay;
204 else
205 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
206 asoc->sackdelay;
207
208 /* Restart the SACK timer. */
209 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
210 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
193 } else { 211 } else {
194 if (asoc->a_rwnd > asoc->rwnd) 212 if (asoc->a_rwnd > asoc->rwnd)
195 asoc->a_rwnd = asoc->rwnd; 213 asoc->a_rwnd = asoc->rwnd;
@@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
205 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, 223 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
206 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); 224 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
207 } 225 }
208out: 226
209 return error; 227 return error;
210nomem: 228nomem:
211 error = -ENOMEM; 229 error = -ENOMEM;
@@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
415 asoc->overall_error_count++; 433 asoc->overall_error_count++;
416 434
417 if (transport->state != SCTP_INACTIVE && 435 if (transport->state != SCTP_INACTIVE &&
418 (transport->error_count++ >= transport->max_retrans)) { 436 (transport->error_count++ >= transport->pathmaxrxt)) {
419 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", 437 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
420 " transport IP: port:%d failed.\n", 438 " transport IP: port:%d failed.\n",
421 asoc, 439 asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 475bfb4972d9..557a7d90b92a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
900 * HEARTBEAT is sent (see Section 8.3). 900 * HEARTBEAT is sent (see Section 8.3).
901 */ 901 */
902 902
903 if (transport->hb_allowed) { 903 if (transport->param_flags & SPP_HB_ENABLE) {
904 if (SCTP_DISPOSITION_NOMEM == 904 if (SCTP_DISPOSITION_NOMEM ==
905 sctp_sf_heartbeat(ep, asoc, type, arg, 905 sctp_sf_heartbeat(ep, asoc, type, arg,
906 commands)) 906 commands))
@@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1051 return SCTP_DISPOSITION_DISCARD; 1051 return SCTP_DISPOSITION_DISCARD;
1052 } 1052 }
1053 1053
1054 max_interval = link->hb_interval + link->rto; 1054 max_interval = link->hbinterval + link->rto;
1055 1055
1056 /* Check if the timestamp looks valid. */ 1056 /* Check if the timestamp looks valid. */
1057 if (time_after(hbinfo->sent_at, jiffies) || 1057 if (time_after(hbinfo->sent_at, jiffies) ||
@@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
2691 * document allow. However, an SCTP transmitter MUST NOT be 2691 * document allow. However, an SCTP transmitter MUST NOT be
2692 * more aggressive than the following algorithms allow. 2692 * more aggressive than the following algorithms allow.
2693 */ 2693 */
2694 if (chunk->end_of_packet) { 2694 if (chunk->end_of_packet)
2695 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 2695 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
2696 2696
2697 /* Start the SACK timer. */
2698 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
2699 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
2700 }
2701
2702 return SCTP_DISPOSITION_CONSUME; 2697 return SCTP_DISPOSITION_CONSUME;
2703 2698
2704discard_force: 2699discard_force:
@@ -2721,13 +2716,9 @@ discard_force:
2721 return SCTP_DISPOSITION_DISCARD; 2716 return SCTP_DISPOSITION_DISCARD;
2722 2717
2723discard_noforce: 2718discard_noforce:
2724 if (chunk->end_of_packet) { 2719 if (chunk->end_of_packet)
2725 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 2720 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
2726 2721
2727 /* Start the SACK timer. */
2728 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
2729 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
2730 }
2731 return SCTP_DISPOSITION_DISCARD; 2722 return SCTP_DISPOSITION_DISCARD;
2732consume: 2723consume:
2733 return SCTP_DISPOSITION_CONSUME; 2724 return SCTP_DISPOSITION_CONSUME;
@@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
3442 * send another. 3433 * send another.
3443 */ 3434 */
3444 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 3435 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
3445 /* Start the SACK timer. */
3446 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
3447 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
3448 3436
3449 return SCTP_DISPOSITION_CONSUME; 3437 return SCTP_DISPOSITION_CONSUME;
3450 3438
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9df888e932c5..fc04d185fa33 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
1941 * address's parameters: 1941 * address's parameters:
1942 * 1942 *
1943 * struct sctp_paddrparams { 1943 * struct sctp_paddrparams {
1944 * sctp_assoc_t spp_assoc_id; 1944 * sctp_assoc_t spp_assoc_id;
1945 * struct sockaddr_storage spp_address; 1945 * struct sockaddr_storage spp_address;
1946 * uint32_t spp_hbinterval; 1946 * uint32_t spp_hbinterval;
1947 * uint16_t spp_pathmaxrxt; 1947 * uint16_t spp_pathmaxrxt;
1948 * }; 1948 * uint32_t spp_pathmtu;
1949 * 1949 * uint32_t spp_sackdelay;
1950 * spp_assoc_id - (UDP style socket) This is filled in the application, 1950 * uint32_t spp_flags;
1951 * and identifies the association for this query. 1951 * };
1952 *
1953 * spp_assoc_id - (one-to-many style socket) This is filled in the
1954 * application, and identifies the association for
1955 * this query.
1952 * spp_address - This specifies which address is of interest. 1956 * spp_address - This specifies which address is of interest.
1953 * spp_hbinterval - This contains the value of the heartbeat interval, 1957 * spp_hbinterval - This contains the value of the heartbeat interval,
1954 * in milliseconds. A value of 0, when modifying the 1958 * in milliseconds. If a value of zero
1955 * parameter, specifies that the heartbeat on this 1959 * is present in this field then no changes are to
1956 * address should be disabled. A value of UINT32_MAX 1960 * be made to this parameter.
1957 * (4294967295), when modifying the parameter,
1958 * specifies that a heartbeat should be sent
1959 * immediately to the peer address, and the current
1960 * interval should remain unchanged.
1961 * spp_pathmaxrxt - This contains the maximum number of 1961 * spp_pathmaxrxt - This contains the maximum number of
1962 * retransmissions before this address shall be 1962 * retransmissions before this address shall be
1963 * considered unreachable. 1963 * considered unreachable. If a value of zero
1964 * is present in this field then no changes are to
1965 * be made to this parameter.
1966 * spp_pathmtu - When Path MTU discovery is disabled the value
1967 * specified here will be the "fixed" path mtu.
1968 * Note that if the spp_address field is empty
1969 * then all associations on this address will
1970 * have this fixed path mtu set upon them.
1971 *
1972 * spp_sackdelay - When delayed sack is enabled, this value specifies
1973 * the number of milliseconds that sacks will be delayed
1974 * for. This value will apply to all addresses of an
1975 * association if the spp_address field is empty. Note
1976 * also, that if delayed sack is enabled and this
1977 * value is set to 0, no change is made to the last
1978 * recorded delayed sack timer value.
1979 *
1980 * spp_flags - These flags are used to control various features
1981 * on an association. The flag field may contain
1982 * zero or more of the following options.
1983 *
1984 * SPP_HB_ENABLE - Enable heartbeats on the
1985 * specified address. Note that if the address
1986 * field is empty all addresses for the association
1987 * have heartbeats enabled upon them.
1988 *
1989 * SPP_HB_DISABLE - Disable heartbeats on the
1990 * speicifed address. Note that if the address
1991 * field is empty all addresses for the association
1992 * will have their heartbeats disabled. Note also
1993 * that SPP_HB_ENABLE and SPP_HB_DISABLE are
1994 * mutually exclusive, only one of these two should
1995 * be specified. Enabling both fields will have
1996 * undetermined results.
1997 *
1998 * SPP_HB_DEMAND - Request a user initiated heartbeat
1999 * to be made immediately.
2000 *
2001 * SPP_PMTUD_ENABLE - This field will enable PMTU
2002 * discovery upon the specified address. Note that
2003 * if the address feild is empty then all addresses
2004 * on the association are effected.
2005 *
2006 * SPP_PMTUD_DISABLE - This field will disable PMTU
2007 * discovery upon the specified address. Note that
2008 * if the address feild is empty then all addresses
2009 * on the association are effected. Not also that
2010 * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
2011 * exclusive. Enabling both will have undetermined
2012 * results.
2013 *
2014 * SPP_SACKDELAY_ENABLE - Setting this flag turns
2015 * on delayed sack. The time specified in spp_sackdelay
2016 * is used to specify the sack delay for this address. Note
2017 * that if spp_address is empty then all addresses will
2018 * enable delayed sack and take on the sack delay
2019 * value specified in spp_sackdelay.
2020 * SPP_SACKDELAY_DISABLE - Setting this flag turns
2021 * off delayed sack. If the spp_address field is blank then
2022 * delayed sack is disabled for the entire association. Note
2023 * also that this field is mutually exclusive to
2024 * SPP_SACKDELAY_ENABLE, setting both will have undefined
2025 * results.
1964 */ 2026 */
2027int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2028 struct sctp_transport *trans,
2029 struct sctp_association *asoc,
2030 struct sctp_sock *sp,
2031 int hb_change,
2032 int pmtud_change,
2033 int sackdelay_change)
2034{
2035 int error;
2036
2037 if (params->spp_flags & SPP_HB_DEMAND && trans) {
2038 error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
2039 if (error)
2040 return error;
2041 }
2042
2043 if (params->spp_hbinterval) {
2044 if (trans) {
2045 trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
2046 } else if (asoc) {
2047 asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
2048 } else {
2049 sp->hbinterval = params->spp_hbinterval;
2050 }
2051 }
2052
2053 if (hb_change) {
2054 if (trans) {
2055 trans->param_flags =
2056 (trans->param_flags & ~SPP_HB) | hb_change;
2057 } else if (asoc) {
2058 asoc->param_flags =
2059 (asoc->param_flags & ~SPP_HB) | hb_change;
2060 } else {
2061 sp->param_flags =
2062 (sp->param_flags & ~SPP_HB) | hb_change;
2063 }
2064 }
2065
2066 if (params->spp_pathmtu) {
2067 if (trans) {
2068 trans->pathmtu = params->spp_pathmtu;
2069 sctp_assoc_sync_pmtu(asoc);
2070 } else if (asoc) {
2071 asoc->pathmtu = params->spp_pathmtu;
2072 sctp_frag_point(sp, params->spp_pathmtu);
2073 } else {
2074 sp->pathmtu = params->spp_pathmtu;
2075 }
2076 }
2077
2078 if (pmtud_change) {
2079 if (trans) {
2080 int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
2081 (params->spp_flags & SPP_PMTUD_ENABLE);
2082 trans->param_flags =
2083 (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
2084 if (update) {
2085 sctp_transport_pmtu(trans);
2086 sctp_assoc_sync_pmtu(asoc);
2087 }
2088 } else if (asoc) {
2089 asoc->param_flags =
2090 (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
2091 } else {
2092 sp->param_flags =
2093 (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
2094 }
2095 }
2096
2097 if (params->spp_sackdelay) {
2098 if (trans) {
2099 trans->sackdelay =
2100 msecs_to_jiffies(params->spp_sackdelay);
2101 } else if (asoc) {
2102 asoc->sackdelay =
2103 msecs_to_jiffies(params->spp_sackdelay);
2104 } else {
2105 sp->sackdelay = params->spp_sackdelay;
2106 }
2107 }
2108
2109 if (sackdelay_change) {
2110 if (trans) {
2111 trans->param_flags =
2112 (trans->param_flags & ~SPP_SACKDELAY) |
2113 sackdelay_change;
2114 } else if (asoc) {
2115 asoc->param_flags =
2116 (asoc->param_flags & ~SPP_SACKDELAY) |
2117 sackdelay_change;
2118 } else {
2119 sp->param_flags =
2120 (sp->param_flags & ~SPP_SACKDELAY) |
2121 sackdelay_change;
2122 }
2123 }
2124
2125 if (params->spp_pathmaxrxt) {
2126 if (trans) {
2127 trans->pathmaxrxt = params->spp_pathmaxrxt;
2128 } else if (asoc) {
2129 asoc->pathmaxrxt = params->spp_pathmaxrxt;
2130 } else {
2131 sp->pathmaxrxt = params->spp_pathmaxrxt;
2132 }
2133 }
2134
2135 return 0;
2136}
2137
1965static int sctp_setsockopt_peer_addr_params(struct sock *sk, 2138static int sctp_setsockopt_peer_addr_params(struct sock *sk,
1966 char __user *optval, int optlen) 2139 char __user *optval, int optlen)
1967{ 2140{
1968 struct sctp_paddrparams params; 2141 struct sctp_paddrparams params;
1969 struct sctp_transport *trans; 2142 struct sctp_transport *trans = NULL;
2143 struct sctp_association *asoc = NULL;
2144 struct sctp_sock *sp = sctp_sk(sk);
1970 int error; 2145 int error;
2146 int hb_change, pmtud_change, sackdelay_change;
1971 2147
1972 if (optlen != sizeof(struct sctp_paddrparams)) 2148 if (optlen != sizeof(struct sctp_paddrparams))
1973 return -EINVAL; 2149 return - EINVAL;
2150
1974 if (copy_from_user(&params, optval, optlen)) 2151 if (copy_from_user(&params, optval, optlen))
1975 return -EFAULT; 2152 return -EFAULT;
1976 2153
1977 /* 2154 /* Validate flags and value parameters. */
1978 * API 7. Socket Options (setting the default value for the endpoint) 2155 hb_change = params.spp_flags & SPP_HB;
1979 * All options that support specific settings on an association by 2156 pmtud_change = params.spp_flags & SPP_PMTUD;
1980 * filling in either an association id variable or a sockaddr_storage 2157 sackdelay_change = params.spp_flags & SPP_SACKDELAY;
1981 * SHOULD also support setting of the same value for the entire endpoint 2158
1982 * (i.e. future associations). To accomplish this the following logic is 2159 if (hb_change == SPP_HB ||
1983 * used when setting one of these options: 2160 pmtud_change == SPP_PMTUD ||
1984 2161 sackdelay_change == SPP_SACKDELAY ||
1985 * c) If neither the sockaddr_storage or association identification is 2162 params.spp_sackdelay > 500 ||
1986 * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and 2163 (params.spp_pathmtu
1987 * the association identification is 0, the settings are a default 2164 && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
1988 * and to be applied to the endpoint (all future associations). 2165 return -EINVAL;
1989 */
1990 2166
1991 /* update default value for endpoint (all future associations) */ 2167 /* If an address other than INADDR_ANY is specified, and
1992 if (!params.spp_assoc_id && 2168 * no transport is found, then the request is invalid.
1993 sctp_is_any(( union sctp_addr *)&params.spp_address)) { 2169 */
1994 /* Manual heartbeat on an endpoint is invalid. */ 2170 if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
1995 if (0xffffffff == params.spp_hbinterval) 2171 trans = sctp_addr_id2transport(sk, &params.spp_address,
2172 params.spp_assoc_id);
2173 if (!trans)
1996 return -EINVAL; 2174 return -EINVAL;
1997 else if (params.spp_hbinterval)
1998 sctp_sk(sk)->paddrparam.spp_hbinterval =
1999 params.spp_hbinterval;
2000 if (params.spp_pathmaxrxt)
2001 sctp_sk(sk)->paddrparam.spp_pathmaxrxt =
2002 params.spp_pathmaxrxt;
2003 return 0;
2004 } 2175 }
2005 2176
2006 trans = sctp_addr_id2transport(sk, &params.spp_address, 2177 /* Get association, if assoc_id != 0 and the socket is a one
2007 params.spp_assoc_id); 2178 * to many style socket, and an association was not found, then
2008 if (!trans) 2179 * the id was invalid.
2180 */
2181 asoc = sctp_id2assoc(sk, params.spp_assoc_id);
2182 if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
2009 return -EINVAL; 2183 return -EINVAL;
2010 2184
2011 /* Applications can enable or disable heartbeats for any peer address 2185 /* Heartbeat demand can only be sent on a transport or
2012 * of an association, modify an address's heartbeat interval, force a 2186 * association, but not a socket.
2013 * heartbeat to be sent immediately, and adjust the address's maximum
2014 * number of retransmissions sent before an address is considered
2015 * unreachable.
2016 *
2017 * The value of the heartbeat interval, in milliseconds. A value of
2018 * UINT32_MAX (4294967295), when modifying the parameter, specifies
2019 * that a heartbeat should be sent immediately to the peer address,
2020 * and the current interval should remain unchanged.
2021 */ 2187 */
2022 if (0xffffffff == params.spp_hbinterval) { 2188 if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
2023 error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); 2189 return -EINVAL;
2024 if (error) 2190
2025 return error; 2191 /* Process parameters. */
2026 } else { 2192 error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
2027 /* The value of the heartbeat interval, in milliseconds. A value of 0, 2193 hb_change, pmtud_change,
2028 * when modifying the parameter, specifies that the heartbeat on this 2194 sackdelay_change);
2029 * address should be disabled. 2195
2196 if (error)
2197 return error;
2198
2199 /* If changes are for association, also apply parameters to each
2200 * transport.
2030 */ 2201 */
2031 if (params.spp_hbinterval) { 2202 if (!trans && asoc) {
2032 trans->hb_allowed = 1; 2203 struct list_head *pos;
2033 trans->hb_interval = 2204
2034 msecs_to_jiffies(params.spp_hbinterval); 2205 list_for_each(pos, &asoc->peer.transport_addr_list) {
2035 } else 2206 trans = list_entry(pos, struct sctp_transport,
2036 trans->hb_allowed = 0; 2207 transports);
2208 sctp_apply_peer_addr_params(&params, trans, asoc, sp,
2209 hb_change, pmtud_change,
2210 sackdelay_change);
2211 }
2037 } 2212 }
2038 2213
2039 /* spp_pathmaxrxt contains the maximum number of retransmissions 2214 return 0;
2040 * before this address shall be considered unreachable. 2215}
2041 */ 2216
2042 if (params.spp_pathmaxrxt) 2217/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
2043 trans->max_retrans = params.spp_pathmaxrxt; 2218 *
2219 * This options will get or set the delayed ack timer. The time is set
2220 * in milliseconds. If the assoc_id is 0, then this sets or gets the
2221 * endpoints default delayed ack timer value. If the assoc_id field is
2222 * non-zero, then the set or get effects the specified association.
2223 *
2224 * struct sctp_assoc_value {
2225 * sctp_assoc_t assoc_id;
2226 * uint32_t assoc_value;
2227 * };
2228 *
2229 * assoc_id - This parameter, indicates which association the
2230 * user is preforming an action upon. Note that if
2231 * this field's value is zero then the endpoints
2232 * default value is changed (effecting future
2233 * associations only).
2234 *
2235 * assoc_value - This parameter contains the number of milliseconds
2236 * that the user is requesting the delayed ACK timer
2237 * be set to. Note that this value is defined in
2238 * the standard to be between 200 and 500 milliseconds.
2239 *
2240 * Note: a value of zero will leave the value alone,
2241 * but disable SACK delay. A non-zero value will also
2242 * enable SACK delay.
2243 */
2044 2244
2245static int sctp_setsockopt_delayed_ack_time(struct sock *sk,
2246 char __user *optval, int optlen)
2247{
2248 struct sctp_assoc_value params;
2249 struct sctp_transport *trans = NULL;
2250 struct sctp_association *asoc = NULL;
2251 struct sctp_sock *sp = sctp_sk(sk);
2252
2253 if (optlen != sizeof(struct sctp_assoc_value))
2254 return - EINVAL;
2255
2256 if (copy_from_user(&params, optval, optlen))
2257 return -EFAULT;
2258
2259 /* Validate value parameter. */
2260 if (params.assoc_value > 500)
2261 return -EINVAL;
2262
2263 /* Get association, if assoc_id != 0 and the socket is a one
2264 * to many style socket, and an association was not found, then
2265 * the id was invalid.
2266 */
2267 asoc = sctp_id2assoc(sk, params.assoc_id);
2268 if (!asoc && params.assoc_id && sctp_style(sk, UDP))
2269 return -EINVAL;
2270
2271 if (params.assoc_value) {
2272 if (asoc) {
2273 asoc->sackdelay =
2274 msecs_to_jiffies(params.assoc_value);
2275 asoc->param_flags =
2276 (asoc->param_flags & ~SPP_SACKDELAY) |
2277 SPP_SACKDELAY_ENABLE;
2278 } else {
2279 sp->sackdelay = params.assoc_value;
2280 sp->param_flags =
2281 (sp->param_flags & ~SPP_SACKDELAY) |
2282 SPP_SACKDELAY_ENABLE;
2283 }
2284 } else {
2285 if (asoc) {
2286 asoc->param_flags =
2287 (asoc->param_flags & ~SPP_SACKDELAY) |
2288 SPP_SACKDELAY_DISABLE;
2289 } else {
2290 sp->param_flags =
2291 (sp->param_flags & ~SPP_SACKDELAY) |
2292 SPP_SACKDELAY_DISABLE;
2293 }
2294 }
2295
2296 /* If change is for association, also apply to each transport. */
2297 if (asoc) {
2298 struct list_head *pos;
2299
2300 list_for_each(pos, &asoc->peer.transport_addr_list) {
2301 trans = list_entry(pos, struct sctp_transport,
2302 transports);
2303 if (params.assoc_value) {
2304 trans->sackdelay =
2305 msecs_to_jiffies(params.assoc_value);
2306 trans->param_flags =
2307 (trans->param_flags & ~SPP_SACKDELAY) |
2308 SPP_SACKDELAY_ENABLE;
2309 } else {
2310 trans->param_flags =
2311 (trans->param_flags & ~SPP_SACKDELAY) |
2312 SPP_SACKDELAY_DISABLE;
2313 }
2314 }
2315 }
2316
2045 return 0; 2317 return 0;
2046} 2318}
2047 2319
@@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
2334 /* Update the frag_point of the existing associations. */ 2606 /* Update the frag_point of the existing associations. */
2335 list_for_each(pos, &(sp->ep->asocs)) { 2607 list_for_each(pos, &(sp->ep->asocs)) {
2336 asoc = list_entry(pos, struct sctp_association, asocs); 2608 asoc = list_entry(pos, struct sctp_association, asocs);
2337 asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); 2609 asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
2338 } 2610 }
2339 2611
2340 return 0; 2612 return 0;
@@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
2491 retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); 2763 retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
2492 break; 2764 break;
2493 2765
2766 case SCTP_DELAYED_ACK_TIME:
2767 retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen);
2768 break;
2769
2494 case SCTP_INITMSG: 2770 case SCTP_INITMSG:
2495 retval = sctp_setsockopt_initmsg(sk, optval, optlen); 2771 retval = sctp_setsockopt_initmsg(sk, optval, optlen);
2496 break; 2772 break;
@@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
2715 /* Default Peer Address Parameters. These defaults can 2991 /* Default Peer Address Parameters. These defaults can
2716 * be modified via SCTP_PEER_ADDR_PARAMS 2992 * be modified via SCTP_PEER_ADDR_PARAMS
2717 */ 2993 */
2718 sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); 2994 sp->hbinterval = jiffies_to_msecs(sctp_hb_interval);
2719 sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; 2995 sp->pathmaxrxt = sctp_max_retrans_path;
2996 sp->pathmtu = 0; // allow default discovery
2997 sp->sackdelay = sctp_sack_timeout;
2998 sp->param_flags = SPP_HB_ENABLE |
2999 SPP_PMTUD_ENABLE |
3000 SPP_SACKDELAY_ENABLE;
2720 3001
2721 /* If enabled no SCTP message fragmentation will be performed. 3002 /* If enabled no SCTP message fragmentation will be performed.
2722 * Configure through SCTP_DISABLE_FRAGMENTS socket option. 3003 * Configure through SCTP_DISABLE_FRAGMENTS socket option.
@@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
2865 status.sstat_primary.spinfo_cwnd = transport->cwnd; 3146 status.sstat_primary.spinfo_cwnd = transport->cwnd;
2866 status.sstat_primary.spinfo_srtt = transport->srtt; 3147 status.sstat_primary.spinfo_srtt = transport->srtt;
2867 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); 3148 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
2868 status.sstat_primary.spinfo_mtu = transport->pmtu; 3149 status.sstat_primary.spinfo_mtu = transport->pathmtu;
2869 3150
2870 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) 3151 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
2871 status.sstat_primary.spinfo_state = SCTP_ACTIVE; 3152 status.sstat_primary.spinfo_state = SCTP_ACTIVE;
@@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
2924 pinfo.spinfo_cwnd = transport->cwnd; 3205 pinfo.spinfo_cwnd = transport->cwnd;
2925 pinfo.spinfo_srtt = transport->srtt; 3206 pinfo.spinfo_srtt = transport->srtt;
2926 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); 3207 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
2927 pinfo.spinfo_mtu = transport->pmtu; 3208 pinfo.spinfo_mtu = transport->pathmtu;
2928 3209
2929 if (pinfo.spinfo_state == SCTP_UNKNOWN) 3210 if (pinfo.spinfo_state == SCTP_UNKNOWN)
2930 pinfo.spinfo_state = SCTP_ACTIVE; 3211 pinfo.spinfo_state = SCTP_ACTIVE;
@@ -3086,69 +3367,227 @@ out:
3086 * address's parameters: 3367 * address's parameters:
3087 * 3368 *
3088 * struct sctp_paddrparams { 3369 * struct sctp_paddrparams {
3089 * sctp_assoc_t spp_assoc_id; 3370 * sctp_assoc_t spp_assoc_id;
3090 * struct sockaddr_storage spp_address; 3371 * struct sockaddr_storage spp_address;
3091 * uint32_t spp_hbinterval; 3372 * uint32_t spp_hbinterval;
3092 * uint16_t spp_pathmaxrxt; 3373 * uint16_t spp_pathmaxrxt;
3093 * }; 3374 * uint32_t spp_pathmtu;
3094 * 3375 * uint32_t spp_sackdelay;
3095 * spp_assoc_id - (UDP style socket) This is filled in the application, 3376 * uint32_t spp_flags;
3096 * and identifies the association for this query. 3377 * };
3378 *
3379 * spp_assoc_id - (one-to-many style socket) This is filled in the
3380 * application, and identifies the association for
3381 * this query.
3097 * spp_address - This specifies which address is of interest. 3382 * spp_address - This specifies which address is of interest.
3098 * spp_hbinterval - This contains the value of the heartbeat interval, 3383 * spp_hbinterval - This contains the value of the heartbeat interval,
3099 * in milliseconds. A value of 0, when modifying the 3384 * in milliseconds. If a value of zero
3100 * parameter, specifies that the heartbeat on this 3385 * is present in this field then no changes are to
3101 * address should be disabled. A value of UINT32_MAX 3386 * be made to this parameter.
3102 * (4294967295), when modifying the parameter,
3103 * specifies that a heartbeat should be sent
3104 * immediately to the peer address, and the current
3105 * interval should remain unchanged.
3106 * spp_pathmaxrxt - This contains the maximum number of 3387 * spp_pathmaxrxt - This contains the maximum number of
3107 * retransmissions before this address shall be 3388 * retransmissions before this address shall be
3108 * considered unreachable. 3389 * considered unreachable. If a value of zero
3390 * is present in this field then no changes are to
3391 * be made to this parameter.
3392 * spp_pathmtu - When Path MTU discovery is disabled the value
3393 * specified here will be the "fixed" path mtu.
3394 * Note that if the spp_address field is empty
3395 * then all associations on this address will
3396 * have this fixed path mtu set upon them.
3397 *
3398 * spp_sackdelay - When delayed sack is enabled, this value specifies
3399 * the number of milliseconds that sacks will be delayed
3400 * for. This value will apply to all addresses of an
3401 * association if the spp_address field is empty. Note
3402 * also, that if delayed sack is enabled and this
3403 * value is set to 0, no change is made to the last
3404 * recorded delayed sack timer value.
3405 *
3406 * spp_flags - These flags are used to control various features
3407 * on an association. The flag field may contain
3408 * zero or more of the following options.
3409 *
3410 * SPP_HB_ENABLE - Enable heartbeats on the
3411 * specified address. Note that if the address
3412 * field is empty all addresses for the association
3413 * have heartbeats enabled upon them.
3414 *
3415 * SPP_HB_DISABLE - Disable heartbeats on the
3416 * speicifed address. Note that if the address
3417 * field is empty all addresses for the association
3418 * will have their heartbeats disabled. Note also
3419 * that SPP_HB_ENABLE and SPP_HB_DISABLE are
3420 * mutually exclusive, only one of these two should
3421 * be specified. Enabling both fields will have
3422 * undetermined results.
3423 *
3424 * SPP_HB_DEMAND - Request a user initiated heartbeat
3425 * to be made immediately.
3426 *
3427 * SPP_PMTUD_ENABLE - This field will enable PMTU
3428 * discovery upon the specified address. Note that
3429 * if the address feild is empty then all addresses
3430 * on the association are effected.
3431 *
3432 * SPP_PMTUD_DISABLE - This field will disable PMTU
3433 * discovery upon the specified address. Note that
3434 * if the address feild is empty then all addresses
3435 * on the association are effected. Not also that
3436 * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
3437 * exclusive. Enabling both will have undetermined
3438 * results.
3439 *
3440 * SPP_SACKDELAY_ENABLE - Setting this flag turns
3441 * on delayed sack. The time specified in spp_sackdelay
3442 * is used to specify the sack delay for this address. Note
3443 * that if spp_address is empty then all addresses will
3444 * enable delayed sack and take on the sack delay
3445 * value specified in spp_sackdelay.
3446 * SPP_SACKDELAY_DISABLE - Setting this flag turns
3447 * off delayed sack. If the spp_address field is blank then
3448 * delayed sack is disabled for the entire association. Note
3449 * also that this field is mutually exclusive to
3450 * SPP_SACKDELAY_ENABLE, setting both will have undefined
3451 * results.
3109 */ 3452 */
3110static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, 3453static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
3111 char __user *optval, int __user *optlen) 3454 char __user *optval, int __user *optlen)
3112{ 3455{
3113 struct sctp_paddrparams params; 3456 struct sctp_paddrparams params;
3114 struct sctp_transport *trans; 3457 struct sctp_transport *trans = NULL;
3458 struct sctp_association *asoc = NULL;
3459 struct sctp_sock *sp = sctp_sk(sk);
3115 3460
3116 if (len != sizeof(struct sctp_paddrparams)) 3461 if (len != sizeof(struct sctp_paddrparams))
3117 return -EINVAL; 3462 return -EINVAL;
3463
3118 if (copy_from_user(&params, optval, len)) 3464 if (copy_from_user(&params, optval, len))
3119 return -EFAULT; 3465 return -EFAULT;
3120 3466
3121 /* If no association id is specified retrieve the default value 3467 /* If an address other than INADDR_ANY is specified, and
3122 * for the endpoint that will be used for all future associations 3468 * no transport is found, then the request is invalid.
3123 */ 3469 */
3124 if (!params.spp_assoc_id && 3470 if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
3125 sctp_is_any(( union sctp_addr *)&params.spp_address)) { 3471 trans = sctp_addr_id2transport(sk, &params.spp_address,
3126 params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; 3472 params.spp_assoc_id);
3127 params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; 3473 if (!trans) {
3128 3474 SCTP_DEBUG_PRINTK("Failed no transport\n");
3129 goto done; 3475 return -EINVAL;
3476 }
3130 } 3477 }
3131 3478
3132 trans = sctp_addr_id2transport(sk, &params.spp_address, 3479 /* Get association, if assoc_id != 0 and the socket is a one
3133 params.spp_assoc_id); 3480 * to many style socket, and an association was not found, then
3134 if (!trans) 3481 * the id was invalid.
3482 */
3483 asoc = sctp_id2assoc(sk, params.spp_assoc_id);
3484 if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
3485 SCTP_DEBUG_PRINTK("Failed no association\n");
3135 return -EINVAL; 3486 return -EINVAL;
3487 }
3136 3488
3137 /* The value of the heartbeat interval, in milliseconds. A value of 0, 3489 if (trans) {
3138 * when modifying the parameter, specifies that the heartbeat on this 3490 /* Fetch transport values. */
3139 * address should be disabled. 3491 params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
3140 */ 3492 params.spp_pathmtu = trans->pathmtu;
3141 if (!trans->hb_allowed) 3493 params.spp_pathmaxrxt = trans->pathmaxrxt;
3142 params.spp_hbinterval = 0; 3494 params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay);
3143 else 3495
3144 params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); 3496 /*draft-11 doesn't say what to return in spp_flags*/
3497 params.spp_flags = trans->param_flags;
3498 } else if (asoc) {
3499 /* Fetch association values. */
3500 params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
3501 params.spp_pathmtu = asoc->pathmtu;
3502 params.spp_pathmaxrxt = asoc->pathmaxrxt;
3503 params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay);
3504
3505 /*draft-11 doesn't say what to return in spp_flags*/
3506 params.spp_flags = asoc->param_flags;
3507 } else {
3508 /* Fetch socket values. */
3509 params.spp_hbinterval = sp->hbinterval;
3510 params.spp_pathmtu = sp->pathmtu;
3511 params.spp_sackdelay = sp->sackdelay;
3512 params.spp_pathmaxrxt = sp->pathmaxrxt;
3513
3514 /*draft-11 doesn't say what to return in spp_flags*/
3515 params.spp_flags = sp->param_flags;
3516 }
3145 3517
3146 /* spp_pathmaxrxt contains the maximum number of retransmissions 3518 if (copy_to_user(optval, &params, len))
3147 * before this address shall be considered unreachable. 3519 return -EFAULT;
3148 */ 3520
3149 params.spp_pathmaxrxt = trans->max_retrans; 3521 if (put_user(len, optlen))
3522 return -EFAULT;
3523
3524 return 0;
3525}
3526
3527/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
3528 *
3529 * This options will get or set the delayed ack timer. The time is set
3530 * in milliseconds. If the assoc_id is 0, then this sets or gets the
3531 * endpoints default delayed ack timer value. If the assoc_id field is
3532 * non-zero, then the set or get effects the specified association.
3533 *
3534 * struct sctp_assoc_value {
3535 * sctp_assoc_t assoc_id;
3536 * uint32_t assoc_value;
3537 * };
3538 *
3539 * assoc_id - This parameter, indicates which association the
3540 * user is preforming an action upon. Note that if
3541 * this field's value is zero then the endpoints
3542 * default value is changed (effecting future
3543 * associations only).
3544 *
3545 * assoc_value - This parameter contains the number of milliseconds
3546 * that the user is requesting the delayed ACK timer
3547 * be set to. Note that this value is defined in
3548 * the standard to be between 200 and 500 milliseconds.
3549 *
3550 * Note: a value of zero will leave the value alone,
3551 * but disable SACK delay. A non-zero value will also
3552 * enable SACK delay.
3553 */
3554static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len,
3555 char __user *optval,
3556 int __user *optlen)
3557{
3558 struct sctp_assoc_value params;
3559 struct sctp_association *asoc = NULL;
3560 struct sctp_sock *sp = sctp_sk(sk);
3561
3562 if (len != sizeof(struct sctp_assoc_value))
3563 return - EINVAL;
3564
3565 if (copy_from_user(&params, optval, len))
3566 return -EFAULT;
3567
3568 /* Get association, if assoc_id != 0 and the socket is a one
3569 * to many style socket, and an association was not found, then
3570 * the id was invalid.
3571 */
3572 asoc = sctp_id2assoc(sk, params.assoc_id);
3573 if (!asoc && params.assoc_id && sctp_style(sk, UDP))
3574 return -EINVAL;
3575
3576 if (asoc) {
3577 /* Fetch association values. */
3578 if (asoc->param_flags & SPP_SACKDELAY_ENABLE)
3579 params.assoc_value = jiffies_to_msecs(
3580 asoc->sackdelay);
3581 else
3582 params.assoc_value = 0;
3583 } else {
3584 /* Fetch socket values. */
3585 if (sp->param_flags & SPP_SACKDELAY_ENABLE)
3586 params.assoc_value = sp->sackdelay;
3587 else
3588 params.assoc_value = 0;
3589 }
3150 3590
3151done:
3152 if (copy_to_user(optval, &params, len)) 3591 if (copy_to_user(optval, &params, len))
3153 return -EFAULT; 3592 return -EFAULT;
3154 3593
@@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
4015 retval = sctp_getsockopt_peer_addr_params(sk, len, optval, 4454 retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
4016 optlen); 4455 optlen);
4017 break; 4456 break;
4457 case SCTP_DELAYED_ACK_TIME:
4458 retval = sctp_getsockopt_delayed_ack_time(sk, len, optval,
4459 optlen);
4460 break;
4018 case SCTP_INITMSG: 4461 case SCTP_INITMSG:
4019 retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); 4462 retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
4020 break; 4463 break;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 268ddaf2dc0f..68d73e2dd155 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
86 peer->init_sent_count = 0; 86 peer->init_sent_count = 0;
87 87
88 peer->state = SCTP_ACTIVE; 88 peer->state = SCTP_ACTIVE;
89 peer->hb_allowed = 0; 89 peer->param_flags = SPP_HB_DISABLE |
90 SPP_PMTUD_ENABLE |
91 SPP_SACKDELAY_ENABLE;
92 peer->hbinterval = 0;
90 93
91 /* Initialize the default path max_retrans. */ 94 /* Initialize the default path max_retrans. */
92 peer->max_retrans = sctp_max_retrans_path; 95 peer->pathmaxrxt = sctp_max_retrans_path;
93 peer->error_count = 0; 96 peer->error_count = 0;
94 97
95 INIT_LIST_HEAD(&peer->transmitted); 98 INIT_LIST_HEAD(&peer->transmitted);
@@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport)
229 dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); 232 dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
230 233
231 if (dst) { 234 if (dst) {
232 transport->pmtu = dst_mtu(dst); 235 transport->pathmtu = dst_mtu(dst);
233 dst_release(dst); 236 dst_release(dst);
234 } else 237 } else
235 transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; 238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
236} 239}
237 240
238/* Caches the dst entry and source address for a transport's destination 241/* Caches the dst entry and source address for a transport's destination
@@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport,
254 af->get_saddr(asoc, dst, daddr, &transport->saddr); 257 af->get_saddr(asoc, dst, daddr, &transport->saddr);
255 258
256 transport->dst = dst; 259 transport->dst = dst;
260 if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
261 return;
262 }
257 if (dst) { 263 if (dst) {
258 transport->pmtu = dst_mtu(dst); 264 transport->pathmtu = dst_mtu(dst);
259 265
260 /* Initialize sk->sk_rcv_saddr, if the transport is the 266 /* Initialize sk->sk_rcv_saddr, if the transport is the
261 * association's active path for getsockname(). 267 * association's active path for getsockname().
@@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport,
264 opt->pf->af->to_sk_saddr(&transport->saddr, 270 opt->pf->af->to_sk_saddr(&transport->saddr,
265 asoc->base.sk); 271 asoc->base.sk);
266 } else 272 } else
267 transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; 273 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
268} 274}
269 275
270/* Hold a reference to a transport. */ 276/* Hold a reference to a transport. */
@@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
369 375
370 ssthresh = transport->ssthresh; 376 ssthresh = transport->ssthresh;
371 pba = transport->partial_bytes_acked; 377 pba = transport->partial_bytes_acked;
372 pmtu = transport->asoc->pmtu; 378 pmtu = transport->asoc->pathmtu;
373 379
374 if (cwnd <= ssthresh) { 380 if (cwnd <= ssthresh) {
375 /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less 381 /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
@@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
441 * partial_bytes_acked = 0 447 * partial_bytes_acked = 0
442 */ 448 */
443 transport->ssthresh = max(transport->cwnd/2, 449 transport->ssthresh = max(transport->cwnd/2,
444 4*transport->asoc->pmtu); 450 4*transport->asoc->pathmtu);
445 transport->cwnd = transport->asoc->pmtu; 451 transport->cwnd = transport->asoc->pathmtu;
446 break; 452 break;
447 453
448 case SCTP_LOWER_CWND_FAST_RTX: 454 case SCTP_LOWER_CWND_FAST_RTX:
@@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
459 * partial_bytes_acked = 0 465 * partial_bytes_acked = 0
460 */ 466 */
461 transport->ssthresh = max(transport->cwnd/2, 467 transport->ssthresh = max(transport->cwnd/2,
462 4*transport->asoc->pmtu); 468 4*transport->asoc->pathmtu);
463 transport->cwnd = transport->ssthresh; 469 transport->cwnd = transport->ssthresh;
464 break; 470 break;
465 471
@@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
479 if ((jiffies - transport->last_time_ecne_reduced) > 485 if ((jiffies - transport->last_time_ecne_reduced) >
480 transport->rtt) { 486 transport->rtt) {
481 transport->ssthresh = max(transport->cwnd/2, 487 transport->ssthresh = max(transport->cwnd/2,
482 4*transport->asoc->pmtu); 488 4*transport->asoc->pathmtu);
483 transport->cwnd = transport->ssthresh; 489 transport->cwnd = transport->ssthresh;
484 transport->last_time_ecne_reduced = jiffies; 490 transport->last_time_ecne_reduced = jiffies;
485 } 491 }
@@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
496 */ 502 */
497 if ((jiffies - transport->last_time_used) > transport->rto) 503 if ((jiffies - transport->last_time_used) > transport->rto)
498 transport->cwnd = max(transport->cwnd/2, 504 transport->cwnd = max(transport->cwnd/2,
499 4*transport->asoc->pmtu); 505 4*transport->asoc->pathmtu);
500 break; 506 break;
501 }; 507 };
502 508
@@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
511unsigned long sctp_transport_timeout(struct sctp_transport *t) 517unsigned long sctp_transport_timeout(struct sctp_transport *t)
512{ 518{
513 unsigned long timeout; 519 unsigned long timeout;
514 timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); 520 timeout = t->hbinterval + t->rto + sctp_jitter(t->rto);
515 timeout += jiffies; 521 timeout += jiffies;
516 return timeout; 522 return timeout;
517} 523}
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf54..06fa217f58a9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb)
640 kfree(iocb->private); 640 kfree(iocb->private);
641} 641}
642 642
643/* 643static ssize_t sock_sendpage(struct file *file, struct page *page,
644 * Read data from a socket. ubuf is a user mode pointer. We make sure the user 644 int offset, size_t size, loff_t *ppos, int more)
645 * area ubuf...ubuf+size-1 is writable before asking the protocol.
646 */
647
648static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
649 size_t size, loff_t pos)
650{ 645{
651 struct sock_iocb *x, siocb;
652 struct socket *sock; 646 struct socket *sock;
653 int flags; 647 int flags;
654 648
655 if (pos != 0) 649 sock = file->private_data;
656 return -ESPIPE;
657 if (size==0) /* Match SYS5 behaviour */
658 return 0;
659 650
660 if (is_sync_kiocb(iocb)) 651 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
661 x = &siocb; 652 if (more)
662 else { 653 flags |= MSG_MORE;
663 x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); 654
664 if (!x) 655 return sock->ops->sendpage(sock, page, offset, size, flags);
665 return -ENOMEM; 656}
657
658static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
659 char __user *ubuf, size_t size, struct sock_iocb *siocb)
660{
661 if (!is_sync_kiocb(iocb)) {
662 siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
663 if (!siocb)
664 return NULL;
666 iocb->ki_dtor = sock_aio_dtor; 665 iocb->ki_dtor = sock_aio_dtor;
667 } 666 }
668 iocb->private = x;
669 x->kiocb = iocb;
670 sock = iocb->ki_filp->private_data;
671 667
672 x->async_msg.msg_name = NULL; 668 siocb->kiocb = iocb;
673 x->async_msg.msg_namelen = 0; 669 siocb->async_iov.iov_base = ubuf;
674 x->async_msg.msg_iov = &x->async_iov; 670 siocb->async_iov.iov_len = size;
675 x->async_msg.msg_iovlen = 1;
676 x->async_msg.msg_control = NULL;
677 x->async_msg.msg_controllen = 0;
678 x->async_iov.iov_base = ubuf;
679 x->async_iov.iov_len = size;
680 flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
681 671
682 return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); 672 iocb->private = siocb;
673 return siocb;
683} 674}
684 675
676static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
677 struct file *file, struct iovec *iov, unsigned long nr_segs)
678{
679 struct socket *sock = file->private_data;
680 size_t size = 0;
681 int i;
685 682
686/* 683 for (i = 0 ; i < nr_segs ; i++)
687 * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 684 size += iov[i].iov_len;
688 * is readable by the user process.
689 */
690 685
691static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, 686 msg->msg_name = NULL;
692 size_t size, loff_t pos) 687 msg->msg_namelen = 0;
688 msg->msg_control = NULL;
689 msg->msg_controllen = 0;
690 msg->msg_iov = (struct iovec *) iov;
691 msg->msg_iovlen = nr_segs;
692 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
693
694 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
695}
696
697static ssize_t sock_readv(struct file *file, const struct iovec *iov,
698 unsigned long nr_segs, loff_t *ppos)
693{ 699{
694 struct sock_iocb *x, siocb; 700 struct kiocb iocb;
695 struct socket *sock; 701 struct sock_iocb siocb;
696 702 struct msghdr msg;
703 int ret;
704
705 init_sync_kiocb(&iocb, NULL);
706 iocb.private = &siocb;
707
708 ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
709 if (-EIOCBQUEUED == ret)
710 ret = wait_on_sync_kiocb(&iocb);
711 return ret;
712}
713
714static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
715 size_t count, loff_t pos)
716{
717 struct sock_iocb siocb, *x;
718
697 if (pos != 0) 719 if (pos != 0)
698 return -ESPIPE; 720 return -ESPIPE;
699 if(size==0) /* Match SYS5 behaviour */ 721 if (count == 0) /* Match SYS5 behaviour */
700 return 0; 722 return 0;
701 723
702 if (is_sync_kiocb(iocb)) 724 x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
703 x = &siocb; 725 if (!x)
704 else { 726 return -ENOMEM;
705 x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); 727 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
706 if (!x) 728 &x->async_iov, 1);
707 return -ENOMEM;
708 iocb->ki_dtor = sock_aio_dtor;
709 }
710 iocb->private = x;
711 x->kiocb = iocb;
712 sock = iocb->ki_filp->private_data;
713
714 x->async_msg.msg_name = NULL;
715 x->async_msg.msg_namelen = 0;
716 x->async_msg.msg_iov = &x->async_iov;
717 x->async_msg.msg_iovlen = 1;
718 x->async_msg.msg_control = NULL;
719 x->async_msg.msg_controllen = 0;
720 x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
721 if (sock->type == SOCK_SEQPACKET)
722 x->async_msg.msg_flags |= MSG_EOR;
723 x->async_iov.iov_base = (void __user *)ubuf;
724 x->async_iov.iov_len = size;
725
726 return __sock_sendmsg(iocb, sock, &x->async_msg, size);
727} 729}
728 730
729static ssize_t sock_sendpage(struct file *file, struct page *page, 731static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
730 int offset, size_t size, loff_t *ppos, int more) 732 struct file *file, struct iovec *iov, unsigned long nr_segs)
731{ 733{
732 struct socket *sock; 734 struct socket *sock = file->private_data;
733 int flags; 735 size_t size = 0;
736 int i;
734 737
735 sock = file->private_data; 738 for (i = 0 ; i < nr_segs ; i++)
739 size += iov[i].iov_len;
736 740
737 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; 741 msg->msg_name = NULL;
738 if (more) 742 msg->msg_namelen = 0;
739 flags |= MSG_MORE; 743 msg->msg_control = NULL;
744 msg->msg_controllen = 0;
745 msg->msg_iov = (struct iovec *) iov;
746 msg->msg_iovlen = nr_segs;
747 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
748 if (sock->type == SOCK_SEQPACKET)
749 msg->msg_flags |= MSG_EOR;
740 750
741 return sock->ops->sendpage(sock, page, offset, size, flags); 751 return __sock_sendmsg(iocb, sock, msg, size);
742} 752}
743 753
744static int sock_readv_writev(int type, 754static ssize_t sock_writev(struct file *file, const struct iovec *iov,
745 struct file * file, const struct iovec * iov, 755 unsigned long nr_segs, loff_t *ppos)
746 long count, size_t size)
747{ 756{
748 struct msghdr msg; 757 struct msghdr msg;
749 struct socket *sock; 758 struct kiocb iocb;
759 struct sock_iocb siocb;
760 int ret;
750 761
751 sock = file->private_data; 762 init_sync_kiocb(&iocb, NULL);
763 iocb.private = &siocb;
752 764
753 msg.msg_name = NULL; 765 ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
754 msg.msg_namelen = 0; 766 if (-EIOCBQUEUED == ret)
755 msg.msg_control = NULL; 767 ret = wait_on_sync_kiocb(&iocb);
756 msg.msg_controllen = 0; 768 return ret;
757 msg.msg_iov = (struct iovec *) iov; 769}
758 msg.msg_iovlen = count;
759 msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
760 770
761 /* read() does a VERIFY_WRITE */ 771static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
762 if (type == VERIFY_WRITE) 772 size_t count, loff_t pos)
763 return sock_recvmsg(sock, &msg, size, msg.msg_flags); 773{
774 struct sock_iocb siocb, *x;
764 775
765 if (sock->type == SOCK_SEQPACKET) 776 if (pos != 0)
766 msg.msg_flags |= MSG_EOR; 777 return -ESPIPE;
778 if (count == 0) /* Match SYS5 behaviour */
779 return 0;
767 780
768 return sock_sendmsg(sock, &msg, size); 781 x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
769} 782 if (!x)
783 return -ENOMEM;
770 784
771static ssize_t sock_readv(struct file *file, const struct iovec *vector, 785 return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
772 unsigned long count, loff_t *ppos) 786 &x->async_iov, 1);
773{
774 size_t tot_len = 0;
775 int i;
776 for (i = 0 ; i < count ; i++)
777 tot_len += vector[i].iov_len;
778 return sock_readv_writev(VERIFY_WRITE,
779 file, vector, count, tot_len);
780}
781
782static ssize_t sock_writev(struct file *file, const struct iovec *vector,
783 unsigned long count, loff_t *ppos)
784{
785 size_t tot_len = 0;
786 int i;
787 for (i = 0 ; i < count ; i++)
788 tot_len += vector[i].iov_len;
789 return sock_readv_writev(VERIFY_READ,
790 file, vector, count, tot_len);
791} 787}
792 788
793 789
@@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
904 break; 900 break;
905 default: 901 default:
906 err = sock->ops->ioctl(sock, cmd, arg); 902 err = sock->ops->ioctl(sock, cmd, arg);
903
904 /*
905 * If this ioctl is unknown try to hand it down
906 * to the NIC driver.
907 */
908 if (err == -ENOIOCTLCMD)
909 err = dev_ioctl(cmd, argp);
907 break; 910 break;
908 } 911 }
909 return err; 912 return err;
@@ -2036,7 +2039,7 @@ int sock_unregister(int family)
2036 return 0; 2039 return 0;
2037} 2040}
2038 2041
2039void __init sock_init(void) 2042static int __init sock_init(void)
2040{ 2043{
2041 /* 2044 /*
2042 * Initialize sock SLAB cache. 2045 * Initialize sock SLAB cache.
@@ -2044,12 +2047,10 @@ void __init sock_init(void)
2044 2047
2045 sk_init(); 2048 sk_init();
2046 2049
2047#ifdef SLAB_SKB
2048 /* 2050 /*
2049 * Initialize skbuff SLAB cache 2051 * Initialize skbuff SLAB cache
2050 */ 2052 */
2051 skb_init(); 2053 skb_init();
2052#endif
2053 2054
2054 /* 2055 /*
2055 * Initialize the protocols module. 2056 * Initialize the protocols module.
@@ -2058,15 +2059,19 @@ void __init sock_init(void)
2058 init_inodecache(); 2059 init_inodecache();
2059 register_filesystem(&sock_fs_type); 2060 register_filesystem(&sock_fs_type);
2060 sock_mnt = kern_mount(&sock_fs_type); 2061 sock_mnt = kern_mount(&sock_fs_type);
2061 /* The real protocol initialization is performed when 2062
2062 * do_initcalls is run. 2063 /* The real protocol initialization is performed in later initcalls.
2063 */ 2064 */
2064 2065
2065#ifdef CONFIG_NETFILTER 2066#ifdef CONFIG_NETFILTER
2066 netfilter_init(); 2067 netfilter_init();
2067#endif 2068#endif
2069
2070 return 0;
2068} 2071}
2069 2072
2073core_initcall(sock_init); /* early initcall */
2074
2070#ifdef CONFIG_PROC_FS 2075#ifdef CONFIG_PROC_FS
2071void socket_seq_show(struct seq_file *seq) 2076void socket_seq_show(struct seq_file *seq)
2072{ 2077{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71e..d68eba481291 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
758 struct svc_serv *serv = svsk->sk_server; 758 struct svc_serv *serv = svsk->sk_server;
759 struct socket *sock = svsk->sk_sock; 759 struct socket *sock = svsk->sk_sock;
760 struct socket *newsock; 760 struct socket *newsock;
761 struct proto_ops *ops; 761 const struct proto_ops *ops;
762 struct svc_sock *newsvsk; 762 struct svc_sock *newsvsk;
763 int err, slen; 763 int err, slen;
764 764
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acc73ba8bade..5f6ae79b8b16 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -121,7 +121,7 @@
121int sysctl_unix_max_dgram_qlen = 10; 121int sysctl_unix_max_dgram_qlen = 10;
122 122
123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; 123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124DEFINE_RWLOCK(unix_table_lock); 124DEFINE_SPINLOCK(unix_table_lock);
125static atomic_t unix_nr_socks = ATOMIC_INIT(0); 125static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126 126
127#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) 127#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
@@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
130 130
131/* 131/*
132 * SMP locking strategy: 132 * SMP locking strategy:
133 * hash table is protected with rwlock unix_table_lock 133 * hash table is protected with spinlock unix_table_lock
134 * each socket state is protected by separate rwlock. 134 * each socket state is protected by separate rwlock.
135 */ 135 */
136 136
@@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
214 214
215static inline void unix_remove_socket(struct sock *sk) 215static inline void unix_remove_socket(struct sock *sk)
216{ 216{
217 write_lock(&unix_table_lock); 217 spin_lock(&unix_table_lock);
218 __unix_remove_socket(sk); 218 __unix_remove_socket(sk);
219 write_unlock(&unix_table_lock); 219 spin_unlock(&unix_table_lock);
220} 220}
221 221
222static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) 222static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
223{ 223{
224 write_lock(&unix_table_lock); 224 spin_lock(&unix_table_lock);
225 __unix_insert_socket(list, sk); 225 __unix_insert_socket(list, sk);
226 write_unlock(&unix_table_lock); 226 spin_unlock(&unix_table_lock);
227} 227}
228 228
229static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, 229static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
@@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
250{ 250{
251 struct sock *s; 251 struct sock *s;
252 252
253 read_lock(&unix_table_lock); 253 spin_lock(&unix_table_lock);
254 s = __unix_find_socket_byname(sunname, len, type, hash); 254 s = __unix_find_socket_byname(sunname, len, type, hash);
255 if (s) 255 if (s)
256 sock_hold(s); 256 sock_hold(s);
257 read_unlock(&unix_table_lock); 257 spin_unlock(&unix_table_lock);
258 return s; 258 return s;
259} 259}
260 260
@@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
263 struct sock *s; 263 struct sock *s;
264 struct hlist_node *node; 264 struct hlist_node *node;
265 265
266 read_lock(&unix_table_lock); 266 spin_lock(&unix_table_lock);
267 sk_for_each(s, node, 267 sk_for_each(s, node,
268 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { 268 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
269 struct dentry *dentry = unix_sk(s)->dentry; 269 struct dentry *dentry = unix_sk(s)->dentry;
@@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
276 } 276 }
277 s = NULL; 277 s = NULL;
278found: 278found:
279 read_unlock(&unix_table_lock); 279 spin_unlock(&unix_table_lock);
280 return s; 280 return s;
281} 281}
282 282
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
473static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, 473static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
474 struct msghdr *, size_t); 474 struct msghdr *, size_t);
475 475
476static struct proto_ops unix_stream_ops = { 476static const struct proto_ops unix_stream_ops = {
477 .family = PF_UNIX, 477 .family = PF_UNIX,
478 .owner = THIS_MODULE, 478 .owner = THIS_MODULE,
479 .release = unix_release, 479 .release = unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
494 .sendpage = sock_no_sendpage, 494 .sendpage = sock_no_sendpage,
495}; 495};
496 496
497static struct proto_ops unix_dgram_ops = { 497static const struct proto_ops unix_dgram_ops = {
498 .family = PF_UNIX, 498 .family = PF_UNIX,
499 .owner = THIS_MODULE, 499 .owner = THIS_MODULE,
500 .release = unix_release, 500 .release = unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
515 .sendpage = sock_no_sendpage, 515 .sendpage = sock_no_sendpage,
516}; 516};
517 517
518static struct proto_ops unix_seqpacket_ops = { 518static const struct proto_ops unix_seqpacket_ops = {
519 .family = PF_UNIX, 519 .family = PF_UNIX,
520 .owner = THIS_MODULE, 520 .owner = THIS_MODULE,
521 .release = unix_release, 521 .release = unix_release,
@@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock)
564 u = unix_sk(sk); 564 u = unix_sk(sk);
565 u->dentry = NULL; 565 u->dentry = NULL;
566 u->mnt = NULL; 566 u->mnt = NULL;
567 rwlock_init(&u->lock); 567 spin_lock_init(&u->lock);
568 atomic_set(&u->inflight, sock ? 0 : -1); 568 atomic_set(&u->inflight, sock ? 0 : -1);
569 init_MUTEX(&u->readsem); /* single task reading lock */ 569 init_MUTEX(&u->readsem); /* single task reading lock */
570 init_waitqueue_head(&u->peer_wait); 570 init_waitqueue_head(&u->peer_wait);
@@ -642,12 +642,12 @@ retry:
642 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); 642 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
643 addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); 643 addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
644 644
645 write_lock(&unix_table_lock); 645 spin_lock(&unix_table_lock);
646 ordernum = (ordernum+1)&0xFFFFF; 646 ordernum = (ordernum+1)&0xFFFFF;
647 647
648 if (__unix_find_socket_byname(addr->name, addr->len, sock->type, 648 if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
649 addr->hash)) { 649 addr->hash)) {
650 write_unlock(&unix_table_lock); 650 spin_unlock(&unix_table_lock);
651 /* Sanity yield. It is unusual case, but yet... */ 651 /* Sanity yield. It is unusual case, but yet... */
652 if (!(ordernum&0xFF)) 652 if (!(ordernum&0xFF))
653 yield(); 653 yield();
@@ -658,7 +658,7 @@ retry:
658 __unix_remove_socket(sk); 658 __unix_remove_socket(sk);
659 u->addr = addr; 659 u->addr = addr;
660 __unix_insert_socket(&unix_socket_table[addr->hash], sk); 660 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
661 write_unlock(&unix_table_lock); 661 spin_unlock(&unix_table_lock);
662 err = 0; 662 err = 0;
663 663
664out: up(&u->readsem); 664out: up(&u->readsem);
@@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
791 addr->hash = UNIX_HASH_SIZE; 791 addr->hash = UNIX_HASH_SIZE;
792 } 792 }
793 793
794 write_lock(&unix_table_lock); 794 spin_lock(&unix_table_lock);
795 795
796 if (!sunaddr->sun_path[0]) { 796 if (!sunaddr->sun_path[0]) {
797 err = -EADDRINUSE; 797 err = -EADDRINUSE;
@@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
814 __unix_insert_socket(list, sk); 814 __unix_insert_socket(list, sk);
815 815
816out_unlock: 816out_unlock:
817 write_unlock(&unix_table_lock); 817 spin_unlock(&unix_table_lock);
818out_up: 818out_up:
819 up(&u->readsem); 819 up(&u->readsem);
820out: 820out:
@@ -1063,10 +1063,12 @@ restart:
1063 /* Set credentials */ 1063 /* Set credentials */
1064 sk->sk_peercred = other->sk_peercred; 1064 sk->sk_peercred = other->sk_peercred;
1065 1065
1066 sock_hold(newsk);
1067 unix_peer(sk) = newsk;
1068 sock->state = SS_CONNECTED; 1066 sock->state = SS_CONNECTED;
1069 sk->sk_state = TCP_ESTABLISHED; 1067 sk->sk_state = TCP_ESTABLISHED;
1068 sock_hold(newsk);
1069
1070 smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */
1071 unix_peer(sk) = newsk;
1070 1072
1071 unix_state_wunlock(sk); 1073 unix_state_wunlock(sk);
1072 1074
@@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1414 } else { 1416 } else {
1415 sunaddr = NULL; 1417 sunaddr = NULL;
1416 err = -ENOTCONN; 1418 err = -ENOTCONN;
1417 other = unix_peer_get(sk); 1419 other = unix_peer(sk);
1418 if (!other) 1420 if (!other)
1419 goto out_err; 1421 goto out_err;
1420 } 1422 }
@@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1476 other->sk_data_ready(other, size); 1478 other->sk_data_ready(other, size);
1477 sent+=size; 1479 sent+=size;
1478 } 1480 }
1479 sock_put(other);
1480 1481
1481 scm_destroy(siocb->scm); 1482 scm_destroy(siocb->scm);
1482 siocb->scm = NULL; 1483 siocb->scm = NULL;
@@ -1491,8 +1492,6 @@ pipe_err:
1491 send_sig(SIGPIPE,current,0); 1492 send_sig(SIGPIPE,current,0);
1492 err = -EPIPE; 1493 err = -EPIPE;
1493out_err: 1494out_err:
1494 if (other)
1495 sock_put(other);
1496 scm_destroy(siocb->scm); 1495 scm_destroy(siocb->scm);
1497 siocb->scm = NULL; 1496 siocb->scm = NULL;
1498 return sent ? : err; 1497 return sent ? : err;
@@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1860 } 1859 }
1861 1860
1862 default: 1861 default:
1863 err = dev_ioctl(cmd, (void __user *)arg); 1862 err = -ENOIOCTLCMD;
1864 break; 1863 break;
1865 } 1864 }
1866 return err; 1865 return err;
@@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
1917 1916
1918static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 1917static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1919{ 1918{
1920 read_lock(&unix_table_lock); 1919 spin_lock(&unix_table_lock);
1921 return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); 1920 return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1922} 1921}
1923 1922
@@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1932 1931
1933static void unix_seq_stop(struct seq_file *seq, void *v) 1932static void unix_seq_stop(struct seq_file *seq, void *v)
1934{ 1933{
1935 read_unlock(&unix_table_lock); 1934 spin_unlock(&unix_table_lock);
1936} 1935}
1937 1936
1938static int unix_seq_show(struct seq_file *seq, void *v) 1937static int unix_seq_show(struct seq_file *seq, void *v)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6ffc64e1712d..411802bd4d37 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -182,7 +182,7 @@ void unix_gc(void)
182 if (down_trylock(&unix_gc_sem)) 182 if (down_trylock(&unix_gc_sem))
183 return; 183 return;
184 184
185 read_lock(&unix_table_lock); 185 spin_lock(&unix_table_lock);
186 186
187 forall_unix_sockets(i, s) 187 forall_unix_sockets(i, s)
188 { 188 {
@@ -301,7 +301,7 @@ void unix_gc(void)
301 } 301 }
302 u->gc_tree = GC_ORPHAN; 302 u->gc_tree = GC_ORPHAN;
303 } 303 }
304 read_unlock(&unix_table_lock); 304 spin_unlock(&unix_table_lock);
305 305
306 /* 306 /*
307 * Here we are. Hitlist is filled. Die. 307 * Here we are. Hitlist is filled. Die.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b2132..7a43ae4721ed 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
181#endif 181#endif
182 182
183static int sk_count; 183static int sk_count;
184extern struct proto_ops wanpipe_ops; 184extern const struct proto_ops wanpipe_ops;
185static unsigned long find_free_critical; 185static unsigned long find_free_critical;
186 186
187static void wanpipe_unlink_driver(struct sock *sk); 187static void wanpipe_unlink_driver(struct sock *sk);
@@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar
1839#endif 1839#endif
1840 1840
1841 default: 1841 default:
1842 return dev_ioctl(cmd,(void __user *) arg); 1842 return -ENOIOCTLCMD;
1843 } 1843 }
1844 /*NOTREACHED*/ 1844 /*NOTREACHED*/
1845} 1845}
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
2546 return 0; 2546 return 0;
2547} 2547}
2548 2548
2549struct proto_ops wanpipe_ops = { 2549const struct proto_ops wanpipe_ops = {
2550 .family = PF_WANPIPE, 2550 .family = PF_WANPIPE,
2551 .owner = THIS_MODULE, 2551 .owner = THIS_MODULE,
2552 .release = wanpipe_release, 2552 .release = wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc8414..16459c7f54b2 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2;
64HLIST_HEAD(x25_list); 64HLIST_HEAD(x25_list);
65DEFINE_RWLOCK(x25_list_lock); 65DEFINE_RWLOCK(x25_list_lock);
66 66
67static struct proto_ops x25_proto_ops; 67static const struct proto_ops x25_proto_ops;
68 68
69static struct x25_address null_x25_address = {" "}; 69static struct x25_address null_x25_address = {" "};
70 70
@@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1378 } 1378 }
1379 1379
1380 default: 1380 default:
1381 rc = dev_ioctl(cmd, argp); 1381 rc = -ENOIOCTLCMD;
1382 break; 1382 break;
1383 } 1383 }
1384 1384
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
1391 .owner = THIS_MODULE, 1391 .owner = THIS_MODULE,
1392}; 1392};
1393 1393
1394static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { 1394static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
1395 .family = AF_X25, 1395 .family = AF_X25,
1396 .owner = THIS_MODULE, 1396 .owner = THIS_MODULE,
1397 .release = x25_release, 1397 .release = x25_release,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4a..64a447375fdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
10 * YOSHIFUJI Hideaki 10 * YOSHIFUJI Hideaki
11 * Split up af-specific portion 11 * Split up af-specific portion
12 * Derek Atkins <derek@ihtfp.com> Add the post_input processor 12 * Derek Atkins <derek@ihtfp.com> Add the post_input processor
13 * 13 *
14 */ 14 */
15 15
16#include <asm/bug.h> 16#include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
256 if (del_timer(&policy->timer)) 256 if (del_timer(&policy->timer))
257 BUG(); 257 BUG();
258 258
259 security_xfrm_policy_free(policy);
259 kfree(policy); 260 kfree(policy);
260} 261}
261EXPORT_SYMBOL(__xfrm_policy_destroy); 262EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
350 351
351 write_lock_bh(&xfrm_policy_lock); 352 write_lock_bh(&xfrm_policy_lock);
352 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { 353 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
353 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { 354 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
355 xfrm_sec_ctx_match(pol->security, policy->security)) {
354 if (excl) { 356 if (excl) {
355 write_unlock_bh(&xfrm_policy_lock); 357 write_unlock_bh(&xfrm_policy_lock);
356 return -EEXIST; 358 return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
416} 418}
417EXPORT_SYMBOL(xfrm_policy_insert); 419EXPORT_SYMBOL(xfrm_policy_insert);
418 420
419struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, 421struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
420 int delete) 422 struct xfrm_sec_ctx *ctx, int delete)
421{ 423{
422 struct xfrm_policy *pol, **p; 424 struct xfrm_policy *pol, **p;
423 425
424 write_lock_bh(&xfrm_policy_lock); 426 write_lock_bh(&xfrm_policy_lock);
425 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { 427 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
426 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { 428 if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
429 (xfrm_sec_ctx_match(ctx, pol->security))) {
427 xfrm_pol_hold(pol); 430 xfrm_pol_hold(pol);
428 if (delete) 431 if (delete)
429 *p = pol->next; 432 *p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
438 } 441 }
439 return pol; 442 return pol;
440} 443}
441EXPORT_SYMBOL(xfrm_policy_bysel); 444EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
442 445
443struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) 446struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
444{ 447{
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
519 522
520/* Find policy to apply to this flow. */ 523/* Find policy to apply to this flow. */
521 524
522static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, 525static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
523 void **objp, atomic_t **obj_refp) 526 void **objp, atomic_t **obj_refp)
524{ 527{
525 struct xfrm_policy *pol; 528 struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
533 continue; 536 continue;
534 537
535 match = xfrm_selector_match(sel, fl, family); 538 match = xfrm_selector_match(sel, fl, family);
539
536 if (match) { 540 if (match) {
537 xfrm_pol_hold(pol); 541 if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
538 break; 542 xfrm_pol_hold(pol);
543 break;
544 }
539 } 545 }
540 } 546 }
541 read_unlock_bh(&xfrm_policy_lock); 547 read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
543 *obj_refp = &pol->refcnt; 549 *obj_refp = &pol->refcnt;
544} 550}
545 551
546static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) 552static inline int policy_to_flow_dir(int dir)
553{
554 if (XFRM_POLICY_IN == FLOW_DIR_IN &&
555 XFRM_POLICY_OUT == FLOW_DIR_OUT &&
556 XFRM_POLICY_FWD == FLOW_DIR_FWD)
557 return dir;
558 switch (dir) {
559 default:
560 case XFRM_POLICY_IN:
561 return FLOW_DIR_IN;
562 case XFRM_POLICY_OUT:
563 return FLOW_DIR_OUT;
564 case XFRM_POLICY_FWD:
565 return FLOW_DIR_FWD;
566 };
567}
568
569static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
547{ 570{
548 struct xfrm_policy *pol; 571 struct xfrm_policy *pol;
549 572
550 read_lock_bh(&xfrm_policy_lock); 573 read_lock_bh(&xfrm_policy_lock);
551 if ((pol = sk->sk_policy[dir]) != NULL) { 574 if ((pol = sk->sk_policy[dir]) != NULL) {
552 int match = xfrm_selector_match(&pol->selector, fl, 575 int match = xfrm_selector_match(&pol->selector, fl,
553 sk->sk_family); 576 sk->sk_family);
577 int err = 0;
578
554 if (match) 579 if (match)
580 err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
581
582 if (match && !err)
555 xfrm_pol_hold(pol); 583 xfrm_pol_hold(pol);
556 else 584 else
557 pol = NULL; 585 pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
624 652
625 if (newp) { 653 if (newp) {
626 newp->selector = old->selector; 654 newp->selector = old->selector;
655 if (security_xfrm_policy_clone(old, newp)) {
656 kfree(newp);
657 return NULL; /* ENOMEM */
658 }
627 newp->lft = old->lft; 659 newp->lft = old->lft;
628 newp->curlft = old->curlft; 660 newp->curlft = old->curlft;
629 newp->action = old->action; 661 newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
735 return err; 767 return err;
736} 768}
737 769
738static inline int policy_to_flow_dir(int dir)
739{
740 if (XFRM_POLICY_IN == FLOW_DIR_IN &&
741 XFRM_POLICY_OUT == FLOW_DIR_OUT &&
742 XFRM_POLICY_FWD == FLOW_DIR_FWD)
743 return dir;
744 switch (dir) {
745 default:
746 case XFRM_POLICY_IN:
747 return FLOW_DIR_IN;
748 case XFRM_POLICY_OUT:
749 return FLOW_DIR_OUT;
750 case XFRM_POLICY_FWD:
751 return FLOW_DIR_FWD;
752 };
753}
754 770
755static int stale_bundle(struct dst_entry *dst); 771static int stale_bundle(struct dst_entry *dst);
756 772
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
769 int err; 785 int err;
770 u32 genid; 786 u32 genid;
771 u16 family = dst_orig->ops->family; 787 u16 family = dst_orig->ops->family;
788 u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
789 u32 sk_sid = security_sk_sid(sk, fl, dir);
772restart: 790restart:
773 genid = atomic_read(&flow_cache_genid); 791 genid = atomic_read(&flow_cache_genid);
774 policy = NULL; 792 policy = NULL;
775 if (sk && sk->sk_policy[1]) 793 if (sk && sk->sk_policy[1])
776 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); 794 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
777 795
778 if (!policy) { 796 if (!policy) {
779 /* To accelerate a bit... */ 797 /* To accelerate a bit... */
780 if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) 798 if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
781 return 0; 799 return 0;
782 800
783 policy = flow_cache_lookup(fl, family, 801 policy = flow_cache_lookup(fl, sk_sid, family, dir,
784 policy_to_flow_dir(XFRM_POLICY_OUT),
785 xfrm_policy_lookup); 802 xfrm_policy_lookup);
786 } 803 }
787 804
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
962{ 979{
963 struct xfrm_policy *pol; 980 struct xfrm_policy *pol;
964 struct flowi fl; 981 struct flowi fl;
982 u8 fl_dir = policy_to_flow_dir(dir);
983 u32 sk_sid;
965 984
966 if (_decode_session(skb, &fl, family) < 0) 985 if (_decode_session(skb, &fl, family) < 0)
967 return 0; 986 return 0;
968 987
988 sk_sid = security_sk_sid(sk, &fl, fl_dir);
989
969 /* First, check used SA against their selectors. */ 990 /* First, check used SA against their selectors. */
970 if (skb->sp) { 991 if (skb->sp) {
971 int i; 992 int i;
972 993
973 for (i=skb->sp->len-1; i>=0; i--) { 994 for (i=skb->sp->len-1; i>=0; i--) {
974 struct sec_decap_state *xvec = &(skb->sp->x[i]); 995 struct sec_decap_state *xvec = &(skb->sp->x[i]);
975 if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) 996 if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
976 return 0; 997 return 0;
977 998
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
986 1007
987 pol = NULL; 1008 pol = NULL;
988 if (sk && sk->sk_policy[dir]) 1009 if (sk && sk->sk_policy[dir])
989 pol = xfrm_sk_policy_lookup(sk, dir, &fl); 1010 pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
990 1011
991 if (!pol) 1012 if (!pol)
992 pol = flow_cache_lookup(&fl, family, 1013 pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
993 policy_to_flow_dir(dir),
994 xfrm_policy_lookup); 1014 xfrm_policy_lookup);
995 1015
996 if (!pol) 1016 if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc97666..e12d0be5f976 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
10 * Split up af-specific functions 10 * Split up af-specific functions
11 * Derek Atkins <derek@ihtfp.com> 11 * Derek Atkins <derek@ihtfp.com>
12 * Add UDP Encapsulation 12 * Add UDP Encapsulation
13 * 13 *
14 */ 14 */
15 15
16#include <linux/workqueue.h> 16#include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
70 x->type->destructor(x); 70 x->type->destructor(x);
71 xfrm_put_type(x->type); 71 xfrm_put_type(x->type);
72 } 72 }
73 security_xfrm_state_free(x);
73 kfree(x); 74 kfree(x);
74} 75}
75 76
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
343 selector. 344 selector.
344 */ 345 */
345 if (x->km.state == XFRM_STATE_VALID) { 346 if (x->km.state == XFRM_STATE_VALID) {
346 if (!xfrm_selector_match(&x->sel, fl, family)) 347 if (!xfrm_selector_match(&x->sel, fl, family) ||
348 !xfrm_sec_ctx_match(pol->security, x->security))
347 continue; 349 continue;
348 if (!best || 350 if (!best ||
349 best->km.dying > x->km.dying || 351 best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
354 acquire_in_progress = 1; 356 acquire_in_progress = 1;
355 } else if (x->km.state == XFRM_STATE_ERROR || 357 } else if (x->km.state == XFRM_STATE_ERROR ||
356 x->km.state == XFRM_STATE_EXPIRED) { 358 x->km.state == XFRM_STATE_EXPIRED) {
357 if (xfrm_selector_match(&x->sel, fl, family)) 359 if (xfrm_selector_match(&x->sel, fl, family) &&
360 xfrm_sec_ctx_match(pol->security, x->security))
358 error = -ESRCH; 361 error = -ESRCH;
359 } 362 }
360 } 363 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e043..92e2b804c606 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
7 * Kazunori MIYAZAWA @USAGI 7 * Kazunori MIYAZAWA @USAGI
8 * Kunihiro Ishiguro <kunihiro@ipinfusion.com> 8 * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
9 * IPv6 support 9 * IPv6 support
10 * 10 *
11 */ 11 */
12 12
13#include <linux/module.h> 13#include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
88 return 0; 88 return 0;
89} 89}
90 90
91
92static inline int verify_sec_ctx_len(struct rtattr **xfrma)
93{
94 struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
95 struct xfrm_user_sec_ctx *uctx;
96 int len = 0;
97
98 if (!rt)
99 return 0;
100
101 if (rt->rta_len < sizeof(*uctx))
102 return -EINVAL;
103
104 uctx = RTA_DATA(rt);
105
106 if (uctx->ctx_len > PAGE_SIZE)
107 return -EINVAL;
108
109 len += sizeof(struct xfrm_user_sec_ctx);
110 len += uctx->ctx_len;
111
112 if (uctx->len != len)
113 return -EINVAL;
114
115 return 0;
116}
117
118
91static int verify_newsa_info(struct xfrm_usersa_info *p, 119static int verify_newsa_info(struct xfrm_usersa_info *p,
92 struct rtattr **xfrma) 120 struct rtattr **xfrma)
93{ 121{
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
145 goto out; 173 goto out;
146 if ((err = verify_encap_tmpl(xfrma))) 174 if ((err = verify_encap_tmpl(xfrma)))
147 goto out; 175 goto out;
176 if ((err = verify_sec_ctx_len(xfrma)))
177 goto out;
148 178
149 err = -EINVAL; 179 err = -EINVAL;
150 switch (p->mode) { 180 switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
209 return 0; 239 return 0;
210} 240}
211 241
242
243static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
244{
245 struct xfrm_sec_ctx *xfrm_ctx = xp->security;
246 int len = 0;
247
248 if (xfrm_ctx) {
249 len += sizeof(struct xfrm_user_sec_ctx);
250 len += xfrm_ctx->ctx_len;
251 }
252 return len;
253}
254
255static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
256{
257 struct xfrm_user_sec_ctx *uctx;
258
259 if (!u_arg)
260 return 0;
261
262 uctx = RTA_DATA(u_arg);
263 return security_xfrm_state_alloc(x, uctx);
264}
265
212static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) 266static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
213{ 267{
214 memcpy(&x->id, &p->id, sizeof(x->id)); 268 memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
253 if (err) 307 if (err)
254 goto error; 308 goto error;
255 309
310 if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
311 goto error;
312
256 x->km.seq = p->seq; 313 x->km.seq = p->seq;
257 314
258 return x; 315 return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
272 int err; 329 int err;
273 struct km_event c; 330 struct km_event c;
274 331
275 err = verify_newsa_info(p, (struct rtattr **) xfrma); 332 err = verify_newsa_info(p, (struct rtattr **)xfrma);
276 if (err) 333 if (err)
277 return err; 334 return err;
278 335
279 x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); 336 x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
280 if (!x) 337 if (!x)
281 return err; 338 return err;
282 339
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
390 if (x->encap) 447 if (x->encap)
391 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); 448 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
392 449
450 if (x->security) {
451 int ctx_size = sizeof(struct xfrm_sec_ctx) +
452 x->security->ctx_len;
453 struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
454 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
455
456 uctx->exttype = XFRMA_SEC_CTX;
457 uctx->len = ctx_size;
458 uctx->ctx_doi = x->security->ctx_doi;
459 uctx->ctx_alg = x->security->ctx_alg;
460 uctx->ctx_len = x->security->ctx_len;
461 memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
462 }
393 nlh->nlmsg_len = skb->tail - b; 463 nlh->nlmsg_len = skb->tail - b;
394out: 464out:
395 sp->this_idx++; 465 sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
603 return verify_policy_dir(p->dir); 673 return verify_policy_dir(p->dir);
604} 674}
605 675
676static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
677{
678 struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
679 struct xfrm_user_sec_ctx *uctx;
680
681 if (!rt)
682 return 0;
683
684 uctx = RTA_DATA(rt);
685 return security_xfrm_policy_alloc(pol, uctx);
686}
687
606static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, 688static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
607 int nr) 689 int nr)
608{ 690{
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
681 } 763 }
682 764
683 copy_from_user_policy(xp, p); 765 copy_from_user_policy(xp, p);
684 err = copy_from_user_tmpl(xp, xfrma); 766
767 if (!(err = copy_from_user_tmpl(xp, xfrma)))
768 err = copy_from_user_sec_ctx(xp, xfrma);
769
685 if (err) { 770 if (err) {
686 *errp = err; 771 *errp = err;
687 kfree(xp); 772 kfree(xp);
@@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
702 err = verify_newpolicy_info(p); 787 err = verify_newpolicy_info(p);
703 if (err) 788 if (err)
704 return err; 789 return err;
790 err = verify_sec_ctx_len((struct rtattr **)xfrma);
791 if (err)
792 return err;
705 793
706 xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); 794 xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
707 if (!xp) 795 if (!xp)
708 return err; 796 return err;
709 797
@@ -761,6 +849,27 @@ rtattr_failure:
761 return -1; 849 return -1;
762} 850}
763 851
852static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
853{
854 if (xp->security) {
855 int ctx_size = sizeof(struct xfrm_sec_ctx) +
856 xp->security->ctx_len;
857 struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
858 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
859
860 uctx->exttype = XFRMA_SEC_CTX;
861 uctx->len = ctx_size;
862 uctx->ctx_doi = xp->security->ctx_doi;
863 uctx->ctx_alg = xp->security->ctx_alg;
864 uctx->ctx_len = xp->security->ctx_len;
865 memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
866 }
867 return 0;
868
869 rtattr_failure:
870 return -1;
871}
872
764static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) 873static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
765{ 874{
766 struct xfrm_dump_info *sp = ptr; 875 struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
782 copy_to_user_policy(xp, p, dir); 891 copy_to_user_policy(xp, p, dir);
783 if (copy_to_user_tmpl(xp, skb) < 0) 892 if (copy_to_user_tmpl(xp, skb) < 0)
784 goto nlmsg_failure; 893 goto nlmsg_failure;
894 if (copy_to_user_sec_ctx(xp, skb))
895 goto nlmsg_failure;
785 896
786 nlh->nlmsg_len = skb->tail - b; 897 nlh->nlmsg_len = skb->tail - b;
787out: 898out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
852 963
853 if (p->index) 964 if (p->index)
854 xp = xfrm_policy_byid(p->dir, p->index, delete); 965 xp = xfrm_policy_byid(p->dir, p->index, delete);
855 else 966 else {
856 xp = xfrm_policy_bysel(p->dir, &p->sel, delete); 967 struct rtattr **rtattrs = (struct rtattr **)xfrma;
968 struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
969 struct xfrm_policy tmp;
970
971 err = verify_sec_ctx_len(rtattrs);
972 if (err)
973 return err;
974
975 memset(&tmp, 0, sizeof(struct xfrm_policy));
976 if (rt) {
977 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
978
979 if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
980 return err;
981 }
982 xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
983 security_xfrm_policy_free(&tmp);
984 }
857 if (xp == NULL) 985 if (xp == NULL)
858 return -ENOENT; 986 return -ENOENT;
859 987
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
1224 1352
1225 if (copy_to_user_tmpl(xp, skb) < 0) 1353 if (copy_to_user_tmpl(xp, skb) < 0)
1226 goto nlmsg_failure; 1354 goto nlmsg_failure;
1355 if (copy_to_user_sec_ctx(xp, skb))
1356 goto nlmsg_failure;
1227 1357
1228 nlh->nlmsg_len = skb->tail - b; 1358 nlh->nlmsg_len = skb->tail - b;
1229 return skb->len; 1359 return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
1241 1371
1242 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); 1372 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1243 len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); 1373 len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
1374 len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
1244 skb = alloc_skb(len, GFP_ATOMIC); 1375 skb = alloc_skb(len, GFP_ATOMIC);
1245 if (skb == NULL) 1376 if (skb == NULL)
1246 return -ENOMEM; 1377 return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
1324 copy_to_user_policy(xp, &upe->pol, dir); 1455 copy_to_user_policy(xp, &upe->pol, dir);
1325 if (copy_to_user_tmpl(xp, skb) < 0) 1456 if (copy_to_user_tmpl(xp, skb) < 0)
1326 goto nlmsg_failure; 1457 goto nlmsg_failure;
1458 if (copy_to_user_sec_ctx(xp, skb))
1459 goto nlmsg_failure;
1327 upe->hard = !!hard; 1460 upe->hard = !!hard;
1328 1461
1329 nlh->nlmsg_len = skb->tail - b; 1462 nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
1341 1474
1342 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); 1475 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1343 len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); 1476 len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
1477 len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
1344 skb = alloc_skb(len, GFP_ATOMIC); 1478 skb = alloc_skb(len, GFP_ATOMIC);
1345 if (skb == NULL) 1479 if (skb == NULL)
1346 return -ENOMEM; 1480 return -ENOMEM;
diff --git a/security/Kconfig b/security/Kconfig
index 64d3f1e9ca85..34f593410d57 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,19 @@ config SECURITY_NETWORK
54 implement socket and networking access controls. 54 implement socket and networking access controls.
55 If you are unsure how to answer this question, answer N. 55 If you are unsure how to answer this question, answer N.
56 56
57config SECURITY_NETWORK_XFRM
58 bool "XFRM (IPSec) Networking Security Hooks"
59 depends on XFRM && SECURITY_NETWORK
60 help
61 This enables the XFRM (IPSec) networking security hooks.
62 If enabled, a security module can use these hooks to
63 implement per-packet access controls based on labels
64 derived from IPSec policy. Non-IPSec communications are
65 designated as unlabelled, and only sockets authorized
66 to communicate unlabelled data can send without using
67 IPSec.
68 If you are unsure how to answer this question, answer N.
69
57config SECURITY_CAPABILITIES 70config SECURITY_CAPABILITIES
58 tristate "Default Linux Capabilities" 71 tristate "Default Linux Capabilities"
59 depends on SECURITY 72 depends on SECURITY
diff --git a/security/dummy.c b/security/dummy.c
index 3ca5f2b828a0..a15c54709fde 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr
776static inline void dummy_sk_free_security (struct sock *sk) 776static inline void dummy_sk_free_security (struct sock *sk)
777{ 777{
778} 778}
779
780static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir)
781{
782 return 0;
783}
779#endif /* CONFIG_SECURITY_NETWORK */ 784#endif /* CONFIG_SECURITY_NETWORK */
780 785
786#ifdef CONFIG_SECURITY_NETWORK_XFRM
787static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
788{
789 return 0;
790}
791
792static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new)
793{
794 return 0;
795}
796
797static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
798{
799}
800
801static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
802{
803 return 0;
804}
805
806static void dummy_xfrm_state_free_security(struct xfrm_state *x)
807{
808}
809
810static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
811{
812 return 0;
813}
814#endif /* CONFIG_SECURITY_NETWORK_XFRM */
781static int dummy_register_security (const char *name, struct security_operations *ops) 815static int dummy_register_security (const char *name, struct security_operations *ops)
782{ 816{
783 return -EINVAL; 817 return -EINVAL;
@@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops)
970 set_to_dummy_if_null(ops, socket_getpeersec); 1004 set_to_dummy_if_null(ops, socket_getpeersec);
971 set_to_dummy_if_null(ops, sk_alloc_security); 1005 set_to_dummy_if_null(ops, sk_alloc_security);
972 set_to_dummy_if_null(ops, sk_free_security); 1006 set_to_dummy_if_null(ops, sk_free_security);
973#endif /* CONFIG_SECURITY_NETWORK */ 1007 set_to_dummy_if_null(ops, sk_getsid);
1008 #endif /* CONFIG_SECURITY_NETWORK */
1009#ifdef CONFIG_SECURITY_NETWORK_XFRM
1010 set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
1011 set_to_dummy_if_null(ops, xfrm_policy_clone_security);
1012 set_to_dummy_if_null(ops, xfrm_policy_free_security);
1013 set_to_dummy_if_null(ops, xfrm_state_alloc_security);
1014 set_to_dummy_if_null(ops, xfrm_state_free_security);
1015 set_to_dummy_if_null(ops, xfrm_policy_lookup);
1016#endif /* CONFIG_SECURITY_NETWORK_XFRM */
974#ifdef CONFIG_KEYS 1017#ifdef CONFIG_KEYS
975 set_to_dummy_if_null(ops, key_alloc); 1018 set_to_dummy_if_null(ops, key_alloc);
976 set_to_dummy_if_null(ops, key_free); 1019 set_to_dummy_if_null(ops, key_free);
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index b038cd0fae2e..06d54d9d20a5 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o
8 8
9selinux-$(CONFIG_SECURITY_NETWORK) += netif.o 9selinux-$(CONFIG_SECURITY_NETWORK) += netif.o
10 10
11selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
12
11EXTRA_CFLAGS += -Isecurity/selinux/include 13EXTRA_CFLAGS += -Isecurity/selinux/include
12 14
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fc774436a264..3d496eae1b47 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -73,6 +73,7 @@
73#include "avc.h" 73#include "avc.h"
74#include "objsec.h" 74#include "objsec.h"
75#include "netif.h" 75#include "netif.h"
76#include "xfrm.h"
76 77
77#define XATTR_SELINUX_SUFFIX "selinux" 78#define XATTR_SELINUX_SUFFIX "selinux"
78#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX 79#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -3349,6 +3350,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
3349 err = avc_has_perm(sock_sid, port_sid, 3350 err = avc_has_perm(sock_sid, port_sid,
3350 sock_class, recv_perm, &ad); 3351 sock_class, recv_perm, &ad);
3351 } 3352 }
3353
3354 if (!err)
3355 err = selinux_xfrm_sock_rcv_skb(sock_sid, skb);
3356
3352out: 3357out:
3353 return err; 3358 return err;
3354} 3359}
@@ -3401,6 +3406,24 @@ static void selinux_sk_free_security(struct sock *sk)
3401 sk_free_security(sk); 3406 sk_free_security(sk);
3402} 3407}
3403 3408
3409static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir)
3410{
3411 struct inode_security_struct *isec;
3412 u32 sock_sid = SECINITSID_ANY_SOCKET;
3413
3414 if (!sk)
3415 return selinux_no_sk_sid(fl);
3416
3417 read_lock_bh(&sk->sk_callback_lock);
3418 isec = get_sock_isec(sk);
3419
3420 if (isec)
3421 sock_sid = isec->sid;
3422
3423 read_unlock_bh(&sk->sk_callback_lock);
3424 return sock_sid;
3425}
3426
3404static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb) 3427static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
3405{ 3428{
3406 int err = 0; 3429 int err = 0;
@@ -3536,6 +3559,11 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
3536 send_perm, &ad) ? NF_DROP : NF_ACCEPT; 3559 send_perm, &ad) ? NF_DROP : NF_ACCEPT;
3537 } 3560 }
3538 3561
3562 if (err != NF_ACCEPT)
3563 goto out;
3564
3565 err = selinux_xfrm_postroute_last(isec->sid, skb);
3566
3539out: 3567out:
3540 return err; 3568 return err;
3541} 3569}
@@ -4380,6 +4408,16 @@ static struct security_operations selinux_ops = {
4380 .socket_getpeersec = selinux_socket_getpeersec, 4408 .socket_getpeersec = selinux_socket_getpeersec,
4381 .sk_alloc_security = selinux_sk_alloc_security, 4409 .sk_alloc_security = selinux_sk_alloc_security,
4382 .sk_free_security = selinux_sk_free_security, 4410 .sk_free_security = selinux_sk_free_security,
4411 .sk_getsid = selinux_sk_getsid_security,
4412#endif
4413
4414#ifdef CONFIG_SECURITY_NETWORK_XFRM
4415 .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc,
4416 .xfrm_policy_clone_security = selinux_xfrm_policy_clone,
4417 .xfrm_policy_free_security = selinux_xfrm_policy_free,
4418 .xfrm_state_alloc_security = selinux_xfrm_state_alloc,
4419 .xfrm_state_free_security = selinux_xfrm_state_free,
4420 .xfrm_policy_lookup = selinux_xfrm_policy_lookup,
4383#endif 4421#endif
4384}; 4422};
4385 4423
@@ -4491,6 +4529,7 @@ static int __init selinux_nf_ip_init(void)
4491 panic("SELinux: nf_register_hook for IPv6: error %d\n", err); 4529 panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
4492 4530
4493#endif /* IPV6 */ 4531#endif /* IPV6 */
4532
4494out: 4533out:
4495 return err; 4534 return err;
4496} 4535}
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 1deb59e1b762..71aeb12f07c8 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -238,3 +238,5 @@
238 S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost") 238 S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost")
239 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") 239 S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto")
240 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") 240 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom")
241 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELFROM, "relabelfrom")
242 S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELTO, "relabelto")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index a78b5d59c9fc..d1d0996049e3 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -908,6 +908,8 @@
908 908
909#define ASSOCIATION__SENDTO 0x00000001UL 909#define ASSOCIATION__SENDTO 0x00000001UL
910#define ASSOCIATION__RECVFROM 0x00000002UL 910#define ASSOCIATION__RECVFROM 0x00000002UL
911#define ASSOCIATION__RELABELFROM 0x00000004UL
912#define ASSOCIATION__RELABELTO 0x00000008UL
911 913
912#define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL 914#define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL
913#define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL 915#define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
new file mode 100644
index 000000000000..8e87996c6dd5
--- /dev/null
+++ b/security/selinux/include/xfrm.h
@@ -0,0 +1,54 @@
1/*
2 * SELinux support for the XFRM LSM hooks
3 *
4 * Author : Trent Jaeger, <jaegert@us.ibm.com>
5 */
6#ifndef _SELINUX_XFRM_H_
7#define _SELINUX_XFRM_H_
8
9int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
10int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
11void selinux_xfrm_policy_free(struct xfrm_policy *xp);
12int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
13void selinux_xfrm_state_free(struct xfrm_state *x);
14int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
15
16/*
17 * Extract the security blob from the sock (it's actually on the socket)
18 */
19static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
20{
21 if (!sk->sk_socket)
22 return NULL;
23
24 return SOCK_INODE(sk->sk_socket)->i_security;
25}
26
27
28static inline u32 selinux_no_sk_sid(struct flowi *fl)
29{
30 /* NOTE: no sock occurs on ICMP reply, forwards, ... */
31 /* icmp_reply: authorize as kernel packet */
32 if (fl && fl->proto == IPPROTO_ICMP) {
33 return SECINITSID_KERNEL;
34 }
35
36 return SECINITSID_ANY_SOCKET;
37}
38
39#ifdef CONFIG_SECURITY_NETWORK_XFRM
40int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb);
41int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb);
42#else
43static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
44{
45 return 0;
46}
47
48static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
49{
50 return NF_ACCEPT;
51}
52#endif
53
54#endif /* _SELINUX_XFRM_H_ */
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
new file mode 100644
index 000000000000..c4d87d4dca7b
--- /dev/null
+++ b/security/selinux/xfrm.c
@@ -0,0 +1,311 @@
1/*
2 * NSA Security-Enhanced Linux (SELinux) security module
3 *
4 * This file contains the SELinux XFRM hook function implementations.
5 *
6 * Authors: Serge Hallyn <sergeh@us.ibm.com>
7 * Trent Jaeger <jaegert@us.ibm.com>
8 *
9 * Copyright (C) 2005 International Business Machines Corporation
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2,
13 * as published by the Free Software Foundation.
14 */
15
16/*
17 * USAGE:
18 * NOTES:
19 * 1. Make sure to enable the following options in your kernel config:
20 * CONFIG_SECURITY=y
21 * CONFIG_SECURITY_NETWORK=y
22 * CONFIG_SECURITY_NETWORK_XFRM=y
23 * CONFIG_SECURITY_SELINUX=m/y
24 * ISSUES:
25 * 1. Caching packets, so they are not dropped during negotiation
26 * 2. Emulating a reasonable SO_PEERSEC across machines
27 * 3. Testing addition of sk_policy's with security context via setsockopt
28 */
29#include <linux/config.h>
30#include <linux/module.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/security.h>
34#include <linux/types.h>
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv6.h>
38#include <linux/ip.h>
39#include <linux/tcp.h>
40#include <linux/skbuff.h>
41#include <linux/xfrm.h>
42#include <net/xfrm.h>
43#include <net/checksum.h>
44#include <net/udp.h>
45#include <asm/semaphore.h>
46
47#include "avc.h"
48#include "objsec.h"
49#include "xfrm.h"
50
51
52/*
53 * Returns true if an LSM/SELinux context
54 */
55static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
56{
57 return (ctx &&
58 (ctx->ctx_doi == XFRM_SC_DOI_LSM) &&
59 (ctx->ctx_alg == XFRM_SC_ALG_SELINUX));
60}
61
62/*
63 * Returns true if the xfrm contains a security blob for SELinux
64 */
65static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
66{
67 return selinux_authorizable_ctx(x->security);
68}
69
70/*
71 * LSM hook implementation that authorizes that a socket can be used
72 * with the corresponding xfrm_sec_ctx and direction.
73 */
74int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
75{
76 int rc = 0;
77 u32 sel_sid = SECINITSID_UNLABELED;
78 struct xfrm_sec_ctx *ctx;
79
80 /* Context sid is either set to label or ANY_ASSOC */
81 if ((ctx = xp->security)) {
82 if (!selinux_authorizable_ctx(ctx))
83 return -EINVAL;
84
85 sel_sid = ctx->ctx_sid;
86 }
87
88 rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION,
89 ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM :
90 ((dir == FLOW_DIR_OUT) ? ASSOCIATION__SENDTO :
91 (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))),
92 NULL);
93
94 return rc;
95}
96
97/*
98 * Security blob allocation for xfrm_policy and xfrm_state
99 * CTX does not have a meaningful value on input
100 */
101static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx)
102{
103 int rc = 0;
104 struct task_security_struct *tsec = current->security;
105 struct xfrm_sec_ctx *ctx;
106
107 BUG_ON(!uctx);
108 BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX);
109
110 if (uctx->ctx_len >= PAGE_SIZE)
111 return -ENOMEM;
112
113 *ctxp = ctx = kmalloc(sizeof(*ctx) +
114 uctx->ctx_len,
115 GFP_KERNEL);
116
117 if (!ctx)
118 return -ENOMEM;
119
120 ctx->ctx_doi = uctx->ctx_doi;
121 ctx->ctx_len = uctx->ctx_len;
122 ctx->ctx_alg = uctx->ctx_alg;
123
124 memcpy(ctx->ctx_str,
125 uctx+1,
126 ctx->ctx_len);
127 rc = security_context_to_sid(ctx->ctx_str,
128 ctx->ctx_len,
129 &ctx->ctx_sid);
130
131 if (rc)
132 goto out;
133
134 /*
135 * Does the subject have permission to set security or permission to
136 * do the relabel?
137 * Must be permitted to relabel from default socket type (process type)
138 * to specified context
139 */
140 rc = avc_has_perm(tsec->sid, tsec->sid,
141 SECCLASS_ASSOCIATION,
142 ASSOCIATION__RELABELFROM, NULL);
143 if (rc)
144 goto out;
145
146 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
147 SECCLASS_ASSOCIATION,
148 ASSOCIATION__RELABELTO, NULL);
149 if (rc)
150 goto out;
151
152 return rc;
153
154out:
155 *ctxp = 0;
156 kfree(ctx);
157 return rc;
158}
159
160/*
161 * LSM hook implementation that allocs and transfers uctx spec to
162 * xfrm_policy.
163 */
164int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx)
165{
166 int err;
167
168 BUG_ON(!xp);
169
170 err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx);
171 return err;
172}
173
174
175/*
176 * LSM hook implementation that copies security data structure from old to
177 * new for policy cloning.
178 */
179int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
180{
181 struct xfrm_sec_ctx *old_ctx, *new_ctx;
182
183 old_ctx = old->security;
184
185 if (old_ctx) {
186 new_ctx = new->security = kmalloc(sizeof(*new_ctx) +
187 old_ctx->ctx_len,
188 GFP_KERNEL);
189
190 if (!new_ctx)
191 return -ENOMEM;
192
193 memcpy(new_ctx, old_ctx, sizeof(*new_ctx));
194 memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len);
195 }
196 return 0;
197}
198
199/*
200 * LSM hook implementation that frees xfrm_policy security information.
201 */
202void selinux_xfrm_policy_free(struct xfrm_policy *xp)
203{
204 struct xfrm_sec_ctx *ctx = xp->security;
205 if (ctx)
206 kfree(ctx);
207}
208
209/*
210 * LSM hook implementation that allocs and transfers sec_ctx spec to
211 * xfrm_state.
212 */
213int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx)
214{
215 int err;
216
217 BUG_ON(!x);
218
219 err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx);
220 return err;
221}
222
223/*
224 * LSM hook implementation that frees xfrm_state security information.
225 */
226void selinux_xfrm_state_free(struct xfrm_state *x)
227{
228 struct xfrm_sec_ctx *ctx = x->security;
229 if (ctx)
230 kfree(ctx);
231}
232
233/*
234 * LSM hook that controls access to unlabelled packets. If
235 * a xfrm_state is authorizable (defined by macro) then it was
236 * already authorized by the IPSec process. If not, then
237 * we need to check for unlabelled access since this may not have
238 * gone thru the IPSec process.
239 */
240int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
241{
242 int i, rc = 0;
243 struct sec_path *sp;
244
245 sp = skb->sp;
246
247 if (sp) {
248 /*
249 * __xfrm_policy_check does not approve unless xfrm_policy_ok
250 * says that spi's match for policy and the socket.
251 *
252 * Only need to verify the existence of an authorizable sp.
253 */
254 for (i = 0; i < sp->len; i++) {
255 struct xfrm_state *x = sp->x[i].xvec;
256
257 if (x && selinux_authorizable_xfrm(x))
258 goto accept;
259 }
260 }
261
262 /* check SELinux sock for unlabelled access */
263 rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
264 ASSOCIATION__RECVFROM, NULL);
265 if (rc)
266 goto drop;
267
268accept:
269 return 0;
270
271drop:
272 return rc;
273}
274
275/*
276 * POSTROUTE_LAST hook's XFRM processing:
277 * If we have no security association, then we need to determine
278 * whether the socket is allowed to send to an unlabelled destination.
279 * If we do have a authorizable security association, then it has already been
280 * checked in xfrm_policy_lookup hook.
281 */
282int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
283{
284 struct dst_entry *dst;
285 int rc = 0;
286
287 dst = skb->dst;
288
289 if (dst) {
290 struct dst_entry *dst_test;
291
292 for (dst_test = dst; dst_test != 0;
293 dst_test = dst_test->child) {
294 struct xfrm_state *x = dst_test->xfrm;
295
296 if (x && selinux_authorizable_xfrm(x))
297 goto accept;
298 }
299 }
300
301 rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
302 ASSOCIATION__SENDTO, NULL);
303 if (rc)
304 goto drop;
305
306accept:
307 return NF_ACCEPT;
308
309drop:
310 return NF_DROP;
311}