aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/appletalk/ddp.c23
-rw-r--r--net/atm/pvc.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/bluetooth/af_bluetooth.c5
-rw-r--r--net/bluetooth/bnep/sock.c2
-rw-r--r--net/bluetooth/cmtp/sock.c2
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hidp/sock.c2
-rw-r--r--net/bluetooth/l2cap.c9
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/sco.c9
-rw-r--r--net/bridge/br.c1
-rw-r--r--net/bridge/br_device.c91
-rw-r--r--net/bridge/br_if.c55
-rw-r--r--net/bridge/br_input.c11
-rw-r--r--net/bridge/br_netfilter.c4
-rw-r--r--net/bridge/br_notify.c14
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/bridge/br_stp_if.c5
-rw-r--r--net/bridge/netfilter/Kconfig6
-rw-r--r--net/bridge/netfilter/ebt_log.c73
-rw-r--r--net/bridge/netfilter/ebt_ulog.c53
-rw-r--r--net/core/datagram.c36
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c112
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netpoll.c1
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/skbuff.c27
-rw-r--r--net/core/sock.c21
-rw-r--r--net/core/stream.c10
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c33
-rw-r--r--net/dccp/ackvec.h12
-rw-r--r--net/dccp/ccid.h2
-rw-r--r--net/dccp/dccp.h24
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/input.c79
-rw-r--r--net/dccp/ipv4.c305
-rw-r--r--net/dccp/ipv6.c1261
-rw-r--r--net/dccp/ipv6.h37
-rw-r--r--net/dccp/minisocks.c23
-rw-r--r--net/dccp/output.c47
-rw-r--r--net/dccp/proto.c56
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_in.c17
-rw-r--r--net/econet/af_econet.c9
-rw-r--r--net/ieee80211/ieee80211_rx.c5
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_diag.c14
-rw-r--r--net/ipv4/inet_hashtables.c178
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c28
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c175
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c1
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_bic.c85
-rw-r--r--net/ipv4/tcp_cong.c28
-rw-r--r--net/ipv4/tcp_cubic.c411
-rw-r--r--net/ipv4/tcp_input.c99
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_vegas.c4
-rw-r--r--net/ipv4/udp.c22
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/af_inet6.c90
-rw-r--r--net/ipv6/ah6.c1
-rw-r--r--net/ipv6/esp6.c1
-rw-r--r--net/ipv6/exthdrs.c4
-rw-r--r--net/ipv6/inet6_connection_sock.c199
-rw-r--r--net/ipv6/inet6_hashtables.c183
-rw-r--r--net/ipv6/ip6_flowlabel.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ipcomp6.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c24
-rw-r--r--net/ipv6/mcast.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c191
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c1
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c1
-rw-r--r--net/ipv6/netfilter/ip6t_esp.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/ipv6/tcp_ipv6.c639
-rw-r--r--net/ipv6/udp.c16
-rw-r--r--net/ipx/af_ipx.c6
-rw-r--r--net/irda/af_irda.c23
-rw-r--r--net/key/af_key.c201
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/netfilter/nfnetlink_log.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c2
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/netrom/af_netrom.c16
-rw-r--r--net/nonet.c5
-rw-r--r--net/packet/af_packet.c10
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/sched/sch_netem.c49
-rw-r--r--net/sched/sch_teql.c1
-rw-r--r--net/sctp/associola.c81
-rw-r--r--net/sctp/input.c36
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/output.c17
-rw-r--r--net/sctp/protocol.c3
-rw-r--r--net/sctp/sm_sideeffect.c26
-rw-r--r--net/sctp/sm_statefuns.c20
-rw-r--r--net/sctp/socket.c691
-rw-r--r--net/sctp/transport.c32
-rw-r--r--net/socket.c243
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/unix/af_unix.c55
-rw-r--r--net/unix/garbage.c4
-rw-r--r--net/wanrouter/af_wanpipe.c6
-rw-r--r--net/x25/af_x25.c6
-rw-r--r--net/xfrm/xfrm_policy.c88
-rw-r--r--net/xfrm/xfrm_state.c9
-rw-r--r--net/xfrm/xfrm_user.c148
166 files changed, 5404 insertions, 2308 deletions
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c83..a5144e43aae1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
63#include <linux/atalk.h> 63#include <linux/atalk.h>
64 64
65struct datalink_proto *ddp_dl, *aarp_dl; 65struct datalink_proto *ddp_dl, *aarp_dl;
66static struct proto_ops atalk_dgram_ops; 66static const struct proto_ops atalk_dgram_ops;
67 67
68/**************************************************************************\ 68/**************************************************************************\
69* * 69* *
@@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
1763 */ 1763 */
1764static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 1764static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1765{ 1765{
1766 int rc = -EINVAL; 1766 int rc = -ENOIOCTLCMD;
1767 struct sock *sk = sock->sk; 1767 struct sock *sk = sock->sk;
1768 void __user *argp = (void __user *)arg; 1768 void __user *argp = (void __user *)arg;
1769 1769
@@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1813 rc = atif_ioctl(cmd, argp); 1813 rc = atif_ioctl(cmd, argp);
1814 rtnl_unlock(); 1814 rtnl_unlock();
1815 break; 1815 break;
1816 /* Physical layer ioctl calls */
1817 case SIOCSIFLINK:
1818 case SIOCGIFHWADDR:
1819 case SIOCSIFHWADDR:
1820 case SIOCGIFFLAGS:
1821 case SIOCSIFFLAGS:
1822 case SIOCGIFTXQLEN:
1823 case SIOCSIFTXQLEN:
1824 case SIOCGIFMTU:
1825 case SIOCGIFCONF:
1826 case SIOCADDMULTI:
1827 case SIOCDELMULTI:
1828 case SIOCGIFCOUNT:
1829 case SIOCGIFINDEX:
1830 case SIOCGIFNAME:
1831 rc = dev_ioctl(cmd, argp);
1832 break;
1833 } 1816 }
1834 1817
1835 return rc; 1818 return rc;
@@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = {
1841 .owner = THIS_MODULE, 1824 .owner = THIS_MODULE,
1842}; 1825};
1843 1826
1844static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { 1827static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
1845 .family = PF_APPLETALK, 1828 .family = PF_APPLETALK,
1846 .owner = THIS_MODULE, 1829 .owner = THIS_MODULE,
1847 .release = atalk_release, 1830 .release = atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22b..f2c541774dcd 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
102} 102}
103 103
104 104
105static struct proto_ops pvc_proto_ops = { 105static const struct proto_ops pvc_proto_ops = {
106 .family = PF_ATMPVC, 106 .family = PF_ATMPVC,
107 .owner = THIS_MODULE, 107 .owner = THIS_MODULE,
108 108
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf6..3a180cfd7b48 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
613 return error; 613 return error;
614} 614}
615 615
616static struct proto_ops svc_proto_ops = { 616static const struct proto_ops svc_proto_ops = {
617 .family = PF_ATMSVC, 617 .family = PF_ATMSVC,
618 .owner = THIS_MODULE, 618 .owner = THIS_MODULE,
619 619
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f302657..e8753c7fcad1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
54HLIST_HEAD(ax25_list); 54HLIST_HEAD(ax25_list);
55DEFINE_SPINLOCK(ax25_list_lock); 55DEFINE_SPINLOCK(ax25_list_lock);
56 56
57static struct proto_ops ax25_proto_ops; 57static const struct proto_ops ax25_proto_ops;
58 58
59static void ax25_free_sock(struct sock *sk) 59static void ax25_free_sock(struct sock *sk)
60{ 60{
@@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1827 break; 1827 break;
1828 1828
1829 default: 1829 default:
1830 res = dev_ioctl(cmd, argp); 1830 res = -ENOIOCTLCMD;
1831 break; 1831 break;
1832 } 1832 }
1833 release_sock(sk); 1833 release_sock(sk);
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
1944 .owner = THIS_MODULE, 1944 .owner = THIS_MODULE,
1945}; 1945};
1946 1946
1947static struct proto_ops ax25_proto_ops = { 1947static const struct proto_ops ax25_proto_ops = {
1948 .family = PF_AX25, 1948 .family = PF_AX25,
1949 .owner = THIS_MODULE, 1949 .owner = THIS_MODULE,
1950 .release = ax25_release, 1950 .release = ax25_release,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ea616e3fc98e..fb031fe9be9e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
287 timeo = schedule_timeout(timeo); 287 timeo = schedule_timeout(timeo);
288 lock_sock(sk); 288 lock_sock(sk);
289 289
290 if (sk->sk_err) { 290 err = sock_error(sk);
291 err = sock_error(sk); 291 if (err)
292 break; 292 break;
293 }
294 } 293 }
295 set_current_state(TASK_RUNNING); 294 set_current_state(TASK_RUNNING);
296 remove_wait_queue(sk->sk_sleep, &wait); 295 remove_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53b..ccbaf69afc5b 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
146 return 0; 146 return 0;
147} 147}
148 148
149static struct proto_ops bnep_sock_ops = { 149static const struct proto_ops bnep_sock_ops = {
150 .family = PF_BLUETOOTH, 150 .family = PF_BLUETOOTH,
151 .owner = THIS_MODULE, 151 .owner = THIS_MODULE,
152 .release = bnep_sock_release, 152 .release = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf5714..5e22343b6090 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
137 return -EINVAL; 137 return -EINVAL;
138} 138}
139 139
140static struct proto_ops cmtp_sock_ops = { 140static const struct proto_ops cmtp_sock_ops = {
141 .family = PF_BLUETOOTH, 141 .family = PF_BLUETOOTH,
142 .owner = THIS_MODULE, 142 .owner = THIS_MODULE,
143 .release = cmtp_sock_release, 143 .release = cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c099..84e6c93a044a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
575 return 0; 575 return 0;
576} 576}
577 577
578static struct proto_ops hci_sock_ops = { 578static const struct proto_ops hci_sock_ops = {
579 .family = PF_BLUETOOTH, 579 .family = PF_BLUETOOTH,
580 .owner = THIS_MODULE, 580 .owner = THIS_MODULE,
581 .release = hci_sock_release, 581 .release = hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f881431..8f8dd931b294 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
143 return -EINVAL; 143 return -EINVAL;
144} 144}
145 145
146static struct proto_ops hidp_sock_ops = { 146static const struct proto_ops hidp_sock_ops = {
147 .family = PF_BLUETOOTH, 147 .family = PF_BLUETOOTH,
148 .owner = THIS_MODULE, 148 .owner = THIS_MODULE,
149 .release = hidp_sock_release, 149 .release = hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index e3bb11ca4235..7f0781e4326f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
57 57
58#define VERSION "2.8" 58#define VERSION "2.8"
59 59
60static struct proto_ops l2cap_sock_ops; 60static const struct proto_ops l2cap_sock_ops;
61 61
62static struct bt_sock_list l2cap_sk_list = { 62static struct bt_sock_list l2cap_sk_list = {
63 .lock = RW_LOCK_UNLOCKED 63 .lock = RW_LOCK_UNLOCKED
@@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
767 767
768 BT_DBG("sock %p, sk %p", sock, sk); 768 BT_DBG("sock %p, sk %p", sock, sk);
769 769
770 if (sk->sk_err) 770 err = sock_error(sk);
771 return sock_error(sk); 771 if (err)
772 return err;
772 773
773 if (msg->msg_flags & MSG_OOB) 774 if (msg->msg_flags & MSG_OOB)
774 return -EOPNOTSUPP; 775 return -EOPNOTSUPP;
@@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
2160 2161
2161static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); 2162static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
2162 2163
2163static struct proto_ops l2cap_sock_ops = { 2164static const struct proto_ops l2cap_sock_ops = {
2164 .family = PF_BLUETOOTH, 2165 .family = PF_BLUETOOTH,
2165 .owner = THIS_MODULE, 2166 .owner = THIS_MODULE,
2166 .release = l2cap_sock_release, 2167 .release = l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232e..757d2dd3b02f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
58#define BT_DBG(D...) 58#define BT_DBG(D...)
59#endif 59#endif
60 60
61static struct proto_ops rfcomm_sock_ops; 61static const struct proto_ops rfcomm_sock_ops;
62 62
63static struct bt_sock_list rfcomm_sk_list = { 63static struct bt_sock_list rfcomm_sk_list = {
64 .lock = RW_LOCK_UNLOCKED 64 .lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
907 907
908static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); 908static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
909 909
910static struct proto_ops rfcomm_sock_ops = { 910static const struct proto_ops rfcomm_sock_ops = {
911 .family = PF_BLUETOOTH, 911 .family = PF_BLUETOOTH,
912 .owner = THIS_MODULE, 912 .owner = THIS_MODULE,
913 .release = rfcomm_sock_release, 913 .release = rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9cb00dc6c08c..6b61323ce23c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
56 56
57#define VERSION "0.5" 57#define VERSION "0.5"
58 58
59static struct proto_ops sco_sock_ops; 59static const struct proto_ops sco_sock_ops;
60 60
61static struct bt_sock_list sco_sk_list = { 61static struct bt_sock_list sco_sk_list = {
62 .lock = RW_LOCK_UNLOCKED 62 .lock = RW_LOCK_UNLOCKED
@@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
637 637
638 BT_DBG("sock %p, sk %p", sock, sk); 638 BT_DBG("sock %p, sk %p", sock, sk);
639 639
640 if (sk->sk_err) 640 err = sock_error(sk);
641 return sock_error(sk); 641 if (err)
642 return err;
642 643
643 if (msg->msg_flags & MSG_OOB) 644 if (msg->msg_flags & MSG_OOB)
644 return -EOPNOTSUPP; 645 return -EOPNOTSUPP;
@@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
913 914
914static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); 915static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
915 916
916static struct proto_ops sco_sock_ops = { 917static const struct proto_ops sco_sock_ops = {
917 .family = PF_BLUETOOTH, 918 .family = PF_BLUETOOTH,
918 .owner = THIS_MODULE, 919 .owner = THIS_MODULE,
919 .release = sco_sock_release, 920 .release = sco_sock_release,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index f8f184942aaf..188cc1ac49eb 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook);
67module_init(br_init) 67module_init(br_init)
68module_exit(br_deinit) 68module_exit(br_deinit)
69MODULE_LICENSE("GPL"); 69MODULE_LICENSE("GPL");
70MODULE_VERSION(BR_VERSION);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f564ee99782d..0b33a7b3a00c 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,9 @@
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/module.h> 18#include <linux/etherdevice.h>
19#include <linux/ethtool.h>
20
19#include <asm/uaccess.h> 21#include <asm/uaccess.h>
20#include "br_private.h" 22#include "br_private.h"
21 23
@@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
82 return 0; 84 return 0;
83} 85}
84 86
87/* Allow setting mac address of pseudo-bridge to be same as
88 * any of the bound interfaces
89 */
90static int br_set_mac_address(struct net_device *dev, void *p)
91{
92 struct net_bridge *br = netdev_priv(dev);
93 struct sockaddr *addr = p;
94 struct net_bridge_port *port;
95 int err = -EADDRNOTAVAIL;
96
97 spin_lock_bh(&br->lock);
98 list_for_each_entry(port, &br->port_list, list) {
99 if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
100 br_stp_change_bridge_id(br, addr->sa_data);
101 err = 0;
102 break;
103 }
104 }
105 spin_unlock_bh(&br->lock);
106
107 return err;
108}
109
110static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
111{
112 strcpy(info->driver, "bridge");
113 strcpy(info->version, BR_VERSION);
114 strcpy(info->fw_version, "N/A");
115 strcpy(info->bus_info, "N/A");
116}
117
118static int br_set_sg(struct net_device *dev, u32 data)
119{
120 struct net_bridge *br = netdev_priv(dev);
121
122 if (data)
123 br->feature_mask |= NETIF_F_SG;
124 else
125 br->feature_mask &= ~NETIF_F_SG;
126
127 br_features_recompute(br);
128 return 0;
129}
130
131static int br_set_tso(struct net_device *dev, u32 data)
132{
133 struct net_bridge *br = netdev_priv(dev);
134
135 if (data)
136 br->feature_mask |= NETIF_F_TSO;
137 else
138 br->feature_mask &= ~NETIF_F_TSO;
139
140 br_features_recompute(br);
141 return 0;
142}
143
144static int br_set_tx_csum(struct net_device *dev, u32 data)
145{
146 struct net_bridge *br = netdev_priv(dev);
147
148 if (data)
149 br->feature_mask |= NETIF_F_IP_CSUM;
150 else
151 br->feature_mask &= ~NETIF_F_IP_CSUM;
152
153 br_features_recompute(br);
154 return 0;
155}
156
157static struct ethtool_ops br_ethtool_ops = {
158 .get_drvinfo = br_getinfo,
159 .get_link = ethtool_op_get_link,
160 .get_sg = ethtool_op_get_sg,
161 .set_sg = br_set_sg,
162 .get_tx_csum = ethtool_op_get_tx_csum,
163 .set_tx_csum = br_set_tx_csum,
164 .get_tso = ethtool_op_get_tso,
165 .set_tso = br_set_tso,
166};
167
85void br_dev_setup(struct net_device *dev) 168void br_dev_setup(struct net_device *dev)
86{ 169{
87 memset(dev->dev_addr, 0, ETH_ALEN); 170 memset(dev->dev_addr, 0, ETH_ALEN);
@@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev)
96 dev->change_mtu = br_change_mtu; 179 dev->change_mtu = br_change_mtu;
97 dev->destructor = free_netdev; 180 dev->destructor = free_netdev;
98 SET_MODULE_OWNER(dev); 181 SET_MODULE_OWNER(dev);
182 SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
99 dev->stop = br_dev_stop; 183 dev->stop = br_dev_stop;
100 dev->tx_queue_len = 0; 184 dev->tx_queue_len = 0;
101 dev->set_mac_address = NULL; 185 dev->set_mac_address = br_set_mac_address;
102 dev->priv_flags = IFF_EBRIDGE; 186 dev->priv_flags = IFF_EBRIDGE;
187
188 dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
189 | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
103} 190}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 975abe254b7a..11321197338e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -32,9 +32,8 @@
32 * ethtool, use ethtool_ops. Also, since driver might sleep need to 32 * ethtool, use ethtool_ops. Also, since driver might sleep need to
33 * not be holding any locks. 33 * not be holding any locks.
34 */ 34 */
35static int br_initial_port_cost(struct net_device *dev) 35static int port_cost(struct net_device *dev)
36{ 36{
37
38 struct ethtool_cmd ecmd = { ETHTOOL_GSET }; 37 struct ethtool_cmd ecmd = { ETHTOOL_GSET };
39 struct ifreq ifr; 38 struct ifreq ifr;
40 mm_segment_t old_fs; 39 mm_segment_t old_fs;
@@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev)
58 return 2; 57 return 2;
59 case SPEED_10: 58 case SPEED_10:
60 return 100; 59 return 100;
61 default:
62 pr_info("bridge: can't decode speed from %s: %d\n",
63 dev->name, ecmd.speed);
64 return 100;
65 } 60 }
66 } 61 }
67 62
@@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev)
75 return 100; /* assume old 10Mbps */ 70 return 100; /* assume old 10Mbps */
76} 71}
77 72
73
74/*
75 * Check for port carrier transistions.
76 * Called from work queue to allow for calling functions that
77 * might sleep (such as speed check), and to debounce.
78 */
79static void port_carrier_check(void *arg)
80{
81 struct net_bridge_port *p = arg;
82
83 rtnl_lock();
84 if (netif_carrier_ok(p->dev)) {
85 u32 cost = port_cost(p->dev);
86
87 spin_lock_bh(&p->br->lock);
88 if (p->state == BR_STATE_DISABLED) {
89 p->path_cost = cost;
90 br_stp_enable_port(p);
91 }
92 spin_unlock_bh(&p->br->lock);
93 } else {
94 spin_lock_bh(&p->br->lock);
95 if (p->state != BR_STATE_DISABLED)
96 br_stp_disable_port(p);
97 spin_unlock_bh(&p->br->lock);
98 }
99 rtnl_unlock();
100}
101
78static void destroy_nbp(struct net_bridge_port *p) 102static void destroy_nbp(struct net_bridge_port *p)
79{ 103{
80 struct net_device *dev = p->dev; 104 struct net_device *dev = p->dev;
@@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p)
102 dev->br_port = NULL; 126 dev->br_port = NULL;
103 dev_set_promiscuity(dev, -1); 127 dev_set_promiscuity(dev, -1);
104 128
129 cancel_delayed_work(&p->carrier_check);
130 flush_scheduled_work();
131
105 spin_lock_bh(&br->lock); 132 spin_lock_bh(&br->lock);
106 br_stp_disable_port(p); 133 br_stp_disable_port(p);
107 spin_unlock_bh(&br->lock); 134 spin_unlock_bh(&br->lock);
@@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name)
155 br->bridge_id.prio[1] = 0x00; 182 br->bridge_id.prio[1] = 0x00;
156 memset(br->bridge_id.addr, 0, ETH_ALEN); 183 memset(br->bridge_id.addr, 0, ETH_ALEN);
157 184
185 br->feature_mask = dev->features;
158 br->stp_enabled = 0; 186 br->stp_enabled = 0;
159 br->designated_root = br->bridge_id; 187 br->designated_root = br->bridge_id;
160 br->root_path_cost = 0; 188 br->root_path_cost = 0;
@@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br)
195 return (index >= BR_MAX_PORTS) ? -EXFULL : index; 223 return (index >= BR_MAX_PORTS) ? -EXFULL : index;
196} 224}
197 225
198/* called with RTNL */ 226/* called with RTNL but without bridge lock */
199static struct net_bridge_port *new_nbp(struct net_bridge *br, 227static struct net_bridge_port *new_nbp(struct net_bridge *br,
200 struct net_device *dev, 228 struct net_device *dev)
201 unsigned long cost)
202{ 229{
203 int index; 230 int index;
204 struct net_bridge_port *p; 231 struct net_bridge_port *p;
@@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
215 p->br = br; 242 p->br = br;
216 dev_hold(dev); 243 dev_hold(dev);
217 p->dev = dev; 244 p->dev = dev;
218 p->path_cost = cost; 245 p->path_cost = port_cost(dev);
219 p->priority = 0x8000 >> BR_PORT_BITS; 246 p->priority = 0x8000 >> BR_PORT_BITS;
220 dev->br_port = p; 247 dev->br_port = p;
221 p->port_no = index; 248 p->port_no = index;
222 br_init_port(p); 249 br_init_port(p);
223 p->state = BR_STATE_DISABLED; 250 p->state = BR_STATE_DISABLED;
251 INIT_WORK(&p->carrier_check, port_carrier_check, p);
224 kobject_init(&p->kobj); 252 kobject_init(&p->kobj);
225 253
226 return p; 254 return p;
@@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br)
322 struct net_bridge_port *p; 350 struct net_bridge_port *p;
323 unsigned long features, checksum; 351 unsigned long features, checksum;
324 352
325 features = NETIF_F_SG | NETIF_F_FRAGLIST 353 features = br->feature_mask &~ NETIF_F_IP_CSUM;
326 | NETIF_F_HIGHDMA | NETIF_F_TSO; 354 checksum = br->feature_mask & NETIF_F_IP_CSUM;
327 checksum = NETIF_F_IP_CSUM; /* least commmon subset */
328 355
329 list_for_each_entry(p, &br->port_list, list) { 356 list_for_each_entry(p, &br->port_list, list) {
330 if (!(p->dev->features 357 if (!(p->dev->features
@@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
351 if (dev->br_port != NULL) 378 if (dev->br_port != NULL)
352 return -EBUSY; 379 return -EBUSY;
353 380
354 if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) 381 if (IS_ERR(p = new_nbp(br, dev)))
355 return PTR_ERR(p); 382 return PTR_ERR(p);
356 383
357 if ((err = br_fdb_insert(br, p, dev->dev_addr))) 384 if ((err = br_fdb_insert(br, p, dev->dev_addr)))
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b88220a64cd8..c387852f753a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb)
53 /* insert into forwarding database after filtering to avoid spoofing */ 53 /* insert into forwarding database after filtering to avoid spoofing */
54 br_fdb_update(p->br, p, eth_hdr(skb)->h_source); 54 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
55 55
56 if (p->state == BR_STATE_LEARNING) {
57 kfree_skb(skb);
58 goto out;
59 }
60
56 if (br->dev->flags & IFF_PROMISC) { 61 if (br->dev->flags & IFF_PROMISC) {
57 struct sk_buff *skb2; 62 struct sk_buff *skb2;
58 63
@@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
107 if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) 112 if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
108 goto err; 113 goto err;
109 114
110 if (p->state == BR_STATE_LEARNING)
111 br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
112
113 if (p->br->stp_enabled && 115 if (p->br->stp_enabled &&
114 !memcmp(dest, bridge_ula, 5) && 116 !memcmp(dest, bridge_ula, 5) &&
115 !(dest[5] & 0xF0)) { 117 !(dest[5] & 0xF0)) {
@@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
118 NULL, br_stp_handle_bpdu); 120 NULL, br_stp_handle_bpdu);
119 return 1; 121 return 1;
120 } 122 }
123 goto err;
121 } 124 }
122 125
123 else if (p->state == BR_STATE_FORWARDING) { 126 if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
124 if (br_should_route_hook) { 127 if (br_should_route_hook) {
125 if (br_should_route_hook(pskb)) 128 if (br_should_route_hook(pskb))
126 return 0; 129 return 0;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
26#include <linux/ip.h> 26#include <linux/ip.h>
27#include <linux/netdevice.h> 27#include <linux/netdevice.h>
28#include <linux/skbuff.h> 28#include <linux/skbuff.h>
29#include <linux/if_arp.h>
29#include <linux/if_ether.h> 30#include <linux/if_ether.h>
30#include <linux/if_vlan.h> 31#include <linux/if_vlan.h>
31#include <linux/netfilter_bridge.h> 32#include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
33#include <linux/netfilter_ipv6.h> 34#include <linux/netfilter_ipv6.h>
34#include <linux/netfilter_arp.h> 35#include <linux/netfilter_arp.h>
35#include <linux/in_route.h> 36#include <linux/in_route.h>
37
36#include <net/ip.h> 38#include <net/ip.h>
37#include <net/ipv6.h> 39#include <net/ipv6.h>
40#include <net/route.h>
41
38#include <asm/uaccess.h> 42#include <asm/uaccess.h>
39#include <asm/checksum.h> 43#include <asm/checksum.h>
40#include "br_private.h" 44#include "br_private.h"
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 917311c6828b..a43a9c1d50d7 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
52 br_stp_recalculate_bridge_id(br); 52 br_stp_recalculate_bridge_id(br);
53 break; 53 break;
54 54
55 case NETDEV_CHANGE: /* device is up but carrier changed */ 55 case NETDEV_CHANGE:
56 if (!(br->dev->flags & IFF_UP)) 56 if (br->dev->flags & IFF_UP)
57 break; 57 schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
58
59 if (netif_carrier_ok(dev)) {
60 if (p->state == BR_STATE_DISABLED)
61 br_stp_enable_port(p);
62 } else {
63 if (p->state != BR_STATE_DISABLED)
64 br_stp_disable_port(p);
65 }
66 break; 58 break;
67 59
68 case NETDEV_FEAT_CHANGE: 60 case NETDEV_FEAT_CHANGE:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bdf95a74d8cd..c5bd631ffcd5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -27,6 +27,10 @@
27#define BR_PORT_BITS 10 27#define BR_PORT_BITS 10
28#define BR_MAX_PORTS (1<<BR_PORT_BITS) 28#define BR_MAX_PORTS (1<<BR_PORT_BITS)
29 29
30#define BR_PORT_DEBOUNCE (HZ/10)
31
32#define BR_VERSION "2.1"
33
30typedef struct bridge_id bridge_id; 34typedef struct bridge_id bridge_id;
31typedef struct mac_addr mac_addr; 35typedef struct mac_addr mac_addr;
32typedef __u16 port_id; 36typedef __u16 port_id;
@@ -78,6 +82,7 @@ struct net_bridge_port
78 struct timer_list hold_timer; 82 struct timer_list hold_timer;
79 struct timer_list message_age_timer; 83 struct timer_list message_age_timer;
80 struct kobject kobj; 84 struct kobject kobj;
85 struct work_struct carrier_check;
81 struct rcu_head rcu; 86 struct rcu_head rcu;
82}; 87};
83 88
@@ -90,6 +95,7 @@ struct net_bridge
90 spinlock_t hash_lock; 95 spinlock_t hash_lock;
91 struct hlist_head hash[BR_HASH_SIZE]; 96 struct hlist_head hash[BR_HASH_SIZE];
92 struct list_head age_list; 97 struct list_head age_list;
98 unsigned long feature_mask;
93 99
94 /* STP */ 100 /* STP */
95 bridge_id designated_root; 101 bridge_id designated_root;
@@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br);
201extern void br_stp_enable_port(struct net_bridge_port *p); 207extern void br_stp_enable_port(struct net_bridge_port *p);
202extern void br_stp_disable_port(struct net_bridge_port *p); 208extern void br_stp_disable_port(struct net_bridge_port *p);
203extern void br_stp_recalculate_bridge_id(struct net_bridge *br); 209extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
210extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
204extern void br_stp_set_bridge_priority(struct net_bridge *br, 211extern void br_stp_set_bridge_priority(struct net_bridge *br,
205 u16 newprio); 212 u16 newprio);
206extern void br_stp_set_port_priority(struct net_bridge_port *p, 213extern void br_stp_set_port_priority(struct net_bridge_port *p,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index ac09b6a23523..cc047f7fb6ef 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
120} 120}
121 121
122/* called under bridge lock */ 122/* called under bridge lock */
123static void br_stp_change_bridge_id(struct net_bridge *br, 123void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
124 const unsigned char *addr)
125{ 124{
126 unsigned char oldaddr[6]; 125 unsigned char oldaddr[6];
127 struct net_bridge_port *p; 126 struct net_bridge_port *p;
@@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
158 157
159 list_for_each_entry(p, &br->port_list, list) { 158 list_for_each_entry(p, &br->port_list, list) {
160 if (addr == br_mac_zero || 159 if (addr == br_mac_zero ||
161 compare_ether_addr(p->dev->dev_addr, addr) < 0) 160 memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
162 addr = p->dev->dev_addr; 161 addr = p->dev->dev_addr;
163 162
164 } 163 }
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c70b3be23026..b84fc6075fe1 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG
196 To compile it as a module, choose M here. If unsure, say N. 196 To compile it as a module, choose M here. If unsure, say N.
197 197
198config BRIDGE_EBT_ULOG 198config BRIDGE_EBT_ULOG
199 tristate "ebt: ulog support" 199 tristate "ebt: ulog support (OBSOLETE)"
200 depends on BRIDGE_NF_EBTABLES 200 depends on BRIDGE_NF_EBTABLES
201 help 201 help
202 This option enables the old bridge-specific "ebt_ulog" implementation
203 which has been obsoleted by the new "nfnetlink_log" code (see
204 CONFIG_NETFILTER_NETLINK_LOG).
205
202 This option adds the ulog watcher, that you can use in any rule 206 This option adds the ulog watcher, that you can use in any rule
203 in any ebtables table. The packet is passed to a userspace 207 in any ebtables table. The packet is passed to a userspace
204 logging daemon using netlink multicast sockets. This differs 208 logging daemon using netlink multicast sockets. This differs
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 662975be3d1d..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -3,13 +3,16 @@
3 * 3 *
4 * Authors: 4 * Authors:
5 * Bart De Schuymer <bdschuym@pandora.be> 5 * Bart De Schuymer <bdschuym@pandora.be>
6 * Harald Welte <laforge@netfilter.org>
6 * 7 *
7 * April, 2002 8 * April, 2002
8 * 9 *
9 */ 10 */
10 11
12#include <linux/in.h>
11#include <linux/netfilter_bridge/ebtables.h> 13#include <linux/netfilter_bridge/ebtables.h>
12#include <linux/netfilter_bridge/ebt_log.h> 14#include <linux/netfilter_bridge/ebt_log.h>
15#include <linux/netfilter.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/ip.h> 17#include <linux/ip.h>
15#include <linux/if_arp.h> 18#include <linux/if_arp.h>
@@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p)
55} 58}
56 59
57#define myNIPQUAD(a) a[0], a[1], a[2], a[3] 60#define myNIPQUAD(a) a[0], a[1], a[2], a[3]
58static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, 61static void
59 const struct net_device *in, const struct net_device *out, 62ebt_log_packet(unsigned int pf, unsigned int hooknum,
60 const void *data, unsigned int datalen) 63 const struct sk_buff *skb, const struct net_device *in,
64 const struct net_device *out, const struct nf_loginfo *loginfo,
65 const char *prefix)
61{ 66{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 67 unsigned int bitmask;
63 char level_string[4] = "< >";
64 68
65 level_string[1] = '0' + info->loglevel;
66 spin_lock_bh(&ebt_log_lock); 69 spin_lock_bh(&ebt_log_lock);
67 printk(level_string); 70 printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
68 printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", 71 prefix, in ? in->name : "", out ? out->name : "");
69 out ? out->name : "");
70 72
71 printk("MAC source = ");
72 print_MAC(eth_hdr(skb)->h_source); 73 print_MAC(eth_hdr(skb)->h_source);
73 printk("MAC dest = "); 74 printk("MAC dest = ");
74 print_MAC(eth_hdr(skb)->h_dest); 75 print_MAC(eth_hdr(skb)->h_dest);
75 76
76 printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); 77 printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
77 78
78 if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == 79 if (loginfo->type == NF_LOG_TYPE_LOG)
80 bitmask = loginfo->u.log.logflags;
81 else
82 bitmask = NF_LOG_MASK;
83
84 if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
79 htons(ETH_P_IP)){ 85 htons(ETH_P_IP)){
80 struct iphdr _iph, *ih; 86 struct iphdr _iph, *ih;
81 87
@@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
84 printk(" INCOMPLETE IP header"); 90 printk(" INCOMPLETE IP header");
85 goto out; 91 goto out;
86 } 92 }
87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 93 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 94 "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos, 95 NIPQUAD(ih->daddr), ih->tos, ih->protocol);
90 ih->protocol);
91 if (ih->protocol == IPPROTO_TCP || 96 if (ih->protocol == IPPROTO_TCP ||
92 ih->protocol == IPPROTO_UDP) { 97 ih->protocol == IPPROTO_UDP) {
93 struct tcpudphdr _ports, *pptr; 98 struct tcpudphdr _ports, *pptr;
@@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
104 goto out; 109 goto out;
105 } 110 }
106 111
107 if ((info->bitmask & EBT_LOG_ARP) && 112 if ((bitmask & EBT_LOG_ARP) &&
108 ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || 113 ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
109 (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { 114 (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
110 struct arphdr _arph, *ah; 115 struct arphdr _arph, *ah;
@@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
144out: 149out:
145 printk("\n"); 150 printk("\n");
146 spin_unlock_bh(&ebt_log_lock); 151 spin_unlock_bh(&ebt_log_lock);
152
153}
154
155static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
156 const struct net_device *in, const struct net_device *out,
157 const void *data, unsigned int datalen)
158{
159 struct ebt_log_info *info = (struct ebt_log_info *)data;
160 struct nf_loginfo li;
161
162 li.type = NF_LOG_TYPE_LOG;
163 li.u.log.level = info->loglevel;
164 li.u.log.logflags = info->bitmask;
165
166 nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix);
147} 167}
148 168
149static struct ebt_watcher log = 169static struct ebt_watcher log =
@@ -154,13 +174,32 @@ static struct ebt_watcher log =
154 .me = THIS_MODULE, 174 .me = THIS_MODULE,
155}; 175};
156 176
177static struct nf_logger ebt_log_logger = {
178 .name = "ebt_log",
179 .logfn = &ebt_log_packet,
180 .me = THIS_MODULE,
181};
182
157static int __init init(void) 183static int __init init(void)
158{ 184{
159 return ebt_register_watcher(&log); 185 int ret;
186
187 ret = ebt_register_watcher(&log);
188 if (ret < 0)
189 return ret;
190 if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
191 printk(KERN_WARNING "ebt_log: not logging via system console "
192 "since somebody else already registered for PF_INET\n");
193 /* we cannot make module load fail here, since otherwise
194 * ebtables userspace would abort */
195 }
196
197 return 0;
160} 198}
161 199
162static void __exit fini(void) 200static void __exit fini(void)
163{ 201{
202 nf_log_unregister_logger(&ebt_log_logger);
164 ebt_unregister_watcher(&log); 203 ebt_unregister_watcher(&log);
165} 204}
166 205
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index aae26ae2e61f..ce617b3dbbb8 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Authors: 4 * Authors:
5 * Bart De Schuymer <bdschuym@pandora.be> 5 * Bart De Schuymer <bdschuym@pandora.be>
6 * Harald Welte <laforge@netfilter.org>
6 * 7 *
7 * November, 2004 8 * November, 2004
8 * 9 *
@@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
115 return skb; 116 return skb;
116} 117}
117 118
118static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, 119static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
119 const struct net_device *in, const struct net_device *out, 120 const struct net_device *in, const struct net_device *out,
120 const void *data, unsigned int datalen) 121 const struct ebt_ulog_info *uloginfo, const char *prefix)
121{ 122{
122 ebt_ulog_packet_msg_t *pm; 123 ebt_ulog_packet_msg_t *pm;
123 size_t size, copy_len; 124 size_t size, copy_len;
124 struct nlmsghdr *nlh; 125 struct nlmsghdr *nlh;
125 struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
126 unsigned int group = uloginfo->nlgroup; 126 unsigned int group = uloginfo->nlgroup;
127 ebt_ulog_buff_t *ub = &ulog_buffers[group]; 127 ebt_ulog_buff_t *ub = &ulog_buffers[group];
128 spinlock_t *lock = &ub->lock; 128 spinlock_t *lock = &ub->lock;
@@ -216,6 +216,39 @@ alloc_failure:
216 goto unlock; 216 goto unlock;
217} 217}
218 218
219/* this function is registered with the netfilter core */
220static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
221 const struct sk_buff *skb, const struct net_device *in,
222 const struct net_device *out, const struct nf_loginfo *li,
223 const char *prefix)
224{
225 struct ebt_ulog_info loginfo;
226
227 if (!li || li->type != NF_LOG_TYPE_ULOG) {
228 loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
229 loginfo.cprange = 0;
230 loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
231 loginfo.prefix[0] = '\0';
232 } else {
233 loginfo.nlgroup = li->u.ulog.group;
234 loginfo.cprange = li->u.ulog.copy_len;
235 loginfo.qthreshold = li->u.ulog.qthreshold;
236 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
237 }
238
239 ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
240}
241
242static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
243 const struct net_device *in, const struct net_device *out,
244 const void *data, unsigned int datalen)
245{
246 struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
247
248 ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
249}
250
251
219static int ebt_ulog_check(const char *tablename, unsigned int hookmask, 252static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
220 const struct ebt_entry *e, void *data, unsigned int datalen) 253 const struct ebt_entry *e, void *data, unsigned int datalen)
221{ 254{
@@ -240,6 +273,12 @@ static struct ebt_watcher ulog = {
240 .me = THIS_MODULE, 273 .me = THIS_MODULE,
241}; 274};
242 275
276static struct nf_logger ebt_ulog_logger = {
277 .name = EBT_ULOG_WATCHER,
278 .logfn = &ebt_log_packet,
279 .me = THIS_MODULE,
280};
281
243static int __init init(void) 282static int __init init(void)
244{ 283{
245 int i, ret = 0; 284 int i, ret = 0;
@@ -265,6 +304,13 @@ static int __init init(void)
265 else if ((ret = ebt_register_watcher(&ulog))) 304 else if ((ret = ebt_register_watcher(&ulog)))
266 sock_release(ebtulognl->sk_socket); 305 sock_release(ebtulognl->sk_socket);
267 306
307 if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
308 printk(KERN_WARNING "ebt_ulog: not logging via ulog "
309 "since somebody else already registered for PF_BRIDGE\n");
310 /* we cannot make module load fail here, since otherwise
311 * ebtables userspace would abort */
312 }
313
268 return ret; 314 return ret;
269} 315}
270 316
@@ -273,6 +319,7 @@ static void __exit fini(void)
273 ebt_ulog_buff_t *ub; 319 ebt_ulog_buff_t *ub;
274 int i; 320 int i;
275 321
322 nf_log_unregister_logger(&ebt_ulog_logger);
276 ebt_unregister_watcher(&ulog); 323 ebt_unregister_watcher(&ulog);
277 for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { 324 for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
278 ub = &ulog_buffers[i]; 325 ub = &ulog_buffers[i];
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac58..f8d322e1ea92 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
47#include <linux/rtnetlink.h> 47#include <linux/rtnetlink.h>
48#include <linux/poll.h> 48#include <linux/poll.h>
49#include <linux/highmem.h> 49#include <linux/highmem.h>
50#include <linux/spinlock.h>
50 51
51#include <net/protocol.h> 52#include <net/protocol.h>
52#include <linux/skbuff.h> 53#include <linux/skbuff.h>
@@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
200} 201}
201 202
202/** 203/**
204 * skb_kill_datagram - Free a datagram skbuff forcibly
205 * @sk: socket
206 * @skb: datagram skbuff
207 * @flags: MSG_ flags
208 *
209 * This function frees a datagram skbuff that was received by
210 * skb_recv_datagram. The flags argument must match the one
211 * used for skb_recv_datagram.
212 *
213 * If the MSG_PEEK flag is set, and the packet is still on the
214 * receive queue of the socket, it will be taken off the queue
215 * before it is freed.
216 *
217 * This function currently only disables BH when acquiring the
218 * sk_receive_queue lock. Therefore it must not be used in a
219 * context where that lock is acquired in an IRQ context.
220 */
221
222void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
223{
224 if (flags & MSG_PEEK) {
225 spin_lock_bh(&sk->sk_receive_queue.lock);
226 if (skb == skb_peek(&sk->sk_receive_queue)) {
227 __skb_unlink(skb, &sk->sk_receive_queue);
228 atomic_dec(&skb->users);
229 }
230 spin_unlock_bh(&sk->sk_receive_queue.lock);
231 }
232
233 kfree_skb(skb);
234}
235
236EXPORT_SYMBOL(skb_kill_datagram);
237
238/**
203 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 239 * skb_copy_datagram_iovec - Copy a datagram to an iovec.
204 * @skb: buffer to copy 240 * @skb: buffer to copy
205 * @offset: offset in the buffer to start copying from 241 * @offset: offset in the buffer to start copying from
diff --git a/net/core/dev.c b/net/core/dev.c
index a5efc9ae010b..29ba109d3e54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close);
3276EXPORT_SYMBOL(dev_get_by_flags); 3276EXPORT_SYMBOL(dev_get_by_flags);
3277EXPORT_SYMBOL(dev_get_by_index); 3277EXPORT_SYMBOL(dev_get_by_index);
3278EXPORT_SYMBOL(dev_get_by_name); 3278EXPORT_SYMBOL(dev_get_by_name);
3279EXPORT_SYMBOL(dev_ioctl);
3280EXPORT_SYMBOL(dev_open); 3279EXPORT_SYMBOL(dev_open);
3281EXPORT_SYMBOL(dev_queue_xmit); 3280EXPORT_SYMBOL(dev_queue_xmit);
3282EXPORT_SYMBOL(dev_remove_pack); 3281EXPORT_SYMBOL(dev_remove_pack);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a10e0bc90e8..8964d3445588 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -13,6 +13,7 @@
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 * 14 *
15 * Andi Kleen - Fix a few bad bugs and races. 15 * Andi Kleen - Fix a few bad bugs and races.
16 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
16 */ 17 */
17 18
18#include <linux/module.h> 19#include <linux/module.h>
@@ -250,7 +251,7 @@ load_b:
250 mem[fentry->k] = X; 251 mem[fentry->k] = X;
251 continue; 252 continue;
252 default: 253 default:
253 /* Invalid instruction counts as RET */ 254 WARN_ON(1);
254 return 0; 255 return 0;
255 } 256 }
256 257
@@ -283,8 +284,8 @@ load_b:
283 * 284 *
284 * Check the user's filter code. If we let some ugly 285 * Check the user's filter code. If we let some ugly
285 * filter code slip through kaboom! The filter must contain 286 * filter code slip through kaboom! The filter must contain
286 * no references or jumps that are out of range, no illegal instructions 287 * no references or jumps that are out of range, no illegal
287 * and no backward jumps. It must end with a RET instruction 288 * instructions, and must end with a RET instruction.
288 * 289 *
289 * Returns 0 if the rule set is legal or a negative errno code if not. 290 * Returns 0 if the rule set is legal or a negative errno code if not.
290 */ 291 */
@@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
300 for (pc = 0; pc < flen; pc++) { 301 for (pc = 0; pc < flen; pc++) {
301 /* all jumps are forward as they are not signed */ 302 /* all jumps are forward as they are not signed */
302 ftest = &filter[pc]; 303 ftest = &filter[pc];
303 if (BPF_CLASS(ftest->code) == BPF_JMP) {
304 /* but they mustn't jump off the end */
305 if (BPF_OP(ftest->code) == BPF_JA) {
306 /*
307 * Note, the large ftest->k might cause loops.
308 * Compare this with conditional jumps below,
309 * where offsets are limited. --ANK (981016)
310 */
311 if (ftest->k >= (unsigned)(flen-pc-1))
312 return -EINVAL;
313 } else {
314 /* for conditionals both must be safe */
315 if (pc + ftest->jt +1 >= flen ||
316 pc + ftest->jf +1 >= flen)
317 return -EINVAL;
318 }
319 }
320 304
321 /* check for division by zero -Kris Katterjohn 2005-10-30 */ 305 /* Only allow valid instructions */
322 if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) 306 switch (ftest->code) {
323 return -EINVAL; 307 case BPF_ALU|BPF_ADD|BPF_K:
308 case BPF_ALU|BPF_ADD|BPF_X:
309 case BPF_ALU|BPF_SUB|BPF_K:
310 case BPF_ALU|BPF_SUB|BPF_X:
311 case BPF_ALU|BPF_MUL|BPF_K:
312 case BPF_ALU|BPF_MUL|BPF_X:
313 case BPF_ALU|BPF_DIV|BPF_X:
314 case BPF_ALU|BPF_AND|BPF_K:
315 case BPF_ALU|BPF_AND|BPF_X:
316 case BPF_ALU|BPF_OR|BPF_K:
317 case BPF_ALU|BPF_OR|BPF_X:
318 case BPF_ALU|BPF_LSH|BPF_K:
319 case BPF_ALU|BPF_LSH|BPF_X:
320 case BPF_ALU|BPF_RSH|BPF_K:
321 case BPF_ALU|BPF_RSH|BPF_X:
322 case BPF_ALU|BPF_NEG:
323 case BPF_LD|BPF_W|BPF_ABS:
324 case BPF_LD|BPF_H|BPF_ABS:
325 case BPF_LD|BPF_B|BPF_ABS:
326 case BPF_LD|BPF_W|BPF_LEN:
327 case BPF_LD|BPF_W|BPF_IND:
328 case BPF_LD|BPF_H|BPF_IND:
329 case BPF_LD|BPF_B|BPF_IND:
330 case BPF_LD|BPF_IMM:
331 case BPF_LDX|BPF_W|BPF_LEN:
332 case BPF_LDX|BPF_B|BPF_MSH:
333 case BPF_LDX|BPF_IMM:
334 case BPF_MISC|BPF_TAX:
335 case BPF_MISC|BPF_TXA:
336 case BPF_RET|BPF_K:
337 case BPF_RET|BPF_A:
338 break;
339
340 /* Some instructions need special checks */
324 341
325 /* check that memory operations use valid addresses. */ 342 case BPF_ALU|BPF_DIV|BPF_K:
326 if (ftest->k >= BPF_MEMWORDS) { 343 /* check for division by zero */
327 /* but it might not be a memory operation... */ 344 if (ftest->k == 0)
328 switch (ftest->code) {
329 case BPF_ST:
330 case BPF_STX:
331 case BPF_LD|BPF_MEM:
332 case BPF_LDX|BPF_MEM:
333 return -EINVAL; 345 return -EINVAL;
334 } 346 break;
347
348 case BPF_LD|BPF_MEM:
349 case BPF_LDX|BPF_MEM:
350 case BPF_ST:
351 case BPF_STX:
352 /* check for invalid memory addresses */
353 if (ftest->k >= BPF_MEMWORDS)
354 return -EINVAL;
355 break;
356
357 case BPF_JMP|BPF_JA:
358 /*
359 * Note, the large ftest->k might cause loops.
360 * Compare this with conditional jumps below,
361 * where offsets are limited. --ANK (981016)
362 */
363 if (ftest->k >= (unsigned)(flen-pc-1))
364 return -EINVAL;
365 break;
366
367 case BPF_JMP|BPF_JEQ|BPF_K:
368 case BPF_JMP|BPF_JEQ|BPF_X:
369 case BPF_JMP|BPF_JGE|BPF_K:
370 case BPF_JMP|BPF_JGE|BPF_X:
371 case BPF_JMP|BPF_JGT|BPF_K:
372 case BPF_JMP|BPF_JGT|BPF_X:
373 case BPF_JMP|BPF_JSET|BPF_K:
374 case BPF_JMP|BPF_JSET|BPF_X:
375 /* for conditionals both must be safe */
376 if (pc + ftest->jt + 1 >= flen ||
377 pc + ftest->jf + 1 >= flen)
378 return -EINVAL;
379 break;
380
381 default:
382 return -EINVAL;
335 } 383 }
336 } 384 }
337 385
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9fd..c4f25385029f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
23#include <net/flow.h> 23#include <net/flow.h>
24#include <asm/atomic.h> 24#include <asm/atomic.h>
25#include <asm/semaphore.h> 25#include <asm/semaphore.h>
26#include <linux/security.h>
26 27
27struct flow_cache_entry { 28struct flow_cache_entry {
28 struct flow_cache_entry *next; 29 struct flow_cache_entry *next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
30 u8 dir; 31 u8 dir;
31 struct flowi key; 32 struct flowi key;
32 u32 genid; 33 u32 genid;
34 u32 sk_sid;
33 void *object; 35 void *object;
34 atomic_t *object_ref; 36 atomic_t *object_ref;
35}; 37};
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
162 return 0; 164 return 0;
163} 165}
164 166
165void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, 167void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
166 flow_resolve_t resolver) 168 flow_resolve_t resolver)
167{ 169{
168 struct flow_cache_entry *fle, **head; 170 struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
186 for (fle = *head; fle; fle = fle->next) { 188 for (fle = *head; fle; fle = fle->next) {
187 if (fle->family == family && 189 if (fle->family == family &&
188 fle->dir == dir && 190 fle->dir == dir &&
191 fle->sk_sid == sk_sid &&
189 flow_key_compare(key, &fle->key) == 0) { 192 flow_key_compare(key, &fle->key) == 0) {
190 if (fle->genid == atomic_read(&flow_cache_genid)) { 193 if (fle->genid == atomic_read(&flow_cache_genid)) {
191 void *ret = fle->object; 194 void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
210 *head = fle; 213 *head = fle;
211 fle->family = family; 214 fle->family = family;
212 fle->dir = dir; 215 fle->dir = dir;
216 fle->sk_sid = sk_sid;
213 memcpy(&fle->key, key, sizeof(*key)); 217 memcpy(&fle->key, key, sizeof(*key));
214 fle->object = NULL; 218 fle->object = NULL;
215 flow_count(cpu)++; 219 flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
221 void *obj; 225 void *obj;
222 atomic_t *obj_ref; 226 atomic_t *obj_ref;
223 227
224 resolver(key, family, dir, &obj, &obj_ref); 228 resolver(key, sk_sid, family, dir, &obj, &obj_ref);
225 229
226 if (fle) { 230 if (fle) {
227 fle->genid = atomic_read(&flow_cache_genid); 231 fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/etherdevice.h> 14#include <linux/etherdevice.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/if_arp.h>
16#include <linux/inetdevice.h> 17#include <linux/inetdevice.h>
17#include <linux/inet.h> 18#include <linux/inet.h>
18#include <linux/interrupt.h> 19#include <linux/interrupt.h>
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7fc3e9e28c34..06cad2d63e8a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]);
487 487
488/* Module parameters, defaults. */ 488/* Module parameters, defaults. */
489static int pg_count_d = 1000; /* 1000 pkts by default */ 489static int pg_count_d = 1000; /* 1000 pkts by default */
490static int pg_delay_d = 0; 490static int pg_delay_d;
491static int pg_clone_skb_d = 0; 491static int pg_clone_skb_d;
492static int debug = 0; 492static int debug;
493 493
494static DECLARE_MUTEX(pktgen_sem); 494static DECLARE_MUTEX(pktgen_sem);
495static struct pktgen_thread *pktgen_threads = NULL; 495static struct pktgen_thread *pktgen_threads = NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38e..070f91cfde59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
135struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 135struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
136 int fclone) 136 int fclone)
137{ 137{
138 struct skb_shared_info *shinfo;
138 struct sk_buff *skb; 139 struct sk_buff *skb;
139 u8 *data; 140 u8 *data;
140 141
141 /* Get the HEAD */ 142 /* Get the HEAD */
142 if (fclone) 143 skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
143 skb = kmem_cache_alloc(skbuff_fclone_cache, 144 gfp_mask & ~__GFP_DMA);
144 gfp_mask & ~__GFP_DMA);
145 else
146 skb = kmem_cache_alloc(skbuff_head_cache,
147 gfp_mask & ~__GFP_DMA);
148
149 if (!skb) 145 if (!skb)
150 goto out; 146 goto out;
151 147
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
162 skb->data = data; 158 skb->data = data;
163 skb->tail = data; 159 skb->tail = data;
164 skb->end = data + size; 160 skb->end = data + size;
161 /* make sure we initialize shinfo sequentially */
162 shinfo = skb_shinfo(skb);
163 atomic_set(&shinfo->dataref, 1);
164 shinfo->nr_frags = 0;
165 shinfo->tso_size = 0;
166 shinfo->tso_segs = 0;
167 shinfo->ufo_size = 0;
168 shinfo->ip6_frag_id = 0;
169 shinfo->frag_list = NULL;
170
165 if (fclone) { 171 if (fclone) {
166 struct sk_buff *child = skb + 1; 172 struct sk_buff *child = skb + 1;
167 atomic_t *fclone_ref = (atomic_t *) (child + 1); 173 atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
171 177
172 child->fclone = SKB_FCLONE_UNAVAILABLE; 178 child->fclone = SKB_FCLONE_UNAVAILABLE;
173 } 179 }
174 atomic_set(&(skb_shinfo(skb)->dataref), 1);
175 skb_shinfo(skb)->nr_frags = 0;
176 skb_shinfo(skb)->tso_size = 0;
177 skb_shinfo(skb)->tso_segs = 0;
178 skb_shinfo(skb)->frag_list = NULL;
179 skb_shinfo(skb)->ufo_size = 0;
180 skb_shinfo(skb)->ip6_frag_id = 0;
181out: 180out:
182 return skb; 181 return skb;
183nodata: 182nodata:
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f056..6465b0e4c8cb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
1488 } 1488 }
1489 } 1489 }
1490 1490
1491 if (prot->twsk_obj_size) { 1491 if (prot->twsk_prot != NULL) {
1492 static const char mask[] = "tw_sock_%s"; 1492 static const char mask[] = "tw_sock_%s";
1493 1493
1494 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); 1494 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
1497 goto out_free_request_sock_slab; 1497 goto out_free_request_sock_slab;
1498 1498
1499 sprintf(timewait_sock_slab_name, mask, prot->name); 1499 sprintf(timewait_sock_slab_name, mask, prot->name);
1500 prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, 1500 prot->twsk_prot->twsk_slab =
1501 prot->twsk_obj_size, 1501 kmem_cache_create(timewait_sock_slab_name,
1502 0, SLAB_HWCACHE_ALIGN, 1502 prot->twsk_prot->twsk_obj_size,
1503 NULL, NULL); 1503 0, SLAB_HWCACHE_ALIGN,
1504 if (prot->twsk_slab == NULL) 1504 NULL, NULL);
1505 if (prot->twsk_prot->twsk_slab == NULL)
1505 goto out_free_timewait_sock_slab_name; 1506 goto out_free_timewait_sock_slab_name;
1506 } 1507 }
1507 } 1508 }
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
1548 prot->rsk_prot->slab = NULL; 1549 prot->rsk_prot->slab = NULL;
1549 } 1550 }
1550 1551
1551 if (prot->twsk_slab != NULL) { 1552 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1552 const char *name = kmem_cache_name(prot->twsk_slab); 1553 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1553 1554
1554 kmem_cache_destroy(prot->twsk_slab); 1555 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1555 kfree(name); 1556 kfree(name);
1556 prot->twsk_slab = NULL; 1557 prot->twsk_prot->twsk_slab = NULL;
1557 } 1558 }
1558} 1559}
1559 1560
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03e8024..35e25259fd95 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
55 int done; 55 int done;
56 56
57 do { 57 do {
58 if (sk->sk_err) 58 int err = sock_error(sk);
59 return sock_error(sk); 59 if (err)
60 return err;
60 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) 61 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
61 return -EPIPE; 62 return -EPIPE;
62 if (!*timeo_p) 63 if (!*timeo_p)
@@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
67 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 68 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
68 sk->sk_write_pending++; 69 sk->sk_write_pending++;
69 done = sk_wait_event(sk, timeo_p, 70 done = sk_wait_event(sk, timeo_p,
71 !sk->sk_err &&
70 !((1 << sk->sk_state) & 72 !((1 << sk->sk_state) &
71 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); 73 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
72 finish_wait(sk->sk_sleep, &wait); 74 finish_wait(sk->sk_sleep, &wait);
@@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
137 139
138 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 140 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
139 sk->sk_write_pending++; 141 sk->sk_write_pending++;
140 sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) && 142 sk_wait_event(sk, &current_timeo, !sk->sk_err &&
143 !(sk->sk_shutdown & SEND_SHUTDOWN) &&
144 sk_stream_memory_free(sk) &&
141 vm_wait); 145 vm_wait);
142 sk->sk_write_pending--; 146 sk->sk_write_pending--;
143 147
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 344a8da153fc..87b27fff6e3b 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,3 +1,7 @@
1obj-$(CONFIG_IPV6) += dccp_ipv6.o
2
3dccp_ipv6-y := ipv6.o
4
1obj-$(CONFIG_IP_DCCP) += dccp.o 5obj-$(CONFIG_IP_DCCP) += dccp.o
2 6
3dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ 7dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index c9a62cca22fc..ce9cb77c5c29 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
55 from = av->dccpav_buf + av->dccpav_buf_head; 55 from = av->dccpav_buf + av->dccpav_buf_head;
56 56
57 /* Check if buf_head wraps */ 57 /* Check if buf_head wraps */
58 if (av->dccpav_buf_head + len > av->dccpav_vec_len) { 58 if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
59 const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); 59 const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
60 60
61 memcpy(to, from, tailsize); 61 memcpy(to, from, tailsize);
62 to += tailsize; 62 to += tailsize;
@@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
93struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, 93struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
94 const gfp_t priority) 94 const gfp_t priority)
95{ 95{
96 struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); 96 struct dccp_ackvec *av;
97 97
98 BUG_ON(len == 0);
99
100 if (len > DCCP_MAX_ACKVEC_LEN)
101 return NULL;
102
103 av = kmalloc(sizeof(*av) + len, priority);
98 if (av != NULL) { 104 if (av != NULL) {
99 av->dccpav_buf_len = len; 105 av->dccpav_buf_len = len;
100 av->dccpav_buf_head = 106 av->dccpav_buf_head =
@@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
117} 123}
118 124
119static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 125static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
120 const unsigned int index) 126 const u8 index)
121{ 127{
122 return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; 128 return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
123} 129}
124 130
125static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 131static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
126 const unsigned int index) 132 const u8 index)
127{ 133{
128 return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; 134 return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
129} 135}
@@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
135 */ 141 */
136static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 142static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
137 const unsigned int packets, 143 const unsigned int packets,
138 const unsigned char state) 144 const unsigned char state)
139{ 145{
140 unsigned int gap; 146 unsigned int gap;
141 signed long new_head; 147 signed long new_head;
@@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
223 * could reduce the complexity of this scan.) 229 * could reduce the complexity of this scan.)
224 */ 230 */
225 u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); 231 u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
226 unsigned int index = av->dccpav_buf_head; 232 u8 index = av->dccpav_buf_head;
227 233
228 while (1) { 234 while (1) {
229 const u8 len = dccp_ackvec_len(av, index); 235 const u8 len = dccp_ackvec_len(av, index);
@@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av)
291} 297}
292#endif 298#endif
293 299
294static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) 300static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
295{ 301{
296 /* 302 /*
297 * As we're keeping track of the ack vector size (dccpav_vec_len) and 303 * As we're keeping track of the ack vector size (dccpav_vec_len) and
@@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
301 * draft-ietf-dccp-spec-11.txt Appendix A. -acme 307 * draft-ietf-dccp-spec-11.txt Appendix A. -acme
302 */ 308 */
303#if 0 309#if 0
304 av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; 310 u32 new_buf_tail = av->dccpav_ack_ptr + 1;
305 if (av->dccpav_buf_tail >= av->dccpav_vec_len) 311 if (new_buf_tail >= av->dccpav_vec_len)
306 av->dccpav_buf_tail -= av->dccpav_vec_len; 312 new_buf_tail -= av->dccpav_vec_len;
313 av->dccpav_buf_tail = new_buf_tail;
307#endif 314#endif
308 av->dccpav_vec_len -= av->dccpav_sent_len; 315 av->dccpav_vec_len -= av->dccpav_sent_len;
309} 316}
@@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
326 debug_prefix, 1, 333 debug_prefix, 1,
327 (unsigned long long)av->dccpav_ack_seqno, 334 (unsigned long long)av->dccpav_ack_seqno,
328 (unsigned long long)av->dccpav_ack_ackno); 335 (unsigned long long)av->dccpav_ack_ackno);
329 dccp_ackvec_trow_away_ack_record(av); 336 dccp_ackvec_throw_away_ack_record(av);
330 av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; 337 av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
331 } 338 }
332} 339}
@@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
389 av->dccpav_ack_seqno, 396 av->dccpav_ack_seqno,
390 (unsigned long long) 397 (unsigned long long)
391 av->dccpav_ack_ackno); 398 av->dccpav_ack_ackno);
392 dccp_ackvec_trow_away_ack_record(av); 399 dccp_ackvec_throw_away_ack_record(av);
393 } 400 }
394 /* 401 /*
395 * If dccpav_ack_seqno was not received, no problem 402 * If dccpav_ack_seqno was not received, no problem
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index d0fd6c60c574..f7dfb5f67b87 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -54,16 +54,16 @@
54 * @dccpav_buf - circular buffer of acknowledgeable packets 54 * @dccpav_buf - circular buffer of acknowledgeable packets
55 */ 55 */
56struct dccp_ackvec { 56struct dccp_ackvec {
57 unsigned int dccpav_buf_head;
58 unsigned int dccpav_buf_tail;
59 u64 dccpav_buf_ackno; 57 u64 dccpav_buf_ackno;
60 u64 dccpav_ack_seqno; 58 u64 dccpav_ack_seqno;
61 u64 dccpav_ack_ackno; 59 u64 dccpav_ack_ackno;
62 unsigned int dccpav_ack_ptr;
63 unsigned int dccpav_sent_len;
64 unsigned int dccpav_vec_len;
65 unsigned int dccpav_buf_len;
66 struct timeval dccpav_time; 60 struct timeval dccpav_time;
61 u8 dccpav_buf_head;
62 u8 dccpav_buf_tail;
63 u8 dccpav_ack_ptr;
64 u8 dccpav_sent_len;
65 u8 dccpav_vec_len;
66 u8 dccpav_buf_len;
67 u8 dccpav_buf_nonce; 67 u8 dccpav_buf_nonce;
68 u8 dccpav_ack_nonce; 68 u8 dccpav_ack_nonce;
69 u8 dccpav_buf[0]; 69 u8 dccpav_buf[0];
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
21 21
22#define CCID_MAX 255 22#define CCID_MAX 255
23 23
24struct tcp_info;
25
24struct ccid { 26struct ccid {
25 unsigned char ccid_id; 27 unsigned char ccid_id;
26 const char *ccid_name; 28 const char *ccid_name;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f97b85d55ad8..93f26dd6e6cb 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
59 59
60#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ 60#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
61 61
62extern struct proto dccp_v4_prot; 62extern struct proto dccp_prot;
63 63
64/* is seq1 < seq2 ? */ 64/* is seq1 < seq2 ? */
65static inline int before48(const u64 seq1, const u64 seq2) 65static inline int before48(const u64 seq1, const u64 seq2)
@@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
228extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 228extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
229 const struct dccp_hdr *dh, const unsigned len); 229 const struct dccp_hdr *dh, const unsigned len);
230 230
231extern int dccp_v4_init_sock(struct sock *sk);
232extern int dccp_v4_destroy_sock(struct sock *sk);
233
231extern void dccp_close(struct sock *sk, long timeout); 234extern void dccp_close(struct sock *sk, long timeout);
232extern struct sk_buff *dccp_make_response(struct sock *sk, 235extern struct sk_buff *dccp_make_response(struct sock *sk,
233 struct dst_entry *dst, 236 struct dst_entry *dst,
@@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk,
238 241
239extern int dccp_connect(struct sock *sk); 242extern int dccp_connect(struct sock *sk);
240extern int dccp_disconnect(struct sock *sk, int flags); 243extern int dccp_disconnect(struct sock *sk, int flags);
244extern void dccp_unhash(struct sock *sk);
241extern int dccp_getsockopt(struct sock *sk, int level, int optname, 245extern int dccp_getsockopt(struct sock *sk, int level, int optname,
242 char __user *optval, int __user *optlen); 246 char __user *optval, int __user *optlen);
243extern int dccp_setsockopt(struct sock *sk, int level, int optname, 247extern int dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
249 struct msghdr *msg, size_t len, int nonblock, 253 struct msghdr *msg, size_t len, int nonblock,
250 int flags, int *addr_len); 254 int flags, int *addr_len);
251extern void dccp_shutdown(struct sock *sk, int how); 255extern void dccp_shutdown(struct sock *sk, int how);
256extern int inet_dccp_listen(struct socket *sock, int backlog);
257extern unsigned int dccp_poll(struct file *file, struct socket *sock,
258 poll_table *wait);
259extern void dccp_v4_send_check(struct sock *sk, int len,
260 struct sk_buff *skb);
261extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
262 int addr_len);
252 263
253extern int dccp_v4_checksum(const struct sk_buff *skb, 264extern int dccp_v4_checksum(const struct sk_buff *skb,
254 const u32 saddr, const u32 daddr); 265 const u32 saddr, const u32 daddr);
@@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb,
256extern int dccp_v4_send_reset(struct sock *sk, 267extern int dccp_v4_send_reset(struct sock *sk,
257 enum dccp_reset_codes code); 268 enum dccp_reset_codes code);
258extern void dccp_send_close(struct sock *sk, const int active); 269extern void dccp_send_close(struct sock *sk, const int active);
270extern int dccp_invalid_packet(struct sk_buff *skb);
271
272static inline int dccp_bad_service_code(const struct sock *sk,
273 const __u32 service)
274{
275 const struct dccp_sock *dp = dccp_sk(sk);
276
277 if (dp->dccps_service == service)
278 return 0;
279 return !dccp_list_has_service(dp->dccps_service_list, service);
280}
259 281
260struct dccp_skb_cb { 282struct dccp_skb_cb {
261 __u8 dccpd_type:4; 283 __u8 dccpd_type:4;
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d3..3f78c00e3822 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
28 info->tcpi_retransmits = icsk->icsk_retransmits; 28 info->tcpi_retransmits = icsk->icsk_retransmits;
29 info->tcpi_probes = icsk->icsk_probes_out; 29 info->tcpi_probes = icsk->icsk_probes_out;
30 info->tcpi_backoff = icsk->icsk_backoff; 30 info->tcpi_backoff = icsk->icsk_backoff;
31 info->tcpi_pmtu = dp->dccps_pmtu_cookie; 31 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
32 32
33 if (dp->dccps_options.dccpo_send_ack_vector) 33 if (dp->dccps_options.dccpo_send_ack_vector)
34 info->tcpi_options |= TCPI_OPT_SACK; 34 info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3454d5941900..b6cba72b44e8 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
151 return 0; 151 return 0;
152} 152}
153 153
154int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 154static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
155 const struct dccp_hdr *dh, const unsigned len) 155 const struct dccp_hdr *dh,
156 const unsigned len)
156{ 157{
157 struct dccp_sock *dp = dccp_sk(sk); 158 struct dccp_sock *dp = dccp_sk(sk);
158 159
159 if (dccp_check_seqno(sk, skb))
160 goto discard;
161
162 if (dccp_parse_options(sk, skb))
163 goto discard;
164
165 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
166 dccp_event_ack_recv(sk, skb);
167
168 if (dp->dccps_options.dccpo_send_ack_vector &&
169 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
170 DCCP_SKB_CB(skb)->dccpd_seq,
171 DCCP_ACKVEC_STATE_RECEIVED))
172 goto discard;
173
174 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
175 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
176
177 switch (dccp_hdr(skb)->dccph_type) { 160 switch (dccp_hdr(skb)->dccph_type) {
178 case DCCP_PKT_DATAACK: 161 case DCCP_PKT_DATAACK:
179 case DCCP_PKT_DATA: 162 case DCCP_PKT_DATA:
@@ -250,6 +233,37 @@ discard:
250 return 0; 233 return 0;
251} 234}
252 235
236int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
237 const struct dccp_hdr *dh, const unsigned len)
238{
239 struct dccp_sock *dp = dccp_sk(sk);
240
241 if (dccp_check_seqno(sk, skb))
242 goto discard;
243
244 if (dccp_parse_options(sk, skb))
245 goto discard;
246
247 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
248 dccp_event_ack_recv(sk, skb);
249
250 if (dp->dccps_options.dccpo_send_ack_vector &&
251 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
252 DCCP_SKB_CB(skb)->dccpd_seq,
253 DCCP_ACKVEC_STATE_RECEIVED))
254 goto discard;
255
256 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
257 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
258
259 return __dccp_rcv_established(sk, skb, dh, len);
260discard:
261 __kfree_skb(skb);
262 return 0;
263}
264
265EXPORT_SYMBOL_GPL(dccp_rcv_established);
266
253static int dccp_rcv_request_sent_state_process(struct sock *sk, 267static int dccp_rcv_request_sent_state_process(struct sock *sk,
254 struct sk_buff *skb, 268 struct sk_buff *skb,
255 const struct dccp_hdr *dh, 269 const struct dccp_hdr *dh,
@@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
286 goto out_invalid_packet; 300 goto out_invalid_packet;
287 } 301 }
288 302
303 if (dp->dccps_options.dccpo_send_ack_vector &&
304 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
305 DCCP_SKB_CB(skb)->dccpd_seq,
306 DCCP_ACKVEC_STATE_RECEIVED))
307 goto out_invalid_packet; /* FIXME: change error code */
308
289 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 309 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
290 dccp_update_gsr(sk, dp->dccps_isr); 310 dccp_update_gsr(sk, dp->dccps_isr);
291 /* 311 /*
@@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
309 goto out_invalid_packet; 329 goto out_invalid_packet;
310 } 330 }
311 331
312 dccp_sync_mss(sk, dp->dccps_pmtu_cookie); 332 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
313 333
314 /* 334 /*
315 * Step 10: Process REQUEST state (second part) 335 * Step 10: Process REQUEST state (second part)
@@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
329 dccp_set_state(sk, DCCP_PARTOPEN); 349 dccp_set_state(sk, DCCP_PARTOPEN);
330 350
331 /* Make sure socket is routed, for correct metrics. */ 351 /* Make sure socket is routed, for correct metrics. */
332 inet_sk_rebuild_header(sk); 352 icsk->icsk_af_ops->rebuild_header(sk);
333 353
334 if (!sock_flag(sk, SOCK_DEAD)) { 354 if (!sock_flag(sk, SOCK_DEAD)) {
335 sk->sk_state_change(sk); 355 sk->sk_state_change(sk);
@@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
398 418
399 if (dh->dccph_type == DCCP_PKT_DATAACK || 419 if (dh->dccph_type == DCCP_PKT_DATAACK ||
400 dh->dccph_type == DCCP_PKT_DATA) { 420 dh->dccph_type == DCCP_PKT_DATA) {
401 dccp_rcv_established(sk, skb, dh, len); 421 __dccp_rcv_established(sk, skb, dh, len);
402 queued = 1; /* packet was queued 422 queued = 1; /* packet was queued
403 (by dccp_rcv_established) */ 423 (by __dccp_rcv_established) */
404 } 424 }
405 break; 425 break;
406 } 426 }
@@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
444 */ 464 */
445 if (sk->sk_state == DCCP_LISTEN) { 465 if (sk->sk_state == DCCP_LISTEN) {
446 if (dh->dccph_type == DCCP_PKT_REQUEST) { 466 if (dh->dccph_type == DCCP_PKT_REQUEST) {
447 if (dccp_v4_conn_request(sk, skb) < 0) 467 if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
468 skb) < 0)
448 return 1; 469 return 1;
449 470
450 /* FIXME: do congestion control initialization */ 471 /* FIXME: do congestion control initialization */
@@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
471 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 492 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
472 dccp_event_ack_recv(sk, skb); 493 dccp_event_ack_recv(sk, skb);
473 494
474 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
475 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
476
477 if (dp->dccps_options.dccpo_send_ack_vector && 495 if (dp->dccps_options.dccpo_send_ack_vector &&
478 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, 496 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
479 DCCP_SKB_CB(skb)->dccpd_seq, 497 DCCP_SKB_CB(skb)->dccpd_seq,
480 DCCP_ACKVEC_STATE_RECEIVED)) 498 DCCP_ACKVEC_STATE_RECEIVED))
481 goto discard; 499 goto discard;
500
501 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
502 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
482 } 503 }
483 504
484 /* 505 /*
@@ -566,3 +587,5 @@ discard:
566 } 587 }
567 return 0; 588 return 0;
568} 589}
590
591EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 656e13e38cfb..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,7 +19,9 @@
19 19
20#include <net/icmp.h> 20#include <net/icmp.h>
21#include <net/inet_hashtables.h> 21#include <net/inet_hashtables.h>
22#include <net/inet_sock.h>
22#include <net/sock.h> 23#include <net/sock.h>
24#include <net/timewait_sock.h>
23#include <net/tcp_states.h> 25#include <net/tcp_states.h>
24#include <net/xfrm.h> 26#include <net/xfrm.h>
25 27
@@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
37 39
38static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) 40static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
39{ 41{
40 return inet_csk_get_port(&dccp_hashinfo, sk, snum); 42 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
43 inet_csk_bind_conflict);
41} 44}
42 45
43static void dccp_v4_hash(struct sock *sk) 46static void dccp_v4_hash(struct sock *sk)
@@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk)
45 inet_hash(&dccp_hashinfo, sk); 48 inet_hash(&dccp_hashinfo, sk);
46} 49}
47 50
48static void dccp_v4_unhash(struct sock *sk) 51void dccp_unhash(struct sock *sk)
49{ 52{
50 inet_unhash(&dccp_hashinfo, sk); 53 inet_unhash(&dccp_hashinfo, sk);
51} 54}
52 55
53/* called with local bh disabled */ 56EXPORT_SYMBOL_GPL(dccp_unhash);
54static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
55 struct inet_timewait_sock **twp)
56{
57 struct inet_sock *inet = inet_sk(sk);
58 const u32 daddr = inet->rcv_saddr;
59 const u32 saddr = inet->daddr;
60 const int dif = sk->sk_bound_dev_if;
61 INET_ADDR_COOKIE(acookie, saddr, daddr)
62 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
63 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
64 struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
65 const struct sock *sk2;
66 const struct hlist_node *node;
67 struct inet_timewait_sock *tw;
68
69 prefetch(head->chain.first);
70 write_lock(&head->lock);
71
72 /* Check TIME-WAIT sockets first. */
73 sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
74 tw = inet_twsk(sk2);
75
76 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
77 goto not_unique;
78 }
79 tw = NULL;
80
81 /* And established part... */
82 sk_for_each(sk2, node, &head->chain) {
83 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
84 goto not_unique;
85 }
86 57
87 /* Must record num and sport now. Otherwise we will see 58int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
88 * in hash table socket with a funny identity. */
89 inet->num = lport;
90 inet->sport = htons(lport);
91 sk->sk_hash = hash;
92 BUG_TRAP(sk_unhashed(sk));
93 __sk_add_node(sk, &head->chain);
94 sock_prot_inc_use(sk->sk_prot);
95 write_unlock(&head->lock);
96
97 if (twp != NULL) {
98 *twp = tw;
99 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
100 } else if (tw != NULL) {
101 /* Silly. Should hash-dance instead... */
102 inet_twsk_deschedule(tw, &dccp_death_row);
103 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
104
105 inet_twsk_put(tw);
106 }
107
108 return 0;
109
110not_unique:
111 write_unlock(&head->lock);
112 return -EADDRNOTAVAIL;
113}
114
115/*
116 * Bind a port for a connect operation and hash it.
117 */
118static int dccp_v4_hash_connect(struct sock *sk)
119{
120 const unsigned short snum = inet_sk(sk)->num;
121 struct inet_bind_hashbucket *head;
122 struct inet_bind_bucket *tb;
123 int ret;
124
125 if (snum == 0) {
126 int low = sysctl_local_port_range[0];
127 int high = sysctl_local_port_range[1];
128 int remaining = (high - low) + 1;
129 int rover = net_random() % (high - low) + low;
130 struct hlist_node *node;
131 struct inet_timewait_sock *tw = NULL;
132
133 local_bh_disable();
134 do {
135 head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
136 dccp_hashinfo.bhash_size)];
137 spin_lock(&head->lock);
138
139 /* Does not bother with rcv_saddr checks,
140 * because the established check is already
141 * unique enough.
142 */
143 inet_bind_bucket_for_each(tb, node, &head->chain) {
144 if (tb->port == rover) {
145 BUG_TRAP(!hlist_empty(&tb->owners));
146 if (tb->fastreuse >= 0)
147 goto next_port;
148 if (!__dccp_v4_check_established(sk,
149 rover,
150 &tw))
151 goto ok;
152 goto next_port;
153 }
154 }
155
156 tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
157 head, rover);
158 if (tb == NULL) {
159 spin_unlock(&head->lock);
160 break;
161 }
162 tb->fastreuse = -1;
163 goto ok;
164
165 next_port:
166 spin_unlock(&head->lock);
167 if (++rover > high)
168 rover = low;
169 } while (--remaining > 0);
170
171 local_bh_enable();
172
173 return -EADDRNOTAVAIL;
174
175ok:
176 /* All locks still held and bhs disabled */
177 inet_bind_hash(sk, tb, rover);
178 if (sk_unhashed(sk)) {
179 inet_sk(sk)->sport = htons(rover);
180 __inet_hash(&dccp_hashinfo, sk, 0);
181 }
182 spin_unlock(&head->lock);
183
184 if (tw != NULL) {
185 inet_twsk_deschedule(tw, &dccp_death_row);
186 inet_twsk_put(tw);
187 }
188
189 ret = 0;
190 goto out;
191 }
192
193 head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
194 dccp_hashinfo.bhash_size)];
195 tb = inet_csk(sk)->icsk_bind_hash;
196 spin_lock_bh(&head->lock);
197 if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
198 __inet_hash(&dccp_hashinfo, sk, 0);
199 spin_unlock_bh(&head->lock);
200 return 0;
201 } else {
202 spin_unlock(&head->lock);
203 /* No definite answer... Walk to established hash table */
204 ret = __dccp_v4_check_established(sk, snum, NULL);
205out:
206 local_bh_enable();
207 return ret;
208 }
209}
210
211static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
212 int addr_len)
213{ 59{
214 struct inet_sock *inet = inet_sk(sk); 60 struct inet_sock *inet = inet_sk(sk);
215 struct dccp_sock *dp = dccp_sk(sk); 61 struct dccp_sock *dp = dccp_sk(sk);
@@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
259 inet->dport = usin->sin_port; 105 inet->dport = usin->sin_port;
260 inet->daddr = daddr; 106 inet->daddr = daddr;
261 107
262 dp->dccps_ext_header_len = 0; 108 inet_csk(sk)->icsk_ext_hdr_len = 0;
263 if (inet->opt != NULL) 109 if (inet->opt != NULL)
264 dp->dccps_ext_header_len = inet->opt->optlen; 110 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
265 /* 111 /*
266 * Socket identity is still unknown (sport may be zero). 112 * Socket identity is still unknown (sport may be zero).
267 * However we set state to DCCP_REQUESTING and not releasing socket 113 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
269 * complete initialization after this. 115 * complete initialization after this.
270 */ 116 */
271 dccp_set_state(sk, DCCP_REQUESTING); 117 dccp_set_state(sk, DCCP_REQUESTING);
272 err = dccp_v4_hash_connect(sk); 118 err = inet_hash_connect(&dccp_death_row, sk);
273 if (err != 0) 119 if (err != 0)
274 goto failure; 120 goto failure;
275 121
@@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
287 usin->sin_port); 133 usin->sin_port);
288 dccp_update_gss(sk, dp->dccps_iss); 134 dccp_update_gss(sk, dp->dccps_iss);
289 135
290 /*
291 * SWL and AWL are initially adjusted so that they are not less than
292 * the initial Sequence Numbers received and sent, respectively:
293 * SWL := max(GSR + 1 - floor(W/4), ISR),
294 * AWL := max(GSS - W' + 1, ISS).
295 * These adjustments MUST be applied only at the beginning of the
296 * connection.
297 */
298 dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
299
300 inet->id = dp->dccps_iss ^ jiffies; 136 inet->id = dp->dccps_iss ^ jiffies;
301 137
302 err = dccp_connect(sk); 138 err = dccp_connect(sk);
@@ -316,6 +152,8 @@ failure:
316 goto out; 152 goto out;
317} 153}
318 154
155EXPORT_SYMBOL_GPL(dccp_v4_connect);
156
319/* 157/*
320 * This routine does path mtu discovery as defined in RFC1191. 158 * This routine does path mtu discovery as defined in RFC1191.
321 */ 159 */
@@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
354 mtu = dst_mtu(dst); 192 mtu = dst_mtu(dst);
355 193
356 if (inet->pmtudisc != IP_PMTUDISC_DONT && 194 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
357 dp->dccps_pmtu_cookie > mtu) { 195 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
358 dccp_sync_mss(sk, mtu); 196 dccp_sync_mss(sk, mtu);
359 197
360 /* 198 /*
@@ -606,6 +444,17 @@ out:
606 sock_put(sk); 444 sock_put(sk);
607} 445}
608 446
447/* This routine computes an IPv4 DCCP checksum. */
448void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
449{
450 const struct inet_sock *inet = inet_sk(sk);
451 struct dccp_hdr *dh = dccp_hdr(skb);
452
453 dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
454}
455
456EXPORT_SYMBOL_GPL(dccp_v4_send_check);
457
609int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) 458int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
610{ 459{
611 struct sk_buff *skb; 460 struct sk_buff *skb;
@@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk,
641 dccp_hdr(skb)->dccph_sport); 490 dccp_hdr(skb)->dccph_sport);
642} 491}
643 492
644static inline int dccp_bad_service_code(const struct sock *sk,
645 const __u32 service)
646{
647 const struct dccp_sock *dp = dccp_sk(sk);
648
649 if (dp->dccps_service == service)
650 return 0;
651 return !dccp_list_has_service(dp->dccps_service_list, service);
652}
653
654int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 493int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
655{ 494{
656 struct inet_request_sock *ireq; 495 struct inet_request_sock *ireq;
@@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
662 const __u32 service = dccp_hdr_request(skb)->dccph_req_service; 501 const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
663 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 502 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
664 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; 503 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
665 struct dst_entry *dst = NULL;
666 504
667 /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ 505 /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
668 if (((struct rtable *)skb->dst)->rt_flags & 506 if (((struct rtable *)skb->dst)->rt_flags &
@@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
703 ireq = inet_rsk(req); 541 ireq = inet_rsk(req);
704 ireq->loc_addr = daddr; 542 ireq->loc_addr = daddr;
705 ireq->rmt_addr = saddr; 543 ireq->rmt_addr = saddr;
706 /* FIXME: Merge Aristeu's option parsing code when ready */
707 req->rcv_wnd = 100; /* Fake, option parsing will get the 544 req->rcv_wnd = 100; /* Fake, option parsing will get the
708 right value */ 545 right value */
709 ireq->opt = NULL; 546 ireq->opt = NULL;
@@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
721 dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); 558 dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
722 dreq->dreq_service = service; 559 dreq->dreq_service = service;
723 560
724 if (dccp_v4_send_response(sk, req, dst)) 561 if (dccp_v4_send_response(sk, req, NULL))
725 goto drop_and_free; 562 goto drop_and_free;
726 563
727 inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); 564 inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
728 return 0; 565 return 0;
729 566
730drop_and_free: 567drop_and_free:
731 /* 568 reqsk_free(req);
732 * FIXME: should be reqsk_free after implementing req->rsk_ops
733 */
734 __reqsk_free(req);
735drop: 569drop:
736 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); 570 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
737 dcb->dccpd_reset_code = reset_code; 571 dcb->dccpd_reset_code = reset_code;
738 return -1; 572 return -1;
739} 573}
740 574
575EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
576
741/* 577/*
742 * The three way handshake has completed - we got a valid ACK or DATAACK - 578 * The three way handshake has completed - we got a valid ACK or DATAACK -
743 * now create the new socket. 579 * now create the new socket.
@@ -792,6 +628,8 @@ exit:
792 return NULL; 628 return NULL;
793} 629}
794 630
631EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
632
795static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 633static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
796{ 634{
797 const struct dccp_hdr *dh = dccp_hdr(skb); 635 const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -1011,7 +849,9 @@ discard:
1011 return 0; 849 return 0;
1012} 850}
1013 851
1014static inline int dccp_invalid_packet(struct sk_buff *skb) 852EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
853
854int dccp_invalid_packet(struct sk_buff *skb)
1015{ 855{
1016 const struct dccp_hdr *dh; 856 const struct dccp_hdr *dh;
1017 857
@@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb)
1065 return 1; 905 return 1;
1066 } 906 }
1067 907
1068 /* If the header checksum is incorrect, drop packet and return */
1069 if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
1070 skb->nh.iph->daddr) < 0) {
1071 LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
1072 "incorrect\n");
1073 return 1;
1074 }
1075
1076 return 0; 908 return 0;
1077} 909}
1078 910
911EXPORT_SYMBOL_GPL(dccp_invalid_packet);
912
1079/* this is called when real data arrives */ 913/* this is called when real data arrives */
1080int dccp_v4_rcv(struct sk_buff *skb) 914int dccp_v4_rcv(struct sk_buff *skb)
1081{ 915{
1082 const struct dccp_hdr *dh; 916 const struct dccp_hdr *dh;
1083 struct sock *sk; 917 struct sock *sk;
1084 int rc;
1085 918
1086 /* Step 1: Check header basics: */ 919 /* Step 1: Check header basics: */
1087 920
1088 if (dccp_invalid_packet(skb)) 921 if (dccp_invalid_packet(skb))
1089 goto discard_it; 922 goto discard_it;
1090 923
924 /* If the header checksum is incorrect, drop packet and return */
925 if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
926 skb->nh.iph->daddr) < 0) {
927 LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
928 __FUNCTION__);
929 goto discard_it;
930 }
931
1091 dh = dccp_hdr(skb); 932 dh = dccp_hdr(skb);
1092 933
1093 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); 934 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
@@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb)
1143 goto do_time_wait; 984 goto do_time_wait;
1144 } 985 }
1145 986
1146 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 987 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1147 dccp_pr_debug("xfrm4_policy_check failed\n");
1148 goto discard_and_relse; 988 goto discard_and_relse;
1149 }
1150
1151 if (sk_filter(sk, skb, 0)) {
1152 dccp_pr_debug("sk_filter failed\n");
1153 goto discard_and_relse;
1154 }
1155
1156 skb->dev = NULL;
1157
1158 bh_lock_sock(sk);
1159 rc = 0;
1160 if (!sock_owned_by_user(sk))
1161 rc = dccp_v4_do_rcv(sk, skb);
1162 else
1163 sk_add_backlog(sk, skb);
1164 bh_unlock_sock(sk);
1165 989
1166 sock_put(sk); 990 return sk_receive_skb(sk, skb);
1167 return rc;
1168 991
1169no_dccp_socket: 992no_dccp_socket:
1170 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 993 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -1194,9 +1017,23 @@ do_time_wait:
1194 goto no_dccp_socket; 1017 goto no_dccp_socket;
1195} 1018}
1196 1019
1197static int dccp_v4_init_sock(struct sock *sk) 1020struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
1021 .queue_xmit = ip_queue_xmit,
1022 .send_check = dccp_v4_send_check,
1023 .rebuild_header = inet_sk_rebuild_header,
1024 .conn_request = dccp_v4_conn_request,
1025 .syn_recv_sock = dccp_v4_request_recv_sock,
1026 .net_header_len = sizeof(struct iphdr),
1027 .setsockopt = ip_setsockopt,
1028 .getsockopt = ip_getsockopt,
1029 .addr2sockaddr = inet_csk_addr2sockaddr,
1030 .sockaddr_len = sizeof(struct sockaddr_in),
1031};
1032
1033int dccp_v4_init_sock(struct sock *sk)
1198{ 1034{
1199 struct dccp_sock *dp = dccp_sk(sk); 1035 struct dccp_sock *dp = dccp_sk(sk);
1036 struct inet_connection_sock *icsk = inet_csk(sk);
1200 static int dccp_ctl_socket_init = 1; 1037 static int dccp_ctl_socket_init = 1;
1201 1038
1202 dccp_options_init(&dp->dccps_options); 1039 dccp_options_init(&dp->dccps_options);
@@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk)
1236 dccp_ctl_socket_init = 0; 1073 dccp_ctl_socket_init = 0;
1237 1074
1238 dccp_init_xmit_timers(sk); 1075 dccp_init_xmit_timers(sk);
1239 inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; 1076 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
1240 sk->sk_state = DCCP_CLOSED; 1077 sk->sk_state = DCCP_CLOSED;
1241 sk->sk_write_space = dccp_write_space; 1078 sk->sk_write_space = dccp_write_space;
1079 icsk->icsk_af_ops = &dccp_ipv4_af_ops;
1080 icsk->icsk_sync_mss = dccp_sync_mss;
1242 dp->dccps_mss_cache = 536; 1081 dp->dccps_mss_cache = 536;
1243 dp->dccps_role = DCCP_ROLE_UNDEFINED; 1082 dp->dccps_role = DCCP_ROLE_UNDEFINED;
1244 dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; 1083 dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
@@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk)
1246 return 0; 1085 return 0;
1247} 1086}
1248 1087
1249static int dccp_v4_destroy_sock(struct sock *sk) 1088EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
1089
1090int dccp_v4_destroy_sock(struct sock *sk)
1250{ 1091{
1251 struct dccp_sock *dp = dccp_sk(sk); 1092 struct dccp_sock *dp = dccp_sk(sk);
1252 1093
@@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
1279 return 0; 1120 return 0;
1280} 1121}
1281 1122
1123EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
1124
1282static void dccp_v4_reqsk_destructor(struct request_sock *req) 1125static void dccp_v4_reqsk_destructor(struct request_sock *req)
1283{ 1126{
1284 kfree(inet_rsk(req)->opt); 1127 kfree(inet_rsk(req)->opt);
@@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = {
1293 .send_reset = dccp_v4_ctl_send_reset, 1136 .send_reset = dccp_v4_ctl_send_reset,
1294}; 1137};
1295 1138
1296struct proto dccp_v4_prot = { 1139static struct timewait_sock_ops dccp_timewait_sock_ops = {
1140 .twsk_obj_size = sizeof(struct inet_timewait_sock),
1141};
1142
1143struct proto dccp_prot = {
1297 .name = "DCCP", 1144 .name = "DCCP",
1298 .owner = THIS_MODULE, 1145 .owner = THIS_MODULE,
1299 .close = dccp_close, 1146 .close = dccp_close,
@@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = {
1307 .recvmsg = dccp_recvmsg, 1154 .recvmsg = dccp_recvmsg,
1308 .backlog_rcv = dccp_v4_do_rcv, 1155 .backlog_rcv = dccp_v4_do_rcv,
1309 .hash = dccp_v4_hash, 1156 .hash = dccp_v4_hash,
1310 .unhash = dccp_v4_unhash, 1157 .unhash = dccp_unhash,
1311 .accept = inet_csk_accept, 1158 .accept = inet_csk_accept,
1312 .get_port = dccp_v4_get_port, 1159 .get_port = dccp_v4_get_port,
1313 .shutdown = dccp_shutdown, 1160 .shutdown = dccp_shutdown,
@@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = {
1316 .max_header = MAX_DCCP_HEADER, 1163 .max_header = MAX_DCCP_HEADER,
1317 .obj_size = sizeof(struct dccp_sock), 1164 .obj_size = sizeof(struct dccp_sock),
1318 .rsk_prot = &dccp_request_sock_ops, 1165 .rsk_prot = &dccp_request_sock_ops,
1319 .twsk_obj_size = sizeof(struct inet_timewait_sock), 1166 .twsk_prot = &dccp_timewait_sock_ops,
1320}; 1167};
1168
1169EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 000000000000..c609dc78f487
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1261 @@
1/*
2 * DCCP over IPv6
3 * Linux INET6 implementation
4 *
5 * Based on net/dccp6/ipv6.c
6 *
7 * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/random.h>
18#include <linux/xfrm.h>
19
20#include <net/addrconf.h>
21#include <net/inet_common.h>
22#include <net/inet_hashtables.h>
23#include <net/inet_sock.h>
24#include <net/inet6_connection_sock.h>
25#include <net/inet6_hashtables.h>
26#include <net/ip6_route.h>
27#include <net/ipv6.h>
28#include <net/protocol.h>
29#include <net/transp_v6.h>
30#include <net/xfrm.h>
31
32#include "dccp.h"
33#include "ipv6.h"
34
35static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
36static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
37 struct request_sock *req);
38static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
39
40static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
41
42static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
43static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
44
45static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
46{
47 return inet_csk_get_port(&dccp_hashinfo, sk, snum,
48 inet6_csk_bind_conflict);
49}
50
51static void dccp_v6_hash(struct sock *sk)
52{
53 if (sk->sk_state != DCCP_CLOSED) {
54 if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
55 dccp_prot.hash(sk);
56 return;
57 }
58 local_bh_disable();
59 __inet6_hash(&dccp_hashinfo, sk);
60 local_bh_enable();
61 }
62}
63
64static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
65 struct in6_addr *saddr,
66 struct in6_addr *daddr,
67 unsigned long base)
68{
69 return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
70}
71
72static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
73{
74 const struct dccp_hdr *dh = dccp_hdr(skb);
75
76 if (skb->protocol == htons(ETH_P_IPV6))
77 return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
78 skb->nh.ipv6h->saddr.s6_addr32,
79 dh->dccph_dport,
80 dh->dccph_sport);
81 else
82 return secure_dccp_sequence_number(skb->nh.iph->daddr,
83 skb->nh.iph->saddr,
84 dh->dccph_dport,
85 dh->dccph_sport);
86}
87
88static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
89 int addr_len)
90{
91 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
92 struct inet_connection_sock *icsk = inet_csk(sk);
93 struct inet_sock *inet = inet_sk(sk);
94 struct ipv6_pinfo *np = inet6_sk(sk);
95 struct dccp_sock *dp = dccp_sk(sk);
96 struct in6_addr *saddr = NULL, *final_p = NULL, final;
97 struct flowi fl;
98 struct dst_entry *dst;
99 int addr_type;
100 int err;
101
102 dp->dccps_role = DCCP_ROLE_CLIENT;
103
104 if (addr_len < SIN6_LEN_RFC2133)
105 return -EINVAL;
106
107 if (usin->sin6_family != AF_INET6)
108 return -EAFNOSUPPORT;
109
110 memset(&fl, 0, sizeof(fl));
111
112 if (np->sndflow) {
113 fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
114 IP6_ECN_flow_init(fl.fl6_flowlabel);
115 if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
116 struct ip6_flowlabel *flowlabel;
117 flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
118 if (flowlabel == NULL)
119 return -EINVAL;
120 ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
121 fl6_sock_release(flowlabel);
122 }
123 }
124
125 /*
126 * connect() to INADDR_ANY means loopback (BSD'ism).
127 */
128
129 if (ipv6_addr_any(&usin->sin6_addr))
130 usin->sin6_addr.s6_addr[15] = 0x1;
131
132 addr_type = ipv6_addr_type(&usin->sin6_addr);
133
134 if(addr_type & IPV6_ADDR_MULTICAST)
135 return -ENETUNREACH;
136
137 if (addr_type & IPV6_ADDR_LINKLOCAL) {
138 if (addr_len >= sizeof(struct sockaddr_in6) &&
139 usin->sin6_scope_id) {
140 /* If interface is set while binding, indices
141 * must coincide.
142 */
143 if (sk->sk_bound_dev_if &&
144 sk->sk_bound_dev_if != usin->sin6_scope_id)
145 return -EINVAL;
146
147 sk->sk_bound_dev_if = usin->sin6_scope_id;
148 }
149
150 /* Connect to link-local address requires an interface */
151 if (!sk->sk_bound_dev_if)
152 return -EINVAL;
153 }
154
155 ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
156 np->flow_label = fl.fl6_flowlabel;
157
158 /*
159 * DCCP over IPv4
160 */
161
162 if (addr_type == IPV6_ADDR_MAPPED) {
163 u32 exthdrlen = icsk->icsk_ext_hdr_len;
164 struct sockaddr_in sin;
165
166 SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
167
168 if (__ipv6_only_sock(sk))
169 return -ENETUNREACH;
170
171 sin.sin_family = AF_INET;
172 sin.sin_port = usin->sin6_port;
173 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
174
175 icsk->icsk_af_ops = &dccp_ipv6_mapped;
176 sk->sk_backlog_rcv = dccp_v4_do_rcv;
177
178 err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
179
180 if (err) {
181 icsk->icsk_ext_hdr_len = exthdrlen;
182 icsk->icsk_af_ops = &dccp_ipv6_af_ops;
183 sk->sk_backlog_rcv = dccp_v6_do_rcv;
184 goto failure;
185 } else {
186 ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
187 inet->saddr);
188 ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
189 inet->rcv_saddr);
190 }
191
192 return err;
193 }
194
195 if (!ipv6_addr_any(&np->rcv_saddr))
196 saddr = &np->rcv_saddr;
197
198 fl.proto = IPPROTO_DCCP;
199 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
200 ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
201 fl.oif = sk->sk_bound_dev_if;
202 fl.fl_ip_dport = usin->sin6_port;
203 fl.fl_ip_sport = inet->sport;
204
205 if (np->opt && np->opt->srcrt) {
206 struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
207 ipv6_addr_copy(&final, &fl.fl6_dst);
208 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
209 final_p = &final;
210 }
211
212 err = ip6_dst_lookup(sk, &dst, &fl);
213 if (err)
214 goto failure;
215 if (final_p)
216 ipv6_addr_copy(&fl.fl6_dst, final_p);
217
218 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
219 goto failure;
220
221 if (saddr == NULL) {
222 saddr = &fl.fl6_src;
223 ipv6_addr_copy(&np->rcv_saddr, saddr);
224 }
225
226 /* set the source address */
227 ipv6_addr_copy(&np->saddr, saddr);
228 inet->rcv_saddr = LOOPBACK4_IPV6;
229
230 ip6_dst_store(sk, dst, NULL);
231
232 icsk->icsk_ext_hdr_len = 0;
233 if (np->opt)
234 icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
235 np->opt->opt_nflen);
236
237 inet->dport = usin->sin6_port;
238
239 dccp_set_state(sk, DCCP_REQUESTING);
240 err = inet6_hash_connect(&dccp_death_row, sk);
241 if (err)
242 goto late_failure;
243 /* FIXME */
244#if 0
245 dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
246 np->daddr.s6_addr32,
247 inet->sport,
248 inet->dport);
249#endif
250 err = dccp_connect(sk);
251 if (err)
252 goto late_failure;
253
254 return 0;
255
256late_failure:
257 dccp_set_state(sk, DCCP_CLOSED);
258 __sk_dst_reset(sk);
259failure:
260 inet->dport = 0;
261 sk->sk_route_caps = 0;
262 return err;
263}
264
265static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
266 int type, int code, int offset, __u32 info)
267{
268 struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
269 const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
270 struct ipv6_pinfo *np;
271 struct sock *sk;
272 int err;
273 __u64 seq;
274
275 sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
276 &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
277
278 if (sk == NULL) {
279 ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
280 return;
281 }
282
283 if (sk->sk_state == DCCP_TIME_WAIT) {
284 inet_twsk_put((struct inet_timewait_sock *)sk);
285 return;
286 }
287
288 bh_lock_sock(sk);
289 if (sock_owned_by_user(sk))
290 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
291
292 if (sk->sk_state == DCCP_CLOSED)
293 goto out;
294
295 np = inet6_sk(sk);
296
297 if (type == ICMPV6_PKT_TOOBIG) {
298 struct dst_entry *dst = NULL;
299
300 if (sock_owned_by_user(sk))
301 goto out;
302 if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
303 goto out;
304
305 /* icmp should have updated the destination cache entry */
306 dst = __sk_dst_check(sk, np->dst_cookie);
307
308 if (dst == NULL) {
309 struct inet_sock *inet = inet_sk(sk);
310 struct flowi fl;
311
312 /* BUGGG_FUTURE: Again, it is not clear how
313 to handle rthdr case. Ignore this complexity
314 for now.
315 */
316 memset(&fl, 0, sizeof(fl));
317 fl.proto = IPPROTO_DCCP;
318 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
319 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
320 fl.oif = sk->sk_bound_dev_if;
321 fl.fl_ip_dport = inet->dport;
322 fl.fl_ip_sport = inet->sport;
323
324 if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
325 sk->sk_err_soft = -err;
326 goto out;
327 }
328
329 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
330 sk->sk_err_soft = -err;
331 goto out;
332 }
333
334 } else
335 dst_hold(dst);
336
337 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
338 dccp_sync_mss(sk, dst_mtu(dst));
339 } /* else let the usual retransmit timer handle it */
340 dst_release(dst);
341 goto out;
342 }
343
344 icmpv6_err_convert(type, code, &err);
345
346 seq = DCCP_SKB_CB(skb)->dccpd_seq;
347 /* Might be for an request_sock */
348 switch (sk->sk_state) {
349 struct request_sock *req, **prev;
350 case DCCP_LISTEN:
351 if (sock_owned_by_user(sk))
352 goto out;
353
354 req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
355 &hdr->daddr, &hdr->saddr,
356 inet6_iif(skb));
357 if (!req)
358 goto out;
359
360 /* ICMPs are not backlogged, hence we cannot get
361 * an established socket here.
362 */
363 BUG_TRAP(req->sk == NULL);
364
365 if (seq != dccp_rsk(req)->dreq_iss) {
366 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
367 goto out;
368 }
369
370 inet_csk_reqsk_queue_drop(sk, req, prev);
371 goto out;
372
373 case DCCP_REQUESTING:
374 case DCCP_RESPOND: /* Cannot happen.
375 It can, it SYNs are crossed. --ANK */
376 if (!sock_owned_by_user(sk)) {
377 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
378 sk->sk_err = err;
379 /*
380 * Wake people up to see the error
381 * (see connect in sock.c)
382 */
383 sk->sk_error_report(sk);
384
385 dccp_done(sk);
386 } else
387 sk->sk_err_soft = err;
388 goto out;
389 }
390
391 if (!sock_owned_by_user(sk) && np->recverr) {
392 sk->sk_err = err;
393 sk->sk_error_report(sk);
394 } else
395 sk->sk_err_soft = err;
396
397out:
398 bh_unlock_sock(sk);
399 sock_put(sk);
400}
401
402
403static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
404 struct dst_entry *dst)
405{
406 struct inet6_request_sock *ireq6 = inet6_rsk(req);
407 struct ipv6_pinfo *np = inet6_sk(sk);
408 struct sk_buff *skb;
409 struct ipv6_txoptions *opt = NULL;
410 struct in6_addr *final_p = NULL, final;
411 struct flowi fl;
412 int err = -1;
413
414 memset(&fl, 0, sizeof(fl));
415 fl.proto = IPPROTO_DCCP;
416 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
417 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
418 fl.fl6_flowlabel = 0;
419 fl.oif = ireq6->iif;
420 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
421 fl.fl_ip_sport = inet_sk(sk)->sport;
422
423 if (dst == NULL) {
424 opt = np->opt;
425 if (opt == NULL &&
426 np->rxopt.bits.osrcrt == 2 &&
427 ireq6->pktopts) {
428 struct sk_buff *pktopts = ireq6->pktopts;
429 struct inet6_skb_parm *rxopt = IP6CB(pktopts);
430 if (rxopt->srcrt)
431 opt = ipv6_invert_rthdr(sk,
432 (struct ipv6_rt_hdr *)(pktopts->nh.raw +
433 rxopt->srcrt));
434 }
435
436 if (opt && opt->srcrt) {
437 struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
438 ipv6_addr_copy(&final, &fl.fl6_dst);
439 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
440 final_p = &final;
441 }
442
443 err = ip6_dst_lookup(sk, &dst, &fl);
444 if (err)
445 goto done;
446 if (final_p)
447 ipv6_addr_copy(&fl.fl6_dst, final_p);
448 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
449 goto done;
450 }
451
452 skb = dccp_make_response(sk, dst, req);
453 if (skb != NULL) {
454 struct dccp_hdr *dh = dccp_hdr(skb);
455 dh->dccph_checksum = dccp_v6_check(dh, skb->len,
456 &ireq6->loc_addr,
457 &ireq6->rmt_addr,
458 csum_partial((char *)dh,
459 skb->len,
460 skb->csum));
461 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
462 err = ip6_xmit(sk, skb, &fl, opt, 0);
463 if (err == NET_XMIT_CN)
464 err = 0;
465 }
466
467done:
468 if (opt && opt != np->opt)
469 sock_kfree_s(sk, opt, opt->tot_len);
470 return err;
471}
472
473static void dccp_v6_reqsk_destructor(struct request_sock *req)
474{
475 if (inet6_rsk(req)->pktopts != NULL)
476 kfree_skb(inet6_rsk(req)->pktopts);
477}
478
479static struct request_sock_ops dccp6_request_sock_ops = {
480 .family = AF_INET6,
481 .obj_size = sizeof(struct dccp6_request_sock),
482 .rtx_syn_ack = dccp_v6_send_response,
483 .send_ack = dccp_v6_reqsk_send_ack,
484 .destructor = dccp_v6_reqsk_destructor,
485 .send_reset = dccp_v6_ctl_send_reset,
486};
487
488static struct timewait_sock_ops dccp6_timewait_sock_ops = {
489 .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
490};
491
492static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
493{
494 struct ipv6_pinfo *np = inet6_sk(sk);
495 struct dccp_hdr *dh = dccp_hdr(skb);
496
497 dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
498 len, IPPROTO_DCCP,
499 csum_partial((char *)dh,
500 dh->dccph_doff << 2,
501 skb->csum));
502}
503
504static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
505{
506 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
507 const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
508 sizeof(struct dccp_hdr_ext) +
509 sizeof(struct dccp_hdr_reset);
510 struct sk_buff *skb;
511 struct flowi fl;
512 u64 seqno;
513
514 if (rxdh->dccph_type == DCCP_PKT_RESET)
515 return;
516
517 if (!ipv6_unicast_destination(rxskb))
518 return;
519
520 /*
521 * We need to grab some memory, and put together an RST,
522 * and then put it into the queue to be sent.
523 */
524
525 skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
526 dccp_hdr_reset_len, GFP_ATOMIC);
527 if (skb == NULL)
528 return;
529
530 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
531 dccp_hdr_reset_len);
532
533 skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
534 dh = dccp_hdr(skb);
535 memset(dh, 0, dccp_hdr_reset_len);
536
537 /* Swap the send and the receive. */
538 dh->dccph_type = DCCP_PKT_RESET;
539 dh->dccph_sport = rxdh->dccph_dport;
540 dh->dccph_dport = rxdh->dccph_sport;
541 dh->dccph_doff = dccp_hdr_reset_len / 4;
542 dh->dccph_x = 1;
543 dccp_hdr_reset(skb)->dccph_reset_code =
544 DCCP_SKB_CB(rxskb)->dccpd_reset_code;
545
546 /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
547 seqno = 0;
548 if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
549 dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
550
551 dccp_hdr_set_seq(dh, seqno);
552 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
553 DCCP_SKB_CB(rxskb)->dccpd_seq);
554
555 memset(&fl, 0, sizeof(fl));
556 ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
557 ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
558 dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
559 sizeof(*dh), IPPROTO_DCCP,
560 skb->csum);
561 fl.proto = IPPROTO_DCCP;
562 fl.oif = inet6_iif(rxskb);
563 fl.fl_ip_dport = dh->dccph_dport;
564 fl.fl_ip_sport = dh->dccph_sport;
565
566 /* sk = NULL, but it is safe for now. RST socket required. */
567 if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
568 if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
569 ip6_xmit(NULL, skb, &fl, NULL, 0);
570 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
571 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
572 return;
573 }
574 }
575
576 kfree_skb(skb);
577}
578
579static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
580{
581 struct flowi fl;
582 struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
583 const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
584 sizeof(struct dccp_hdr_ext) +
585 sizeof(struct dccp_hdr_ack_bits);
586 struct sk_buff *skb;
587
588 skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
589 dccp_hdr_ack_len, GFP_ATOMIC);
590 if (skb == NULL)
591 return;
592
593 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
594 dccp_hdr_ack_len);
595
596 skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
597 dh = dccp_hdr(skb);
598 memset(dh, 0, dccp_hdr_ack_len);
599
600 /* Build DCCP header and checksum it. */
601 dh->dccph_type = DCCP_PKT_ACK;
602 dh->dccph_sport = rxdh->dccph_dport;
603 dh->dccph_dport = rxdh->dccph_sport;
604 dh->dccph_doff = dccp_hdr_ack_len / 4;
605 dh->dccph_x = 1;
606
607 dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
608 dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
609 DCCP_SKB_CB(rxskb)->dccpd_seq);
610
611 memset(&fl, 0, sizeof(fl));
612 ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
613 ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
614
615 /* FIXME: calculate checksum, IPv4 also should... */
616
617 fl.proto = IPPROTO_DCCP;
618 fl.oif = inet6_iif(rxskb);
619 fl.fl_ip_dport = dh->dccph_dport;
620 fl.fl_ip_sport = dh->dccph_sport;
621
622 if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
623 if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
624 ip6_xmit(NULL, skb, &fl, NULL, 0);
625 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
626 return;
627 }
628 }
629
630 kfree_skb(skb);
631}
632
633static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
634 struct request_sock *req)
635{
636 dccp_v6_ctl_send_ack(skb);
637}
638
639static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
640{
641 const struct dccp_hdr *dh = dccp_hdr(skb);
642 const struct ipv6hdr *iph = skb->nh.ipv6h;
643 struct sock *nsk;
644 struct request_sock **prev;
645 /* Find possible connection requests. */
646 struct request_sock *req = inet6_csk_search_req(sk, &prev,
647 dh->dccph_sport,
648 &iph->saddr,
649 &iph->daddr,
650 inet6_iif(skb));
651 if (req != NULL)
652 return dccp_check_req(sk, skb, req, prev);
653
654 nsk = __inet6_lookup_established(&dccp_hashinfo,
655 &iph->saddr, dh->dccph_sport,
656 &iph->daddr, ntohs(dh->dccph_dport),
657 inet6_iif(skb));
658
659 if (nsk != NULL) {
660 if (nsk->sk_state != DCCP_TIME_WAIT) {
661 bh_lock_sock(nsk);
662 return nsk;
663 }
664 inet_twsk_put((struct inet_timewait_sock *)nsk);
665 return NULL;
666 }
667
668 return sk;
669}
670
671static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
672{
673 struct inet_request_sock *ireq;
674 struct dccp_sock dp;
675 struct request_sock *req;
676 struct dccp_request_sock *dreq;
677 struct inet6_request_sock *ireq6;
678 struct ipv6_pinfo *np = inet6_sk(sk);
679 const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
680 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
681 __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
682
683 if (skb->protocol == htons(ETH_P_IP))
684 return dccp_v4_conn_request(sk, skb);
685
686 if (!ipv6_unicast_destination(skb))
687 goto drop;
688
689 if (dccp_bad_service_code(sk, service)) {
690 reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
691 goto drop;
692 }
693 /*
694 * There are no SYN attacks on IPv6, yet...
695 */
696 if (inet_csk_reqsk_queue_is_full(sk))
697 goto drop;
698
699 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
700 goto drop;
701
702 req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
703 if (req == NULL)
704 goto drop;
705
706 /* FIXME: process options */
707
708 dccp_openreq_init(req, &dp, skb);
709
710 ireq6 = inet6_rsk(req);
711 ireq = inet_rsk(req);
712 ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
713 ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
714 req->rcv_wnd = 100; /* Fake, option parsing will get the
715 right value */
716 ireq6->pktopts = NULL;
717
718 if (ipv6_opt_accepted(sk, skb) ||
719 np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
720 np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
721 atomic_inc(&skb->users);
722 ireq6->pktopts = skb;
723 }
724 ireq6->iif = sk->sk_bound_dev_if;
725
726 /* So that link locals have meaning */
727 if (!sk->sk_bound_dev_if &&
728 ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
729 ireq6->iif = inet6_iif(skb);
730
731 /*
732 * Step 3: Process LISTEN state
733 *
734 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
735 *
736 * In fact we defer setting S.GSR, S.SWL, S.SWH to
737 * dccp_create_openreq_child.
738 */
739 dreq = dccp_rsk(req);
740 dreq->dreq_isr = dcb->dccpd_seq;
741 dreq->dreq_iss = dccp_v6_init_sequence(sk, skb);
742 dreq->dreq_service = service;
743
744 if (dccp_v6_send_response(sk, req, NULL))
745 goto drop_and_free;
746
747 inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
748 return 0;
749
750drop_and_free:
751 reqsk_free(req);
752drop:
753 DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
754 dcb->dccpd_reset_code = reset_code;
755 return -1;
756}
757
758static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
759 struct sk_buff *skb,
760 struct request_sock *req,
761 struct dst_entry *dst)
762{
763 struct inet6_request_sock *ireq6 = inet6_rsk(req);
764 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
765 struct inet_sock *newinet;
766 struct dccp_sock *newdp;
767 struct dccp6_sock *newdp6;
768 struct sock *newsk;
769 struct ipv6_txoptions *opt;
770
771 if (skb->protocol == htons(ETH_P_IP)) {
772 /*
773 * v6 mapped
774 */
775
776 newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
777 if (newsk == NULL)
778 return NULL;
779
780 newdp6 = (struct dccp6_sock *)newsk;
781 newdp = dccp_sk(newsk);
782 newinet = inet_sk(newsk);
783 newinet->pinet6 = &newdp6->inet6;
784 newnp = inet6_sk(newsk);
785
786 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
787
788 ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
789 newinet->daddr);
790
791 ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
792 newinet->saddr);
793
794 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
795
796 inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
797 newsk->sk_backlog_rcv = dccp_v4_do_rcv;
798 newnp->pktoptions = NULL;
799 newnp->opt = NULL;
800 newnp->mcast_oif = inet6_iif(skb);
801 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
802
803 /*
804 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
805 * here, dccp_create_openreq_child now does this for us, see the comment in
806 * that function for the gory details. -acme
807 */
808
809 /* It is tricky place. Until this moment IPv4 tcp
810 worked with IPv6 icsk.icsk_af_ops.
811 Sync it now.
812 */
813 dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
814
815 return newsk;
816 }
817
818 opt = np->opt;
819
820 if (sk_acceptq_is_full(sk))
821 goto out_overflow;
822
823 if (np->rxopt.bits.osrcrt == 2 &&
824 opt == NULL && ireq6->pktopts) {
825 struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
826 if (rxopt->srcrt)
827 opt = ipv6_invert_rthdr(sk,
828 (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
829 rxopt->srcrt));
830 }
831
832 if (dst == NULL) {
833 struct in6_addr *final_p = NULL, final;
834 struct flowi fl;
835
836 memset(&fl, 0, sizeof(fl));
837 fl.proto = IPPROTO_DCCP;
838 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
839 if (opt && opt->srcrt) {
840 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
841 ipv6_addr_copy(&final, &fl.fl6_dst);
842 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
843 final_p = &final;
844 }
845 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
846 fl.oif = sk->sk_bound_dev_if;
847 fl.fl_ip_dport = inet_rsk(req)->rmt_port;
848 fl.fl_ip_sport = inet_sk(sk)->sport;
849
850 if (ip6_dst_lookup(sk, &dst, &fl))
851 goto out;
852
853 if (final_p)
854 ipv6_addr_copy(&fl.fl6_dst, final_p);
855
856 if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
857 goto out;
858 }
859
860 newsk = dccp_create_openreq_child(sk, req, skb);
861 if (newsk == NULL)
862 goto out;
863
864 /*
865 * No need to charge this sock to the relevant IPv6 refcnt debug socks
866 * count here, dccp_create_openreq_child now does this for us, see the
867 * comment in that function for the gory details. -acme
868 */
869
870 ip6_dst_store(newsk, dst, NULL);
871 newsk->sk_route_caps = dst->dev->features &
872 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
873
874 newdp6 = (struct dccp6_sock *)newsk;
875 newinet = inet_sk(newsk);
876 newinet->pinet6 = &newdp6->inet6;
877 newdp = dccp_sk(newsk);
878 newnp = inet6_sk(newsk);
879
880 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
881
882 ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
883 ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
884 ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
885 newsk->sk_bound_dev_if = ireq6->iif;
886
887 /* Now IPv6 options...
888
889 First: no IPv4 options.
890 */
891 newinet->opt = NULL;
892
893 /* Clone RX bits */
894 newnp->rxopt.all = np->rxopt.all;
895
896 /* Clone pktoptions received with SYN */
897 newnp->pktoptions = NULL;
898 if (ireq6->pktopts != NULL) {
899 newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
900 kfree_skb(ireq6->pktopts);
901 ireq6->pktopts = NULL;
902 if (newnp->pktoptions)
903 skb_set_owner_r(newnp->pktoptions, newsk);
904 }
905 newnp->opt = NULL;
906 newnp->mcast_oif = inet6_iif(skb);
907 newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
908
909 /* Clone native IPv6 options from listening socket (if any)
910
911 Yes, keeping reference count would be much more clever,
912 but we make one more one thing there: reattach optmem
913 to newsk.
914 */
915 if (opt) {
916 newnp->opt = ipv6_dup_options(newsk, opt);
917 if (opt != np->opt)
918 sock_kfree_s(sk, opt, opt->tot_len);
919 }
920
921 inet_csk(newsk)->icsk_ext_hdr_len = 0;
922 if (newnp->opt)
923 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
924 newnp->opt->opt_flen);
925
926 dccp_sync_mss(newsk, dst_mtu(dst));
927
928 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
929
930 __inet6_hash(&dccp_hashinfo, newsk);
931 inet_inherit_port(&dccp_hashinfo, sk, newsk);
932
933 return newsk;
934
935out_overflow:
936 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
937out:
938 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
939 if (opt && opt != np->opt)
940 sock_kfree_s(sk, opt, opt->tot_len);
941 dst_release(dst);
942 return NULL;
943}
944
945/* The socket must have it's spinlock held when we get
946 * here.
947 *
948 * We have a potential double-lock case here, so even when
949 * doing backlog processing we use the BH locking scheme.
950 * This is because we cannot sleep with the original spinlock
951 * held.
952 */
953static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
954{
955 struct ipv6_pinfo *np = inet6_sk(sk);
956 struct sk_buff *opt_skb = NULL;
957
958 /* Imagine: socket is IPv6. IPv4 packet arrives,
959 goes to IPv4 receive handler and backlogged.
960 From backlog it always goes here. Kerboom...
961 Fortunately, dccp_rcv_established and rcv_established
962 handle them correctly, but it is not case with
963 dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
964 */
965
966 if (skb->protocol == htons(ETH_P_IP))
967 return dccp_v4_do_rcv(sk, skb);
968
969 if (sk_filter(sk, skb, 0))
970 goto discard;
971
972 /*
973 * socket locking is here for SMP purposes as backlog rcv
974 * is currently called with bh processing disabled.
975 */
976
977 /* Do Stevens' IPV6_PKTOPTIONS.
978
979 Yes, guys, it is the only place in our code, where we
980 may make it not affecting IPv4.
981 The rest of code is protocol independent,
982 and I do not like idea to uglify IPv4.
983
984 Actually, all the idea behind IPV6_PKTOPTIONS
985 looks not very well thought. For now we latch
986 options, received in the last packet, enqueued
987 by tcp. Feel free to propose better solution.
988 --ANK (980728)
989 */
990 if (np->rxopt.all)
991 opt_skb = skb_clone(skb, GFP_ATOMIC);
992
993 if (sk->sk_state == DCCP_OPEN) { /* Fast path */
994 if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
995 goto reset;
996 return 0;
997 }
998
999 if (sk->sk_state == DCCP_LISTEN) {
1000 struct sock *nsk = dccp_v6_hnd_req(sk, skb);
1001 if (!nsk)
1002 goto discard;
1003
1004 /*
1005 * Queue it on the new socket if the new socket is active,
1006 * otherwise we just shortcircuit this and continue with
1007 * the new socket..
1008 */
1009 if(nsk != sk) {
1010 if (dccp_child_process(sk, nsk, skb))
1011 goto reset;
1012 if (opt_skb)
1013 __kfree_skb(opt_skb);
1014 return 0;
1015 }
1016 }
1017
1018 if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
1019 goto reset;
1020 return 0;
1021
1022reset:
1023 dccp_v6_ctl_send_reset(skb);
1024discard:
1025 if (opt_skb)
1026 __kfree_skb(opt_skb);
1027 kfree_skb(skb);
1028 return 0;
1029}
1030
1031static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
1032{
1033 const struct dccp_hdr *dh;
1034 struct sk_buff *skb = *pskb;
1035 struct sock *sk;
1036
1037 /* Step 1: Check header basics: */
1038
1039 if (dccp_invalid_packet(skb))
1040 goto discard_it;
1041
1042 dh = dccp_hdr(skb);
1043
1044 DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
1045 DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
1046
1047 if (dccp_packet_without_ack(skb))
1048 DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
1049 else
1050 DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
1051
1052 /* Step 2:
1053 * Look up flow ID in table and get corresponding socket */
1054 sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
1055 dh->dccph_sport,
1056 &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
1057 inet6_iif(skb));
1058 /*
1059 * Step 2:
1060 * If no socket ...
1061 * Generate Reset(No Connection) unless P.type == Reset
1062 * Drop packet and return
1063 */
1064 if (sk == NULL)
1065 goto no_dccp_socket;
1066
1067 /*
1068 * Step 2:
1069 * ... or S.state == TIMEWAIT,
1070 * Generate Reset(No Connection) unless P.type == Reset
1071 * Drop packet and return
1072 */
1073
1074 if (sk->sk_state == DCCP_TIME_WAIT)
1075 goto do_time_wait;
1076
1077 if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
1078 goto discard_and_relse;
1079
1080 return sk_receive_skb(sk, skb) ? -1 : 0;
1081
1082no_dccp_socket:
1083 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
1084 goto discard_it;
1085 /*
1086 * Step 2:
1087 * Generate Reset(No Connection) unless P.type == Reset
1088 * Drop packet and return
1089 */
1090 if (dh->dccph_type != DCCP_PKT_RESET) {
1091 DCCP_SKB_CB(skb)->dccpd_reset_code =
1092 DCCP_RESET_CODE_NO_CONNECTION;
1093 dccp_v6_ctl_send_reset(skb);
1094 }
1095discard_it:
1096
1097 /*
1098 * Discard frame
1099 */
1100
1101 kfree_skb(skb);
1102 return 0;
1103
1104discard_and_relse:
1105 sock_put(sk);
1106 goto discard_it;
1107
1108do_time_wait:
1109 inet_twsk_put((struct inet_timewait_sock *)sk);
1110 goto no_dccp_socket;
1111}
1112
1113static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
1114 .queue_xmit = inet6_csk_xmit,
1115 .send_check = dccp_v6_send_check,
1116 .rebuild_header = inet6_sk_rebuild_header,
1117 .conn_request = dccp_v6_conn_request,
1118 .syn_recv_sock = dccp_v6_request_recv_sock,
1119 .net_header_len = sizeof(struct ipv6hdr),
1120 .setsockopt = ipv6_setsockopt,
1121 .getsockopt = ipv6_getsockopt,
1122 .addr2sockaddr = inet6_csk_addr2sockaddr,
1123 .sockaddr_len = sizeof(struct sockaddr_in6)
1124};
1125
1126/*
1127 * DCCP over IPv4 via INET6 API
1128 */
1129static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
1130 .queue_xmit = ip_queue_xmit,
1131 .send_check = dccp_v4_send_check,
1132 .rebuild_header = inet_sk_rebuild_header,
1133 .conn_request = dccp_v6_conn_request,
1134 .syn_recv_sock = dccp_v6_request_recv_sock,
1135 .net_header_len = sizeof(struct iphdr),
1136 .setsockopt = ipv6_setsockopt,
1137 .getsockopt = ipv6_getsockopt,
1138 .addr2sockaddr = inet6_csk_addr2sockaddr,
1139 .sockaddr_len = sizeof(struct sockaddr_in6)
1140};
1141
1142/* NOTE: A lot of things set to zero explicitly by call to
1143 * sk_alloc() so need not be done here.
1144 */
1145static int dccp_v6_init_sock(struct sock *sk)
1146{
1147 int err = dccp_v4_init_sock(sk);
1148
1149 if (err == 0)
1150 inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
1151
1152 return err;
1153}
1154
1155static int dccp_v6_destroy_sock(struct sock *sk)
1156{
1157 dccp_v4_destroy_sock(sk);
1158 return inet6_destroy_sock(sk);
1159}
1160
1161static struct proto dccp_v6_prot = {
1162 .name = "DCCPv6",
1163 .owner = THIS_MODULE,
1164 .close = dccp_close,
1165 .connect = dccp_v6_connect,
1166 .disconnect = dccp_disconnect,
1167 .ioctl = dccp_ioctl,
1168 .init = dccp_v6_init_sock,
1169 .setsockopt = dccp_setsockopt,
1170 .getsockopt = dccp_getsockopt,
1171 .sendmsg = dccp_sendmsg,
1172 .recvmsg = dccp_recvmsg,
1173 .backlog_rcv = dccp_v6_do_rcv,
1174 .hash = dccp_v6_hash,
1175 .unhash = dccp_unhash,
1176 .accept = inet_csk_accept,
1177 .get_port = dccp_v6_get_port,
1178 .shutdown = dccp_shutdown,
1179 .destroy = dccp_v6_destroy_sock,
1180 .orphan_count = &dccp_orphan_count,
1181 .max_header = MAX_DCCP_HEADER,
1182 .obj_size = sizeof(struct dccp6_sock),
1183 .rsk_prot = &dccp6_request_sock_ops,
1184 .twsk_prot = &dccp6_timewait_sock_ops,
1185};
1186
1187static struct inet6_protocol dccp_v6_protocol = {
1188 .handler = dccp_v6_rcv,
1189 .err_handler = dccp_v6_err,
1190 .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
1191};
1192
1193static struct proto_ops inet6_dccp_ops = {
1194 .family = PF_INET6,
1195 .owner = THIS_MODULE,
1196 .release = inet6_release,
1197 .bind = inet6_bind,
1198 .connect = inet_stream_connect,
1199 .socketpair = sock_no_socketpair,
1200 .accept = inet_accept,
1201 .getname = inet6_getname,
1202 .poll = dccp_poll,
1203 .ioctl = inet6_ioctl,
1204 .listen = inet_dccp_listen,
1205 .shutdown = inet_shutdown,
1206 .setsockopt = sock_common_setsockopt,
1207 .getsockopt = sock_common_getsockopt,
1208 .sendmsg = inet_sendmsg,
1209 .recvmsg = sock_common_recvmsg,
1210 .mmap = sock_no_mmap,
1211 .sendpage = sock_no_sendpage,
1212};
1213
1214static struct inet_protosw dccp_v6_protosw = {
1215 .type = SOCK_DCCP,
1216 .protocol = IPPROTO_DCCP,
1217 .prot = &dccp_v6_prot,
1218 .ops = &inet6_dccp_ops,
1219 .capability = -1,
1220 .flags = INET_PROTOSW_ICSK,
1221};
1222
1223static int __init dccp_v6_init(void)
1224{
1225 int err = proto_register(&dccp_v6_prot, 1);
1226
1227 if (err != 0)
1228 goto out;
1229
1230 err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
1231 if (err != 0)
1232 goto out_unregister_proto;
1233
1234 inet6_register_protosw(&dccp_v6_protosw);
1235out:
1236 return err;
1237out_unregister_proto:
1238 proto_unregister(&dccp_v6_prot);
1239 goto out;
1240}
1241
1242static void __exit dccp_v6_exit(void)
1243{
1244 inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
1245 inet6_unregister_protosw(&dccp_v6_protosw);
1246 proto_unregister(&dccp_v6_prot);
1247}
1248
1249module_init(dccp_v6_init);
1250module_exit(dccp_v6_exit);
1251
1252/*
1253 * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
1254 * values directly, Also cover the case where the protocol is not specified,
1255 * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
1256 */
1257MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
1258MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
1259MODULE_LICENSE("GPL");
1260MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
1261MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 000000000000..e4d4e9309270
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,37 @@
1#ifndef _DCCP_IPV6_H
2#define _DCCP_IPV6_H
3/*
4 * net/dccp/ipv6.h
5 *
6 * An implementation of the DCCP protocol
7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/config.h>
15#include <linux/dccp.h>
16#include <linux/ipv6.h>
17
18struct dccp6_sock {
19 struct dccp_sock dccp;
20 /*
21 * ipv6_pinfo has to be the last member of dccp6_sock,
22 * see inet6_sk_generic.
23 */
24 struct ipv6_pinfo inet6;
25};
26
27struct dccp6_request_sock {
28 struct dccp_request_sock dccp;
29 struct inet6_request_sock inet6;
30};
31
32struct dccp6_timewait_sock {
33 struct inet_timewait_sock inet;
34 struct inet6_timewait_sock tw6;
35};
36
37#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1393461898bb..29261fc198e7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = {
40 (unsigned long)&dccp_death_row), 40 (unsigned long)&dccp_death_row),
41}; 41};
42 42
43EXPORT_SYMBOL_GPL(dccp_death_row);
44
43void dccp_time_wait(struct sock *sk, int state, int timeo) 45void dccp_time_wait(struct sock *sk, int state, int timeo)
44{ 46{
45 struct inet_timewait_sock *tw = NULL; 47 struct inet_timewait_sock *tw = NULL;
@@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
50 if (tw != NULL) { 52 if (tw != NULL) {
51 const struct inet_connection_sock *icsk = inet_csk(sk); 53 const struct inet_connection_sock *icsk = inet_csk(sk);
52 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 54 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
53 55#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
56 if (tw->tw_family == PF_INET6) {
57 const struct ipv6_pinfo *np = inet6_sk(sk);
58 struct inet6_timewait_sock *tw6;
59
60 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
61 tw6 = inet6_twsk((struct sock *)tw);
62 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
63 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
64 tw->tw_ipv6only = np->ipv6only;
65 }
66#endif
54 /* Linkage updates. */ 67 /* Linkage updates. */
55 __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); 68 __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
56 69
@@ -170,6 +183,8 @@ out_free:
170 return newsk; 183 return newsk;
171} 184}
172 185
186EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
187
173/* 188/*
174 * Process an incoming packet for RESPOND sockets represented 189 * Process an incoming packet for RESPOND sockets represented
175 * as an request_sock. 190 * as an request_sock.
@@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
214 goto drop; 229 goto drop;
215 } 230 }
216 231
217 child = dccp_v4_request_recv_sock(sk, skb, req, NULL); 232 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
218 if (child == NULL) 233 if (child == NULL)
219 goto listen_overflow; 234 goto listen_overflow;
220 235
@@ -236,6 +251,8 @@ drop:
236 goto out; 251 goto out;
237} 252}
238 253
254EXPORT_SYMBOL_GPL(dccp_check_req);
255
239/* 256/*
240 * Queue segment on the new socket if the new socket is active, 257 * Queue segment on the new socket if the new socket is active,
241 * otherwise we just shortcircuit this and continue with 258 * otherwise we just shortcircuit this and continue with
@@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child,
266 sock_put(child); 283 sock_put(child);
267 return ret; 284 return ret;
268} 285}
286
287EXPORT_SYMBOL_GPL(dccp_child_process);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 74ff87025878..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17 17
18#include <net/inet_sock.h>
18#include <net/sock.h> 19#include <net/sock.h>
19 20
20#include "ackvec.h" 21#include "ackvec.h"
@@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
43{ 44{
44 if (likely(skb != NULL)) { 45 if (likely(skb != NULL)) {
45 const struct inet_sock *inet = inet_sk(sk); 46 const struct inet_sock *inet = inet_sk(sk);
47 const struct inet_connection_sock *icsk = inet_csk(sk);
46 struct dccp_sock *dp = dccp_sk(sk); 48 struct dccp_sock *dp = dccp_sk(sk);
47 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 49 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
48 struct dccp_hdr *dh; 50 struct dccp_hdr *dh;
@@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
108 break; 110 break;
109 } 111 }
110 112
111 dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, 113 icsk->icsk_af_ops->send_check(sk, skb->len, skb);
112 inet->daddr);
113 114
114 if (set_ack) 115 if (set_ack)
115 dccp_event_ack_sent(sk); 116 dccp_event_ack_sent(sk);
@@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
117 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 118 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
118 119
119 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 120 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
120 err = ip_queue_xmit(skb, 0); 121 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
121 if (err <= 0) 122 if (err <= 0)
122 return err; 123 return err;
123 124
@@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
134 135
135unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) 136unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
136{ 137{
138 struct inet_connection_sock *icsk = inet_csk(sk);
137 struct dccp_sock *dp = dccp_sk(sk); 139 struct dccp_sock *dp = dccp_sk(sk);
138 int mss_now; 140 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
139 141 sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
140 /*
141 * FIXME: we really should be using the af_specific thing to support
142 * IPv6.
143 * mss_now = pmtu - tp->af_specific->net_header_len -
144 * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
145 */
146 mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
147 sizeof(struct dccp_hdr_ext);
148 142
149 /* Now subtract optional transport overhead */ 143 /* Now subtract optional transport overhead */
150 mss_now -= dp->dccps_ext_header_len; 144 mss_now -= icsk->icsk_ext_hdr_len;
151 145
152 /* 146 /*
153 * FIXME: this should come from the CCID infrastructure, where, say, 147 * FIXME: this should come from the CCID infrastructure, where, say,
@@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
160 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 154 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
161 155
162 /* And store cached results */ 156 /* And store cached results */
163 dp->dccps_pmtu_cookie = pmtu; 157 icsk->icsk_pmtu_cookie = pmtu;
164 dp->dccps_mss_cache = mss_now; 158 dp->dccps_mss_cache = mss_now;
165 159
166 return mss_now; 160 return mss_now;
167} 161}
168 162
163EXPORT_SYMBOL_GPL(dccp_sync_mss);
164
169void dccp_write_space(struct sock *sk) 165void dccp_write_space(struct sock *sk)
170{ 166{
171 read_lock(&sk->sk_callback_lock); 167 read_lock(&sk->sk_callback_lock);
@@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
266 262
267int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 263int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
268{ 264{
269 if (inet_sk_rebuild_header(sk) != 0) 265 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
270 return -EHOSTUNREACH; /* Routing failure or similar. */ 266 return -EHOSTUNREACH; /* Routing failure or similar. */
271 267
272 return dccp_transmit_skb(sk, (skb_cloned(skb) ? 268 return dccp_transmit_skb(sk, (skb_cloned(skb) ?
@@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
321 return skb; 317 return skb;
322} 318}
323 319
320EXPORT_SYMBOL_GPL(dccp_make_response);
321
324struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, 322struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
325 const enum dccp_reset_codes code) 323 const enum dccp_reset_codes code)
326 324
@@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
377 */ 375 */
378static inline void dccp_connect_init(struct sock *sk) 376static inline void dccp_connect_init(struct sock *sk)
379{ 377{
378 struct dccp_sock *dp = dccp_sk(sk);
380 struct dst_entry *dst = __sk_dst_get(sk); 379 struct dst_entry *dst = __sk_dst_get(sk);
381 struct inet_connection_sock *icsk = inet_csk(sk); 380 struct inet_connection_sock *icsk = inet_csk(sk);
382 381
@@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk)
385 384
386 dccp_sync_mss(sk, dst_mtu(dst)); 385 dccp_sync_mss(sk, dst_mtu(dst));
387 386
388 /* 387 dccp_update_gss(sk, dp->dccps_iss);
389 * FIXME: set dp->{dccps_swh,dccps_swl}, with 388 /*
390 * something like dccp_inc_seq 389 * SWL and AWL are initially adjusted so that they are not less than
391 */ 390 * the initial Sequence Numbers received and sent, respectively:
391 * SWL := max(GSR + 1 - floor(W/4), ISR),
392 * AWL := max(GSS - W' + 1, ISS).
393 * These adjustments MUST be applied only at the beginning of the
394 * connection.
395 */
396 dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
392 397
393 icsk->icsk_retransmits = 0; 398 icsk->icsk_retransmits = 0;
394} 399}
@@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk)
420 return 0; 425 return 0;
421} 426}
422 427
428EXPORT_SYMBOL_GPL(dccp_connect);
429
423void dccp_send_ack(struct sock *sk) 430void dccp_send_ack(struct sock *sk)
424{ 431{
425 /* If we have been reset, we may not send again. */ 432 /* If we have been reset, we may not send again. */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6b2a9e4581..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
24#include <net/checksum.h> 24#include <net/checksum.h>
25 25
26#include <net/inet_common.h> 26#include <net/inet_common.h>
27#include <net/ip.h> 27#include <net/inet_sock.h>
28#include <net/protocol.h> 28#include <net/protocol.h>
29#include <net/sock.h> 29#include <net/sock.h>
30#include <net/xfrm.h> 30#include <net/xfrm.h>
@@ -34,15 +34,18 @@
34#include <linux/timer.h> 34#include <linux/timer.h>
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/poll.h> 36#include <linux/poll.h>
37#include <linux/dccp.h>
38 37
39#include "ccid.h" 38#include "ccid.h"
40#include "dccp.h" 39#include "dccp.h"
41 40
42DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; 41DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
43 42
43EXPORT_SYMBOL_GPL(dccp_statistics);
44
44atomic_t dccp_orphan_count = ATOMIC_INIT(0); 45atomic_t dccp_orphan_count = ATOMIC_INIT(0);
45 46
47EXPORT_SYMBOL_GPL(dccp_orphan_count);
48
46static struct net_protocol dccp_protocol = { 49static struct net_protocol dccp_protocol = {
47 .handler = dccp_v4_rcv, 50 .handler = dccp_v4_rcv,
48 .err_handler = dccp_v4_err, 51 .err_handler = dccp_v4_err,
@@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags)
149 return err; 152 return err;
150} 153}
151 154
155EXPORT_SYMBOL_GPL(dccp_disconnect);
156
152/* 157/*
153 * Wait for a DCCP event. 158 * Wait for a DCCP event.
154 * 159 *
@@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags)
156 * take care of normal races (between the test and the event) and we don't 161 * take care of normal races (between the test and the event) and we don't
157 * go look at any of the socket buffers directly. 162 * go look at any of the socket buffers directly.
158 */ 163 */
159static unsigned int dccp_poll(struct file *file, struct socket *sock, 164unsigned int dccp_poll(struct file *file, struct socket *sock,
160 poll_table *wait) 165 poll_table *wait)
161{ 166{
162 unsigned int mask; 167 unsigned int mask;
163 struct sock *sk = sock->sk; 168 struct sock *sk = sock->sk;
@@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock,
205 return mask; 210 return mask;
206} 211}
207 212
213EXPORT_SYMBOL_GPL(dccp_poll);
214
208int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) 215int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
209{ 216{
210 dccp_pr_debug("entry\n"); 217 dccp_pr_debug("entry\n");
211 return -ENOIOCTLCMD; 218 return -ENOIOCTLCMD;
212} 219}
213 220
221EXPORT_SYMBOL_GPL(dccp_ioctl);
222
214static int dccp_setsockopt_service(struct sock *sk, const u32 service, 223static int dccp_setsockopt_service(struct sock *sk, const u32 service,
215 char __user *optval, int optlen) 224 char __user *optval, int optlen)
216{ 225{
@@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
254 int val; 263 int val;
255 264
256 if (level != SOL_DCCP) 265 if (level != SOL_DCCP)
257 return ip_setsockopt(sk, level, optname, optval, optlen); 266 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
267 optname, optval,
268 optlen);
258 269
259 if (optlen < sizeof(int)) 270 if (optlen < sizeof(int))
260 return -EINVAL; 271 return -EINVAL;
@@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
282 return err; 293 return err;
283} 294}
284 295
296EXPORT_SYMBOL_GPL(dccp_setsockopt);
297
285static int dccp_getsockopt_service(struct sock *sk, int len, 298static int dccp_getsockopt_service(struct sock *sk, int len,
286 u32 __user *optval, 299 u32 __user *optval,
287 int __user *optlen) 300 int __user *optlen)
@@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
320 int val, len; 333 int val, len;
321 334
322 if (level != SOL_DCCP) 335 if (level != SOL_DCCP)
323 return ip_getsockopt(sk, level, optname, optval, optlen); 336 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
324 337 optname, optval,
338 optlen);
325 if (get_user(len, optlen)) 339 if (get_user(len, optlen))
326 return -EFAULT; 340 return -EFAULT;
327 341
@@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
354 return 0; 368 return 0;
355} 369}
356 370
371EXPORT_SYMBOL_GPL(dccp_getsockopt);
372
357int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 373int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
358 size_t len) 374 size_t len)
359{ 375{
@@ -410,6 +426,8 @@ out_discard:
410 goto out_release; 426 goto out_release;
411} 427}
412 428
429EXPORT_SYMBOL_GPL(dccp_sendmsg);
430
413int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 431int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
414 size_t len, int nonblock, int flags, int *addr_len) 432 size_t len, int nonblock, int flags, int *addr_len)
415{ 433{
@@ -507,7 +525,9 @@ out:
507 return len; 525 return len;
508} 526}
509 527
510static int inet_dccp_listen(struct socket *sock, int backlog) 528EXPORT_SYMBOL_GPL(dccp_recvmsg);
529
530int inet_dccp_listen(struct socket *sock, int backlog)
511{ 531{
512 struct sock *sk = sock->sk; 532 struct sock *sk = sock->sk;
513 unsigned char old_state; 533 unsigned char old_state;
@@ -543,6 +563,8 @@ out:
543 return err; 563 return err;
544} 564}
545 565
566EXPORT_SYMBOL_GPL(inet_dccp_listen);
567
546static const unsigned char dccp_new_state[] = { 568static const unsigned char dccp_new_state[] = {
547 /* current state: new state: action: */ 569 /* current state: new state: action: */
548 [0] = DCCP_CLOSED, 570 [0] = DCCP_CLOSED,
@@ -648,12 +670,16 @@ adjudge_to_death:
648 sock_put(sk); 670 sock_put(sk);
649} 671}
650 672
673EXPORT_SYMBOL_GPL(dccp_close);
674
651void dccp_shutdown(struct sock *sk, int how) 675void dccp_shutdown(struct sock *sk, int how)
652{ 676{
653 dccp_pr_debug("entry\n"); 677 dccp_pr_debug("entry\n");
654} 678}
655 679
656static struct proto_ops inet_dccp_ops = { 680EXPORT_SYMBOL_GPL(dccp_shutdown);
681
682static const struct proto_ops inet_dccp_ops = {
657 .family = PF_INET, 683 .family = PF_INET,
658 .owner = THIS_MODULE, 684 .owner = THIS_MODULE,
659 .release = inet_release, 685 .release = inet_release,
@@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops;
681static struct inet_protosw dccp_v4_protosw = { 707static struct inet_protosw dccp_v4_protosw = {
682 .type = SOCK_DCCP, 708 .type = SOCK_DCCP,
683 .protocol = IPPROTO_DCCP, 709 .protocol = IPPROTO_DCCP,
684 .prot = &dccp_v4_prot, 710 .prot = &dccp_prot,
685 .ops = &inet_dccp_ops, 711 .ops = &inet_dccp_ops,
686 .capability = -1, 712 .capability = -1,
687 .no_check = 0, 713 .no_check = 0,
688 .flags = 0, 714 .flags = INET_PROTOSW_ICSK,
689}; 715};
690 716
691/* 717/*
@@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
760int dccp_debug; 786int dccp_debug;
761module_param(dccp_debug, int, 0444); 787module_param(dccp_debug, int, 0444);
762MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); 788MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
789
790EXPORT_SYMBOL_GPL(dccp_debug);
763#endif 791#endif
764 792
765static int __init dccp_init(void) 793static int __init dccp_init(void)
766{ 794{
767 unsigned long goal; 795 unsigned long goal;
768 int ehash_order, bhash_order, i; 796 int ehash_order, bhash_order, i;
769 int rc = proto_register(&dccp_v4_prot, 1); 797 int rc = proto_register(&dccp_prot, 1);
770 798
771 if (rc) 799 if (rc)
772 goto out; 800 goto out;
@@ -869,7 +897,7 @@ out_free_bind_bucket_cachep:
869 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 897 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
870 dccp_hashinfo.bind_bucket_cachep = NULL; 898 dccp_hashinfo.bind_bucket_cachep = NULL;
871out_proto_unregister: 899out_proto_unregister:
872 proto_unregister(&dccp_v4_prot); 900 proto_unregister(&dccp_prot);
873 goto out; 901 goto out;
874} 902}
875 903
@@ -892,7 +920,7 @@ static void __exit dccp_fini(void)
892 get_order(dccp_hashinfo.ehash_size * 920 get_order(dccp_hashinfo.ehash_size *
893 sizeof(struct inet_ehash_bucket))); 921 sizeof(struct inet_ehash_bucket)));
894 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 922 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
895 proto_unregister(&dccp_v4_prot); 923 proto_unregister(&dccp_prot);
896} 924}
897 925
898module_init(dccp_init); 926module_init(dccp_init);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c68..78ec5344be86 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
149#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) 149#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
150 150
151 151
152static struct proto_ops dn_proto_ops; 152static const struct proto_ops dn_proto_ops;
153static DEFINE_RWLOCK(dn_hash_lock); 153static DEFINE_RWLOCK(dn_hash_lock);
154static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; 154static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
155static struct hlist_head dn_wild_sk; 155static struct hlist_head dn_wild_sk;
@@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1252 break; 1252 break;
1253 1253
1254 default: 1254 default:
1255 err = dev_ioctl(cmd, (void __user *)arg); 1255 err = -ENOIOCTLCMD;
1256 break; 1256 break;
1257 } 1257 }
1258 1258
@@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = {
2342 .owner = THIS_MODULE, 2342 .owner = THIS_MODULE,
2343}; 2343};
2344 2344
2345static struct proto_ops dn_proto_ops = { 2345static const struct proto_ops dn_proto_ops = {
2346 .family = AF_DECnet, 2346 .family = AF_DECnet,
2347 .owner = THIS_MODULE, 2347 .owner = THIS_MODULE,
2348 .release = dn_release, 2348 .release = dn_release,
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 8d0cc3cf3e49..33ab256cfd4a 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb)
408 } 408 }
409 } 409 }
410 410
411 if (!dn_db->router) { 411 /* Only use routers in our area */
412 dn_db->router = neigh_clone(neigh); 412 if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
413 } else { 413 if (!dn_db->router) {
414 if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) 414 dn_db->router = neigh_clone(neigh);
415 neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); 415 } else {
416 if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
417 neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
418 }
416 } 419 }
417 write_unlock(&neigh->lock); 420 write_unlock(&neigh->lock);
418 neigh_release(neigh); 421 neigh_release(neigh);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 369f25b60f3f..44bda85e678f 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
793got_it: 793got_it:
794 if (sk != NULL) { 794 if (sk != NULL) {
795 struct dn_scp *scp = DN_SK(sk); 795 struct dn_scp *scp = DN_SK(sk);
796 int ret;
797 796
798 /* Reset backoff */ 797 /* Reset backoff */
799 scp->nsp_rxtshift = 0; 798 scp->nsp_rxtshift = 0;
@@ -807,21 +806,7 @@ got_it:
807 goto free_out; 806 goto free_out;
808 } 807 }
809 808
810 bh_lock_sock(sk); 809 return sk_receive_skb(sk, skb);
811 ret = NET_RX_SUCCESS;
812 if (decnet_debug_level & 8)
813 printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n",
814 (int)cb->rt_flags, (int)cb->nsp_flags,
815 (int)cb->src_port, (int)cb->dst_port,
816 !!sock_owned_by_user(sk));
817 if (!sock_owned_by_user(sk))
818 ret = dn_nsp_backlog_rcv(sk, skb);
819 else
820 sk_add_backlog(sk, skb);
821 bh_unlock_sock(sk);
822 sock_put(sk);
823
824 return ret;
825 } 810 }
826 811
827 return dn_nsp_no_socket(skb, reason); 812 return dn_nsp_no_socket(skb, reason);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df96..c792994d7952 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
31#include <linux/if_arp.h> 31#include <linux/if_arp.h>
32#include <linux/wireless.h> 32#include <linux/wireless.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/udp.h>
34#include <net/sock.h> 35#include <net/sock.h>
35#include <net/inet_common.h> 36#include <net/inet_common.h>
36#include <linux/stat.h> 37#include <linux/stat.h>
@@ -45,7 +46,7 @@
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/system.h> 47#include <asm/system.h>
47 48
48static struct proto_ops econet_ops; 49static const struct proto_ops econet_ops;
49static struct hlist_head econet_sklist; 50static struct hlist_head econet_sklist;
50static DEFINE_RWLOCK(econet_lock); 51static DEFINE_RWLOCK(econet_lock);
51 52
@@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256];
56#define EC_PORT_IP 0xd2 57#define EC_PORT_IP 0xd2
57 58
58#ifdef CONFIG_ECONET_AUNUDP 59#ifdef CONFIG_ECONET_AUNUDP
59static spinlock_t aun_queue_lock; 60static DEFINE_SPINLOCK(aun_queue_lock);
60static struct socket *udpsock; 61static struct socket *udpsock;
61#define AUN_PORT 0x8000 62#define AUN_PORT 0x8000
62 63
@@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
686 break; 687 break;
687 688
688 default: 689 default:
689 return dev_ioctl(cmd, argp); 690 return -ENOIOCTLCMD;
690 } 691 }
691 /*NOTREACHED*/ 692 /*NOTREACHED*/
692 return 0; 693 return 0;
@@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = {
698 .owner = THIS_MODULE, 699 .owner = THIS_MODULE,
699}; 700};
700 701
701static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { 702static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
702 .family = PF_ECONET, 703 .family = PF_ECONET,
703 .owner = THIS_MODULE, 704 .owner = THIS_MODULE,
704 .release = econet_release, 705 .release = econet_release,
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 03efaacbdb73..4cc6f41c6930 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
410 return 1; 410 return 1;
411 } 411 }
412 412
413 if ((is_multicast_ether_addr(hdr->addr1) || 413 if (is_multicast_ether_addr(hdr->addr1)
414 is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : 414 ? ieee->host_mc_decrypt : ieee->host_decrypt) {
415 ieee->host_decrypt) {
416 int idx = 0; 415 int idx = 0;
417 if (skb->len >= hdrlen + 3) 416 if (skb->len >= hdrlen + 3)
418 idx = skb->data[hdrlen + 3] >> 6; 417 idx = skb->data[hdrlen + 3] >> 6;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f4..011cca7ae02b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
456 increase provides TCP friendliness. 456 increase provides TCP friendliness.
457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ 457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
458 458
459config TCP_CONG_CUBIC
460 tristate "CUBIC TCP"
461 default m
462 ---help---
463 This is version 2.0 of BIC-TCP which uses a cubic growth function
464 among other techniques.
465 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
466
459config TCP_CONG_WESTWOOD 467config TCP_CONG_WESTWOOD
460 tristate "TCP Westwood+" 468 tristate "TCP Westwood+"
461 default m 469 default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6b..c54edd76de09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
37obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
37obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 38obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
38obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 39obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
39obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 40obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..966a071a408c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
93#include <linux/smp_lock.h> 93#include <linux/smp_lock.h>
94#include <linux/inet.h> 94#include <linux/inet.h>
95#include <linux/igmp.h> 95#include <linux/igmp.h>
96#include <linux/inetdevice.h>
96#include <linux/netdevice.h> 97#include <linux/netdevice.h>
97#include <net/ip.h> 98#include <net/ip.h>
98#include <net/protocol.h> 99#include <net/protocol.h>
@@ -302,6 +303,7 @@ lookup_protocol:
302 sk->sk_reuse = 1; 303 sk->sk_reuse = 1;
303 304
304 inet = inet_sk(sk); 305 inet = inet_sk(sk);
306 inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
305 307
306 if (SOCK_RAW == sock->type) { 308 if (SOCK_RAW == sock->type) {
307 inet->num = protocol; 309 inet->num = protocol;
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
775 err = devinet_ioctl(cmd, (void __user *)arg); 777 err = devinet_ioctl(cmd, (void __user *)arg);
776 break; 778 break;
777 default: 779 default:
778 if (!sk->sk_prot->ioctl || 780 if (sk->sk_prot->ioctl)
779 (err = sk->sk_prot->ioctl(sk, cmd, arg)) == 781 err = sk->sk_prot->ioctl(sk, cmd, arg);
780 -ENOIOCTLCMD) 782 else
781 err = dev_ioctl(cmd, (void __user *)arg); 783 err = -ENOIOCTLCMD;
782 break; 784 break;
783 } 785 }
784 return err; 786 return err;
785} 787}
786 788
787struct proto_ops inet_stream_ops = { 789const struct proto_ops inet_stream_ops = {
788 .family = PF_INET, 790 .family = PF_INET,
789 .owner = THIS_MODULE, 791 .owner = THIS_MODULE,
790 .release = inet_release, 792 .release = inet_release,
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = {
805 .sendpage = tcp_sendpage 807 .sendpage = tcp_sendpage
806}; 808};
807 809
808struct proto_ops inet_dgram_ops = { 810const struct proto_ops inet_dgram_ops = {
809 .family = PF_INET, 811 .family = PF_INET,
810 .owner = THIS_MODULE, 812 .owner = THIS_MODULE,
811 .release = inet_release, 813 .release = inet_release,
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = {
830 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 832 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
831 * udp_poll 833 * udp_poll
832 */ 834 */
833static struct proto_ops inet_sockraw_ops = { 835static const struct proto_ops inet_sockraw_ops = {
834 .family = PF_INET, 836 .family = PF_INET,
835 .owner = THIS_MODULE, 837 .owner = THIS_MODULE,
836 .release = inet_release, 838 .release = inet_release,
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] =
869 .ops = &inet_stream_ops, 871 .ops = &inet_stream_ops,
870 .capability = -1, 872 .capability = -1,
871 .no_check = 0, 873 .no_check = 0,
872 .flags = INET_PROTOSW_PERMANENT, 874 .flags = INET_PROTOSW_PERMANENT |
875 INET_PROTOSW_ICSK,
873 }, 876 },
874 877
875 { 878 {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
6#include <linux/crypto.h> 6#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 7#include <linux/pfkeyv2.h>
8#include <net/icmp.h> 8#include <net/icmp.h>
9#include <net/protocol.h>
9#include <asm/scatterlist.h> 10#include <asm/scatterlist.h>
10 11
11 12
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
86#include <linux/in.h> 86#include <linux/in.h>
87#include <linux/mm.h> 87#include <linux/mm.h>
88#include <linux/inet.h> 88#include <linux/inet.h>
89#include <linux/inetdevice.h>
89#include <linux/netdevice.h> 90#include <linux/netdevice.h>
90#include <linux/etherdevice.h> 91#include <linux/etherdevice.h>
91#include <linux/fddidevice.h> 92#include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
58#endif 58#endif
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60 60
61#include <net/arp.h>
61#include <net/ip.h> 62#include <net/ip.h>
62#include <net/route.h> 63#include <net/route.h>
63#include <net/ip_fib.h> 64#include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
9#include <linux/pfkeyv2.h> 9#include <linux/pfkeyv2.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <net/icmp.h> 11#include <net/icmp.h>
12#include <net/protocol.h>
12#include <net/udp.h> 13#include <net/udp.h>
13 14
14/* decapsulation data for use when post-processing */ 15/* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/in.h> 31#include <linux/in.h>
32#include <linux/inet.h> 32#include <linux/inet.h>
33#include <linux/inetdevice.h>
33#include <linux/netdevice.h> 34#include <linux/netdevice.h>
34#include <linux/if_arp.h> 35#include <linux/if_arp.h>
35#include <linux/skbuff.h> 36#include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
33#include <linux/errno.h> 33#include <linux/errno.h>
34#include <linux/in.h> 34#include <linux/in.h>
35#include <linux/inet.h> 35#include <linux/inet.h>
36#include <linux/inetdevice.h>
36#include <linux/netdevice.h> 37#include <linux/netdevice.h>
37#include <linux/if_arp.h> 38#include <linux/if_arp.h>
38#include <linux/proc_fs.h> 39#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
36#include <linux/netlink.h> 37#include <linux/netlink.h>
37#include <linux/init.h> 38#include <linux/init.h>
38 39
40#include <net/arp.h>
39#include <net/ip.h> 41#include <net/ip.h>
40#include <net/protocol.h> 42#include <net/protocol.h>
41#include <net/route.h> 43#include <net/route.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df9..e320b32373e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
41 * modify it under the terms of the GNU General Public License 41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version 42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
44 */ 51 */
45 52
46#define VERSION "0.404" 53#define VERSION "0.404"
@@ -59,6 +66,7 @@
59#include <linux/errno.h> 66#include <linux/errno.h>
60#include <linux/in.h> 67#include <linux/in.h>
61#include <linux/inet.h> 68#include <linux/inet.h>
69#include <linux/inetdevice.h>
62#include <linux/netdevice.h> 70#include <linux/netdevice.h>
63#include <linux/if_arp.h> 71#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 72#include <linux/proc_fs.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
73#include <linux/socket.h> 73#include <linux/socket.h>
74#include <linux/in.h> 74#include <linux/in.h>
75#include <linux/inet.h> 75#include <linux/inet.h>
76#include <linux/inetdevice.h>
76#include <linux/netdevice.h> 77#include <linux/netdevice.h>
77#include <linux/string.h> 78#include <linux/string.h>
78#include <linux/netfilter_ipv4.h> 79#include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
91#include <linux/if_arp.h> 91#include <linux/if_arp.h>
92#include <linux/rtnetlink.h> 92#include <linux/rtnetlink.h>
93#include <linux/times.h> 93#include <linux/times.h>
94
95#include <net/arp.h>
94#include <net/ip.h> 96#include <net/ip.h>
95#include <net/protocol.h> 97#include <net/protocol.h>
96#include <net/route.h> 98#include <net/route.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a566..ae20281d8deb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
37 */ 37 */
38int sysctl_local_port_range[2] = { 1024, 4999 }; 38int sysctl_local_port_range[2] = { 1024, 4999 };
39 39
40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) 40int inet_csk_bind_conflict(const struct sock *sk,
41 const struct inet_bind_bucket *tb)
41{ 42{
42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 43 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
43 struct sock *sk2; 44 struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
62 return node != NULL; 63 return node != NULL;
63} 64}
64 65
66EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
67
65/* Obtain a reference to a local port for the given sock, 68/* Obtain a reference to a local port for the given sock,
66 * if snum is zero it means select any available local port. 69 * if snum is zero it means select any available local port.
67 */ 70 */
68int inet_csk_get_port(struct inet_hashinfo *hashinfo, 71int inet_csk_get_port(struct inet_hashinfo *hashinfo,
69 struct sock *sk, unsigned short snum) 72 struct sock *sk, unsigned short snum,
73 int (*bind_conflict)(const struct sock *sk,
74 const struct inet_bind_bucket *tb))
70{ 75{
71 struct inet_bind_hashbucket *head; 76 struct inet_bind_hashbucket *head;
72 struct hlist_node *node; 77 struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
125 goto success; 130 goto success;
126 } else { 131 } else {
127 ret = 1; 132 ret = 1;
128 if (inet_csk_bind_conflict(sk, tb)) 133 if (bind_conflict(sk, tb))
129 goto fail_unlock; 134 goto fail_unlock;
130 } 135 }
131 } 136 }
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
380EXPORT_SYMBOL_GPL(inet_csk_search_req); 385EXPORT_SYMBOL_GPL(inet_csk_search_req);
381 386
382void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 387void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
383 const unsigned timeout) 388 unsigned long timeout)
384{ 389{
385 struct inet_connection_sock *icsk = inet_csk(sk); 390 struct inet_connection_sock *icsk = inet_csk(sk);
386 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 391 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
631} 636}
632 637
633EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 638EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
639
640void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
641{
642 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
643 const struct inet_sock *inet = inet_sk(sk);
644
645 sin->sin_family = AF_INET;
646 sin->sin_addr.s_addr = inet->daddr;
647 sin->sin_port = inet->dport;
648}
649
650EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
112 r->idiag_inode = 0; 112 r->idiag_inode = 0;
113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
114 if (r->idiag_family == AF_INET6) { 114 if (r->idiag_family == AF_INET6) {
115 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); 115 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
116 116
117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
118 &tcp6tw->tw_v6_rcv_saddr); 118 &tw6->tw_v6_rcv_saddr);
119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
120 &tcp6tw->tw_v6_daddr); 120 &tw6->tw_v6_daddr);
121 } 121 }
122#endif 122#endif
123 nlh->nlmsg_len = skb->tail - b; 123 nlh->nlmsg_len = skb->tail - b;
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
490 if (r->idiag_family == AF_INET6) { 490 if (r->idiag_family == AF_INET6) {
491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
492 &tcp6_rsk(req)->loc_addr); 492 &inet6_rsk(req)->loc_addr);
493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
494 &tcp6_rsk(req)->rmt_addr); 494 &inet6_rsk(req)->rmt_addr);
495 } 495 }
496#endif 496#endif
497 nlh->nlmsg_len = skb->tail - b; 497 nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
553 entry.saddr = 553 entry.saddr =
554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
555 (entry.family == AF_INET6) ? 555 (entry.family == AF_INET6) ?
556 tcp6_rsk(req)->loc_addr.s6_addr32 : 556 inet6_rsk(req)->loc_addr.s6_addr32 :
557#endif 557#endif
558 &ireq->loc_addr; 558 &ireq->loc_addr;
559 entry.daddr = 559 entry.daddr =
560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
561 (entry.family == AF_INET6) ? 561 (entry.family == AF_INET6) ?
562 tcp6_rsk(req)->rmt_addr.s6_addr32 : 562 inet6_rsk(req)->rmt_addr.s6_addr32 :
563#endif 563#endif
564 &ireq->rmt_addr; 564 &ireq->rmt_addr;
565 entry.dport = ntohs(ireq->rmt_port); 565 entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
15 15
16#include <linux/config.h> 16#include <linux/config.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/random.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/wait.h> 21#include <linux/wait.h>
21 22
22#include <net/inet_connection_sock.h> 23#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h> 24#include <net/inet_hashtables.h>
25#include <net/ip.h>
24 26
25/* 27/*
26 * Allocate and initialize a new local port bind bucket. 28 * Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
163} 165}
164 166
165EXPORT_SYMBOL_GPL(__inet_lookup_listener); 167EXPORT_SYMBOL_GPL(__inet_lookup_listener);
168
169/* called with local bh disabled */
170static int __inet_check_established(struct inet_timewait_death_row *death_row,
171 struct sock *sk, __u16 lport,
172 struct inet_timewait_sock **twp)
173{
174 struct inet_hashinfo *hinfo = death_row->hashinfo;
175 struct inet_sock *inet = inet_sk(sk);
176 u32 daddr = inet->rcv_saddr;
177 u32 saddr = inet->daddr;
178 int dif = sk->sk_bound_dev_if;
179 INET_ADDR_COOKIE(acookie, saddr, daddr)
180 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
181 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
182 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
183 struct sock *sk2;
184 const struct hlist_node *node;
185 struct inet_timewait_sock *tw;
186
187 prefetch(head->chain.first);
188 write_lock(&head->lock);
189
190 /* Check TIME-WAIT sockets first. */
191 sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
192 tw = inet_twsk(sk2);
193
194 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
195 if (twsk_unique(sk, sk2, twp))
196 goto unique;
197 else
198 goto not_unique;
199 }
200 }
201 tw = NULL;
202
203 /* And established part... */
204 sk_for_each(sk2, node, &head->chain) {
205 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
206 goto not_unique;
207 }
208
209unique:
210 /* Must record num and sport now. Otherwise we will see
211 * in hash table socket with a funny identity. */
212 inet->num = lport;
213 inet->sport = htons(lport);
214 sk->sk_hash = hash;
215 BUG_TRAP(sk_unhashed(sk));
216 __sk_add_node(sk, &head->chain);
217 sock_prot_inc_use(sk->sk_prot);
218 write_unlock(&head->lock);
219
220 if (twp) {
221 *twp = tw;
222 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
223 } else if (tw) {
224 /* Silly. Should hash-dance instead... */
225 inet_twsk_deschedule(tw, death_row);
226 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
227
228 inet_twsk_put(tw);
229 }
230
231 return 0;
232
233not_unique:
234 write_unlock(&head->lock);
235 return -EADDRNOTAVAIL;
236}
237
238static inline u32 inet_sk_port_offset(const struct sock *sk)
239{
240 const struct inet_sock *inet = inet_sk(sk);
241 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
242 inet->dport);
243}
244
245/*
246 * Bind a port for a connect operation and hash it.
247 */
248int inet_hash_connect(struct inet_timewait_death_row *death_row,
249 struct sock *sk)
250{
251 struct inet_hashinfo *hinfo = death_row->hashinfo;
252 const unsigned short snum = inet_sk(sk)->num;
253 struct inet_bind_hashbucket *head;
254 struct inet_bind_bucket *tb;
255 int ret;
256
257 if (!snum) {
258 int low = sysctl_local_port_range[0];
259 int high = sysctl_local_port_range[1];
260 int range = high - low;
261 int i;
262 int port;
263 static u32 hint;
264 u32 offset = hint + inet_sk_port_offset(sk);
265 struct hlist_node *node;
266 struct inet_timewait_sock *tw = NULL;
267
268 local_bh_disable();
269 for (i = 1; i <= range; i++) {
270 port = low + (i + offset) % range;
271 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
272 spin_lock(&head->lock);
273
274 /* Does not bother with rcv_saddr checks,
275 * because the established check is already
276 * unique enough.
277 */
278 inet_bind_bucket_for_each(tb, node, &head->chain) {
279 if (tb->port == port) {
280 BUG_TRAP(!hlist_empty(&tb->owners));
281 if (tb->fastreuse >= 0)
282 goto next_port;
283 if (!__inet_check_established(death_row,
284 sk, port,
285 &tw))
286 goto ok;
287 goto next_port;
288 }
289 }
290
291 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
292 if (!tb) {
293 spin_unlock(&head->lock);
294 break;
295 }
296 tb->fastreuse = -1;
297 goto ok;
298
299 next_port:
300 spin_unlock(&head->lock);
301 }
302 local_bh_enable();
303
304 return -EADDRNOTAVAIL;
305
306ok:
307 hint += i;
308
309 /* Head lock still held and bh's disabled */
310 inet_bind_hash(sk, tb, port);
311 if (sk_unhashed(sk)) {
312 inet_sk(sk)->sport = htons(port);
313 __inet_hash(hinfo, sk, 0);
314 }
315 spin_unlock(&head->lock);
316
317 if (tw) {
318 inet_twsk_deschedule(tw, death_row);;
319 inet_twsk_put(tw);
320 }
321
322 ret = 0;
323 goto out;
324 }
325
326 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
327 tb = inet_csk(sk)->icsk_bind_hash;
328 spin_lock_bh(&head->lock);
329 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
330 __inet_hash(hinfo, sk, 0);
331 spin_unlock_bh(&head->lock);
332 return 0;
333 } else {
334 spin_unlock(&head->lock);
335 /* No definite answer... Walk to established hash table */
336 ret = __inet_check_established(death_row, sk, snum, NULL);
337out:
338 local_bh_enable();
339 return ret;
340 }
341}
342
343EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
90 90
91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
92{ 92{
93 struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, 93 struct inet_timewait_sock *tw =
94 SLAB_ATOMIC); 94 kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
95 SLAB_ATOMIC);
95 if (tw != NULL) { 96 if (tw != NULL) {
96 const struct inet_sock *inet = inet_sk(sk); 97 const struct inet_sock *inet = inet_sk(sk);
97 98
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
401 return NULL; 401 return NULL;
402 n->v4daddr = daddr; 402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1); 403 atomic_set(&n->refcnt, 1);
404 atomic_set(&n->rid, 0);
404 n->ip_id_count = secure_ip_id(daddr); 405 n->ip_id_count = secure_ip_id(daddr);
405 n->tcp_ts_stamp = 0; 406 n->tcp_ts_stamp = 0;
406 407
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
22 * Patrick McHardy : LRU queue of frag heads for evictor. 22 * Patrick McHardy : LRU queue of frag heads for evictor.
23 */ 23 */
24 24
25#include <linux/compiler.h>
25#include <linux/config.h> 26#include <linux/config.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/types.h> 28#include <linux/types.h>
@@ -38,6 +39,7 @@
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/icmp.h> 40#include <net/icmp.h>
40#include <net/checksum.h> 41#include <net/checksum.h>
42#include <net/inetpeer.h>
41#include <linux/tcp.h> 43#include <linux/tcp.h>
42#include <linux/udp.h> 44#include <linux/udp.h>
43#include <linux/inet.h> 45#include <linux/inet.h>
@@ -56,6 +58,8 @@
56int sysctl_ipfrag_high_thresh = 256*1024; 58int sysctl_ipfrag_high_thresh = 256*1024;
57int sysctl_ipfrag_low_thresh = 192*1024; 59int sysctl_ipfrag_low_thresh = 192*1024;
58 60
61int sysctl_ipfrag_max_dist = 64;
62
59/* Important NOTE! Fragment queue must be destroyed before MSL expires. 63/* Important NOTE! Fragment queue must be destroyed before MSL expires.
60 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. 64 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
61 */ 65 */
@@ -89,8 +93,10 @@ struct ipq {
89 spinlock_t lock; 93 spinlock_t lock;
90 atomic_t refcnt; 94 atomic_t refcnt;
91 struct timer_list timer; /* when will this queue expire? */ 95 struct timer_list timer; /* when will this queue expire? */
92 int iif;
93 struct timeval stamp; 96 struct timeval stamp;
97 int iif;
98 unsigned int rid;
99 struct inet_peer *peer;
94}; 100};
95 101
96/* Hash table. */ 102/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
195 BUG_TRAP(qp->last_in&COMPLETE); 201 BUG_TRAP(qp->last_in&COMPLETE);
196 BUG_TRAP(del_timer(&qp->timer) == 0); 202 BUG_TRAP(del_timer(&qp->timer) == 0);
197 203
204 if (qp->peer)
205 inet_putpeer(qp->peer);
206
198 /* Release all fragment data. */ 207 /* Release all fragment data. */
199 fp = qp->fragments; 208 fp = qp->fragments;
200 while (fp) { 209 while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
353 qp->meat = 0; 362 qp->meat = 0;
354 qp->fragments = NULL; 363 qp->fragments = NULL;
355 qp->iif = 0; 364 qp->iif = 0;
365 qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
356 366
357 /* Initialize a timer for this entry. */ 367 /* Initialize a timer for this entry. */
358 init_timer(&qp->timer); 368 init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
398 return ip_frag_create(hash, iph, user); 408 return ip_frag_create(hash, iph, user);
399} 409}
400 410
411/* Is the fragment too far ahead to be part of ipq? */
412static inline int ip_frag_too_far(struct ipq *qp)
413{
414 struct inet_peer *peer = qp->peer;
415 unsigned int max = sysctl_ipfrag_max_dist;
416 unsigned int start, end;
417
418 int rc;
419
420 if (!peer || !max)
421 return 0;
422
423 start = qp->rid;
424 end = atomic_inc_return(&peer->rid);
425 qp->rid = end;
426
427 rc = qp->fragments && (end - start) > max;
428
429 if (rc) {
430 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
431 }
432
433 return rc;
434}
435
436static int ip_frag_reinit(struct ipq *qp)
437{
438 struct sk_buff *fp;
439
440 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
441 atomic_inc(&qp->refcnt);
442 return -ETIMEDOUT;
443 }
444
445 fp = qp->fragments;
446 do {
447 struct sk_buff *xp = fp->next;
448 frag_kfree_skb(fp, NULL);
449 fp = xp;
450 } while (fp);
451
452 qp->last_in = 0;
453 qp->len = 0;
454 qp->meat = 0;
455 qp->fragments = NULL;
456 qp->iif = 0;
457
458 return 0;
459}
460
401/* Add new segment to existing queue. */ 461/* Add new segment to existing queue. */
402static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 462static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
403{ 463{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
408 if (qp->last_in & COMPLETE) 468 if (qp->last_in & COMPLETE)
409 goto err; 469 goto err;
410 470
471 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
472 unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
473 ipq_kill(qp);
474 goto err;
475 }
476
411 offset = ntohs(skb->nh.iph->frag_off); 477 offset = ntohs(skb->nh.iph->frag_off);
412 flags = offset & ~IP_OFFSET; 478 flags = offset & ~IP_OFFSET;
413 offset &= IP_OFFSET; 479 offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
128#include <linux/sockios.h> 128#include <linux/sockios.h>
129#include <linux/in.h> 129#include <linux/in.h>
130#include <linux/inet.h> 130#include <linux/inet.h>
131#include <linux/inetdevice.h>
131#include <linux/netdevice.h> 132#include <linux/netdevice.h>
132#include <linux/etherdevice.h> 133#include <linux/etherdevice.h>
133 134
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
22#include <net/sock.h> 22#include <net/sock.h>
23#include <net/ip.h> 23#include <net/ip.h>
24#include <net/icmp.h> 24#include <net/icmp.h>
25#include <net/route.h>
25 26
26/* 27/*
27 * Write options to IP header, record destination address to 28 * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
445 445
446 hlen = iph->ihl * 4; 446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
448 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
448 449
449 /* When frag_list is given, use it. First, check its validity: 450 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing 451 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,12 +25,12 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <linux/ip.h> 26#include <linux/ip.h>
27#include <linux/icmp.h> 27#include <linux/icmp.h>
28#include <linux/inetdevice.h>
28#include <linux/netdevice.h> 29#include <linux/netdevice.h>
29#include <net/sock.h> 30#include <net/sock.h>
30#include <net/ip.h> 31#include <net/ip.h>
31#include <net/icmp.h> 32#include <net/icmp.h>
32#include <net/tcp.h> 33#include <net/tcp_states.h>
33#include <linux/tcp.h>
34#include <linux/udp.h> 34#include <linux/udp.h>
35#include <linux/igmp.h> 35#include <linux/igmp.h>
36#include <linux/netfilter.h> 36#include <linux/netfilter.h>
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
427 err = ip_options_get_from_user(&opt, optval, optlen); 427 err = ip_options_get_from_user(&opt, optval, optlen);
428 if (err) 428 if (err)
429 break; 429 break;
430 if (sk->sk_type == SOCK_STREAM) { 430 if (inet->is_icsk) {
431 struct tcp_sock *tp = tcp_sk(sk); 431 struct inet_connection_sock *icsk = inet_csk(sk);
432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
433 if (sk->sk_family == PF_INET || 433 if (sk->sk_family == PF_INET ||
434 (!((1 << sk->sk_state) & 434 (!((1 << sk->sk_state) &
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
436 inet->daddr != LOOPBACK4_IPV6)) { 436 inet->daddr != LOOPBACK4_IPV6)) {
437#endif 437#endif
438 if (inet->opt) 438 if (inet->opt)
439 tp->ext_header_len -= inet->opt->optlen; 439 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
440 if (opt) 440 if (opt)
441 tp->ext_header_len += opt->optlen; 441 icsk->icsk_ext_hdr_len += opt->optlen;
442 tcp_sync_mss(sk, tp->pmtu_cookie); 442 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
444 } 444 }
445#endif 445#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
28#include <net/xfrm.h> 28#include <net/xfrm.h>
29#include <net/icmp.h> 29#include <net/icmp.h>
30#include <net/ipcomp.h> 30#include <net/ipcomp.h>
31#include <net/protocol.h>
31 32
32struct ipcomp_tfms { 33struct ipcomp_tfms {
33 struct list_head list; 34 struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
42#include <linux/in.h> 42#include <linux/in.h>
43#include <linux/if.h> 43#include <linux/if.h>
44#include <linux/inet.h> 44#include <linux/inet.h>
45#include <linux/inetdevice.h>
45#include <linux/netdevice.h> 46#include <linux/netdevice.h>
46#include <linux/if_arp.h> 47#include <linux/if_arp.h>
47#include <linux/skbuff.h> 48#include <linux/skbuff.h>
@@ -58,6 +59,7 @@
58#include <net/arp.h> 59#include <net/arp.h>
59#include <net/ip.h> 60#include <net/ip.h>
60#include <net/ipconfig.h> 61#include <net/ipconfig.h>
62#include <net/route.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <net/checksum.h> 65#include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
52#include <net/ip.h> 52#include <net/ip.h>
53#include <net/protocol.h> 53#include <net/protocol.h>
54#include <linux/skbuff.h> 54#include <linux/skbuff.h>
55#include <net/route.h>
55#include <net/sock.h> 56#include <net/sock.h>
56#include <net/icmp.h> 57#include <net/icmp.h>
57#include <net/udp.h> 58#include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c2..9b176a942ac5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
224} 224}
225 225
226 226
227#if 0000
228/*
229 * Get reference to app by name (called from user context)
230 */
231struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
232{
233 struct ip_vs_app *app, *a = NULL;
234
235 down(&__ip_vs_app_mutex);
236
237 list_for_each_entry(ent, &ip_vs_app_list, a_list) {
238 if (strcmp(app->name, appname))
239 continue;
240
241 /* softirq may call ip_vs_app_get too, so the caller
242 must disable softirq on the current CPU */
243 if (ip_vs_app_get(app))
244 a = app;
245 break;
246 }
247
248 up(&__ip_vs_app_mutex);
249
250 return a;
251}
252#endif
253
254
255/* 227/*
256 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) 228 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
257 */ 229 */
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..81d90354c928 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,10 @@
24 * 24 *
25 */ 25 */
26 26
27#include <linux/in.h>
28#include <linux/net.h>
27#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/module.h>
28#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
29#include <linux/proc_fs.h> /* for proc_net_* */ 32#include <linux/proc_fs.h> /* for proc_net_* */
30#include <linux/seq_file.h> 33#include <linux/seq_file.h>
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
219 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
220 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
221 224
222 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
223 ip_vs_proto_name(protocol), 226 ip_vs_proto_name(protocol),
224 NIPQUAD(s_addr), ntohs(s_port), 227 NIPQUAD(s_addr), ntohs(s_port),
225 NIPQUAD(d_addr), ntohs(d_port), 228 NIPQUAD(d_addr), ntohs(d_port),
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
254 out: 257 out:
255 ct_read_unlock(hash); 258 ct_read_unlock(hash);
256 259
257 IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
258 ip_vs_proto_name(protocol), 261 ip_vs_proto_name(protocol),
259 NIPQUAD(s_addr), ntohs(s_port), 262 NIPQUAD(s_addr), ntohs(s_port),
260 NIPQUAD(d_addr), ntohs(d_port), 263 NIPQUAD(d_addr), ntohs(d_port),
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
295 298
296 ct_read_unlock(hash); 299 ct_read_unlock(hash);
297 300
298 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
299 ip_vs_proto_name(protocol), 302 ip_vs_proto_name(protocol),
300 NIPQUAD(s_addr), ntohs(s_port), 303 NIPQUAD(s_addr), ntohs(s_port),
301 NIPQUAD(d_addr), ntohs(d_port), 304 NIPQUAD(d_addr), ntohs(d_port),
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
391 cp->flags |= atomic_read(&dest->conn_flags); 394 cp->flags |= atomic_read(&dest->conn_flags);
392 cp->dest = dest; 395 cp->dest = dest;
393 396
394 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 397 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
395 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 398 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
399 "dest->refcnt:%d\n",
396 ip_vs_proto_name(cp->protocol), 400 ip_vs_proto_name(cp->protocol),
397 NIPQUAD(cp->caddr), ntohs(cp->cport), 401 NIPQUAD(cp->caddr), ntohs(cp->cport),
398 NIPQUAD(cp->vaddr), ntohs(cp->vport), 402 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
430 if (!dest) 434 if (!dest)
431 return; 435 return;
432 436
433 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 437 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
434 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 438 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
439 "dest->refcnt:%d\n",
435 ip_vs_proto_name(cp->protocol), 440 ip_vs_proto_name(cp->protocol),
436 NIPQUAD(cp->caddr), ntohs(cp->cport), 441 NIPQUAD(cp->caddr), ntohs(cp->cport),
437 NIPQUAD(cp->vaddr), ntohs(cp->vport), 442 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data)
571 ip_vs_conn_hash(cp); 576 ip_vs_conn_hash(cp);
572 577
573 expire_later: 578 expire_later:
574 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", 579 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
575 atomic_read(&cp->refcnt)-1, 580 atomic_read(&cp->refcnt)-1,
576 atomic_read(&cp->n_control)); 581 atomic_read(&cp->n_control));
577 582
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a9..1aca94a9fd8b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
426 return NULL; 426 return NULL;
427 427
428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " 428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
429 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", 429 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
430 ip_vs_fwd_tag(cp), 430 ip_vs_fwd_tag(cp),
431 NIPQUAD(cp->caddr), ntohs(cp->cport), 431 NIPQUAD(cp->caddr), ntohs(cp->cport),
432 NIPQUAD(cp->vaddr), ntohs(cp->vport), 432 NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..c935c5086d33 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
35#include <linux/netfilter_ipv4.h> 35#include <linux/netfilter_ipv4.h>
36 36
37#include <net/ip.h> 37#include <net/ip.h>
38#include <net/route.h>
38#include <net/sock.h> 39#include <net/sock.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
447 out: 448 out:
448 read_unlock(&__ip_vs_svc_lock); 449 read_unlock(&__ip_vs_svc_lock);
449 450
450 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", 451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451 fwmark, ip_vs_proto_name(protocol), 452 fwmark, ip_vs_proto_name(protocol),
452 NIPQUAD(vaddr), ntohs(vport), 453 NIPQUAD(vaddr), ntohs(vport),
453 svc?"hit":"not hit"); 454 svc?"hit":"not hit");
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
597 */ 598 */
598 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 599 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " 600 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
600 "refcnt=%d\n", 601 "dest->refcnt=%d\n",
601 dest->vfwmark, 602 dest->vfwmark,
602 NIPQUAD(dest->addr), ntohs(dest->port), 603 NIPQUAD(dest->addr), ntohs(dest->port),
603 atomic_read(&dest->refcnt)); 604 atomic_read(&dest->refcnt));
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
804 dest = ip_vs_trash_get_dest(svc, daddr, dport); 805 dest = ip_vs_trash_get_dest(svc, daddr, dport);
805 if (dest != NULL) { 806 if (dest != NULL) {
806 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " 807 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", 808 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808 NIPQUAD(daddr), ntohs(dport), 809 NIPQUAD(daddr), ntohs(dport),
809 atomic_read(&dest->refcnt), 810 atomic_read(&dest->refcnt),
810 dest->vfwmark, 811 dest->vfwmark,
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
949 atomic_dec(&dest->svc->refcnt); 950 atomic_dec(&dest->svc->refcnt);
950 kfree(dest); 951 kfree(dest);
951 } else { 952 } else {
952 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", 953 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
954 "dest->refcnt=%d\n",
953 NIPQUAD(dest->addr), ntohs(dest->port), 955 NIPQUAD(dest->addr), ntohs(dest->port),
954 atomic_read(&dest->refcnt)); 956 atomic_read(&dest->refcnt));
955 list_add(&dest->n_list, &ip_vs_dest_trash); 957 list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
37 * 37 *
38 */ 38 */
39 39
40#include <linux/ip.h>
40#include <linux/module.h> 41#include <linux/module.h>
41#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/skbuff.h>
42 44
43#include <net/ip_vs.h> 45#include <net/ip_vs.h>
44 46
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
13 * Changes: 13 * Changes:
14 * 14 *
15 */ 15 */
16#include <linux/config.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/jiffies.h>
19#include <linux/slab.h>
17#include <linux/types.h> 20#include <linux/types.h>
18 21
19#include <net/ip_vs.h> 22#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa8..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
41 * me to write this module. 41 * me to write this module.
42 */ 42 */
43 43
44#include <linux/ip.h>
44#include <linux/module.h> 45#include <linux/module.h>
45#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/skbuff.h>
46 48
47/* for sysctl */ 49/* for sysctl */
48#include <linux/fs.h> 50#include <linux/fs.h>
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
228} 230}
229 231
230 232
231#if 0000
232/*
233 * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
234 * returns bool success.
235 */
236static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
237 struct ip_vs_lblc_entry *en)
238{
239 if (list_empty(&en->list)) {
240 IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
241 "called from %p\n", __builtin_return_address(0));
242 return 0;
243 }
244
245 /*
246 * Remove it from the table
247 */
248 write_lock(&tbl->lock);
249 list_del(&en->list);
250 INIT_LIST_HEAD(&en->list);
251 write_unlock(&tbl->lock);
252
253 return 1;
254}
255#endif
256
257
258/* 233/*
259 * Get ip_vs_lblc_entry associated with supplied parameters. 234 * Get ip_vs_lblc_entry associated with supplied parameters.
260 */ 235 */
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a5..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
39 * 39 *
40 */ 40 */
41 41
42#include <linux/ip.h>
42#include <linux/module.h> 43#include <linux/module.h>
43#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/skbuff.h>
44 46
45/* for sysctl */ 47/* for sysctl */
46#include <linux/fs.h> 48#include <linux/fs.h>
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
414} 416}
415 417
416 418
417#if 0000
418/*
419 * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
420 * returns bool success.
421 */
422static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
423 struct ip_vs_lblcr_entry *en)
424{
425 if (list_empty(&en->list)) {
426 IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
427 "called from %p\n", __builtin_return_address(0));
428 return 0;
429 }
430
431 /*
432 * Remove it from the table
433 */
434 write_lock(&tbl->lock);
435 list_del(&en->list);
436 INIT_LIST_HEAD(&en->list);
437 write_unlock(&tbl->lock);
438
439 return 1;
440}
441#endif
442
443
444/* 419/*
445 * Get ip_vs_lblcr_entry associated with supplied parameters. 420 * Get ip_vs_lblcr_entry associated with supplied parameters.
446 */ 421 */
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215c..bc28b1160a3a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
275 [IP_VS_TCP_S_LAST] = 2*HZ, 275 [IP_VS_TCP_S_LAST] = 2*HZ,
276}; 276};
277 277
278
279#if 0
280
281/* FIXME: This is going to die */
282
283static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
284 [IP_VS_TCP_S_NONE] = 2*HZ,
285 [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
286 [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
287 [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
288 [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
289 [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
290 [IP_VS_TCP_S_CLOSE] = 10*HZ,
291 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
292 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
293 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
294 [IP_VS_TCP_S_SYNACK] = 100*HZ,
295 [IP_VS_TCP_S_LAST] = 2*HZ,
296};
297
298#endif
299
300static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { 278static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
301 [IP_VS_TCP_S_NONE] = "NONE", 279 [IP_VS_TCP_S_NONE] = "NONE",
302 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", 280 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
448 struct ip_vs_dest *dest = cp->dest; 426 struct ip_vs_dest *dest = cp->dest;
449 427
450 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" 428 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
451 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", 429 "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
452 pp->name, 430 pp->name,
453 (state_off==TCP_DIR_OUTPUT)?"output ":"input ", 431 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
454 th->syn? 'S' : '.', 432 th->syn? 'S' : '.',
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
15 * 15 *
16 */ 16 */
17 17
18#include <linux/in.h>
19#include <linux/ip.h>
18#include <linux/kernel.h> 20#include <linux/kernel.h>
19#include <linux/netfilter_ipv4.h> 21#include <linux/netfilter_ipv4.h>
22#include <linux/udp.h>
20 23
21#include <net/ip_vs.h> 24#include <net/ip_vs.h>
22 25
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
34 * 34 *
35 */ 35 */
36 36
37#include <linux/ip.h>
37#include <linux/module.h> 38#include <linux/module.h>
38#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/skbuff.h>
39 41
40#include <net/ip_vs.h> 42#include <net/ip_vs.h>
41 43
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/inetdevice.h>
24#include <linux/net.h> 25#include <linux/net.h>
25#include <linux/completion.h> 26#include <linux/completion.h>
26#include <linux/delay.h> 27#include <linux/delay.h>
27#include <linux/skbuff.h> 28#include <linux/skbuff.h>
28#include <linux/in.h> 29#include <linux/in.h>
29#include <linux/igmp.h> /* for ip_mc_join_group */ 30#include <linux/igmp.h> /* for ip_mc_join_group */
31#include <linux/udp.h>
30 32
31#include <net/ip.h> 33#include <net/ip.h>
32#include <net/sock.h> 34#include <net/sock.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba6..bba156304695 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
68 unsigned int initial_entries; 68 unsigned int initial_entries;
69 unsigned int hook_entry[NF_ARP_NUMHOOKS]; 69 unsigned int hook_entry[NF_ARP_NUMHOOKS];
70 unsigned int underflow[NF_ARP_NUMHOOKS]; 70 unsigned int underflow[NF_ARP_NUMHOOKS];
71 char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); 71 void *entries[NR_CPUS];
72}; 72};
73 73
74static LIST_HEAD(arpt_target); 74static LIST_HEAD(arpt_target);
75static LIST_HEAD(arpt_tables); 75static LIST_HEAD(arpt_tables);
76#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
76#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 77#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
77 78
78#ifdef CONFIG_SMP
79#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
80#else
81#define TABLE_OFFSET(t,p) 0
82#endif
83
84static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 79static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
85 char *hdr_addr, int len) 80 char *hdr_addr, int len)
86{ 81{
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
269 outdev = out ? out->name : nulldevname; 264 outdev = out ? out->name : nulldevname;
270 265
271 read_lock_bh(&table->lock); 266 read_lock_bh(&table->lock);
272 table_base = (void *)table->private->entries 267 table_base = (void *)table->private->entries[smp_processor_id()];
273 + TABLE_OFFSET(table->private,
274 smp_processor_id());
275 e = get_entry(table_base, table->private->hook_entry[hook]); 268 e = get_entry(table_base, table->private->hook_entry[hook]);
276 back = get_entry(table_base, table->private->underflow[hook]); 269 back = get_entry(table_base, table->private->underflow[hook]);
277 270
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
462/* Figures out from what hook each rule can be called: returns 0 if 455/* Figures out from what hook each rule can be called: returns 0 if
463 * there are loops. Puts hook bitmask in comefrom. 456 * there are loops. Puts hook bitmask in comefrom.
464 */ 457 */
465static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) 458static int mark_source_chains(struct arpt_table_info *newinfo,
459 unsigned int valid_hooks, void *entry0)
466{ 460{
467 unsigned int hook; 461 unsigned int hook;
468 462
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
472 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { 466 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
473 unsigned int pos = newinfo->hook_entry[hook]; 467 unsigned int pos = newinfo->hook_entry[hook];
474 struct arpt_entry *e 468 struct arpt_entry *e
475 = (struct arpt_entry *)(newinfo->entries + pos); 469 = (struct arpt_entry *)(entry0 + pos);
476 470
477 if (!(valid_hooks & (1 << hook))) 471 if (!(valid_hooks & (1 << hook)))
478 continue; 472 continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
514 goto next; 508 goto next;
515 509
516 e = (struct arpt_entry *) 510 e = (struct arpt_entry *)
517 (newinfo->entries + pos); 511 (entry0 + pos);
518 } while (oldpos == pos + e->next_offset); 512 } while (oldpos == pos + e->next_offset);
519 513
520 /* Move along one */ 514 /* Move along one */
521 size = e->next_offset; 515 size = e->next_offset;
522 e = (struct arpt_entry *) 516 e = (struct arpt_entry *)
523 (newinfo->entries + pos + size); 517 (entry0 + pos + size);
524 e->counters.pcnt = pos; 518 e->counters.pcnt = pos;
525 pos += size; 519 pos += size;
526 } else { 520 } else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
537 newpos = pos + e->next_offset; 531 newpos = pos + e->next_offset;
538 } 532 }
539 e = (struct arpt_entry *) 533 e = (struct arpt_entry *)
540 (newinfo->entries + newpos); 534 (entry0 + newpos);
541 e->counters.pcnt = pos; 535 e->counters.pcnt = pos;
542 pos = newpos; 536 pos = newpos;
543 } 537 }
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
689static int translate_table(const char *name, 683static int translate_table(const char *name,
690 unsigned int valid_hooks, 684 unsigned int valid_hooks,
691 struct arpt_table_info *newinfo, 685 struct arpt_table_info *newinfo,
686 void *entry0,
692 unsigned int size, 687 unsigned int size,
693 unsigned int number, 688 unsigned int number,
694 const unsigned int *hook_entries, 689 const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
710 i = 0; 705 i = 0;
711 706
712 /* Walk through entries, checking offsets. */ 707 /* Walk through entries, checking offsets. */
713 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 708 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
714 check_entry_size_and_hooks, 709 check_entry_size_and_hooks,
715 newinfo, 710 newinfo,
716 newinfo->entries, 711 entry0,
717 newinfo->entries + size, 712 entry0 + size,
718 hook_entries, underflows, &i); 713 hook_entries, underflows, &i);
719 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 714 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
720 if (ret != 0) 715 if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
743 } 738 }
744 } 739 }
745 740
746 if (!mark_source_chains(newinfo, valid_hooks)) { 741 if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
747 duprintf("Looping hook\n"); 742 duprintf("Looping hook\n");
748 return -ELOOP; 743 return -ELOOP;
749 } 744 }
750 745
751 /* Finally, each sanity check must pass */ 746 /* Finally, each sanity check must pass */
752 i = 0; 747 i = 0;
753 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 748 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
754 check_entry, name, size, &i); 749 check_entry, name, size, &i);
755 750
756 if (ret != 0) { 751 if (ret != 0) {
757 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 752 ARPT_ENTRY_ITERATE(entry0, newinfo->size,
758 cleanup_entry, &i); 753 cleanup_entry, &i);
759 return ret; 754 return ret;
760 } 755 }
761 756
762 /* And one copy for every other CPU */ 757 /* And one copy for every other CPU */
763 for_each_cpu(i) { 758 for_each_cpu(i) {
764 if (i == 0) 759 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
765 continue; 760 memcpy(newinfo->entries[i], entry0, newinfo->size);
766 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
767 newinfo->entries,
768 SMP_ALIGN(newinfo->size));
769 } 761 }
770 762
771 return ret; 763 return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
807 return 0; 799 return 0;
808} 800}
809 801
802static inline int set_entry_to_counter(const struct arpt_entry *e,
803 struct arpt_counters total[],
804 unsigned int *i)
805{
806 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
807
808 (*i)++;
809 return 0;
810}
811
810static void get_counters(const struct arpt_table_info *t, 812static void get_counters(const struct arpt_table_info *t,
811 struct arpt_counters counters[]) 813 struct arpt_counters counters[])
812{ 814{
813 unsigned int cpu; 815 unsigned int cpu;
814 unsigned int i; 816 unsigned int i;
817 unsigned int curcpu;
818
819 /* Instead of clearing (by a previous call to memset())
820 * the counters and using adds, we set the counters
821 * with data used by 'current' CPU
822 * We dont care about preemption here.
823 */
824 curcpu = raw_smp_processor_id();
825
826 i = 0;
827 ARPT_ENTRY_ITERATE(t->entries[curcpu],
828 t->size,
829 set_entry_to_counter,
830 counters,
831 &i);
815 832
816 for_each_cpu(cpu) { 833 for_each_cpu(cpu) {
834 if (cpu == curcpu)
835 continue;
817 i = 0; 836 i = 0;
818 ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 837 ARPT_ENTRY_ITERATE(t->entries[cpu],
819 t->size, 838 t->size,
820 add_entry_to_counter, 839 add_entry_to_counter,
821 counters, 840 counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
831 struct arpt_entry *e; 850 struct arpt_entry *e;
832 struct arpt_counters *counters; 851 struct arpt_counters *counters;
833 int ret = 0; 852 int ret = 0;
853 void *loc_cpu_entry;
834 854
835 /* We need atomic snapshot of counters: rest doesn't change 855 /* We need atomic snapshot of counters: rest doesn't change
836 * (other than comefrom, which userspace doesn't care 856 * (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
843 return -ENOMEM; 863 return -ENOMEM;
844 864
845 /* First, sum counters... */ 865 /* First, sum counters... */
846 memset(counters, 0, countersize);
847 write_lock_bh(&table->lock); 866 write_lock_bh(&table->lock);
848 get_counters(table->private, counters); 867 get_counters(table->private, counters);
849 write_unlock_bh(&table->lock); 868 write_unlock_bh(&table->lock);
850 869
851 /* ... then copy entire thing from CPU 0... */ 870 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
852 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 871 /* ... then copy entire thing ... */
872 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
853 ret = -EFAULT; 873 ret = -EFAULT;
854 goto free_counters; 874 goto free_counters;
855 } 875 }
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
859 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 879 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
860 struct arpt_entry_target *t; 880 struct arpt_entry_target *t;
861 881
862 e = (struct arpt_entry *)(table->private->entries + off); 882 e = (struct arpt_entry *)(loc_cpu_entry + off);
863 if (copy_to_user(userptr + off 883 if (copy_to_user(userptr + off
864 + offsetof(struct arpt_entry, counters), 884 + offsetof(struct arpt_entry, counters),
865 &counters[num], 885 &counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
911 return ret; 931 return ret;
912} 932}
913 933
934static void free_table_info(struct arpt_table_info *info)
935{
936 int cpu;
937 for_each_cpu(cpu) {
938 if (info->size <= PAGE_SIZE)
939 kfree(info->entries[cpu]);
940 else
941 vfree(info->entries[cpu]);
942 }
943 kfree(info);
944}
945
946static struct arpt_table_info *alloc_table_info(unsigned int size)
947{
948 struct arpt_table_info *newinfo;
949 int cpu;
950
951 newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
952 if (!newinfo)
953 return NULL;
954
955 newinfo->size = size;
956
957 for_each_cpu(cpu) {
958 if (size <= PAGE_SIZE)
959 newinfo->entries[cpu] = kmalloc_node(size,
960 GFP_KERNEL,
961 cpu_to_node(cpu));
962 else
963 newinfo->entries[cpu] = vmalloc_node(size,
964 cpu_to_node(cpu));
965
966 if (newinfo->entries[cpu] == NULL) {
967 free_table_info(newinfo);
968 return NULL;
969 }
970 }
971
972 return newinfo;
973}
974
914static int do_replace(void __user *user, unsigned int len) 975static int do_replace(void __user *user, unsigned int len)
915{ 976{
916 int ret; 977 int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
918 struct arpt_table *t; 979 struct arpt_table *t;
919 struct arpt_table_info *newinfo, *oldinfo; 980 struct arpt_table_info *newinfo, *oldinfo;
920 struct arpt_counters *counters; 981 struct arpt_counters *counters;
982 void *loc_cpu_entry, *loc_cpu_old_entry;
921 983
922 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 984 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
923 return -EFAULT; 985 return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
930 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 992 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
931 return -ENOMEM; 993 return -ENOMEM;
932 994
933 newinfo = vmalloc(sizeof(struct arpt_table_info) 995 newinfo = alloc_table_info(tmp.size);
934 + SMP_ALIGN(tmp.size) *
935 (highest_possible_processor_id()+1));
936 if (!newinfo) 996 if (!newinfo)
937 return -ENOMEM; 997 return -ENOMEM;
938 998
939 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 999 /* choose the copy that is on our node/cpu */
1000 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1001 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
940 tmp.size) != 0) { 1002 tmp.size) != 0) {
941 ret = -EFAULT; 1003 ret = -EFAULT;
942 goto free_newinfo; 1004 goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
947 ret = -ENOMEM; 1009 ret = -ENOMEM;
948 goto free_newinfo; 1010 goto free_newinfo;
949 } 1011 }
950 memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
951 1012
952 ret = translate_table(tmp.name, tmp.valid_hooks, 1013 ret = translate_table(tmp.name, tmp.valid_hooks,
953 newinfo, tmp.size, tmp.num_entries, 1014 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
954 tmp.hook_entry, tmp.underflow); 1015 tmp.hook_entry, tmp.underflow);
955 if (ret != 0) 1016 if (ret != 0)
956 goto free_newinfo_counters; 1017 goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
989 /* Get the old counters. */ 1050 /* Get the old counters. */
990 get_counters(oldinfo, counters); 1051 get_counters(oldinfo, counters);
991 /* Decrease module usage counts and free resource */ 1052 /* Decrease module usage counts and free resource */
992 ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1053 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
993 vfree(oldinfo); 1054 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1055
1056 free_table_info(oldinfo);
994 if (copy_to_user(tmp.counters, counters, 1057 if (copy_to_user(tmp.counters, counters,
995 sizeof(struct arpt_counters) * tmp.num_counters) != 0) 1058 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
996 ret = -EFAULT; 1059 ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
1002 module_put(t->me); 1065 module_put(t->me);
1003 up(&arpt_mutex); 1066 up(&arpt_mutex);
1004 free_newinfo_counters_untrans: 1067 free_newinfo_counters_untrans:
1005 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); 1068 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
1006 free_newinfo_counters: 1069 free_newinfo_counters:
1007 vfree(counters); 1070 vfree(counters);
1008 free_newinfo: 1071 free_newinfo:
1009 vfree(newinfo); 1072 free_table_info(newinfo);
1010 return ret; 1073 return ret;
1011} 1074}
1012 1075
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
1030 struct arpt_counters_info tmp, *paddc; 1093 struct arpt_counters_info tmp, *paddc;
1031 struct arpt_table *t; 1094 struct arpt_table *t;
1032 int ret = 0; 1095 int ret = 0;
1096 void *loc_cpu_entry;
1033 1097
1034 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1098 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1035 return -EFAULT; 1099 return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
1059 } 1123 }
1060 1124
1061 i = 0; 1125 i = 0;
1062 ARPT_ENTRY_ITERATE(t->private->entries, 1126 /* Choose the copy that is on our node */
1127 loc_cpu_entry = t->private->entries[smp_processor_id()];
1128 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1063 t->private->size, 1129 t->private->size,
1064 add_counter_to_entry, 1130 add_counter_to_entry,
1065 paddc->counters, 1131 paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
1220 struct arpt_table_info *newinfo; 1286 struct arpt_table_info *newinfo;
1221 static struct arpt_table_info bootstrap 1287 static struct arpt_table_info bootstrap
1222 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1288 = { 0, 0, 0, { 0 }, { 0 }, { } };
1289 void *loc_cpu_entry;
1223 1290
1224 newinfo = vmalloc(sizeof(struct arpt_table_info) 1291 newinfo = alloc_table_info(repl->size);
1225 + SMP_ALIGN(repl->size) *
1226 (highest_possible_processor_id()+1));
1227 if (!newinfo) { 1292 if (!newinfo) {
1228 ret = -ENOMEM; 1293 ret = -ENOMEM;
1229 return ret; 1294 return ret;
1230 } 1295 }
1231 memcpy(newinfo->entries, repl->entries, repl->size); 1296
1297 /* choose the copy on our node/cpu */
1298 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1299 memcpy(loc_cpu_entry, repl->entries, repl->size);
1232 1300
1233 ret = translate_table(table->name, table->valid_hooks, 1301 ret = translate_table(table->name, table->valid_hooks,
1234 newinfo, repl->size, 1302 newinfo, loc_cpu_entry, repl->size,
1235 repl->num_entries, 1303 repl->num_entries,
1236 repl->hook_entry, 1304 repl->hook_entry,
1237 repl->underflow); 1305 repl->underflow);
1238 duprintf("arpt_register_table: translate table gives %d\n", ret); 1306 duprintf("arpt_register_table: translate table gives %d\n", ret);
1239 if (ret != 0) { 1307 if (ret != 0) {
1240 vfree(newinfo); 1308 free_table_info(newinfo);
1241 return ret; 1309 return ret;
1242 } 1310 }
1243 1311
1244 ret = down_interruptible(&arpt_mutex); 1312 ret = down_interruptible(&arpt_mutex);
1245 if (ret != 0) { 1313 if (ret != 0) {
1246 vfree(newinfo); 1314 free_table_info(newinfo);
1247 return ret; 1315 return ret;
1248 } 1316 }
1249 1317
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
1272 return ret; 1340 return ret;
1273 1341
1274 free_unlock: 1342 free_unlock:
1275 vfree(newinfo); 1343 free_table_info(newinfo);
1276 goto unlock; 1344 goto unlock;
1277} 1345}
1278 1346
1279void arpt_unregister_table(struct arpt_table *table) 1347void arpt_unregister_table(struct arpt_table *table)
1280{ 1348{
1349 void *loc_cpu_entry;
1350
1281 down(&arpt_mutex); 1351 down(&arpt_mutex);
1282 LIST_DELETE(&arpt_tables, table); 1352 LIST_DELETE(&arpt_tables, table);
1283 up(&arpt_mutex); 1353 up(&arpt_mutex);
1284 1354
1285 /* Decrease module usage counts and free resources */ 1355 /* Decrease module usage counts and free resources */
1286 ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1356 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1357 ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1287 cleanup_entry, NULL); 1358 cleanup_entry, NULL);
1288 vfree(table->private); 1359 free_table_info(table->private);
1289} 1360}
1290 1361
1291/* The built-in targets: standard (NULL) and error. */ 1362/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
18 * 18 *
19 */ 19 */
20 20
21#include <linux/in.h>
21#include <linux/kernel.h> 22#include <linux/kernel.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/netfilter.h> 24#include <linux/netfilter.h>
24#include <linux/ip.h> 25#include <linux/ip.h>
25#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
27#include <linux/udp.h>
26#include <net/checksum.h> 28#include <net/checksum.h>
27#include <net/udp.h> 29#include <net/udp.h>
28 30
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
31#include <linux/ip.h> 31#include <linux/ip.h>
32#include <linux/in.h> 32#include <linux/in.h>
33#include <linux/list.h> 33#include <linux/list.h>
34#include <linux/seq_file.h>
34 35
35static DEFINE_RWLOCK(ip_ct_gre_lock); 36static DEFINE_RWLOCK(ip_ct_gre_lock);
36#define ASSERT_READ_LOCK(x) 37#define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
11#include <linux/timer.h> 11#include <linux/timer.h>
12#include <linux/netfilter.h> 12#include <linux/netfilter.h>
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/ip.h>
14#include <linux/udp.h> 15#include <linux/udp.h>
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <net/checksum.h> 17#include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
27#endif 27#endif
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30#include <net/route.h>
30 31
31#define ASSERT_READ_LOCK(x) 32#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) 33#define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
44 * 44 *
45 */ 45 */
46#include <linux/config.h> 46#include <linux/config.h>
47#include <linux/in.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/types.h> 49#include <linux/types.h>
49#include <linux/kernel.h> 50#include <linux/kernel.h>
@@ -53,6 +54,7 @@
53#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 54#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
54#include <linux/netfilter_ipv4/ip_nat_helper.h> 55#include <linux/netfilter_ipv4/ip_nat_helper.h>
55#include <linux/ip.h> 56#include <linux/ip.h>
57#include <linux/udp.h>
56#include <net/checksum.h> 58#include <net/checksum.h>
57#include <net/udp.h> 59#include <net/udp.h>
58#include <asm/uaccess.h> 60#include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e8..2a26d167e149 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
83 context stops packets coming through and allows user context to read 83 context stops packets coming through and allows user context to read
84 the counters or update the rules. 84 the counters or update the rules.
85 85
86 To be cache friendly on SMP, we arrange them like so:
87 [ n-entries ]
88 ... cache-align padding ...
89 [ n-entries ]
90
91 Hence the start of any table is given by get_table() below. */ 86 Hence the start of any table is given by get_table() below. */
92 87
93/* The table itself */ 88/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
105 unsigned int underflow[NF_IP_NUMHOOKS]; 100 unsigned int underflow[NF_IP_NUMHOOKS];
106 101
107 /* ipt_entry tables: one per CPU */ 102 /* ipt_entry tables: one per CPU */
108 char entries[0] ____cacheline_aligned; 103 void *entries[NR_CPUS];
109}; 104};
110 105
111static LIST_HEAD(ipt_target); 106static LIST_HEAD(ipt_target);
112static LIST_HEAD(ipt_match); 107static LIST_HEAD(ipt_match);
113static LIST_HEAD(ipt_tables); 108static LIST_HEAD(ipt_tables);
109#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 110#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
115 111
116#ifdef CONFIG_SMP
117#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
118#else
119#define TABLE_OFFSET(t,p) 0
120#endif
121
122#if 0 112#if 0
123#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) 113#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
124#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) 114#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
290 280
291 read_lock_bh(&table->lock); 281 read_lock_bh(&table->lock);
292 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 282 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
293 table_base = (void *)table->private->entries 283 table_base = (void *)table->private->entries[smp_processor_id()];
294 + TABLE_OFFSET(table->private, smp_processor_id());
295 e = get_entry(table_base, table->private->hook_entry[hook]); 284 e = get_entry(table_base, table->private->hook_entry[hook]);
296 285
297#ifdef CONFIG_NETFILTER_DEBUG 286#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
563/* Figures out from what hook each rule can be called: returns 0 if 552/* Figures out from what hook each rule can be called: returns 0 if
564 there are loops. Puts hook bitmask in comefrom. */ 553 there are loops. Puts hook bitmask in comefrom. */
565static int 554static int
566mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) 555mark_source_chains(struct ipt_table_info *newinfo,
556 unsigned int valid_hooks, void *entry0)
567{ 557{
568 unsigned int hook; 558 unsigned int hook;
569 559
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
572 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { 562 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
573 unsigned int pos = newinfo->hook_entry[hook]; 563 unsigned int pos = newinfo->hook_entry[hook];
574 struct ipt_entry *e 564 struct ipt_entry *e
575 = (struct ipt_entry *)(newinfo->entries + pos); 565 = (struct ipt_entry *)(entry0 + pos);
576 566
577 if (!(valid_hooks & (1 << hook))) 567 if (!(valid_hooks & (1 << hook)))
578 continue; 568 continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
622 goto next; 612 goto next;
623 613
624 e = (struct ipt_entry *) 614 e = (struct ipt_entry *)
625 (newinfo->entries + pos); 615 (entry0 + pos);
626 } while (oldpos == pos + e->next_offset); 616 } while (oldpos == pos + e->next_offset);
627 617
628 /* Move along one */ 618 /* Move along one */
629 size = e->next_offset; 619 size = e->next_offset;
630 e = (struct ipt_entry *) 620 e = (struct ipt_entry *)
631 (newinfo->entries + pos + size); 621 (entry0 + pos + size);
632 e->counters.pcnt = pos; 622 e->counters.pcnt = pos;
633 pos += size; 623 pos += size;
634 } else { 624 } else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
645 newpos = pos + e->next_offset; 635 newpos = pos + e->next_offset;
646 } 636 }
647 e = (struct ipt_entry *) 637 e = (struct ipt_entry *)
648 (newinfo->entries + newpos); 638 (entry0 + newpos);
649 e->counters.pcnt = pos; 639 e->counters.pcnt = pos;
650 pos = newpos; 640 pos = newpos;
651 } 641 }
@@ -855,6 +845,7 @@ static int
855translate_table(const char *name, 845translate_table(const char *name,
856 unsigned int valid_hooks, 846 unsigned int valid_hooks,
857 struct ipt_table_info *newinfo, 847 struct ipt_table_info *newinfo,
848 void *entry0,
858 unsigned int size, 849 unsigned int size,
859 unsigned int number, 850 unsigned int number,
860 const unsigned int *hook_entries, 851 const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
875 duprintf("translate_table: size %u\n", newinfo->size); 866 duprintf("translate_table: size %u\n", newinfo->size);
876 i = 0; 867 i = 0;
877 /* Walk through entries, checking offsets. */ 868 /* Walk through entries, checking offsets. */
878 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 869 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
879 check_entry_size_and_hooks, 870 check_entry_size_and_hooks,
880 newinfo, 871 newinfo,
881 newinfo->entries, 872 entry0,
882 newinfo->entries + size, 873 entry0 + size,
883 hook_entries, underflows, &i); 874 hook_entries, underflows, &i);
884 if (ret != 0) 875 if (ret != 0)
885 return ret; 876 return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
907 } 898 }
908 } 899 }
909 900
910 if (!mark_source_chains(newinfo, valid_hooks)) 901 if (!mark_source_chains(newinfo, valid_hooks, entry0))
911 return -ELOOP; 902 return -ELOOP;
912 903
913 /* Finally, each sanity check must pass */ 904 /* Finally, each sanity check must pass */
914 i = 0; 905 i = 0;
915 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 906 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
916 check_entry, name, size, &i); 907 check_entry, name, size, &i);
917 908
918 if (ret != 0) { 909 if (ret != 0) {
919 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 910 IPT_ENTRY_ITERATE(entry0, newinfo->size,
920 cleanup_entry, &i); 911 cleanup_entry, &i);
921 return ret; 912 return ret;
922 } 913 }
923 914
924 /* And one copy for every other CPU */ 915 /* And one copy for every other CPU */
925 for_each_cpu(i) { 916 for_each_cpu(i) {
926 if (i == 0) 917 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
927 continue; 918 memcpy(newinfo->entries[i], entry0, newinfo->size);
928 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
929 newinfo->entries,
930 SMP_ALIGN(newinfo->size));
931 } 919 }
932 920
933 return ret; 921 return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
943 931
944#ifdef CONFIG_NETFILTER_DEBUG 932#ifdef CONFIG_NETFILTER_DEBUG
945 { 933 {
946 struct ipt_entry *table_base; 934 int cpu;
947 unsigned int i;
948 935
949 for_each_cpu(i) { 936 for_each_cpu(cpu) {
950 table_base = 937 struct ipt_entry *table_base = newinfo->entries[cpu];
951 (void *)newinfo->entries 938 if (table_base)
952 + TABLE_OFFSET(newinfo, i); 939 table_base->comefrom = 0xdead57ac;
953
954 table_base->comefrom = 0xdead57ac;
955 } 940 }
956 } 941 }
957#endif 942#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
986 return 0; 971 return 0;
987} 972}
988 973
974static inline int
975set_entry_to_counter(const struct ipt_entry *e,
976 struct ipt_counters total[],
977 unsigned int *i)
978{
979 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
980
981 (*i)++;
982 return 0;
983}
984
989static void 985static void
990get_counters(const struct ipt_table_info *t, 986get_counters(const struct ipt_table_info *t,
991 struct ipt_counters counters[]) 987 struct ipt_counters counters[])
992{ 988{
993 unsigned int cpu; 989 unsigned int cpu;
994 unsigned int i; 990 unsigned int i;
991 unsigned int curcpu;
992
993 /* Instead of clearing (by a previous call to memset())
994 * the counters and using adds, we set the counters
995 * with data used by 'current' CPU
996 * We dont care about preemption here.
997 */
998 curcpu = raw_smp_processor_id();
999
1000 i = 0;
1001 IPT_ENTRY_ITERATE(t->entries[curcpu],
1002 t->size,
1003 set_entry_to_counter,
1004 counters,
1005 &i);
995 1006
996 for_each_cpu(cpu) { 1007 for_each_cpu(cpu) {
1008 if (cpu == curcpu)
1009 continue;
997 i = 0; 1010 i = 0;
998 IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 1011 IPT_ENTRY_ITERATE(t->entries[cpu],
999 t->size, 1012 t->size,
1000 add_entry_to_counter, 1013 add_entry_to_counter,
1001 counters, 1014 counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
1012 struct ipt_entry *e; 1025 struct ipt_entry *e;
1013 struct ipt_counters *counters; 1026 struct ipt_counters *counters;
1014 int ret = 0; 1027 int ret = 0;
1028 void *loc_cpu_entry;
1015 1029
1016 /* We need atomic snapshot of counters: rest doesn't change 1030 /* We need atomic snapshot of counters: rest doesn't change
1017 (other than comefrom, which userspace doesn't care 1031 (other than comefrom, which userspace doesn't care
1018 about). */ 1032 about). */
1019 countersize = sizeof(struct ipt_counters) * table->private->number; 1033 countersize = sizeof(struct ipt_counters) * table->private->number;
1020 counters = vmalloc(countersize); 1034 counters = vmalloc_node(countersize, numa_node_id());
1021 1035
1022 if (counters == NULL) 1036 if (counters == NULL)
1023 return -ENOMEM; 1037 return -ENOMEM;
1024 1038
1025 /* First, sum counters... */ 1039 /* First, sum counters... */
1026 memset(counters, 0, countersize);
1027 write_lock_bh(&table->lock); 1040 write_lock_bh(&table->lock);
1028 get_counters(table->private, counters); 1041 get_counters(table->private, counters);
1029 write_unlock_bh(&table->lock); 1042 write_unlock_bh(&table->lock);
1030 1043
1031 /* ... then copy entire thing from CPU 0... */ 1044 /* choose the copy that is on our node/cpu, ...
1032 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 1045 * This choice is lazy (because current thread is
1046 * allowed to migrate to another cpu)
1047 */
1048 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1049 /* ... then copy entire thing ... */
1050 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
1033 ret = -EFAULT; 1051 ret = -EFAULT;
1034 goto free_counters; 1052 goto free_counters;
1035 } 1053 }
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
1041 struct ipt_entry_match *m; 1059 struct ipt_entry_match *m;
1042 struct ipt_entry_target *t; 1060 struct ipt_entry_target *t;
1043 1061
1044 e = (struct ipt_entry *)(table->private->entries + off); 1062 e = (struct ipt_entry *)(loc_cpu_entry + off);
1045 if (copy_to_user(userptr + off 1063 if (copy_to_user(userptr + off
1046 + offsetof(struct ipt_entry, counters), 1064 + offsetof(struct ipt_entry, counters),
1047 &counters[num], 1065 &counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
1110 return ret; 1128 return ret;
1111} 1129}
1112 1130
1131static void free_table_info(struct ipt_table_info *info)
1132{
1133 int cpu;
1134 for_each_cpu(cpu) {
1135 if (info->size <= PAGE_SIZE)
1136 kfree(info->entries[cpu]);
1137 else
1138 vfree(info->entries[cpu]);
1139 }
1140 kfree(info);
1141}
1142
1143static struct ipt_table_info *alloc_table_info(unsigned int size)
1144{
1145 struct ipt_table_info *newinfo;
1146 int cpu;
1147
1148 newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
1149 if (!newinfo)
1150 return NULL;
1151
1152 newinfo->size = size;
1153
1154 for_each_cpu(cpu) {
1155 if (size <= PAGE_SIZE)
1156 newinfo->entries[cpu] = kmalloc_node(size,
1157 GFP_KERNEL,
1158 cpu_to_node(cpu));
1159 else
1160 newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
1161 if (newinfo->entries[cpu] == 0) {
1162 free_table_info(newinfo);
1163 return NULL;
1164 }
1165 }
1166
1167 return newinfo;
1168}
1169
1113static int 1170static int
1114do_replace(void __user *user, unsigned int len) 1171do_replace(void __user *user, unsigned int len)
1115{ 1172{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
1118 struct ipt_table *t; 1175 struct ipt_table *t;
1119 struct ipt_table_info *newinfo, *oldinfo; 1176 struct ipt_table_info *newinfo, *oldinfo;
1120 struct ipt_counters *counters; 1177 struct ipt_counters *counters;
1178 void *loc_cpu_entry, *loc_cpu_old_entry;
1121 1179
1122 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1180 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1123 return -EFAULT; 1181 return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
1130 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 1188 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1131 return -ENOMEM; 1189 return -ENOMEM;
1132 1190
1133 newinfo = vmalloc(sizeof(struct ipt_table_info) 1191 newinfo = alloc_table_info(tmp.size);
1134 + SMP_ALIGN(tmp.size) *
1135 (highest_possible_processor_id()+1));
1136 if (!newinfo) 1192 if (!newinfo)
1137 return -ENOMEM; 1193 return -ENOMEM;
1138 1194
1139 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 1195 /* choose the copy that is our node/cpu */
1196 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1197 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
1140 tmp.size) != 0) { 1198 tmp.size) != 0) {
1141 ret = -EFAULT; 1199 ret = -EFAULT;
1142 goto free_newinfo; 1200 goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
1147 ret = -ENOMEM; 1205 ret = -ENOMEM;
1148 goto free_newinfo; 1206 goto free_newinfo;
1149 } 1207 }
1150 memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
1151 1208
1152 ret = translate_table(tmp.name, tmp.valid_hooks, 1209 ret = translate_table(tmp.name, tmp.valid_hooks,
1153 newinfo, tmp.size, tmp.num_entries, 1210 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1154 tmp.hook_entry, tmp.underflow); 1211 tmp.hook_entry, tmp.underflow);
1155 if (ret != 0) 1212 if (ret != 0)
1156 goto free_newinfo_counters; 1213 goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
1189 /* Get the old counters. */ 1246 /* Get the old counters. */
1190 get_counters(oldinfo, counters); 1247 get_counters(oldinfo, counters);
1191 /* Decrease module usage counts and free resource */ 1248 /* Decrease module usage counts and free resource */
1192 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1249 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1193 vfree(oldinfo); 1250 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1251 free_table_info(oldinfo);
1194 if (copy_to_user(tmp.counters, counters, 1252 if (copy_to_user(tmp.counters, counters,
1195 sizeof(struct ipt_counters) * tmp.num_counters) != 0) 1253 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
1196 ret = -EFAULT; 1254 ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
1202 module_put(t->me); 1260 module_put(t->me);
1203 up(&ipt_mutex); 1261 up(&ipt_mutex);
1204 free_newinfo_counters_untrans: 1262 free_newinfo_counters_untrans:
1205 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); 1263 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1206 free_newinfo_counters: 1264 free_newinfo_counters:
1207 vfree(counters); 1265 vfree(counters);
1208 free_newinfo: 1266 free_newinfo:
1209 vfree(newinfo); 1267 free_table_info(newinfo);
1210 return ret; 1268 return ret;
1211} 1269}
1212 1270
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
1239 struct ipt_counters_info tmp, *paddc; 1297 struct ipt_counters_info tmp, *paddc;
1240 struct ipt_table *t; 1298 struct ipt_table *t;
1241 int ret = 0; 1299 int ret = 0;
1300 void *loc_cpu_entry;
1242 1301
1243 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1302 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1244 return -EFAULT; 1303 return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
1246 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) 1305 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
1247 return -EINVAL; 1306 return -EINVAL;
1248 1307
1249 paddc = vmalloc(len); 1308 paddc = vmalloc_node(len, numa_node_id());
1250 if (!paddc) 1309 if (!paddc)
1251 return -ENOMEM; 1310 return -ENOMEM;
1252 1311
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
1268 } 1327 }
1269 1328
1270 i = 0; 1329 i = 0;
1271 IPT_ENTRY_ITERATE(t->private->entries, 1330 /* Choose the copy that is on our node */
1331 loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
1332 IPT_ENTRY_ITERATE(loc_cpu_entry,
1272 t->private->size, 1333 t->private->size,
1273 add_counter_to_entry, 1334 add_counter_to_entry,
1274 paddc->counters, 1335 paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1460 struct ipt_table_info *newinfo; 1521 struct ipt_table_info *newinfo;
1461 static struct ipt_table_info bootstrap 1522 static struct ipt_table_info bootstrap
1462 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1523 = { 0, 0, 0, { 0 }, { 0 }, { } };
1524 void *loc_cpu_entry;
1463 1525
1464 newinfo = vmalloc(sizeof(struct ipt_table_info) 1526 newinfo = alloc_table_info(repl->size);
1465 + SMP_ALIGN(repl->size) *
1466 (highest_possible_processor_id()+1));
1467 if (!newinfo) 1527 if (!newinfo)
1468 return -ENOMEM; 1528 return -ENOMEM;
1469 1529
1470 memcpy(newinfo->entries, repl->entries, repl->size); 1530 /* choose the copy on our node/cpu
1531 * but dont care of preemption
1532 */
1533 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1534 memcpy(loc_cpu_entry, repl->entries, repl->size);
1471 1535
1472 ret = translate_table(table->name, table->valid_hooks, 1536 ret = translate_table(table->name, table->valid_hooks,
1473 newinfo, repl->size, 1537 newinfo, loc_cpu_entry, repl->size,
1474 repl->num_entries, 1538 repl->num_entries,
1475 repl->hook_entry, 1539 repl->hook_entry,
1476 repl->underflow); 1540 repl->underflow);
1477 if (ret != 0) { 1541 if (ret != 0) {
1478 vfree(newinfo); 1542 free_table_info(newinfo);
1479 return ret; 1543 return ret;
1480 } 1544 }
1481 1545
1482 ret = down_interruptible(&ipt_mutex); 1546 ret = down_interruptible(&ipt_mutex);
1483 if (ret != 0) { 1547 if (ret != 0) {
1484 vfree(newinfo); 1548 free_table_info(newinfo);
1485 return ret; 1549 return ret;
1486 } 1550 }
1487 1551
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1510 return ret; 1574 return ret;
1511 1575
1512 free_unlock: 1576 free_unlock:
1513 vfree(newinfo); 1577 free_table_info(newinfo);
1514 goto unlock; 1578 goto unlock;
1515} 1579}
1516 1580
1517void ipt_unregister_table(struct ipt_table *table) 1581void ipt_unregister_table(struct ipt_table *table)
1518{ 1582{
1583 void *loc_cpu_entry;
1584
1519 down(&ipt_mutex); 1585 down(&ipt_mutex);
1520 LIST_DELETE(&ipt_tables, table); 1586 LIST_DELETE(&ipt_tables, table);
1521 up(&ipt_mutex); 1587 up(&ipt_mutex);
1522 1588
1523 /* Decrease module usage counts and free resources */ 1589 /* Decrease module usage counts and free resources */
1524 IPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1590 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1591 IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1525 cleanup_entry, NULL); 1592 cleanup_entry, NULL);
1526 vfree(table->private); 1593 free_table_info(table->private);
1527} 1594}
1528 1595
1529/* Returns 1 if the port is matched by the range, 0 otherwise */ 1596/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/inetdevice.h>
14#include <linux/ip.h> 15#include <linux/ip.h>
15#include <linux/timer.h> 16#include <linux/timer.h>
16#include <linux/module.h> 17#include <linux/module.h>
@@ -18,6 +19,7 @@
18#include <net/protocol.h> 19#include <net/protocol.h>
19#include <net/ip.h> 20#include <net/ip.h>
20#include <net/checksum.h> 21#include <net/checksum.h>
22#include <net/route.h>
21#include <linux/netfilter_ipv4.h> 23#include <linux/netfilter_ipv4.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h> 24#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/netdevice.h>
12#include <linux/skbuff.h> 13#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ipt_physdev.h> 14#include <linux/netfilter_ipv4/ipt_physdev.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
38#include <net/protocol.h> 38#include <net/protocol.h>
39#include <net/tcp.h> 39#include <net/tcp.h>
40#include <net/udp.h> 40#include <net/udp.h>
41#include <linux/inetdevice.h>
41#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
42#include <linux/seq_file.h> 43#include <linux/seq_file.h>
43#include <net/sock.h> 44#include <net/sock.h>
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
173 struct request_sock *req, 173 struct request_sock *req,
174 struct dst_entry *dst) 174 struct dst_entry *dst)
175{ 175{
176 struct tcp_sock *tp = tcp_sk(sk); 176 struct inet_connection_sock *icsk = inet_csk(sk);
177 struct sock *child; 177 struct sock *child;
178 178
179 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); 179 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
180 if (child) 180 if (child)
181 inet_csk_reqsk_queue_add(sk, req, child); 181 inet_csk_reqsk_queue_add(sk, req, child);
182 else 182 else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/sysctl.h> 12#include <linux/sysctl.h>
13#include <linux/config.h> 13#include <linux/config.h>
14#include <linux/igmp.h> 14#include <linux/igmp.h>
15#include <linux/inetdevice.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -22,6 +23,7 @@
22extern int sysctl_ip_nonlocal_bind; 23extern int sysctl_ip_nonlocal_bind;
23 24
24#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
26static int zero;
25static int tcp_retr1_max = 255; 27static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 28static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 29static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = {
614 .strategy = &sysctl_jiffies 616 .strategy = &sysctl_jiffies
615 }, 617 },
616 { 618 {
619 .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
620 .procname = "ipfrag_max_dist",
621 .data = &sysctl_ipfrag_max_dist,
622 .maxlen = sizeof(int),
623 .mode = 0644,
624 .proc_handler = &proc_dointvec_minmax,
625 .extra1 = &zero
626 },
627 {
617 .ctl_name = NET_TCP_NO_METRICS_SAVE, 628 .ctl_name = NET_TCP_NO_METRICS_SAVE,
618 .procname = "tcp_no_metrics_save", 629 .procname = "tcp_no_metrics_save",
619 .data = &sysctl_tcp_nometrics_save, 630 .data = &sysctl_tcp_nometrics_save,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1696 int err = 0; 1696 int err = 0;
1697 1697
1698 if (level != SOL_TCP) 1698 if (level != SOL_TCP)
1699 return tp->af_specific->setsockopt(sk, level, optname, 1699 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1700 optval, optlen); 1700 optval, optlen);
1701 1701
1702 /* This is a string value all the others are int's */ 1702 /* This is a string value all the others are int's */
1703 if (optname == TCP_CONGESTION) { 1703 if (optname == TCP_CONGESTION) {
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1916 1916
1917 info->tcpi_pmtu = tp->pmtu_cookie; 1917 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; 1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; 1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1939 int val, len; 1939 int val, len;
1940 1940
1941 if (level != SOL_TCP) 1941 if (level != SOL_TCP)
1942 return tp->af_specific->getsockopt(sk, level, optname, 1942 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
1943 optval, optlen); 1943 optval, optlen);
1944 1944
1945 if (get_user(len, optlen)) 1945 if (get_user(len, optlen))
1946 return -EFAULT; 1946 return -EFAULT;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b1..035f2092d73a 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
30static int max_increment = 16; 30static int max_increment = 16;
31static int low_window = 14; 31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ 32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100; 33static int initial_ssthresh = 100;
36static int smooth_part = 20; 34static int smooth_part = 20;
37 35
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); 41MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644); 42module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase"); 43MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644); 44module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); 45MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644); 46module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
60 u32 loss_cwnd; /* congestion window at last loss */ 54 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */ 55 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */ 56 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */ 57 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4 58#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 59 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
77 ca->loss_cwnd = 0; 66 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0; 67 ca->last_cwnd = 0;
79 ca->last_time = 0; 68 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0; 69 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 70 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87} 71}
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
143 } 127 }
144 128
145 /* if in slow start or link utilization is very low */ 129 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 || 130 if (ca->loss_cwnd == 0) {
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */ 131 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20; 132 ca->cnt = 20;
150 } 133 }
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
154 ca->cnt = 1; 137 ca->cnt = 1;
155} 138}
156 139
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct sock *sk, int flag)
160{
161 const struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
163 u32 dist, delay;
164
165 /* No time stamp */
166 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
167 /* Discard delay samples right after fast recovery */
168 tcp_time_stamp < ca->epoch_start + HZ ||
169 /* this delay samples may not be accurate */
170 flag == 0) {
171 ca->last_delay = 0;
172 goto notlow;
173 }
174
175 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
176 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
177 if (delay == 0) /* no previous delay sample */
178 goto notlow;
179
180 /* first time call or link delay decreases */
181 if (ca->delay_min == 0 || ca->delay_min > delay) {
182 ca->delay_min = ca->delay_max = delay;
183 goto notlow;
184 }
185
186 if (ca->delay_max < delay)
187 ca->delay_max = delay;
188
189 /* utilization is low, if avg delay < dist*threshold
190 for checking_period time */
191 dist = ca->delay_max - ca->delay_min;
192 if (dist <= ca->delay_min>>6 ||
193 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
194 goto notlow;
195
196 if (ca->low_utilization_start == 0) {
197 ca->low_utilization = 0;
198 ca->low_utilization_start = tcp_time_stamp;
199 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
200 > low_utilization_period*HZ) {
201 ca->low_utilization = 1;
202 }
203
204 return;
205
206 notlow:
207 ca->low_utilization = 0;
208 ca->low_utilization_start = 0;
209
210}
211
212static void bictcp_cong_avoid(struct sock *sk, u32 ack, 140static void bictcp_cong_avoid(struct sock *sk, u32 ack,
213 u32 seq_rtt, u32 in_flight, int data_acked) 141 u32 seq_rtt, u32 in_flight, int data_acked)
214{ 142{
215 struct tcp_sock *tp = tcp_sk(sk); 143 struct tcp_sock *tp = tcp_sk(sk);
216 struct bictcp *ca = inet_csk_ca(sk); 144 struct bictcp *ca = inet_csk_ca(sk);
217 145
218 bictcp_low_utilization(sk, data_acked);
219
220 if (!tcp_is_cwnd_limited(sk, in_flight)) 146 if (!tcp_is_cwnd_limited(sk, in_flight))
221 return; 147 return;
222 148
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
249 175
250 ca->epoch_start = 0; /* end of epoch */ 176 ca->epoch_start = 0; /* end of epoch */
251 177
252 /* in case of wrong delay_max*/
253 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
254 ca->delay_max = ca->delay_min
255 + ((ca->delay_max - ca->delay_min)* 90) / 100;
256
257 /* Wmax and fast convergence */ 178 /* Wmax and fast convergence */
258 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) 179 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
259 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) 180 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
289 bictcp_reset(inet_csk_ca(sk)); 210 bictcp_reset(inet_csk_ca(sk));
290} 211}
291 212
292/* Track delayed acknowledgement ratio using sliding window 213/* Track delayed acknowledgment ratio using sliding window
293 * ratio = (15*ratio + sample) / 16 214 * ratio = (15*ratio + sample) / 16
294 */ 215 */
295static void bictcp_acked(struct sock *sk, u32 cnt) 216static void bictcp_acked(struct sock *sk, u32 cnt)
296{ 217{
297 const struct inet_connection_sock *icsk = inet_csk(sk); 218 const struct inet_connection_sock *icsk = inet_csk(sk);
298 219
299 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { 220 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
300 struct bictcp *ca = inet_csk_ca(sk); 221 struct bictcp *ca = inet_csk_ca(sk);
301 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 222 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
302 ca->delayed_ack += cnt; 223 ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc12..e688c687d62d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
174 return err; 174 return err;
175} 175}
176 176
177
178/*
179 * Linear increase during slow start
180 */
181void tcp_slow_start(struct tcp_sock *tp)
182{
183 if (sysctl_tcp_abc) {
184 /* RFC3465: Slow Start
185 * TCP sender SHOULD increase cwnd by the number of
186 * previously unacknowledged bytes ACKed by each incoming
187 * acknowledgment, provided the increase is not more than L
188 */
189 if (tp->bytes_acked < tp->mss_cache)
190 return;
191
192 /* We MAY increase by 2 if discovered delayed ack */
193 if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
194 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
195 tp->snd_cwnd++;
196 }
197 }
198 tp->bytes_acked = 0;
199
200 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
201 tp->snd_cwnd++;
202}
203EXPORT_SYMBOL_GPL(tcp_slow_start);
204
177/* 205/*
178 * TCP Reno congestion control 206 * TCP Reno congestion control
179 * This is special case used for fallback as well. 207 * This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000000..31a4986dfbf7
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,411 @@
1/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
3 *
4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu.
6 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
7 * in PFLDnet 2005
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
10 *
11 * Unless CUBIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19#include <asm/div64.h>
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
29
30static int fast_convergence = 1;
31static int max_increment = 16;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int initial_ssthresh = 100;
34static int bic_scale = 41;
35static int tcp_friendliness = 1;
36
37static u32 cube_rtt_scale;
38static u32 beta_scale;
39static u64 cube_factor;
40
41/* Note parameters that are used for precomputing scale factors are read-only */
42module_param(fast_convergence, int, 0644);
43MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
44module_param(max_increment, int, 0644);
45MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
46module_param(beta, int, 0444);
47MODULE_PARM_DESC(beta, "beta for multiplicative increase");
48module_param(initial_ssthresh, int, 0644);
49MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
50module_param(bic_scale, int, 0444);
51MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
52module_param(tcp_friendliness, int, 0644);
53MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
54
55#include <asm/div64.h>
56
57/* BIC TCP Parameters */
58struct bictcp {
59 u32 cnt; /* increase cwnd by 1 after ACKs */
60 u32 last_max_cwnd; /* last maximum snd_cwnd */
61 u32 loss_cwnd; /* congestion window at last loss */
62 u32 last_cwnd; /* the last snd_cwnd */
63 u32 last_time; /* time when updated last_cwnd */
64 u32 bic_origin_point;/* origin point of bic function */
65 u32 bic_K; /* time to origin point from the beginning of the current epoch */
66 u32 delay_min; /* min delay */
67 u32 epoch_start; /* beginning of an epoch */
68 u32 ack_cnt; /* number of acks */
69 u32 tcp_cwnd; /* estimated tcp cwnd */
70#define ACK_RATIO_SHIFT 4
71 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
72};
73
74static inline void bictcp_reset(struct bictcp *ca)
75{
76 ca->cnt = 0;
77 ca->last_max_cwnd = 0;
78 ca->loss_cwnd = 0;
79 ca->last_cwnd = 0;
80 ca->last_time = 0;
81 ca->bic_origin_point = 0;
82 ca->bic_K = 0;
83 ca->delay_min = 0;
84 ca->epoch_start = 0;
85 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
86 ca->ack_cnt = 0;
87 ca->tcp_cwnd = 0;
88}
89
90static void bictcp_init(struct sock *sk)
91{
92 bictcp_reset(inet_csk_ca(sk));
93 if (initial_ssthresh)
94 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
95}
96
97/* 64bit divisor, dividend and result. dynamic precision */
98static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
99{
100 u_int32_t d = divisor;
101
102 if (divisor > 0xffffffffULL) {
103 unsigned int shift = fls(divisor >> 32);
104
105 d = divisor >> shift;
106 dividend >>= shift;
107 }
108
109 /* avoid 64 bit division if possible */
110 if (dividend >> 32)
111 do_div(dividend, d);
112 else
113 dividend = (uint32_t) dividend / d;
114
115 return dividend;
116}
117
118/*
119 * calculate the cubic root of x using Newton-Raphson
120 */
121static u32 cubic_root(u64 a)
122{
123 u32 x, x1;
124
125 /* Initial estimate is based on:
126 * cbrt(x) = exp(log(x) / 3)
127 */
128 x = 1u << (fls64(a)/3);
129
130 /*
131 * Iteration based on:
132 * 2
133 * x = ( 2 * x + a / x ) / 3
134 * k+1 k k
135 */
136 do {
137 x1 = x;
138 x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
139 } while (abs(x1 - x) > 1);
140
141 return x;
142}
143
144/*
145 * Compute congestion window to use.
146 */
147static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
148{
149 u64 offs;
150 u32 delta, t, bic_target, min_cnt, max_cnt;
151
152 ca->ack_cnt++; /* count the number of ACKs */
153
154 if (ca->last_cwnd == cwnd &&
155 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
156 return;
157
158 ca->last_cwnd = cwnd;
159 ca->last_time = tcp_time_stamp;
160
161 if (ca->epoch_start == 0) {
162 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
163 ca->ack_cnt = 1; /* start counting */
164 ca->tcp_cwnd = cwnd; /* syn with cubic */
165
166 if (ca->last_max_cwnd <= cwnd) {
167 ca->bic_K = 0;
168 ca->bic_origin_point = cwnd;
169 } else {
170 /* Compute new K based on
171 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
172 */
173 ca->bic_K = cubic_root(cube_factor
174 * (ca->last_max_cwnd - cwnd));
175 ca->bic_origin_point = ca->last_max_cwnd;
176 }
177 }
178
179 /* cubic function - calc*/
180 /* calculate c * time^3 / rtt,
181 * while considering overflow in calculation of time^3
182 * (so time^3 is done by using 64 bit)
183 * and without the support of division of 64bit numbers
184 * (so all divisions are done by using 32 bit)
185 * also NOTE the unit of those veriables
186 * time = (t - K) / 2^bictcp_HZ
187 * c = bic_scale >> 10
188 * rtt = (srtt >> 3) / HZ
189 * !!! The following code does not have overflow problems,
190 * if the cwnd < 1 million packets !!!
191 */
192
193 /* change the unit from HZ to bictcp_HZ */
194 t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
195 << BICTCP_HZ) / HZ;
196
197 if (t < ca->bic_K) /* t - K */
198 offs = ca->bic_K - t;
199 else
200 offs = t - ca->bic_K;
201
202 /* c/rtt * (t-K)^3 */
203 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
204 if (t < ca->bic_K) /* below origin*/
205 bic_target = ca->bic_origin_point - delta;
206 else /* above origin*/
207 bic_target = ca->bic_origin_point + delta;
208
209 /* cubic function - calc bictcp_cnt*/
210 if (bic_target > cwnd) {
211 ca->cnt = cwnd / (bic_target - cwnd);
212 } else {
213 ca->cnt = 100 * cwnd; /* very small increment*/
214 }
215
216 if (ca->delay_min > 0) {
217 /* max increment = Smax * rtt / 0.1 */
218 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
219 if (ca->cnt < min_cnt)
220 ca->cnt = min_cnt;
221 }
222
223 /* slow start and low utilization */
224 if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
225 ca->cnt = 50;
226
227 /* TCP Friendly */
228 if (tcp_friendliness) {
229 u32 scale = beta_scale;
230 delta = (cwnd * scale) >> 3;
231 while (ca->ack_cnt > delta) { /* update tcp cwnd */
232 ca->ack_cnt -= delta;
233 ca->tcp_cwnd++;
234 }
235
236 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
237 delta = ca->tcp_cwnd - cwnd;
238 max_cnt = cwnd / delta;
239 if (ca->cnt > max_cnt)
240 ca->cnt = max_cnt;
241 }
242 }
243
244 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
245 if (ca->cnt == 0) /* cannot be zero */
246 ca->cnt = 1;
247}
248
249
250/* Keep track of minimum rtt */
251static inline void measure_delay(struct sock *sk)
252{
253 const struct tcp_sock *tp = tcp_sk(sk);
254 struct bictcp *ca = inet_csk_ca(sk);
255 u32 delay;
256
257 /* No time stamp */
258 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
259 /* Discard delay samples right after fast recovery */
260 (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
261 return;
262
263 delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
264 if (delay == 0)
265 delay = 1;
266
267 /* first time call or link delay decreases */
268 if (ca->delay_min == 0 || ca->delay_min > delay)
269 ca->delay_min = delay;
270}
271
272static void bictcp_cong_avoid(struct sock *sk, u32 ack,
273 u32 seq_rtt, u32 in_flight, int data_acked)
274{
275 struct tcp_sock *tp = tcp_sk(sk);
276 struct bictcp *ca = inet_csk_ca(sk);
277
278 if (data_acked)
279 measure_delay(sk);
280
281 if (!tcp_is_cwnd_limited(sk, in_flight))
282 return;
283
284 if (tp->snd_cwnd <= tp->snd_ssthresh)
285 tcp_slow_start(tp);
286 else {
287 bictcp_update(ca, tp->snd_cwnd);
288
289 /* In dangerous area, increase slowly.
290 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
291 */
292 if (tp->snd_cwnd_cnt >= ca->cnt) {
293 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
294 tp->snd_cwnd++;
295 tp->snd_cwnd_cnt = 0;
296 } else
297 tp->snd_cwnd_cnt++;
298 }
299
300}
301
302static u32 bictcp_recalc_ssthresh(struct sock *sk)
303{
304 const struct tcp_sock *tp = tcp_sk(sk);
305 struct bictcp *ca = inet_csk_ca(sk);
306
307 ca->epoch_start = 0; /* end of epoch */
308
309 /* Wmax and fast convergence */
310 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
311 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
312 / (2 * BICTCP_BETA_SCALE);
313 else
314 ca->last_max_cwnd = tp->snd_cwnd;
315
316 ca->loss_cwnd = tp->snd_cwnd;
317
318 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
319}
320
321static u32 bictcp_undo_cwnd(struct sock *sk)
322{
323 struct bictcp *ca = inet_csk_ca(sk);
324
325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
326}
327
328static u32 bictcp_min_cwnd(struct sock *sk)
329{
330 return tcp_sk(sk)->snd_ssthresh;
331}
332
333static void bictcp_state(struct sock *sk, u8 new_state)
334{
335 if (new_state == TCP_CA_Loss)
336 bictcp_reset(inet_csk_ca(sk));
337}
338
339/* Track delayed acknowledgment ratio using sliding window
340 * ratio = (15*ratio + sample) / 16
341 */
342static void bictcp_acked(struct sock *sk, u32 cnt)
343{
344 const struct inet_connection_sock *icsk = inet_csk(sk);
345
346 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
347 struct bictcp *ca = inet_csk_ca(sk);
348 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
349 ca->delayed_ack += cnt;
350 }
351}
352
353
354static struct tcp_congestion_ops cubictcp = {
355 .init = bictcp_init,
356 .ssthresh = bictcp_recalc_ssthresh,
357 .cong_avoid = bictcp_cong_avoid,
358 .set_state = bictcp_state,
359 .undo_cwnd = bictcp_undo_cwnd,
360 .min_cwnd = bictcp_min_cwnd,
361 .pkts_acked = bictcp_acked,
362 .owner = THIS_MODULE,
363 .name = "cubic",
364};
365
366static int __init cubictcp_register(void)
367{
368 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
369
370 /* Precompute a bunch of the scaling factors that are used per-packet
371 * based on SRTT of 100ms
372 */
373
374 beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
375
376 cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
377
378 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
379 * so K = cubic_root( (wmax-cwnd)*rtt/c )
380 * the unit of K is bictcp_HZ=2^10, not HZ
381 *
382 * c = bic_scale >> 10
383 * rtt = 100ms
384 *
385 * the following code has been designed and tested for
386 * cwnd < 1 million packets
387 * RTT < 100 seconds
388 * HZ < 1,000,00 (corresponding to 10 nano-second)
389 */
390
391 /* 1/c * 2^2*bictcp_HZ * srtt */
392 cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
393
394 /* divide by bic_scale and by constant Srtt (100ms) */
395 do_div(cube_factor, bic_scale * 10);
396
397 return tcp_register_congestion_control(&cubictcp);
398}
399
400static void __exit cubictcp_unregister(void)
401{
402 tcp_unregister_congestion_control(&cubictcp);
403}
404
405module_init(cubictcp_register);
406module_exit(cubictcp_unregister);
407
408MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
409MODULE_LICENSE("GPL");
410MODULE_DESCRIPTION("CUBIC TCP");
411MODULE_VERSION("2.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..0a461232329f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
115/* Adapt the MSS value used to make delayed ack decision to the 115/* Adapt the MSS value used to make delayed ack decision to the
116 * real world. 116 * real world.
117 */ 117 */
118static inline void tcp_measure_rcv_mss(struct sock *sk, 118static void tcp_measure_rcv_mss(struct sock *sk,
119 const struct sk_buff *skb) 119 const struct sk_buff *skb)
120{ 120{
121 struct inet_connection_sock *icsk = inet_csk(sk); 121 struct inet_connection_sock *icsk = inet_csk(sk);
122 const unsigned int lss = icsk->icsk_ack.last_seg_size; 122 const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
246 return 0; 246 return 0;
247} 247}
248 248
249static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 249static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
250 struct sk_buff *skb) 250 struct sk_buff *skb)
251{ 251{
252 /* Check #1 */ 252 /* Check #1 */
253 if (tp->rcv_ssthresh < tp->window_clamp && 253 if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
342} 342}
343 343
344
345/* Initialize RCV_MSS value.
346 * RCV_MSS is an our guess about MSS used by the peer.
347 * We haven't any direct information about the MSS.
348 * It's better to underestimate the RCV_MSS rather than overestimate.
349 * Overestimations make us ACKing less frequently than needed.
350 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
351 */
352void tcp_initialize_rcv_mss(struct sock *sk)
353{
354 struct tcp_sock *tp = tcp_sk(sk);
355 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
356
357 hint = min(hint, tp->rcv_wnd/2);
358 hint = min(hint, TCP_MIN_RCVMSS);
359 hint = max(hint, TCP_MIN_MSS);
360
361 inet_csk(sk)->icsk_ack.rcv_mss = hint;
362}
363
344/* Receiver "autotuning" code. 364/* Receiver "autotuning" code.
345 * 365 *
346 * The algorithm for RTT estimation w/o timestamps is based on 366 * The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
735 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 755 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
736} 756}
737 757
758/* Set slow start threshold and cwnd not falling to slow start */
759void tcp_enter_cwr(struct sock *sk)
760{
761 struct tcp_sock *tp = tcp_sk(sk);
762
763 tp->prior_ssthresh = 0;
764 tp->bytes_acked = 0;
765 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
766 tp->undo_marker = 0;
767 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
768 tp->snd_cwnd = min(tp->snd_cwnd,
769 tcp_packets_in_flight(tp) + 1U);
770 tp->snd_cwnd_cnt = 0;
771 tp->high_seq = tp->snd_nxt;
772 tp->snd_cwnd_stamp = tcp_time_stamp;
773 TCP_ECN_queue_cwr(tp);
774
775 tcp_set_ca_state(sk, TCP_CA_CWR);
776 }
777}
778
738/* Initialize metrics on socket. */ 779/* Initialize metrics on socket. */
739 780
740static void tcp_init_metrics(struct sock *sk) 781static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
2070 tcp_ack_no_tstamp(sk, seq_rtt, flag); 2111 tcp_ack_no_tstamp(sk, seq_rtt, flag);
2071} 2112}
2072 2113
2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2114static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2074 u32 in_flight, int good) 2115 u32 in_flight, int good)
2075{ 2116{
2076 const struct inet_connection_sock *icsk = inet_csk(sk); 2117 const struct inet_connection_sock *icsk = inet_csk(sk);
2077 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); 2118 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2082 * RFC2988 recommends to restart timer to now+rto. 2123 * RFC2988 recommends to restart timer to now+rto.
2083 */ 2124 */
2084 2125
2085static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 2126static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2086{ 2127{
2087 if (!tp->packets_out) { 2128 if (!tp->packets_out) {
2088 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2129 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2147 return acked; 2188 return acked;
2148} 2189}
2149 2190
2150static inline u32 tcp_usrtt(const struct sk_buff *skb) 2191static u32 tcp_usrtt(const struct sk_buff *skb)
2151{ 2192{
2152 struct timeval tv, now; 2193 struct timeval tv, now;
2153 2194
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2342 2383
2343 if (nwin > tp->max_window) { 2384 if (nwin > tp->max_window) {
2344 tp->max_window = nwin; 2385 tp->max_window = nwin;
2345 tcp_sync_mss(sk, tp->pmtu_cookie); 2386 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
2346 } 2387 }
2347 } 2388 }
2348 } 2389 }
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2583/* Fast parse options. This hopes to only see timestamps. 2624/* Fast parse options. This hopes to only see timestamps.
2584 * If it is wrong it falls back on tcp_parse_options(). 2625 * If it is wrong it falls back on tcp_parse_options().
2585 */ 2626 */
2586static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 2627static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
2587 struct tcp_sock *tp) 2628 struct tcp_sock *tp)
2588{ 2629{
2589 if (th->doff == sizeof(struct tcphdr)>>2) { 2630 if (th->doff == sizeof(struct tcphdr)>>2) {
2590 tp->rx_opt.saw_tstamp = 0; 2631 tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2804 } 2845 }
2805} 2846}
2806 2847
2807static __inline__ int 2848static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2808tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2809{ 2849{
2810 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 2850 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
2811 if (before(seq, sp->start_seq)) 2851 if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2817 return 0; 2857 return 0;
2818} 2858}
2819 2859
2820static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) 2860static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2821{ 2861{
2822 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { 2862 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
2823 if (before(seq, tp->rcv_nxt)) 2863 if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2832 } 2872 }
2833} 2873}
2834 2874
2835static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) 2875static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
2836{ 2876{
2837 if (!tp->rx_opt.dsack) 2877 if (!tp->rx_opt.dsack)
2838 tcp_dsack_set(tp, seq, end_seq); 2878 tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
2890 } 2930 }
2891} 2931}
2892 2932
2893static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) 2933static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
2894{ 2934{
2895 __u32 tmp; 2935 __u32 tmp;
2896 2936
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
3455 tp->snd_cwnd_stamp = tcp_time_stamp; 3495 tp->snd_cwnd_stamp = tcp_time_stamp;
3456} 3496}
3457 3497
3458static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) 3498static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3459{ 3499{
3460 /* If the user specified a specific send buffer setting, do 3500 /* If the user specified a specific send buffer setting, do
3461 * not modify it. 3501 * not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
3502 sk->sk_write_space(sk); 3542 sk->sk_write_space(sk);
3503} 3543}
3504 3544
3505static inline void tcp_check_space(struct sock *sk) 3545static void tcp_check_space(struct sock *sk)
3506{ 3546{
3507 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 3547 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3508 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 3548 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
3512 } 3552 }
3513} 3553}
3514 3554
3515static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) 3555static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3516{ 3556{
3517 tcp_push_pending_frames(sk, tp); 3557 tcp_push_pending_frames(sk, tp);
3518 tcp_check_space(sk); 3558 tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3544 } 3584 }
3545} 3585}
3546 3586
3547static __inline__ void tcp_ack_snd_check(struct sock *sk) 3587static inline void tcp_ack_snd_check(struct sock *sk)
3548{ 3588{
3549 if (!inet_csk_ack_scheduled(sk)) { 3589 if (!inet_csk_ack_scheduled(sk)) {
3550 /* We sent a data segment already. */ 3590 /* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3692 return result; 3732 return result;
3693} 3733}
3694 3734
3695static __inline__ int 3735static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3696tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3697{ 3736{
3698 return skb->ip_summed != CHECKSUM_UNNECESSARY && 3737 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
3699 __tcp_checksum_complete_user(sk, skb); 3738 __tcp_checksum_complete_user(sk, skb);
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3967 struct tcphdr *th, unsigned len) 4006 struct tcphdr *th, unsigned len)
3968{ 4007{
3969 struct tcp_sock *tp = tcp_sk(sk); 4008 struct tcp_sock *tp = tcp_sk(sk);
4009 struct inet_connection_sock *icsk = inet_csk(sk);
3970 int saved_clamp = tp->rx_opt.mss_clamp; 4010 int saved_clamp = tp->rx_opt.mss_clamp;
3971 4011
3972 tcp_parse_options(skb, &tp->rx_opt, 0); 4012 tcp_parse_options(skb, &tp->rx_opt, 0);
3973 4013
3974 if (th->ack) { 4014 if (th->ack) {
3975 struct inet_connection_sock *icsk;
3976 /* rfc793: 4015 /* rfc793:
3977 * "If the state is SYN-SENT then 4016 * "If the state is SYN-SENT then
3978 * first check the ACK bit 4017 * first check the ACK bit
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4061 if (tp->rx_opt.sack_ok && sysctl_tcp_fack) 4100 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4062 tp->rx_opt.sack_ok |= 2; 4101 tp->rx_opt.sack_ok |= 2;
4063 4102
4064 tcp_sync_mss(sk, tp->pmtu_cookie); 4103 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4065 tcp_initialize_rcv_mss(sk); 4104 tcp_initialize_rcv_mss(sk);
4066 4105
4067 /* Remember, tcp_poll() does not lock socket! 4106 /* Remember, tcp_poll() does not lock socket!
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4072 tcp_set_state(sk, TCP_ESTABLISHED); 4111 tcp_set_state(sk, TCP_ESTABLISHED);
4073 4112
4074 /* Make sure socket is routed, for correct metrics. */ 4113 /* Make sure socket is routed, for correct metrics. */
4075 tp->af_specific->rebuild_header(sk); 4114 icsk->icsk_af_ops->rebuild_header(sk);
4076 4115
4077 tcp_init_metrics(sk); 4116 tcp_init_metrics(sk);
4078 4117
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4098 sk_wake_async(sk, 0, POLL_OUT); 4137 sk_wake_async(sk, 0, POLL_OUT);
4099 } 4138 }
4100 4139
4101 icsk = inet_csk(sk);
4102
4103 if (sk->sk_write_pending || 4140 if (sk->sk_write_pending ||
4104 icsk->icsk_accept_queue.rskq_defer_accept || 4141 icsk->icsk_accept_queue.rskq_defer_accept ||
4105 icsk->icsk_ack.pingpong) { 4142 icsk->icsk_ack.pingpong) {
@@ -4173,7 +4210,7 @@ discard:
4173 if (tp->ecn_flags&TCP_ECN_OK) 4210 if (tp->ecn_flags&TCP_ECN_OK)
4174 sock_set_flag(sk, SOCK_NO_LARGESEND); 4211 sock_set_flag(sk, SOCK_NO_LARGESEND);
4175 4212
4176 tcp_sync_mss(sk, tp->pmtu_cookie); 4213 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4177 tcp_initialize_rcv_mss(sk); 4214 tcp_initialize_rcv_mss(sk);
4178 4215
4179 4216
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4220 struct tcphdr *th, unsigned len) 4257 struct tcphdr *th, unsigned len)
4221{ 4258{
4222 struct tcp_sock *tp = tcp_sk(sk); 4259 struct tcp_sock *tp = tcp_sk(sk);
4260 struct inet_connection_sock *icsk = inet_csk(sk);
4223 int queued = 0; 4261 int queued = 0;
4224 4262
4225 tp->rx_opt.saw_tstamp = 0; 4263 tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4236 goto discard; 4274 goto discard;
4237 4275
4238 if(th->syn) { 4276 if(th->syn) {
4239 if(tp->af_specific->conn_request(sk, skb) < 0) 4277 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
4240 return 1; 4278 return 1;
4241 4279
4242 /* Now we have several options: In theory there is 4280 /* Now we have several options: In theory there is
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4349 /* Make sure socket is routed, for 4387 /* Make sure socket is routed, for
4350 * correct metrics. 4388 * correct metrics.
4351 */ 4389 */
4352 tp->af_specific->rebuild_header(sk); 4390 icsk->icsk_af_ops->rebuild_header(sk);
4353 4391
4354 tcp_init_metrics(sk); 4392 tcp_init_metrics(sk);
4355 4393
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
4475EXPORT_SYMBOL(tcp_parse_options); 4513EXPORT_SYMBOL(tcp_parse_options);
4476EXPORT_SYMBOL(tcp_rcv_established); 4514EXPORT_SYMBOL(tcp_rcv_established);
4477EXPORT_SYMBOL(tcp_rcv_state_process); 4515EXPORT_SYMBOL(tcp_rcv_state_process);
4516EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
69#include <net/transp_v6.h> 69#include <net/transp_v6.h>
70#include <net/ipv6.h> 70#include <net/ipv6.h>
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
72#include <net/xfrm.h> 73#include <net/xfrm.h>
73 74
74#include <linux/inet.h> 75#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
86/* Socket used for sending RSTs */ 87/* Socket used for sending RSTs */
87static struct socket *tcp_socket; 88static struct socket *tcp_socket;
88 89
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90 struct sk_buff *skb);
91 91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
97 97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{ 99{
100 return inet_csk_get_port(&tcp_hashinfo, sk, snum); 100 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101 inet_csk_bind_conflict);
101} 102}
102 103
103static void tcp_v4_hash(struct sock *sk) 104static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
118 skb->h.th->source); 119 skb->h.th->source);
119} 120}
120 121
121/* called with local bh disabled */ 122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123 struct inet_timewait_sock **twp)
124{ 123{
125 struct inet_sock *inet = inet_sk(sk); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126 u32 daddr = inet->rcv_saddr; 125 struct tcp_sock *tp = tcp_sk(sk);
127 u32 saddr = inet->daddr;
128 int dif = sk->sk_bound_dev_if;
129 INET_ADDR_COOKIE(acookie, saddr, daddr)
130 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133 struct sock *sk2;
134 const struct hlist_node *node;
135 struct inet_timewait_sock *tw;
136
137 prefetch(head->chain.first);
138 write_lock(&head->lock);
139
140 /* Check TIME-WAIT sockets first. */
141 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142 tw = inet_twsk(sk2);
143
144 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146 struct tcp_sock *tp = tcp_sk(sk);
147
148 /* With PAWS, it is safe from the viewpoint
149 of data integrity. Even without PAWS it
150 is safe provided sequence spaces do not
151 overlap i.e. at data rates <= 80Mbit/sec.
152
153 Actually, the idea is close to VJ's one,
154 only timestamp cache is held not per host,
155 but per port pair and TW bucket is used
156 as state holder.
157 126
158 If TW bucket has been already destroyed we 127 /* With PAWS, it is safe from the viewpoint
159 fall back to VJ's scheme and use initial 128 of data integrity. Even without PAWS it is safe provided sequence
160 timestamp retrieved from peer table. 129 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 */
162 if (tcptw->tw_ts_recent_stamp &&
163 (!twp || (sysctl_tcp_tw_reuse &&
164 xtime.tv_sec -
165 tcptw->tw_ts_recent_stamp > 1))) {
166 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167 if (tp->write_seq == 0)
168 tp->write_seq = 1;
169 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
170 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171 sock_hold(sk2);
172 goto unique;
173 } else
174 goto not_unique;
175 }
176 }
177 tw = NULL;
178 130
179 /* And established part... */ 131 Actually, the idea is close to VJ's one, only timestamp cache is
180 sk_for_each(sk2, node, &head->chain) { 132 held not per host, but per port pair and TW bucket is used as state
181 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) 133 holder.
182 goto not_unique;
183 }
184 134
185unique: 135 If TW bucket has been already destroyed we fall back to VJ's scheme
186 /* Must record num and sport now. Otherwise we will see 136 and use initial timestamp retrieved from peer table.
187 * in hash table socket with a funny identity. */ 137 */
188 inet->num = lport; 138 if (tcptw->tw_ts_recent_stamp &&
189 inet->sport = htons(lport); 139 (twp == NULL || (sysctl_tcp_tw_reuse &&
190 sk->sk_hash = hash; 140 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
191 BUG_TRAP(sk_unhashed(sk)); 141 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
192 __sk_add_node(sk, &head->chain); 142 if (tp->write_seq == 0)
193 sock_prot_inc_use(sk->sk_prot); 143 tp->write_seq = 1;
194 write_unlock(&head->lock); 144 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
195 145 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
196 if (twp) { 146 sock_hold(sktw);
197 *twp = tw; 147 return 1;
198 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199 } else if (tw) {
200 /* Silly. Should hash-dance instead... */
201 inet_twsk_deschedule(tw, &tcp_death_row);
202 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203
204 inet_twsk_put(tw);
205 } 148 }
206 149
207 return 0; 150 return 0;
208
209not_unique:
210 write_unlock(&head->lock);
211 return -EADDRNOTAVAIL;
212} 151}
213 152
214static inline u32 connect_port_offset(const struct sock *sk) 153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
215{
216 const struct inet_sock *inet = inet_sk(sk);
217
218 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219 inet->dport);
220}
221
222/*
223 * Bind a port for a connect operation and hash it.
224 */
225static inline int tcp_v4_hash_connect(struct sock *sk)
226{
227 const unsigned short snum = inet_sk(sk)->num;
228 struct inet_bind_hashbucket *head;
229 struct inet_bind_bucket *tb;
230 int ret;
231
232 if (!snum) {
233 int low = sysctl_local_port_range[0];
234 int high = sysctl_local_port_range[1];
235 int range = high - low;
236 int i;
237 int port;
238 static u32 hint;
239 u32 offset = hint + connect_port_offset(sk);
240 struct hlist_node *node;
241 struct inet_timewait_sock *tw = NULL;
242
243 local_bh_disable();
244 for (i = 1; i <= range; i++) {
245 port = low + (i + offset) % range;
246 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247 spin_lock(&head->lock);
248
249 /* Does not bother with rcv_saddr checks,
250 * because the established check is already
251 * unique enough.
252 */
253 inet_bind_bucket_for_each(tb, node, &head->chain) {
254 if (tb->port == port) {
255 BUG_TRAP(!hlist_empty(&tb->owners));
256 if (tb->fastreuse >= 0)
257 goto next_port;
258 if (!__tcp_v4_check_established(sk,
259 port,
260 &tw))
261 goto ok;
262 goto next_port;
263 }
264 }
265
266 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267 if (!tb) {
268 spin_unlock(&head->lock);
269 break;
270 }
271 tb->fastreuse = -1;
272 goto ok;
273
274 next_port:
275 spin_unlock(&head->lock);
276 }
277 local_bh_enable();
278
279 return -EADDRNOTAVAIL;
280
281ok:
282 hint += i;
283
284 /* Head lock still held and bh's disabled */
285 inet_bind_hash(sk, tb, port);
286 if (sk_unhashed(sk)) {
287 inet_sk(sk)->sport = htons(port);
288 __inet_hash(&tcp_hashinfo, sk, 0);
289 }
290 spin_unlock(&head->lock);
291
292 if (tw) {
293 inet_twsk_deschedule(tw, &tcp_death_row);;
294 inet_twsk_put(tw);
295 }
296
297 ret = 0;
298 goto out;
299 }
300
301 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302 tb = inet_csk(sk)->icsk_bind_hash;
303 spin_lock_bh(&head->lock);
304 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305 __inet_hash(&tcp_hashinfo, sk, 0);
306 spin_unlock_bh(&head->lock);
307 return 0;
308 } else {
309 spin_unlock(&head->lock);
310 /* No definite answer... Walk to established hash table */
311 ret = __tcp_v4_check_established(sk, snum, NULL);
312out:
313 local_bh_enable();
314 return ret;
315 }
316}
317 154
318/* This will initiate an outgoing connection. */ 155/* This will initiate an outgoing connection. */
319int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
383 inet->dport = usin->sin_port; 220 inet->dport = usin->sin_port;
384 inet->daddr = daddr; 221 inet->daddr = daddr;
385 222
386 tp->ext_header_len = 0; 223 inet_csk(sk)->icsk_ext_hdr_len = 0;
387 if (inet->opt) 224 if (inet->opt)
388 tp->ext_header_len = inet->opt->optlen; 225 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
389 226
390 tp->rx_opt.mss_clamp = 536; 227 tp->rx_opt.mss_clamp = 536;
391 228
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
395 * complete initialization after this. 232 * complete initialization after this.
396 */ 233 */
397 tcp_set_state(sk, TCP_SYN_SENT); 234 tcp_set_state(sk, TCP_SYN_SENT);
398 err = tcp_v4_hash_connect(sk); 235 err = inet_hash_connect(&tcp_death_row, sk);
399 if (err) 236 if (err)
400 goto failure; 237 goto failure;
401 238
@@ -433,12 +270,10 @@ failure:
433/* 270/*
434 * This routine does path mtu discovery as defined in RFC1191. 271 * This routine does path mtu discovery as defined in RFC1191.
435 */ 272 */
436static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, 273static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
437 u32 mtu)
438{ 274{
439 struct dst_entry *dst; 275 struct dst_entry *dst;
440 struct inet_sock *inet = inet_sk(sk); 276 struct inet_sock *inet = inet_sk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 277
443 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 278 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444 * send out by Linux are always <576bytes so they should go through 279 * send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
467 mtu = dst_mtu(dst); 302 mtu = dst_mtu(dst);
468 303
469 if (inet->pmtudisc != IP_PMTUDISC_DONT && 304 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470 tp->pmtu_cookie > mtu) { 305 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
471 tcp_sync_mss(sk, mtu); 306 tcp_sync_mss(sk, mtu);
472 307
473 /* Resend the TCP packet because it's 308 /* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
644} 479}
645 480
646/* This routine computes an IPv4 TCP checksum. */ 481/* This routine computes an IPv4 TCP checksum. */
647void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
648 struct sk_buff *skb)
649{ 483{
650 struct inet_sock *inet = inet_sk(sk); 484 struct inet_sock *inet = inet_sk(sk);
485 struct tcphdr *th = skb->h.th;
651 486
652 if (skb->ip_summed == CHECKSUM_HW) { 487 if (skb->ip_summed == CHECKSUM_HW) {
653 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); 488 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
826 kfree(inet_rsk(req)->opt); 661 kfree(inet_rsk(req)->opt);
827} 662}
828 663
829static inline void syn_flood_warning(struct sk_buff *skb) 664#ifdef CONFIG_SYN_COOKIES
665static void syn_flood_warning(struct sk_buff *skb)
830{ 666{
831 static unsigned long warntime; 667 static unsigned long warntime;
832 668
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
837 ntohs(skb->h.th->dest)); 673 ntohs(skb->h.th->dest));
838 } 674 }
839} 675}
676#endif
840 677
841/* 678/*
842 * Save and compile IPv4 options into the request_sock if needed. 679 * Save and compile IPv4 options into the request_sock if needed.
843 */ 680 */
844static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 681static struct ip_options *tcp_v4_save_options(struct sock *sk,
845 struct sk_buff *skb) 682 struct sk_buff *skb)
846{ 683{
847 struct ip_options *opt = &(IPCB(skb)->opt); 684 struct ip_options *opt = &(IPCB(skb)->opt);
848 struct ip_options *dopt = NULL; 685 struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
869 .send_reset = tcp_v4_send_reset, 706 .send_reset = tcp_v4_send_reset,
870}; 707};
871 708
709static struct timewait_sock_ops tcp_timewait_sock_ops = {
710 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
711 .twsk_unique = tcp_twsk_unique,
712};
713
872int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 714int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873{ 715{
874 struct inet_request_sock *ireq; 716 struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1053 ireq->opt = NULL; 895 ireq->opt = NULL;
1054 newinet->mc_index = inet_iif(skb); 896 newinet->mc_index = inet_iif(skb);
1055 newinet->mc_ttl = skb->nh.iph->ttl; 897 newinet->mc_ttl = skb->nh.iph->ttl;
1056 newtp->ext_header_len = 0; 898 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1057 if (newinet->opt) 899 if (newinet->opt)
1058 newtp->ext_header_len = newinet->opt->optlen; 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1059 newinet->id = newtp->write_seq ^ jiffies; 901 newinet->id = newtp->write_seq ^ jiffies;
1060 902
1061 tcp_sync_mss(newsk, dst_mtu(dst)); 903 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
1314 goto discard_it; 1156 goto discard_it;
1315} 1157}
1316 1158
1317static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318{
1319 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320 struct inet_sock *inet = inet_sk(sk);
1321
1322 sin->sin_family = AF_INET;
1323 sin->sin_addr.s_addr = inet->daddr;
1324 sin->sin_port = inet->dport;
1325}
1326
1327/* VJ's idea. Save last timestamp seen from this destination 1159/* VJ's idea. Save last timestamp seen from this destination
1328 * and hold it at least for normal timewait interval to use for duplicate 1160 * and hold it at least for normal timewait interval to use for duplicate
1329 * segment detection in subsequent connections, before they enter synchronized 1161 * segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1382 return 0; 1214 return 0;
1383} 1215}
1384 1216
1385struct tcp_func ipv4_specific = { 1217struct inet_connection_sock_af_ops ipv4_specific = {
1386 .queue_xmit = ip_queue_xmit, 1218 .queue_xmit = ip_queue_xmit,
1387 .send_check = tcp_v4_send_check, 1219 .send_check = tcp_v4_send_check,
1388 .rebuild_header = inet_sk_rebuild_header, 1220 .rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
1392 .net_header_len = sizeof(struct iphdr), 1224 .net_header_len = sizeof(struct iphdr),
1393 .setsockopt = ip_setsockopt, 1225 .setsockopt = ip_setsockopt,
1394 .getsockopt = ip_getsockopt, 1226 .getsockopt = ip_getsockopt,
1395 .addr2sockaddr = v4_addr2sockaddr, 1227 .addr2sockaddr = inet_csk_addr2sockaddr,
1396 .sockaddr_len = sizeof(struct sockaddr_in), 1228 .sockaddr_len = sizeof(struct sockaddr_in),
1397}; 1229};
1398 1230
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
1433 sk->sk_write_space = sk_stream_write_space; 1265 sk->sk_write_space = sk_stream_write_space;
1434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1266 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435 1267
1436 tp->af_specific = &ipv4_specific; 1268 icsk->icsk_af_ops = &ipv4_specific;
1269 icsk->icsk_sync_mss = tcp_sync_mss;
1437 1270
1438 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1271 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1272 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
1989 .sysctl_rmem = sysctl_tcp_rmem, 1822 .sysctl_rmem = sysctl_tcp_rmem,
1990 .max_header = MAX_TCP_HEADER, 1823 .max_header = MAX_TCP_HEADER,
1991 .obj_size = sizeof(struct tcp_sock), 1824 .obj_size = sizeof(struct tcp_sock),
1992 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1825 .twsk_prot = &tcp_timewait_sock_ops,
1993 .rsk_prot = &tcp_request_sock_ops, 1826 .rsk_prot = &tcp_request_sock_ops,
1994}; 1827};
1995 1828
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
274void tcp_time_wait(struct sock *sk, int state, int timeo) 274void tcp_time_wait(struct sock *sk, int state, int timeo)
275{ 275{
276 struct inet_timewait_sock *tw = NULL; 276 struct inet_timewait_sock *tw = NULL;
277 const struct inet_connection_sock *icsk = inet_csk(sk);
277 const struct tcp_sock *tp = tcp_sk(sk); 278 const struct tcp_sock *tp = tcp_sk(sk);
278 int recycle_ok = 0; 279 int recycle_ok = 0;
279 280
280 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 281 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
281 recycle_ok = tp->af_specific->remember_stamp(sk); 282 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
282 283
283 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 284 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
284 tw = inet_twsk_alloc(sk, state); 285 tw = inet_twsk_alloc(sk, state);
285 286
286 if (tw != NULL) { 287 if (tw != NULL) {
287 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 288 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
288 const struct inet_connection_sock *icsk = inet_csk(sk);
289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
290 290
291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
299 if (tw->tw_family == PF_INET6) { 299 if (tw->tw_family == PF_INET6) {
300 struct ipv6_pinfo *np = inet6_sk(sk); 300 struct ipv6_pinfo *np = inet6_sk(sk);
301 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 301 struct inet6_timewait_sock *tw6;
302 302
303 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); 303 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
304 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); 304 tw6 = inet6_twsk((struct sock *)tw);
305 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
306 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
305 tw->tw_ipv6only = np->ipv6only; 307 tw->tw_ipv6only = np->ipv6only;
306 } 308 }
307#endif 309#endif
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
456 struct request_sock **prev) 458 struct request_sock **prev)
457{ 459{
458 struct tcphdr *th = skb->h.th; 460 struct tcphdr *th = skb->h.th;
459 struct tcp_sock *tp = tcp_sk(sk);
460 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 461 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
461 int paws_reject = 0; 462 int paws_reject = 0;
462 struct tcp_options_received tmp_opt; 463 struct tcp_options_received tmp_opt;
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
613 * ESTABLISHED STATE. If it will be dropped after 614 * ESTABLISHED STATE. If it will be dropped after
614 * socket is created, wait for troubles. 615 * socket is created, wait for troubles.
615 */ 616 */
616 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); 617 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
618 req, NULL);
617 if (child == NULL) 619 if (child == NULL)
618 goto listen_overflow; 620 goto listen_overflow;
619 621
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..a7623ead39a8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 3; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
56{ 56{
57 sk->sk_send_head = skb->next; 57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) 58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
124 tp->snd_cwnd_used = 0; 124 tp->snd_cwnd_used = 0;
125} 125}
126 126
127static inline void tcp_event_data_sent(struct tcp_sock *tp, 127static void tcp_event_data_sent(struct tcp_sock *tp,
128 struct sk_buff *skb, struct sock *sk) 128 struct sk_buff *skb, struct sock *sk)
129{ 129{
130 struct inet_connection_sock *icsk = inet_csk(sk); 130 struct inet_connection_sock *icsk = inet_csk(sk);
131 const u32 now = tcp_time_stamp; 131 const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
142 icsk->icsk_ack.pingpong = 1; 142 icsk->icsk_ack.pingpong = 1;
143} 143}
144 144
145static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 145static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
146{ 146{
147 tcp_dec_quickack_mode(sk, pkts); 147 tcp_dec_quickack_mode(sk, pkts);
148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
212 * value can be stuffed directly into th->window for an outgoing 212 * value can be stuffed directly into th->window for an outgoing
213 * frame. 213 * frame.
214 */ 214 */
215static __inline__ u16 tcp_select_window(struct sock *sk) 215static u16 tcp_select_window(struct sock *sk)
216{ 216{
217 struct tcp_sock *tp = tcp_sk(sk); 217 struct tcp_sock *tp = tcp_sk(sk);
218 u32 cur_win = tcp_receive_window(tp); 218 u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
250 return new_win; 250 return new_win;
251} 251}
252 252
253static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
254 __u32 tstamp)
255{
256 if (tp->rx_opt.tstamp_ok) {
257 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
258 (TCPOPT_NOP << 16) |
259 (TCPOPT_TIMESTAMP << 8) |
260 TCPOLEN_TIMESTAMP);
261 *ptr++ = htonl(tstamp);
262 *ptr++ = htonl(tp->rx_opt.ts_recent);
263 }
264 if (tp->rx_opt.eff_sacks) {
265 struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
266 int this_sack;
267
268 *ptr++ = htonl((TCPOPT_NOP << 24) |
269 (TCPOPT_NOP << 16) |
270 (TCPOPT_SACK << 8) |
271 (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
272 TCPOLEN_SACK_PERBLOCK)));
273 for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
274 *ptr++ = htonl(sp[this_sack].start_seq);
275 *ptr++ = htonl(sp[this_sack].end_seq);
276 }
277 if (tp->rx_opt.dsack) {
278 tp->rx_opt.dsack = 0;
279 tp->rx_opt.eff_sacks--;
280 }
281 }
282}
283
284/* Construct a tcp options header for a SYN or SYN_ACK packet.
285 * If this is every changed make sure to change the definition of
286 * MAX_SYN_SIZE to match the new maximum number of options that you
287 * can generate.
288 */
289static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
290 int offer_wscale, int wscale, __u32 tstamp,
291 __u32 ts_recent)
292{
293 /* We always get an MSS option.
294 * The option bytes which will be seen in normal data
295 * packets should timestamps be used, must be in the MSS
296 * advertised. But we subtract them from tp->mss_cache so
297 * that calculations in tcp_sendmsg are simpler etc.
298 * So account for this fact here if necessary. If we
299 * don't do this correctly, as a receiver we won't
300 * recognize data packets as being full sized when we
301 * should, and thus we won't abide by the delayed ACK
302 * rules correctly.
303 * SACKs don't matter, we never delay an ACK when we
304 * have any of those going out.
305 */
306 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
307 if (ts) {
308 if(sack)
309 *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
310 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
311 else
312 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
313 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
314 *ptr++ = htonl(tstamp); /* TSVAL */
315 *ptr++ = htonl(ts_recent); /* TSECR */
316 } else if(sack)
317 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
318 (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
319 if (offer_wscale)
320 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
321}
253 322
254/* This routine actually transmits TCP packets queued in by 323/* This routine actually transmits TCP packets queued in by
255 * tcp_do_sendmsg(). This is used by both the initial 324 * tcp_do_sendmsg(). This is used by both the initial
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
371 TCP_ECN_send(sk, tp, skb, tcp_header_size); 440 TCP_ECN_send(sk, tp, skb, tcp_header_size);
372 } 441 }
373 442
374 tp->af_specific->send_check(sk, th, skb->len, skb); 443 icsk->icsk_af_ops->send_check(sk, skb->len, skb);
375 444
376 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 445 if (likely(tcb->flags & TCPCB_FLAG_ACK))
377 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 446 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
381 450
382 TCP_INC_STATS(TCP_MIB_OUTSEGS); 451 TCP_INC_STATS(TCP_MIB_OUTSEGS);
383 452
384 err = tp->af_specific->queue_xmit(skb, 0); 453 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
385 if (unlikely(err <= 0)) 454 if (unlikely(err <= 0))
386 return err; 455 return err;
387 456
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
621 It is minimum of user_mss and mss received with SYN. 690 It is minimum of user_mss and mss received with SYN.
622 It also does not include TCP options. 691 It also does not include TCP options.
623 692
624 tp->pmtu_cookie is last pmtu, seen by this function. 693 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
625 694
626 tp->mss_cache is current effective sending mss, including 695 tp->mss_cache is current effective sending mss, including
627 all tcp options except for SACKs. It is evaluated, 696 all tcp options except for SACKs. It is evaluated,
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
631 NOTE1. rfc1122 clearly states that advertised MSS 700 NOTE1. rfc1122 clearly states that advertised MSS
632 DOES NOT include either tcp or ip options. 701 DOES NOT include either tcp or ip options.
633 702
634 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside 703 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
635 this function. --ANK (980731) 704 are READ ONLY outside this function. --ANK (980731)
636 */ 705 */
637 706
638unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) 707unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
639{ 708{
640 struct tcp_sock *tp = tcp_sk(sk); 709 struct tcp_sock *tp = tcp_sk(sk);
641 int mss_now; 710 struct inet_connection_sock *icsk = inet_csk(sk);
642
643 /* Calculate base mss without TCP options: 711 /* Calculate base mss without TCP options:
644 It is MMS_S - sizeof(tcphdr) of rfc1122 712 It is MMS_S - sizeof(tcphdr) of rfc1122
645 */ 713 */
646 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); 714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
647 716
648 /* Clamp it (mss_clamp does not include tcp options) */ 717 /* Clamp it (mss_clamp does not include tcp options) */
649 if (mss_now > tp->rx_opt.mss_clamp) 718 if (mss_now > tp->rx_opt.mss_clamp)
650 mss_now = tp->rx_opt.mss_clamp; 719 mss_now = tp->rx_opt.mss_clamp;
651 720
652 /* Now subtract optional transport overhead */ 721 /* Now subtract optional transport overhead */
653 mss_now -= tp->ext_header_len; 722 mss_now -= icsk->icsk_ext_hdr_len;
654 723
655 /* Then reserve room for full set of TCP options and 8 bytes of data */ 724 /* Then reserve room for full set of TCP options and 8 bytes of data */
656 if (mss_now < 48) 725 if (mss_now < 48)
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
664 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); 733 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
665 734
666 /* And store cached results */ 735 /* And store cached results */
667 tp->pmtu_cookie = pmtu; 736 icsk->icsk_pmtu_cookie = pmtu;
668 tp->mss_cache = mss_now; 737 tp->mss_cache = mss_now;
669 738
670 return mss_now; 739 return mss_now;
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
694 763
695 if (dst) { 764 if (dst) {
696 u32 mtu = dst_mtu(dst); 765 u32 mtu = dst_mtu(dst);
697 if (mtu != tp->pmtu_cookie) 766 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
698 mss_now = tcp_sync_mss(sk, mtu); 767 mss_now = tcp_sync_mss(sk, mtu);
699 } 768 }
700 769
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
705 xmit_size_goal = mss_now; 774 xmit_size_goal = mss_now;
706 775
707 if (doing_tso) { 776 if (doing_tso) {
708 xmit_size_goal = 65535 - 777 xmit_size_goal = (65535 -
709 tp->af_specific->net_header_len - 778 inet_csk(sk)->icsk_af_ops->net_header_len -
710 tp->ext_header_len - tp->tcp_header_len; 779 inet_csk(sk)->icsk_ext_hdr_len -
780 tp->tcp_header_len);
711 781
712 if (tp->max_window && 782 if (tp->max_window &&
713 (xmit_size_goal > (tp->max_window >> 1))) 783 (xmit_size_goal > (tp->max_window >> 1)))
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
723 793
724/* Congestion window validation. (RFC2861) */ 794/* Congestion window validation. (RFC2861) */
725 795
726static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) 796static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
727{ 797{
728 __u32 packets_out = tp->packets_out; 798 __u32 packets_out = tp->packets_out;
729 799
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
772/* This must be invoked the first time we consider transmitting 842/* This must be invoked the first time we consider transmitting
773 * SKB onto the wire. 843 * SKB onto the wire.
774 */ 844 */
775static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 845static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
776{ 846{
777 int tso_segs = tcp_skb_pcount(skb); 847 int tso_segs = tcp_skb_pcount(skb);
778 848
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1422 (sysctl_tcp_retrans_collapse != 0)) 1492 (sysctl_tcp_retrans_collapse != 0))
1423 tcp_retrans_try_collapse(sk, skb, cur_mss); 1493 tcp_retrans_try_collapse(sk, skb, cur_mss);
1424 1494
1425 if(tp->af_specific->rebuild_header(sk)) 1495 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1426 return -EHOSTUNREACH; /* Routing failure or similar. */ 1496 return -EHOSTUNREACH; /* Routing failure or similar. */
1427 1497
1428 /* Some Solaris stacks overoptimize and ignore the FIN on a 1498 /* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1793/* 1863/*
1794 * Do all connect socket setups that can be done AF independent. 1864 * Do all connect socket setups that can be done AF independent.
1795 */ 1865 */
1796static inline void tcp_connect_init(struct sock *sk) 1866static void tcp_connect_init(struct sock *sk)
1797{ 1867{
1798 struct dst_entry *dst = __sk_dst_get(sk); 1868 struct dst_entry *dst = __sk_dst_get(sk);
1799 struct tcp_sock *tp = tcp_sk(sk); 1869 struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df16..3b7403495052 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
330 vegas->cntRTT = 0; 330 vegas->cntRTT = 0;
331 vegas->minRTT = 0x7fffffff; 331 vegas->minRTT = 0x7fffffff;
332 } 332 }
333 /* Use normal slow start */
334 else if (tp->snd_cwnd <= tp->snd_ssthresh)
335 tcp_slow_start(tp);
336
333} 337}
334 338
335/* Extract info for Tcp socket info provided via netlink. */ 339/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
86#include <linux/module.h> 86#include <linux/module.h>
87#include <linux/socket.h> 87#include <linux/socket.h>
88#include <linux/sockios.h> 88#include <linux/sockios.h>
89#include <linux/igmp.h>
89#include <linux/in.h> 90#include <linux/in.h>
90#include <linux/errno.h> 91#include <linux/errno.h>
91#include <linux/timer.h> 92#include <linux/timer.h>
@@ -846,20 +847,7 @@ out:
846csum_copy_err: 847csum_copy_err:
847 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 848 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
848 849
849 /* Clear queue. */ 850 skb_kill_datagram(sk, skb, flags);
850 if (flags&MSG_PEEK) {
851 int clear = 0;
852 spin_lock_bh(&sk->sk_receive_queue.lock);
853 if (skb == skb_peek(&sk->sk_receive_queue)) {
854 __skb_unlink(skb, &sk->sk_receive_queue);
855 clear = 1;
856 }
857 spin_unlock_bh(&sk->sk_receive_queue.lock);
858 if (clear)
859 kfree_skb(skb);
860 }
861
862 skb_free_datagram(sk, skb);
863 851
864 if (noblock) 852 if (noblock)
865 return -EAGAIN; 853 return -EAGAIN;
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
1094 * Otherwise, csum completion requires chacksumming packet body, 1082 * Otherwise, csum completion requires chacksumming packet body,
1095 * including udp header and folding it to skb->csum. 1083 * including udp header and folding it to skb->csum.
1096 */ 1084 */
1097static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, 1085static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1098 unsigned short ulen, u32 saddr, u32 daddr) 1086 unsigned short ulen, u32 saddr, u32 daddr)
1099{ 1087{
1100 if (uh->check == 0) { 1088 if (uh->check == 0) {
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1108 /* Probably, we should checksum udp header (it should be in cache 1096 /* Probably, we should checksum udp header (it should be in cache
1109 * in any case) and data in tiny packets (< rx copybreak). 1097 * in any case) and data in tiny packets (< rx copybreak).
1110 */ 1098 */
1111 return 0;
1112} 1099}
1113 1100
1114/* 1101/*
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb)
1141 if (pskb_trim_rcsum(skb, ulen)) 1128 if (pskb_trim_rcsum(skb, ulen))
1142 goto short_packet; 1129 goto short_packet;
1143 1130
1144 if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) 1131 udp_checksum_init(skb, uh, ulen, saddr, daddr);
1145 goto csum_error;
1146 1132
1147 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1133 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1148 return udp_v4_mcast_deliver(skb, uh, saddr, daddr); 1134 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6460eec834b7..9601fd7f9d66 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ 8 route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ 9 protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ 10 exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
11 ip6_flowlabel.o ipv6_syms.o netfilter.o 11 ip6_flowlabel.o ipv6_syms.o netfilter.o \
12 inet6_connection_sock.o
12 13
13ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ 14ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
14 xfrm6_output.o 15 xfrm6_output.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85ad..704fb73e6c5f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
1195int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) 1195int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
1196{ 1196{
1197 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; 1197 const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
1198 const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); 1198 const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
1199 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; 1199 u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
1200 u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 1200 u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
1201 int sk_ipv6only = ipv6_only_sock(sk); 1201 int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa04..68afc53be662 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
167 sk->sk_reuse = 1; 167 sk->sk_reuse = 1;
168 168
169 inet = inet_sk(sk); 169 inet = inet_sk(sk);
170 inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
170 171
171 if (SOCK_RAW == sock->type) { 172 if (SOCK_RAW == sock->type) {
172 inet->num = protocol; 173 inet->num = protocol;
@@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk)
389 return 0; 390 return 0;
390} 391}
391 392
393EXPORT_SYMBOL_GPL(inet6_destroy_sock);
394
392/* 395/*
393 * This does both peername and sockname. 396 * This does both peername and sockname.
394 */ 397 */
@@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
431int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 434int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
432{ 435{
433 struct sock *sk = sock->sk; 436 struct sock *sk = sock->sk;
434 int err = -EINVAL;
435 437
436 switch(cmd) 438 switch(cmd)
437 { 439 {
@@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
450 case SIOCSIFDSTADDR: 452 case SIOCSIFDSTADDR:
451 return addrconf_set_dstaddr((void __user *) arg); 453 return addrconf_set_dstaddr((void __user *) arg);
452 default: 454 default:
453 if (!sk->sk_prot->ioctl || 455 if (!sk->sk_prot->ioctl)
454 (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) 456 return -ENOIOCTLCMD;
455 return(dev_ioctl(cmd,(void __user *) arg)); 457 return sk->sk_prot->ioctl(sk, cmd, arg);
456 return err;
457 } 458 }
458 /*NOTREACHED*/ 459 /*NOTREACHED*/
459 return(0); 460 return(0);
460} 461}
461 462
462struct proto_ops inet6_stream_ops = { 463const struct proto_ops inet6_stream_ops = {
463 .family = PF_INET6, 464 .family = PF_INET6,
464 .owner = THIS_MODULE, 465 .owner = THIS_MODULE,
465 .release = inet6_release, 466 .release = inet6_release,
@@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = {
480 .sendpage = tcp_sendpage 481 .sendpage = tcp_sendpage
481}; 482};
482 483
483struct proto_ops inet6_dgram_ops = { 484const struct proto_ops inet6_dgram_ops = {
484 .family = PF_INET6, 485 .family = PF_INET6,
485 .owner = THIS_MODULE, 486 .owner = THIS_MODULE,
486 .release = inet6_release, 487 .release = inet6_release,
@@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = {
508}; 509};
509 510
510/* Same as inet6_dgram_ops, sans udp_poll. */ 511/* Same as inet6_dgram_ops, sans udp_poll. */
511static struct proto_ops inet6_sockraw_ops = { 512static const struct proto_ops inet6_sockraw_ops = {
512 .family = PF_INET6, 513 .family = PF_INET6,
513 .owner = THIS_MODULE, 514 .owner = THIS_MODULE,
514 .release = inet6_release, 515 .release = inet6_release,
@@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p)
609 } 610 }
610} 611}
611 612
613int inet6_sk_rebuild_header(struct sock *sk)
614{
615 int err;
616 struct dst_entry *dst;
617 struct ipv6_pinfo *np = inet6_sk(sk);
618
619 dst = __sk_dst_check(sk, np->dst_cookie);
620
621 if (dst == NULL) {
622 struct inet_sock *inet = inet_sk(sk);
623 struct in6_addr *final_p = NULL, final;
624 struct flowi fl;
625
626 memset(&fl, 0, sizeof(fl));
627 fl.proto = sk->sk_protocol;
628 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
629 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
630 fl.fl6_flowlabel = np->flow_label;
631 fl.oif = sk->sk_bound_dev_if;
632 fl.fl_ip_dport = inet->dport;
633 fl.fl_ip_sport = inet->sport;
634
635 if (np->opt && np->opt->srcrt) {
636 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
637 ipv6_addr_copy(&final, &fl.fl6_dst);
638 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
639 final_p = &final;
640 }
641
642 err = ip6_dst_lookup(sk, &dst, &fl);
643 if (err) {
644 sk->sk_route_caps = 0;
645 return err;
646 }
647 if (final_p)
648 ipv6_addr_copy(&fl.fl6_dst, final_p);
649
650 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
651 sk->sk_err_soft = -err;
652 return err;
653 }
654
655 ip6_dst_store(sk, dst, NULL);
656 sk->sk_route_caps = dst->dev->features &
657 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
658 }
659
660 return 0;
661}
662
663EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
664
665int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
666{
667 struct ipv6_pinfo *np = inet6_sk(sk);
668 struct inet6_skb_parm *opt = IP6CB(skb);
669
670 if (np->rxopt.all) {
671 if ((opt->hop && (np->rxopt.bits.hopopts ||
672 np->rxopt.bits.ohopopts)) ||
673 ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) &&
674 np->rxopt.bits.rxflow) ||
675 (opt->srcrt && (np->rxopt.bits.srcrt ||
676 np->rxopt.bits.osrcrt)) ||
677 ((opt->dst1 || opt->dst0) &&
678 (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
679 return 1;
680 }
681 return 0;
682}
683
684EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
685
612int 686int
613snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) 687snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
614{ 688{
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <net/icmp.h> 34#include <net/icmp.h>
35#include <net/ipv6.h> 35#include <net/ipv6.h>
36#include <net/protocol.h>
36#include <net/xfrm.h> 37#include <net/xfrm.h>
37#include <asm/scatterlist.h> 38#include <asm/scatterlist.h>
38 39
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
36#include <linux/random.h> 36#include <linux/random.h>
37#include <net/icmp.h> 37#include <net/icmp.h>
38#include <net/ipv6.h> 38#include <net/ipv6.h>
39#include <net/protocol.h>
39#include <linux/icmpv6.h> 40#include <linux/icmpv6.h>
40 41
41static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) 42static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index be6faf311387..113374dc342c 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
413 return opt; 413 return opt;
414} 414}
415 415
416EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
417
416/********************************** 418/**********************************
417 Hop-by-hop options. 419 Hop-by-hop options.
418 **********************************/ 420 **********************************/
@@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
579 return opt2; 581 return opt2;
580} 582}
581 583
584EXPORT_SYMBOL_GPL(ipv6_dup_options);
585
582static int ipv6_renew_option(void *ohdr, 586static int ipv6_renew_option(void *ohdr,
583 struct ipv6_opt_hdr __user *newopt, int newoptlen, 587 struct ipv6_opt_hdr __user *newopt, int newoptlen,
584 int inherit, 588 int inherit,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 000000000000..792f90f0f9ec
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,199 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET6 connection oriented protocols.
7 *
8 * Authors: See the TCPv6 sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/in6.h>
19#include <linux/ipv6.h>
20#include <linux/jhash.h>
21
22#include <net/addrconf.h>
23#include <net/inet_connection_sock.h>
24#include <net/inet_ecn.h>
25#include <net/inet_hashtables.h>
26#include <net/ip6_route.h>
27#include <net/sock.h>
28
29int inet6_csk_bind_conflict(const struct sock *sk,
30 const struct inet_bind_bucket *tb)
31{
32 const struct sock *sk2;
33 const struct hlist_node *node;
34
35 /* We must walk the whole port owner list in this case. -DaveM */
36 sk_for_each_bound(sk2, node, &tb->owners) {
37 if (sk != sk2 &&
38 (!sk->sk_bound_dev_if ||
39 !sk2->sk_bound_dev_if ||
40 sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
41 (!sk->sk_reuse || !sk2->sk_reuse ||
42 sk2->sk_state == TCP_LISTEN) &&
43 ipv6_rcv_saddr_equal(sk, sk2))
44 break;
45 }
46
47 return node != NULL;
48}
49
50EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
51
52/*
53 * request_sock (formerly open request) hash tables.
54 */
55static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport,
56 const u32 rnd, const u16 synq_hsize)
57{
58 u32 a = raddr->s6_addr32[0];
59 u32 b = raddr->s6_addr32[1];
60 u32 c = raddr->s6_addr32[2];
61
62 a += JHASH_GOLDEN_RATIO;
63 b += JHASH_GOLDEN_RATIO;
64 c += rnd;
65 __jhash_mix(a, b, c);
66
67 a += raddr->s6_addr32[3];
68 b += (u32)rport;
69 __jhash_mix(a, b, c);
70
71 return c & (synq_hsize - 1);
72}
73
74struct request_sock *inet6_csk_search_req(const struct sock *sk,
75 struct request_sock ***prevp,
76 const __u16 rport,
77 const struct in6_addr *raddr,
78 const struct in6_addr *laddr,
79 const int iif)
80{
81 const struct inet_connection_sock *icsk = inet_csk(sk);
82 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
83 struct request_sock *req, **prev;
84
85 for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
86 lopt->hash_rnd,
87 lopt->nr_table_entries)];
88 (req = *prev) != NULL;
89 prev = &req->dl_next) {
90 const struct inet6_request_sock *treq = inet6_rsk(req);
91
92 if (inet_rsk(req)->rmt_port == rport &&
93 req->rsk_ops->family == AF_INET6 &&
94 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
95 ipv6_addr_equal(&treq->loc_addr, laddr) &&
96 (!treq->iif || treq->iif == iif)) {
97 BUG_TRAP(req->sk == NULL);
98 *prevp = prev;
99 return req;
100 }
101 }
102
103 return NULL;
104}
105
106EXPORT_SYMBOL_GPL(inet6_csk_search_req);
107
108void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
109 struct request_sock *req,
110 const unsigned long timeout)
111{
112 struct inet_connection_sock *icsk = inet_csk(sk);
113 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
114 const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
115 inet_rsk(req)->rmt_port,
116 lopt->hash_rnd, lopt->nr_table_entries);
117
118 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
119 inet_csk_reqsk_queue_added(sk, timeout);
120}
121
122EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
123
124void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
125{
126 struct ipv6_pinfo *np = inet6_sk(sk);
127 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
128
129 sin6->sin6_family = AF_INET6;
130 ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
131 sin6->sin6_port = inet_sk(sk)->dport;
132 /* We do not store received flowlabel for TCP */
133 sin6->sin6_flowinfo = 0;
134 sin6->sin6_scope_id = 0;
135 if (sk->sk_bound_dev_if &&
136 ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
137 sin6->sin6_scope_id = sk->sk_bound_dev_if;
138}
139
140EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
141
142int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
143{
144 struct sock *sk = skb->sk;
145 struct inet_sock *inet = inet_sk(sk);
146 struct ipv6_pinfo *np = inet6_sk(sk);
147 struct flowi fl;
148 struct dst_entry *dst;
149 struct in6_addr *final_p = NULL, final;
150
151 memset(&fl, 0, sizeof(fl));
152 fl.proto = sk->sk_protocol;
153 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
154 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
155 fl.fl6_flowlabel = np->flow_label;
156 IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
157 fl.oif = sk->sk_bound_dev_if;
158 fl.fl_ip_sport = inet->sport;
159 fl.fl_ip_dport = inet->dport;
160
161 if (np->opt && np->opt->srcrt) {
162 struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
163 ipv6_addr_copy(&final, &fl.fl6_dst);
164 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
165 final_p = &final;
166 }
167
168 dst = __sk_dst_check(sk, np->dst_cookie);
169
170 if (dst == NULL) {
171 int err = ip6_dst_lookup(sk, &dst, &fl);
172
173 if (err) {
174 sk->sk_err_soft = -err;
175 return err;
176 }
177
178 if (final_p)
179 ipv6_addr_copy(&fl.fl6_dst, final_p);
180
181 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
182 sk->sk_route_caps = 0;
183 return err;
184 }
185
186 ip6_dst_store(sk, dst, NULL);
187 sk->sk_route_caps = dst->dev->features &
188 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
189 }
190
191 skb->dst = dst_clone(dst);
192
193 /* Restore final destination back after routing done */
194 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
195
196 return ip6_xmit(sk, skb, &fl, np->opt, 0);
197}
198
199EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e40..4154f3a8b6cf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Generic INET6 transport hashtables 6 * Generic INET6 transport hashtables
7 * 7 *
8 * Authors: Lotsa people, from code originally in tcp 8 * Authors: Lotsa people, from code originally in tcp, generalised here
9 * by Arnaldo Carvalho de Melo <acme@mandriva.com>
9 * 10 *
10 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
14 */ 15 */
15 16
16#include <linux/config.h> 17#include <linux/config.h>
17
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/random.h>
19 20
20#include <net/inet_connection_sock.h> 21#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h> 22#include <net/inet_hashtables.h>
22#include <net/inet6_hashtables.h> 23#include <net/inet6_hashtables.h>
24#include <net/ip.h>
23 25
24struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, 26struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
25 const struct in6_addr *daddr, 27 const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
79} 81}
80 82
81EXPORT_SYMBOL_GPL(inet6_lookup); 83EXPORT_SYMBOL_GPL(inet6_lookup);
84
85static int __inet6_check_established(struct inet_timewait_death_row *death_row,
86 struct sock *sk, const __u16 lport,
87 struct inet_timewait_sock **twp)
88{
89 struct inet_hashinfo *hinfo = death_row->hashinfo;
90 const struct inet_sock *inet = inet_sk(sk);
91 const struct ipv6_pinfo *np = inet6_sk(sk);
92 const struct in6_addr *daddr = &np->rcv_saddr;
93 const struct in6_addr *saddr = &np->daddr;
94 const int dif = sk->sk_bound_dev_if;
95 const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
96 const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
97 inet->dport);
98 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
99 struct sock *sk2;
100 const struct hlist_node *node;
101 struct inet_timewait_sock *tw;
102
103 prefetch(head->chain.first);
104 write_lock(&head->lock);
105
106 /* Check TIME-WAIT sockets first. */
107 sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
108 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
109
110 tw = inet_twsk(sk2);
111
112 if(*((__u32 *)&(tw->tw_dport)) == ports &&
113 sk2->sk_family == PF_INET6 &&
114 ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
115 ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
116 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
117 if (twsk_unique(sk, sk2, twp))
118 goto unique;
119 else
120 goto not_unique;
121 }
122 }
123 tw = NULL;
124
125 /* And established part... */
126 sk_for_each(sk2, node, &head->chain) {
127 if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
128 goto not_unique;
129 }
130
131unique:
132 BUG_TRAP(sk_unhashed(sk));
133 __sk_add_node(sk, &head->chain);
134 sk->sk_hash = hash;
135 sock_prot_inc_use(sk->sk_prot);
136 write_unlock(&head->lock);
137
138 if (twp != NULL) {
139 *twp = tw;
140 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
141 } else if (tw != NULL) {
142 /* Silly. Should hash-dance instead... */
143 inet_twsk_deschedule(tw, death_row);
144 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
145
146 inet_twsk_put(tw);
147 }
148 return 0;
149
150not_unique:
151 write_unlock(&head->lock);
152 return -EADDRNOTAVAIL;
153}
154
155static inline u32 inet6_sk_port_offset(const struct sock *sk)
156{
157 const struct inet_sock *inet = inet_sk(sk);
158 const struct ipv6_pinfo *np = inet6_sk(sk);
159 return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
160 np->daddr.s6_addr32,
161 inet->dport);
162}
163
164int inet6_hash_connect(struct inet_timewait_death_row *death_row,
165 struct sock *sk)
166{
167 struct inet_hashinfo *hinfo = death_row->hashinfo;
168 const unsigned short snum = inet_sk(sk)->num;
169 struct inet_bind_hashbucket *head;
170 struct inet_bind_bucket *tb;
171 int ret;
172
173 if (snum == 0) {
174 const int low = sysctl_local_port_range[0];
175 const int high = sysctl_local_port_range[1];
176 const int range = high - low;
177 int i, port;
178 static u32 hint;
179 const u32 offset = hint + inet6_sk_port_offset(sk);
180 struct hlist_node *node;
181 struct inet_timewait_sock *tw = NULL;
182
183 local_bh_disable();
184 for (i = 1; i <= range; i++) {
185 port = low + (i + offset) % range;
186 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
187 spin_lock(&head->lock);
188
189 /* Does not bother with rcv_saddr checks,
190 * because the established check is already
191 * unique enough.
192 */
193 inet_bind_bucket_for_each(tb, node, &head->chain) {
194 if (tb->port == port) {
195 BUG_TRAP(!hlist_empty(&tb->owners));
196 if (tb->fastreuse >= 0)
197 goto next_port;
198 if (!__inet6_check_established(death_row,
199 sk, port,
200 &tw))
201 goto ok;
202 goto next_port;
203 }
204 }
205
206 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
207 head, port);
208 if (!tb) {
209 spin_unlock(&head->lock);
210 break;
211 }
212 tb->fastreuse = -1;
213 goto ok;
214
215 next_port:
216 spin_unlock(&head->lock);
217 }
218 local_bh_enable();
219
220 return -EADDRNOTAVAIL;
221
222ok:
223 hint += i;
224
225 /* Head lock still held and bh's disabled */
226 inet_bind_hash(sk, tb, port);
227 if (sk_unhashed(sk)) {
228 inet_sk(sk)->sport = htons(port);
229 __inet6_hash(hinfo, sk);
230 }
231 spin_unlock(&head->lock);
232
233 if (tw) {
234 inet_twsk_deschedule(tw, death_row);
235 inet_twsk_put(tw);
236 }
237
238 ret = 0;
239 goto out;
240 }
241
242 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
243 tb = inet_csk(sk)->icsk_bind_hash;
244 spin_lock_bh(&head->lock);
245
246 if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
247 __inet6_hash(hinfo, sk);
248 spin_unlock_bh(&head->lock);
249 return 0;
250 } else {
251 spin_unlock(&head->lock);
252 /* No definite answer... Walk to established hash table */
253 ret = __inet6_check_established(death_row, sk, snum, NULL);
254out:
255 local_bh_enable();
256 return ret;
257 }
258}
259
260EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1cf02765fb5c..89d12b4817a9 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label)
200 return NULL; 200 return NULL;
201} 201}
202 202
203EXPORT_SYMBOL_GPL(fl6_sock_lookup);
204
203void fl6_free_socklist(struct sock *sk) 205void fl6_free_socklist(struct sock *sk)
204{ 206{
205 struct ipv6_pinfo *np = inet6_sk(sk); 207 struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8523c76ebf76..b4c4beba0ede 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -775,6 +775,8 @@ out_err_release:
775 return err; 775 return err;
776} 776}
777 777
778EXPORT_SYMBOL_GPL(ip6_dst_lookup);
779
778static inline int ip6_ufo_append_data(struct sock *sk, 780static inline int ip6_ufo_append_data(struct sock *sk,
779 int getfrag(void *from, char *to, int offset, int len, 781 int getfrag(void *from, char *to, int offset, int len,
780 int odd, struct sk_buff *skb), 782 int odd, struct sk_buff *skb),
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
47#include <linux/rtnetlink.h> 47#include <linux/rtnetlink.h>
48#include <net/icmp.h> 48#include <net/icmp.h>
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <net/protocol.h>
50#include <linux/ipv6.h> 51#include <linux/ipv6.h>
51#include <linux/icmpv6.h> 52#include <linux/icmpv6.h>
52 53
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe6..c63868dd2ca2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
163 sk_refcnt_debug_dec(sk); 163 sk_refcnt_debug_dec(sk);
164 164
165 if (sk->sk_protocol == IPPROTO_TCP) { 165 if (sk->sk_protocol == IPPROTO_TCP) {
166 struct tcp_sock *tp = tcp_sk(sk); 166 struct inet_connection_sock *icsk = inet_csk(sk);
167 167
168 local_bh_disable(); 168 local_bh_disable();
169 sock_prot_dec_use(sk->sk_prot); 169 sock_prot_dec_use(sk->sk_prot);
170 sock_prot_inc_use(&tcp_prot); 170 sock_prot_inc_use(&tcp_prot);
171 local_bh_enable(); 171 local_bh_enable();
172 sk->sk_prot = &tcp_prot; 172 sk->sk_prot = &tcp_prot;
173 tp->af_specific = &ipv4_specific; 173 icsk->icsk_af_ops = &ipv4_specific;
174 sk->sk_socket->ops = &inet_stream_ops; 174 sk->sk_socket->ops = &inet_stream_ops;
175 sk->sk_family = PF_INET; 175 sk->sk_family = PF_INET;
176 tcp_sync_mss(sk, tp->pmtu_cookie); 176 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
177 } else { 177 } else {
178 local_bh_disable(); 178 local_bh_disable();
179 sock_prot_dec_use(sk->sk_prot); 179 sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
317 } 317 }
318 318
319 retv = 0; 319 retv = 0;
320 if (sk->sk_type == SOCK_STREAM) { 320 if (inet_sk(sk)->is_icsk) {
321 if (opt) { 321 if (opt) {
322 struct tcp_sock *tp = tcp_sk(sk); 322 struct inet_connection_sock *icsk = inet_csk(sk);
323 if (!((1 << sk->sk_state) & 323 if (!((1 << sk->sk_state) &
324 (TCPF_LISTEN | TCPF_CLOSE)) 324 (TCPF_LISTEN | TCPF_CLOSE))
325 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { 325 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
326 tp->ext_header_len = opt->opt_flen + opt->opt_nflen; 326 icsk->icsk_ext_hdr_len =
327 tcp_sync_mss(sk, tp->pmtu_cookie); 327 opt->opt_flen + opt->opt_nflen;
328 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
328 } 329 }
329 } 330 }
330 opt = xchg(&np->opt, opt); 331 opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
380 goto done; 381 goto done;
381update: 382update:
382 retv = 0; 383 retv = 0;
383 if (sk->sk_type == SOCK_STREAM) { 384 if (inet_sk(sk)->is_icsk) {
384 if (opt) { 385 if (opt) {
385 struct tcp_sock *tp = tcp_sk(sk); 386 struct inet_connection_sock *icsk = inet_csk(sk);
386 if (!((1 << sk->sk_state) & 387 if (!((1 << sk->sk_state) &
387 (TCPF_LISTEN | TCPF_CLOSE)) 388 (TCPF_LISTEN | TCPF_CLOSE))
388 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { 389 && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
389 tp->ext_header_len = opt->opt_flen + opt->opt_nflen; 390 icsk->icsk_ext_hdr_len =
390 tcp_sync_mss(sk, tp->pmtu_cookie); 391 opt->opt_flen + opt->opt_nflen;
392 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
391 } 393 }
392 } 394 }
393 opt = xchg(&np->opt, opt); 395 opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f829a4ad3ccc..1cf305a9f8dd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
224 224
225 mc_lst->ifindex = dev->ifindex; 225 mc_lst->ifindex = dev->ifindex;
226 mc_lst->sfmode = MCAST_EXCLUDE; 226 mc_lst->sfmode = MCAST_EXCLUDE;
227 mc_lst->sflock = RW_LOCK_UNLOCKED; 227 rwlock_init(&mc_lst->sflock);
228 mc_lst->sflist = NULL; 228 mc_lst->sflist = NULL;
229 229
230 /* 230 /*
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 95d469271c4d..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
15 * - new extension header parser code 15 * - new extension header parser code
16 */ 16 */
17#include <linux/config.h> 17#include <linux/config.h>
18#include <linux/in.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
@@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex);
86 context stops packets coming through and allows user context to read 87 context stops packets coming through and allows user context to read
87 the counters or update the rules. 88 the counters or update the rules.
88 89
89 To be cache friendly on SMP, we arrange them like so:
90 [ n-entries ]
91 ... cache-align padding ...
92 [ n-entries ]
93
94 Hence the start of any table is given by get_table() below. */ 90 Hence the start of any table is given by get_table() below. */
95 91
96/* The table itself */ 92/* The table itself */
@@ -108,20 +104,15 @@ struct ip6t_table_info
108 unsigned int underflow[NF_IP6_NUMHOOKS]; 104 unsigned int underflow[NF_IP6_NUMHOOKS];
109 105
110 /* ip6t_entry tables: one per CPU */ 106 /* ip6t_entry tables: one per CPU */
111 char entries[0] ____cacheline_aligned; 107 void *entries[NR_CPUS];
112}; 108};
113 109
114static LIST_HEAD(ip6t_target); 110static LIST_HEAD(ip6t_target);
115static LIST_HEAD(ip6t_match); 111static LIST_HEAD(ip6t_match);
116static LIST_HEAD(ip6t_tables); 112static LIST_HEAD(ip6t_tables);
113#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
117#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
118 115
119#ifdef CONFIG_SMP
120#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
121#else
122#define TABLE_OFFSET(t,p) 0
123#endif
124
125#if 0 116#if 0
126#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) 117#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
127#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) 118#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb,
376 367
377 read_lock_bh(&table->lock); 368 read_lock_bh(&table->lock);
378 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 369 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
379 table_base = (void *)table->private->entries 370 table_base = (void *)table->private->entries[smp_processor_id()];
380 + TABLE_OFFSET(table->private, smp_processor_id());
381 e = get_entry(table_base, table->private->hook_entry[hook]); 371 e = get_entry(table_base, table->private->hook_entry[hook]);
382 372
383#ifdef CONFIG_NETFILTER_DEBUG 373#ifdef CONFIG_NETFILTER_DEBUG
@@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6)
649/* Figures out from what hook each rule can be called: returns 0 if 639/* Figures out from what hook each rule can be called: returns 0 if
650 there are loops. Puts hook bitmask in comefrom. */ 640 there are loops. Puts hook bitmask in comefrom. */
651static int 641static int
652mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) 642mark_source_chains(struct ip6t_table_info *newinfo,
643 unsigned int valid_hooks, void *entry0)
653{ 644{
654 unsigned int hook; 645 unsigned int hook;
655 646
@@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
658 for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { 649 for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
659 unsigned int pos = newinfo->hook_entry[hook]; 650 unsigned int pos = newinfo->hook_entry[hook];
660 struct ip6t_entry *e 651 struct ip6t_entry *e
661 = (struct ip6t_entry *)(newinfo->entries + pos); 652 = (struct ip6t_entry *)(entry0 + pos);
662 653
663 if (!(valid_hooks & (1 << hook))) 654 if (!(valid_hooks & (1 << hook)))
664 continue; 655 continue;
@@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
708 goto next; 699 goto next;
709 700
710 e = (struct ip6t_entry *) 701 e = (struct ip6t_entry *)
711 (newinfo->entries + pos); 702 (entry0 + pos);
712 } while (oldpos == pos + e->next_offset); 703 } while (oldpos == pos + e->next_offset);
713 704
714 /* Move along one */ 705 /* Move along one */
715 size = e->next_offset; 706 size = e->next_offset;
716 e = (struct ip6t_entry *) 707 e = (struct ip6t_entry *)
717 (newinfo->entries + pos + size); 708 (entry0 + pos + size);
718 e->counters.pcnt = pos; 709 e->counters.pcnt = pos;
719 pos += size; 710 pos += size;
720 } else { 711 } else {
@@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
731 newpos = pos + e->next_offset; 722 newpos = pos + e->next_offset;
732 } 723 }
733 e = (struct ip6t_entry *) 724 e = (struct ip6t_entry *)
734 (newinfo->entries + newpos); 725 (entry0 + newpos);
735 e->counters.pcnt = pos; 726 e->counters.pcnt = pos;
736 pos = newpos; 727 pos = newpos;
737 } 728 }
@@ -941,6 +932,7 @@ static int
941translate_table(const char *name, 932translate_table(const char *name,
942 unsigned int valid_hooks, 933 unsigned int valid_hooks,
943 struct ip6t_table_info *newinfo, 934 struct ip6t_table_info *newinfo,
935 void *entry0,
944 unsigned int size, 936 unsigned int size,
945 unsigned int number, 937 unsigned int number,
946 const unsigned int *hook_entries, 938 const unsigned int *hook_entries,
@@ -961,11 +953,11 @@ translate_table(const char *name,
961 duprintf("translate_table: size %u\n", newinfo->size); 953 duprintf("translate_table: size %u\n", newinfo->size);
962 i = 0; 954 i = 0;
963 /* Walk through entries, checking offsets. */ 955 /* Walk through entries, checking offsets. */
964 ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 956 ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
965 check_entry_size_and_hooks, 957 check_entry_size_and_hooks,
966 newinfo, 958 newinfo,
967 newinfo->entries, 959 entry0,
968 newinfo->entries + size, 960 entry0 + size,
969 hook_entries, underflows, &i); 961 hook_entries, underflows, &i);
970 if (ret != 0) 962 if (ret != 0)
971 return ret; 963 return ret;
@@ -993,27 +985,24 @@ translate_table(const char *name,
993 } 985 }
994 } 986 }
995 987
996 if (!mark_source_chains(newinfo, valid_hooks)) 988 if (!mark_source_chains(newinfo, valid_hooks, entry0))
997 return -ELOOP; 989 return -ELOOP;
998 990
999 /* Finally, each sanity check must pass */ 991 /* Finally, each sanity check must pass */
1000 i = 0; 992 i = 0;
1001 ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 993 ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
1002 check_entry, name, size, &i); 994 check_entry, name, size, &i);
1003 995
1004 if (ret != 0) { 996 if (ret != 0) {
1005 IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, 997 IP6T_ENTRY_ITERATE(entry0, newinfo->size,
1006 cleanup_entry, &i); 998 cleanup_entry, &i);
1007 return ret; 999 return ret;
1008 } 1000 }
1009 1001
1010 /* And one copy for every other CPU */ 1002 /* And one copy for every other CPU */
1011 for_each_cpu(i) { 1003 for_each_cpu(i) {
1012 if (i == 0) 1004 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
1013 continue; 1005 memcpy(newinfo->entries[i], entry0, newinfo->size);
1014 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
1015 newinfo->entries,
1016 SMP_ALIGN(newinfo->size));
1017 } 1006 }
1018 1007
1019 return ret; 1008 return ret;
@@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table,
1029 1018
1030#ifdef CONFIG_NETFILTER_DEBUG 1019#ifdef CONFIG_NETFILTER_DEBUG
1031 { 1020 {
1032 struct ip6t_entry *table_base; 1021 int cpu;
1033 unsigned int i;
1034 1022
1035 for_each_cpu(i) { 1023 for_each_cpu(cpu) {
1036 table_base = 1024 struct ip6t_entry *table_base = newinfo->entries[cpu];
1037 (void *)newinfo->entries 1025 if (table_base)
1038 + TABLE_OFFSET(newinfo, i); 1026 table_base->comefrom = 0xdead57ac;
1039
1040 table_base->comefrom = 0xdead57ac;
1041 } 1027 }
1042 } 1028 }
1043#endif 1029#endif
@@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e,
1072 return 0; 1058 return 0;
1073} 1059}
1074 1060
1061static inline int
1062set_entry_to_counter(const struct ip6t_entry *e,
1063 struct ip6t_counters total[],
1064 unsigned int *i)
1065{
1066 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
1067
1068 (*i)++;
1069 return 0;
1070}
1071
1075static void 1072static void
1076get_counters(const struct ip6t_table_info *t, 1073get_counters(const struct ip6t_table_info *t,
1077 struct ip6t_counters counters[]) 1074 struct ip6t_counters counters[])
1078{ 1075{
1079 unsigned int cpu; 1076 unsigned int cpu;
1080 unsigned int i; 1077 unsigned int i;
1078 unsigned int curcpu;
1079
1080 /* Instead of clearing (by a previous call to memset())
1081 * the counters and using adds, we set the counters
1082 * with data used by 'current' CPU
1083 * We dont care about preemption here.
1084 */
1085 curcpu = raw_smp_processor_id();
1086
1087 i = 0;
1088 IP6T_ENTRY_ITERATE(t->entries[curcpu],
1089 t->size,
1090 set_entry_to_counter,
1091 counters,
1092 &i);
1081 1093
1082 for_each_cpu(cpu) { 1094 for_each_cpu(cpu) {
1095 if (cpu == curcpu)
1096 continue;
1083 i = 0; 1097 i = 0;
1084 IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 1098 IP6T_ENTRY_ITERATE(t->entries[cpu],
1085 t->size, 1099 t->size,
1086 add_entry_to_counter, 1100 add_entry_to_counter,
1087 counters, 1101 counters,
@@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size,
1098 struct ip6t_entry *e; 1112 struct ip6t_entry *e;
1099 struct ip6t_counters *counters; 1113 struct ip6t_counters *counters;
1100 int ret = 0; 1114 int ret = 0;
1115 void *loc_cpu_entry;
1101 1116
1102 /* We need atomic snapshot of counters: rest doesn't change 1117 /* We need atomic snapshot of counters: rest doesn't change
1103 (other than comefrom, which userspace doesn't care 1118 (other than comefrom, which userspace doesn't care
@@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size,
1109 return -ENOMEM; 1124 return -ENOMEM;
1110 1125
1111 /* First, sum counters... */ 1126 /* First, sum counters... */
1112 memset(counters, 0, countersize);
1113 write_lock_bh(&table->lock); 1127 write_lock_bh(&table->lock);
1114 get_counters(table->private, counters); 1128 get_counters(table->private, counters);
1115 write_unlock_bh(&table->lock); 1129 write_unlock_bh(&table->lock);
1116 1130
1117 /* ... then copy entire thing from CPU 0... */ 1131 /* choose the copy that is on ourc node/cpu */
1118 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 1132 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1133 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
1119 ret = -EFAULT; 1134 ret = -EFAULT;
1120 goto free_counters; 1135 goto free_counters;
1121 } 1136 }
@@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size,
1127 struct ip6t_entry_match *m; 1142 struct ip6t_entry_match *m;
1128 struct ip6t_entry_target *t; 1143 struct ip6t_entry_target *t;
1129 1144
1130 e = (struct ip6t_entry *)(table->private->entries + off); 1145 e = (struct ip6t_entry *)(loc_cpu_entry + off);
1131 if (copy_to_user(userptr + off 1146 if (copy_to_user(userptr + off
1132 + offsetof(struct ip6t_entry, counters), 1147 + offsetof(struct ip6t_entry, counters),
1133 &counters[num], 1148 &counters[num],
@@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries,
1196 return ret; 1211 return ret;
1197} 1212}
1198 1213
1214static void free_table_info(struct ip6t_table_info *info)
1215{
1216 int cpu;
1217 for_each_cpu(cpu) {
1218 if (info->size <= PAGE_SIZE)
1219 kfree(info->entries[cpu]);
1220 else
1221 vfree(info->entries[cpu]);
1222 }
1223 kfree(info);
1224}
1225
1226static struct ip6t_table_info *alloc_table_info(unsigned int size)
1227{
1228 struct ip6t_table_info *newinfo;
1229 int cpu;
1230
1231 newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
1232 if (!newinfo)
1233 return NULL;
1234
1235 newinfo->size = size;
1236
1237 for_each_cpu(cpu) {
1238 if (size <= PAGE_SIZE)
1239 newinfo->entries[cpu] = kmalloc_node(size,
1240 GFP_KERNEL,
1241 cpu_to_node(cpu));
1242 else
1243 newinfo->entries[cpu] = vmalloc_node(size,
1244 cpu_to_node(cpu));
1245 if (newinfo->entries[cpu] == NULL) {
1246 free_table_info(newinfo);
1247 return NULL;
1248 }
1249 }
1250
1251 return newinfo;
1252}
1253
1199static int 1254static int
1200do_replace(void __user *user, unsigned int len) 1255do_replace(void __user *user, unsigned int len)
1201{ 1256{
@@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len)
1204 struct ip6t_table *t; 1259 struct ip6t_table *t;
1205 struct ip6t_table_info *newinfo, *oldinfo; 1260 struct ip6t_table_info *newinfo, *oldinfo;
1206 struct ip6t_counters *counters; 1261 struct ip6t_counters *counters;
1262 void *loc_cpu_entry, *loc_cpu_old_entry;
1207 1263
1208 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1264 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1209 return -EFAULT; 1265 return -EFAULT;
@@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len)
1212 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 1268 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1213 return -ENOMEM; 1269 return -ENOMEM;
1214 1270
1215 newinfo = vmalloc(sizeof(struct ip6t_table_info) 1271 newinfo = alloc_table_info(tmp.size);
1216 + SMP_ALIGN(tmp.size) *
1217 (highest_possible_processor_id()+1));
1218 if (!newinfo) 1272 if (!newinfo)
1219 return -ENOMEM; 1273 return -ENOMEM;
1220 1274
1221 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 1275 /* choose the copy that is on our node/cpu */
1276 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1277 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
1222 tmp.size) != 0) { 1278 tmp.size) != 0) {
1223 ret = -EFAULT; 1279 ret = -EFAULT;
1224 goto free_newinfo; 1280 goto free_newinfo;
@@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len)
1229 ret = -ENOMEM; 1285 ret = -ENOMEM;
1230 goto free_newinfo; 1286 goto free_newinfo;
1231 } 1287 }
1232 memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters));
1233 1288
1234 ret = translate_table(tmp.name, tmp.valid_hooks, 1289 ret = translate_table(tmp.name, tmp.valid_hooks,
1235 newinfo, tmp.size, tmp.num_entries, 1290 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1236 tmp.hook_entry, tmp.underflow); 1291 tmp.hook_entry, tmp.underflow);
1237 if (ret != 0) 1292 if (ret != 0)
1238 goto free_newinfo_counters; 1293 goto free_newinfo_counters;
@@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len)
1271 /* Get the old counters. */ 1326 /* Get the old counters. */
1272 get_counters(oldinfo, counters); 1327 get_counters(oldinfo, counters);
1273 /* Decrease module usage counts and free resource */ 1328 /* Decrease module usage counts and free resource */
1274 IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1329 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1275 vfree(oldinfo); 1330 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1331 free_table_info(oldinfo);
1276 if (copy_to_user(tmp.counters, counters, 1332 if (copy_to_user(tmp.counters, counters,
1277 sizeof(struct ip6t_counters) * tmp.num_counters) != 0) 1333 sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
1278 ret = -EFAULT; 1334 ret = -EFAULT;
@@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len)
1284 module_put(t->me); 1340 module_put(t->me);
1285 up(&ip6t_mutex); 1341 up(&ip6t_mutex);
1286 free_newinfo_counters_untrans: 1342 free_newinfo_counters_untrans:
1287 IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); 1343 IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1288 free_newinfo_counters: 1344 free_newinfo_counters:
1289 vfree(counters); 1345 vfree(counters);
1290 free_newinfo: 1346 free_newinfo:
1291 vfree(newinfo); 1347 free_table_info(newinfo);
1292 return ret; 1348 return ret;
1293} 1349}
1294 1350
@@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len)
1321 struct ip6t_counters_info tmp, *paddc; 1377 struct ip6t_counters_info tmp, *paddc;
1322 struct ip6t_table *t; 1378 struct ip6t_table *t;
1323 int ret = 0; 1379 int ret = 0;
1380 void *loc_cpu_entry;
1324 1381
1325 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1382 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1326 return -EFAULT; 1383 return -EFAULT;
@@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len)
1350 } 1407 }
1351 1408
1352 i = 0; 1409 i = 0;
1353 IP6T_ENTRY_ITERATE(t->private->entries, 1410 /* Choose the copy that is on our node */
1411 loc_cpu_entry = t->private->entries[smp_processor_id()];
1412 IP6T_ENTRY_ITERATE(loc_cpu_entry,
1354 t->private->size, 1413 t->private->size,
1355 add_counter_to_entry, 1414 add_counter_to_entry,
1356 paddc->counters, 1415 paddc->counters,
@@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table,
1543 struct ip6t_table_info *newinfo; 1602 struct ip6t_table_info *newinfo;
1544 static struct ip6t_table_info bootstrap 1603 static struct ip6t_table_info bootstrap
1545 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1604 = { 0, 0, 0, { 0 }, { 0 }, { } };
1605 void *loc_cpu_entry;
1546 1606
1547 newinfo = vmalloc(sizeof(struct ip6t_table_info) 1607 newinfo = alloc_table_info(repl->size);
1548 + SMP_ALIGN(repl->size) *
1549 (highest_possible_processor_id()+1));
1550 if (!newinfo) 1608 if (!newinfo)
1551 return -ENOMEM; 1609 return -ENOMEM;
1552 1610
1553 memcpy(newinfo->entries, repl->entries, repl->size); 1611 /* choose the copy on our node/cpu */
1612 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1613 memcpy(loc_cpu_entry, repl->entries, repl->size);
1554 1614
1555 ret = translate_table(table->name, table->valid_hooks, 1615 ret = translate_table(table->name, table->valid_hooks,
1556 newinfo, repl->size, 1616 newinfo, loc_cpu_entry, repl->size,
1557 repl->num_entries, 1617 repl->num_entries,
1558 repl->hook_entry, 1618 repl->hook_entry,
1559 repl->underflow); 1619 repl->underflow);
1560 if (ret != 0) { 1620 if (ret != 0) {
1561 vfree(newinfo); 1621 free_table_info(newinfo);
1562 return ret; 1622 return ret;
1563 } 1623 }
1564 1624
1565 ret = down_interruptible(&ip6t_mutex); 1625 ret = down_interruptible(&ip6t_mutex);
1566 if (ret != 0) { 1626 if (ret != 0) {
1567 vfree(newinfo); 1627 free_table_info(newinfo);
1568 return ret; 1628 return ret;
1569 } 1629 }
1570 1630
@@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table,
1593 return ret; 1653 return ret;
1594 1654
1595 free_unlock: 1655 free_unlock:
1596 vfree(newinfo); 1656 free_table_info(newinfo);
1597 goto unlock; 1657 goto unlock;
1598} 1658}
1599 1659
1600void ip6t_unregister_table(struct ip6t_table *table) 1660void ip6t_unregister_table(struct ip6t_table *table)
1601{ 1661{
1662 void *loc_cpu_entry;
1663
1602 down(&ip6t_mutex); 1664 down(&ip6t_mutex);
1603 LIST_DELETE(&ip6t_tables, table); 1665 LIST_DELETE(&ip6t_tables, table);
1604 up(&ip6t_mutex); 1666 up(&ip6t_mutex);
1605 1667
1606 /* Decrease module usage counts and free resources */ 1668 /* Decrease module usage counts and free resources */
1607 IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, 1669 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1670 IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1608 cleanup_entry, NULL); 1671 cleanup_entry, NULL);
1609 vfree(table->private); 1672 free_table_info(table->private);
1610} 1673}
1611 1674
1612/* Returns 1 if the port is matched by the range, 0 otherwise */ 1675/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/if_arp.h>
16#include <linux/ip.h> 17#include <linux/ip.h>
17#include <linux/spinlock.h> 18#include <linux/spinlock.h>
18#include <linux/icmpv6.h> 19#include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/ip.h>
12#include <linux/ipv6.h> 13#include <linux/ipv6.h>
13#include <linux/types.h> 14#include <linux/types.h>
14#include <net/checksum.h> 15#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/ip.h>
12#include <linux/ipv6.h> 13#include <linux/ipv6.h>
13#include <linux/types.h> 14#include <linux/types.h>
14#include <net/checksum.h> 15#include <net/checksum.h>
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c52af9e560..f3e5ffbd592f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -98,7 +98,7 @@ struct nf_ct_frag6_queue
98#define FRAG6Q_HASHSZ 64 98#define FRAG6Q_HASHSZ 64
99 99
100static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; 100static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
101static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; 101static DEFINE_RWLOCK(nf_ct_frag6_lock);
102static u32 nf_ct_frag6_hash_rnd; 102static u32 nf_ct_frag6_hash_rnd;
103static LIST_HEAD(nf_ct_frag6_lru_list); 103static LIST_HEAD(nf_ct_frag6_lru_list);
104int nf_ct_frag6_nqueues = 0; 104int nf_ct_frag6_nqueues = 0;
@@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct
371 init_timer(&fq->timer); 371 init_timer(&fq->timer);
372 fq->timer.function = nf_ct_frag6_expire; 372 fq->timer.function = nf_ct_frag6_expire;
373 fq->timer.data = (long) fq; 373 fq->timer.data = (long) fq;
374 fq->lock = SPIN_LOCK_UNLOCKED; 374 spin_lock_init(&fq->lock);
375 atomic_set(&fq->refcnt, 1); 375 atomic_set(&fq->refcnt, 1);
376 376
377 return nf_ct_frag6_intern(hash, fq); 377 return nf_ct_frag6_intern(hash, fq);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2af..66f1d12ea578 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
32#include <linux/icmpv6.h> 32#include <linux/icmpv6.h>
33#include <linux/netfilter.h> 33#include <linux/netfilter.h>
34#include <linux/netfilter_ipv6.h> 34#include <linux/netfilter_ipv6.h>
35#include <linux/skbuff.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/ioctls.h> 37#include <asm/ioctls.h>
37#include <asm/bug.h> 38#include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
433 return err; 434 return err;
434 435
435csum_copy_err: 436csum_copy_err:
436 /* Clear queue. */ 437 skb_kill_datagram(sk, skb, flags);
437 if (flags&MSG_PEEK) {
438 int clear = 0;
439 spin_lock_bh(&sk->sk_receive_queue.lock);
440 if (skb == skb_peek(&sk->sk_receive_queue)) {
441 __skb_unlink(skb, &sk->sk_receive_queue);
442 clear = 1;
443 }
444 spin_unlock_bh(&sk->sk_receive_queue.lock);
445 if (clear)
446 kfree_skb(skb);
447 }
448 438
449 /* Error for blocking case is chosen to masquerade 439 /* Error for blocking case is chosen to masquerade
450 as some normal condition. 440 as some normal condition.
451 */ 441 */
452 err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; 442 err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
453 /* FIXME: increment a raw6 drops counter here */ 443 /* FIXME: increment a raw6 drops counter here */
454 goto out_free; 444 goto out;
455} 445}
456 446
457static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, 447static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8827389abaf7..2947bc56d8a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -48,6 +48,7 @@
48#include <net/tcp.h> 48#include <net/tcp.h>
49#include <net/ndisc.h> 49#include <net/ndisc.h>
50#include <net/inet6_hashtables.h> 50#include <net/inet6_hashtables.h>
51#include <net/inet6_connection_sock.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <net/transp_v6.h> 53#include <net/transp_v6.h>
53#include <net/addrconf.h> 54#include <net/addrconf.h>
@@ -59,6 +60,7 @@
59#include <net/addrconf.h> 60#include <net/addrconf.h>
60#include <net/snmp.h> 61#include <net/snmp.h>
61#include <net/dsfield.h> 62#include <net/dsfield.h>
63#include <net/timewait_sock.h>
62 64
63#include <asm/uaccess.h> 65#include <asm/uaccess.h>
64 66
@@ -67,224 +69,33 @@
67 69
68static void tcp_v6_send_reset(struct sk_buff *skb); 70static void tcp_v6_send_reset(struct sk_buff *skb);
69static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); 71static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
70static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 72static void tcp_v6_send_check(struct sock *sk, int len,
71 struct sk_buff *skb); 73 struct sk_buff *skb);
72 74
73static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); 75static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
74static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
75 76
76static struct tcp_func ipv6_mapped; 77static struct inet_connection_sock_af_ops ipv6_mapped;
77static struct tcp_func ipv6_specific; 78static struct inet_connection_sock_af_ops ipv6_specific;
78 79
79static inline int tcp_v6_bind_conflict(const struct sock *sk,
80 const struct inet_bind_bucket *tb)
81{
82 const struct sock *sk2;
83 const struct hlist_node *node;
84
85 /* We must walk the whole port owner list in this case. -DaveM */
86 sk_for_each_bound(sk2, node, &tb->owners) {
87 if (sk != sk2 &&
88 (!sk->sk_bound_dev_if ||
89 !sk2->sk_bound_dev_if ||
90 sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
91 (!sk->sk_reuse || !sk2->sk_reuse ||
92 sk2->sk_state == TCP_LISTEN) &&
93 ipv6_rcv_saddr_equal(sk, sk2))
94 break;
95 }
96
97 return node != NULL;
98}
99
100/* Grrr, addr_type already calculated by caller, but I don't want
101 * to add some silly "cookie" argument to this method just for that.
102 * But it doesn't matter, the recalculation is in the rarest path
103 * this function ever takes.
104 */
105static int tcp_v6_get_port(struct sock *sk, unsigned short snum) 80static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
106{ 81{
107 struct inet_bind_hashbucket *head; 82 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
108 struct inet_bind_bucket *tb; 83 inet6_csk_bind_conflict);
109 struct hlist_node *node;
110 int ret;
111
112 local_bh_disable();
113 if (snum == 0) {
114 int low = sysctl_local_port_range[0];
115 int high = sysctl_local_port_range[1];
116 int remaining = (high - low) + 1;
117 int rover = net_random() % (high - low) + low;
118
119 do {
120 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
121 spin_lock(&head->lock);
122 inet_bind_bucket_for_each(tb, node, &head->chain)
123 if (tb->port == rover)
124 goto next;
125 break;
126 next:
127 spin_unlock(&head->lock);
128 if (++rover > high)
129 rover = low;
130 } while (--remaining > 0);
131
132 /* Exhausted local port range during search? It is not
133 * possible for us to be holding one of the bind hash
134 * locks if this test triggers, because if 'remaining'
135 * drops to zero, we broke out of the do/while loop at
136 * the top level, not from the 'break;' statement.
137 */
138 ret = 1;
139 if (unlikely(remaining <= 0))
140 goto fail;
141
142 /* OK, here is the one we will use. */
143 snum = rover;
144 } else {
145 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
146 spin_lock(&head->lock);
147 inet_bind_bucket_for_each(tb, node, &head->chain)
148 if (tb->port == snum)
149 goto tb_found;
150 }
151 tb = NULL;
152 goto tb_not_found;
153tb_found:
154 if (tb && !hlist_empty(&tb->owners)) {
155 if (tb->fastreuse > 0 && sk->sk_reuse &&
156 sk->sk_state != TCP_LISTEN) {
157 goto success;
158 } else {
159 ret = 1;
160 if (tcp_v6_bind_conflict(sk, tb))
161 goto fail_unlock;
162 }
163 }
164tb_not_found:
165 ret = 1;
166 if (tb == NULL) {
167 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
168 if (tb == NULL)
169 goto fail_unlock;
170 }
171 if (hlist_empty(&tb->owners)) {
172 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
173 tb->fastreuse = 1;
174 else
175 tb->fastreuse = 0;
176 } else if (tb->fastreuse &&
177 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
178 tb->fastreuse = 0;
179
180success:
181 if (!inet_csk(sk)->icsk_bind_hash)
182 inet_bind_hash(sk, tb, snum);
183 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
184 ret = 0;
185
186fail_unlock:
187 spin_unlock(&head->lock);
188fail:
189 local_bh_enable();
190 return ret;
191}
192
193static __inline__ void __tcp_v6_hash(struct sock *sk)
194{
195 struct hlist_head *list;
196 rwlock_t *lock;
197
198 BUG_TRAP(sk_unhashed(sk));
199
200 if (sk->sk_state == TCP_LISTEN) {
201 list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
202 lock = &tcp_hashinfo.lhash_lock;
203 inet_listen_wlock(&tcp_hashinfo);
204 } else {
205 unsigned int hash;
206 sk->sk_hash = hash = inet6_sk_ehashfn(sk);
207 hash &= (tcp_hashinfo.ehash_size - 1);
208 list = &tcp_hashinfo.ehash[hash].chain;
209 lock = &tcp_hashinfo.ehash[hash].lock;
210 write_lock(lock);
211 }
212
213 __sk_add_node(sk, list);
214 sock_prot_inc_use(sk->sk_prot);
215 write_unlock(lock);
216} 84}
217 85
218
219static void tcp_v6_hash(struct sock *sk) 86static void tcp_v6_hash(struct sock *sk)
220{ 87{
221 if (sk->sk_state != TCP_CLOSE) { 88 if (sk->sk_state != TCP_CLOSE) {
222 struct tcp_sock *tp = tcp_sk(sk); 89 if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
223
224 if (tp->af_specific == &ipv6_mapped) {
225 tcp_prot.hash(sk); 90 tcp_prot.hash(sk);
226 return; 91 return;
227 } 92 }
228 local_bh_disable(); 93 local_bh_disable();
229 __tcp_v6_hash(sk); 94 __inet6_hash(&tcp_hashinfo, sk);
230 local_bh_enable(); 95 local_bh_enable();
231 } 96 }
232} 97}
233 98
234/*
235 * Open request hash tables.
236 */
237
238static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
239{
240 u32 a, b, c;
241
242 a = raddr->s6_addr32[0];
243 b = raddr->s6_addr32[1];
244 c = raddr->s6_addr32[2];
245
246 a += JHASH_GOLDEN_RATIO;
247 b += JHASH_GOLDEN_RATIO;
248 c += rnd;
249 __jhash_mix(a, b, c);
250
251 a += raddr->s6_addr32[3];
252 b += (u32) rport;
253 __jhash_mix(a, b, c);
254
255 return c & (TCP_SYNQ_HSIZE - 1);
256}
257
258static struct request_sock *tcp_v6_search_req(const struct sock *sk,
259 struct request_sock ***prevp,
260 __u16 rport,
261 struct in6_addr *raddr,
262 struct in6_addr *laddr,
263 int iif)
264{
265 const struct inet_connection_sock *icsk = inet_csk(sk);
266 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
267 struct request_sock *req, **prev;
268
269 for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
270 (req = *prev) != NULL;
271 prev = &req->dl_next) {
272 const struct tcp6_request_sock *treq = tcp6_rsk(req);
273
274 if (inet_rsk(req)->rmt_port == rport &&
275 req->rsk_ops->family == AF_INET6 &&
276 ipv6_addr_equal(&treq->rmt_addr, raddr) &&
277 ipv6_addr_equal(&treq->loc_addr, laddr) &&
278 (!treq->iif || treq->iif == iif)) {
279 BUG_TRAP(req->sk == NULL);
280 *prevp = prev;
281 return req;
282 }
283 }
284
285 return NULL;
286}
287
288static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, 99static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len,
289 struct in6_addr *saddr, 100 struct in6_addr *saddr,
290 struct in6_addr *daddr, 101 struct in6_addr *daddr,
@@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
308 } 119 }
309} 120}
310 121
311static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
312 struct inet_timewait_sock **twp)
313{
314 struct inet_sock *inet = inet_sk(sk);
315 const struct ipv6_pinfo *np = inet6_sk(sk);
316 const struct in6_addr *daddr = &np->rcv_saddr;
317 const struct in6_addr *saddr = &np->daddr;
318 const int dif = sk->sk_bound_dev_if;
319 const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
320 unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
321 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
322 struct sock *sk2;
323 const struct hlist_node *node;
324 struct inet_timewait_sock *tw;
325
326 prefetch(head->chain.first);
327 write_lock(&head->lock);
328
329 /* Check TIME-WAIT sockets first. */
330 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
331 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
332
333 tw = inet_twsk(sk2);
334
335 if(*((__u32 *)&(tw->tw_dport)) == ports &&
336 sk2->sk_family == PF_INET6 &&
337 ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
338 ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
339 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
340 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
341 struct tcp_sock *tp = tcp_sk(sk);
342
343 if (tcptw->tw_ts_recent_stamp &&
344 (!twp ||
345 (sysctl_tcp_tw_reuse &&
346 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
347 /* See comment in tcp_ipv4.c */
348 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
349 if (!tp->write_seq)
350 tp->write_seq = 1;
351 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
352 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
353 sock_hold(sk2);
354 goto unique;
355 } else
356 goto not_unique;
357 }
358 }
359 tw = NULL;
360
361 /* And established part... */
362 sk_for_each(sk2, node, &head->chain) {
363 if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
364 goto not_unique;
365 }
366
367unique:
368 BUG_TRAP(sk_unhashed(sk));
369 __sk_add_node(sk, &head->chain);
370 sk->sk_hash = hash;
371 sock_prot_inc_use(sk->sk_prot);
372 write_unlock(&head->lock);
373
374 if (twp) {
375 *twp = tw;
376 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
377 } else if (tw) {
378 /* Silly. Should hash-dance instead... */
379 inet_twsk_deschedule(tw, &tcp_death_row);
380 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
381
382 inet_twsk_put(tw);
383 }
384 return 0;
385
386not_unique:
387 write_unlock(&head->lock);
388 return -EADDRNOTAVAIL;
389}
390
391static inline u32 tcpv6_port_offset(const struct sock *sk)
392{
393 const struct inet_sock *inet = inet_sk(sk);
394 const struct ipv6_pinfo *np = inet6_sk(sk);
395
396 return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
397 np->daddr.s6_addr32,
398 inet->dport);
399}
400
401static int tcp_v6_hash_connect(struct sock *sk)
402{
403 unsigned short snum = inet_sk(sk)->num;
404 struct inet_bind_hashbucket *head;
405 struct inet_bind_bucket *tb;
406 int ret;
407
408 if (!snum) {
409 int low = sysctl_local_port_range[0];
410 int high = sysctl_local_port_range[1];
411 int range = high - low;
412 int i;
413 int port;
414 static u32 hint;
415 u32 offset = hint + tcpv6_port_offset(sk);
416 struct hlist_node *node;
417 struct inet_timewait_sock *tw = NULL;
418
419 local_bh_disable();
420 for (i = 1; i <= range; i++) {
421 port = low + (i + offset) % range;
422 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
423 spin_lock(&head->lock);
424
425 /* Does not bother with rcv_saddr checks,
426 * because the established check is already
427 * unique enough.
428 */
429 inet_bind_bucket_for_each(tb, node, &head->chain) {
430 if (tb->port == port) {
431 BUG_TRAP(!hlist_empty(&tb->owners));
432 if (tb->fastreuse >= 0)
433 goto next_port;
434 if (!__tcp_v6_check_established(sk,
435 port,
436 &tw))
437 goto ok;
438 goto next_port;
439 }
440 }
441
442 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
443 if (!tb) {
444 spin_unlock(&head->lock);
445 break;
446 }
447 tb->fastreuse = -1;
448 goto ok;
449
450 next_port:
451 spin_unlock(&head->lock);
452 }
453 local_bh_enable();
454
455 return -EADDRNOTAVAIL;
456
457ok:
458 hint += i;
459
460 /* Head lock still held and bh's disabled */
461 inet_bind_hash(sk, tb, port);
462 if (sk_unhashed(sk)) {
463 inet_sk(sk)->sport = htons(port);
464 __tcp_v6_hash(sk);
465 }
466 spin_unlock(&head->lock);
467
468 if (tw) {
469 inet_twsk_deschedule(tw, &tcp_death_row);
470 inet_twsk_put(tw);
471 }
472
473 ret = 0;
474 goto out;
475 }
476
477 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
478 tb = inet_csk(sk)->icsk_bind_hash;
479 spin_lock_bh(&head->lock);
480
481 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
482 __tcp_v6_hash(sk);
483 spin_unlock_bh(&head->lock);
484 return 0;
485 } else {
486 spin_unlock(&head->lock);
487 /* No definite answer... Walk to established hash table */
488 ret = __tcp_v6_check_established(sk, snum, NULL);
489out:
490 local_bh_enable();
491 return ret;
492 }
493}
494
495static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 122static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
496 int addr_len) 123 int addr_len)
497{ 124{
498 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; 125 struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
499 struct inet_sock *inet = inet_sk(sk); 126 struct inet_sock *inet = inet_sk(sk);
127 struct inet_connection_sock *icsk = inet_csk(sk);
500 struct ipv6_pinfo *np = inet6_sk(sk); 128 struct ipv6_pinfo *np = inet6_sk(sk);
501 struct tcp_sock *tp = tcp_sk(sk); 129 struct tcp_sock *tp = tcp_sk(sk);
502 struct in6_addr *saddr = NULL, *final_p = NULL, final; 130 struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
571 */ 199 */
572 200
573 if (addr_type == IPV6_ADDR_MAPPED) { 201 if (addr_type == IPV6_ADDR_MAPPED) {
574 u32 exthdrlen = tp->ext_header_len; 202 u32 exthdrlen = icsk->icsk_ext_hdr_len;
575 struct sockaddr_in sin; 203 struct sockaddr_in sin;
576 204
577 SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); 205 SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
583 sin.sin_port = usin->sin6_port; 211 sin.sin_port = usin->sin6_port;
584 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; 212 sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
585 213
586 tp->af_specific = &ipv6_mapped; 214 icsk->icsk_af_ops = &ipv6_mapped;
587 sk->sk_backlog_rcv = tcp_v4_do_rcv; 215 sk->sk_backlog_rcv = tcp_v4_do_rcv;
588 216
589 err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); 217 err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
590 218
591 if (err) { 219 if (err) {
592 tp->ext_header_len = exthdrlen; 220 icsk->icsk_ext_hdr_len = exthdrlen;
593 tp->af_specific = &ipv6_specific; 221 icsk->icsk_af_ops = &ipv6_specific;
594 sk->sk_backlog_rcv = tcp_v6_do_rcv; 222 sk->sk_backlog_rcv = tcp_v6_do_rcv;
595 goto failure; 223 goto failure;
596 } else { 224 } else {
@@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
643 sk->sk_route_caps = dst->dev->features & 271 sk->sk_route_caps = dst->dev->features &
644 ~(NETIF_F_IP_CSUM | NETIF_F_TSO); 272 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
645 273
646 tp->ext_header_len = 0; 274 icsk->icsk_ext_hdr_len = 0;
647 if (np->opt) 275 if (np->opt)
648 tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; 276 icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
277 np->opt->opt_nflen);
649 278
650 tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); 279 tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
651 280
652 inet->dport = usin->sin6_port; 281 inet->dport = usin->sin6_port;
653 282
654 tcp_set_state(sk, TCP_SYN_SENT); 283 tcp_set_state(sk, TCP_SYN_SENT);
655 err = tcp_v6_hash_connect(sk); 284 err = inet6_hash_connect(&tcp_death_row, sk);
656 if (err) 285 if (err)
657 goto late_failure; 286 goto late_failure;
658 287
@@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
758 } else 387 } else
759 dst_hold(dst); 388 dst_hold(dst);
760 389
761 if (tp->pmtu_cookie > dst_mtu(dst)) { 390 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
762 tcp_sync_mss(sk, dst_mtu(dst)); 391 tcp_sync_mss(sk, dst_mtu(dst));
763 tcp_simple_retransmit(sk); 392 tcp_simple_retransmit(sk);
764 } /* else let the usual retransmit timer handle it */ 393 } /* else let the usual retransmit timer handle it */
@@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
775 if (sock_owned_by_user(sk)) 404 if (sock_owned_by_user(sk))
776 goto out; 405 goto out;
777 406
778 req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, 407 req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
779 &hdr->saddr, inet6_iif(skb)); 408 &hdr->saddr, inet6_iif(skb));
780 if (!req) 409 if (!req)
781 goto out; 410 goto out;
782 411
@@ -822,7 +451,7 @@ out:
822static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, 451static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
823 struct dst_entry *dst) 452 struct dst_entry *dst)
824{ 453{
825 struct tcp6_request_sock *treq = tcp6_rsk(req); 454 struct inet6_request_sock *treq = inet6_rsk(req);
826 struct ipv6_pinfo *np = inet6_sk(sk); 455 struct ipv6_pinfo *np = inet6_sk(sk);
827 struct sk_buff * skb; 456 struct sk_buff * skb;
828 struct ipv6_txoptions *opt = NULL; 457 struct ipv6_txoptions *opt = NULL;
@@ -888,8 +517,8 @@ done:
888 517
889static void tcp_v6_reqsk_destructor(struct request_sock *req) 518static void tcp_v6_reqsk_destructor(struct request_sock *req)
890{ 519{
891 if (tcp6_rsk(req)->pktopts) 520 if (inet6_rsk(req)->pktopts)
892 kfree_skb(tcp6_rsk(req)->pktopts); 521 kfree_skb(inet6_rsk(req)->pktopts);
893} 522}
894 523
895static struct request_sock_ops tcp6_request_sock_ops = { 524static struct request_sock_ops tcp6_request_sock_ops = {
@@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = {
901 .send_reset = tcp_v6_send_reset 530 .send_reset = tcp_v6_send_reset
902}; 531};
903 532
904static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) 533static struct timewait_sock_ops tcp6_timewait_sock_ops = {
905{ 534 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
906 struct ipv6_pinfo *np = inet6_sk(sk); 535 .twsk_unique = tcp_twsk_unique,
907 struct inet6_skb_parm *opt = IP6CB(skb); 536};
908
909 if (np->rxopt.all) {
910 if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
911 ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
912 (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
913 ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
914 return 1;
915 }
916 return 0;
917}
918
919 537
920static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 538static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
921 struct sk_buff *skb)
922{ 539{
923 struct ipv6_pinfo *np = inet6_sk(sk); 540 struct ipv6_pinfo *np = inet6_sk(sk);
541 struct tcphdr *th = skb->h.th;
924 542
925 if (skb->ip_summed == CHECKSUM_HW) { 543 if (skb->ip_summed == CHECKSUM_HW) {
926 th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); 544 th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0);
@@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1091 struct sock *nsk; 709 struct sock *nsk;
1092 710
1093 /* Find possible connection requests. */ 711 /* Find possible connection requests. */
1094 req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, 712 req = inet6_csk_search_req(sk, &prev, th->source,
1095 &skb->nh.ipv6h->daddr, inet6_iif(skb)); 713 &skb->nh.ipv6h->saddr,
714 &skb->nh.ipv6h->daddr, inet6_iif(skb));
1096 if (req) 715 if (req)
1097 return tcp_check_req(sk, skb, req, prev); 716 return tcp_check_req(sk, skb, req, prev);
1098 717
@@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
1116 return sk; 735 return sk;
1117} 736}
1118 737
1119static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
1120{
1121 struct inet_connection_sock *icsk = inet_csk(sk);
1122 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
1123 const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1124
1125 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
1126 inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
1127}
1128
1129
1130/* FIXME: this is substantially similar to the ipv4 code. 738/* FIXME: this is substantially similar to the ipv4 code.
1131 * Can some kind of merge be done? -- erics 739 * Can some kind of merge be done? -- erics
1132 */ 740 */
1133static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) 741static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1134{ 742{
1135 struct tcp6_request_sock *treq; 743 struct inet6_request_sock *treq;
1136 struct ipv6_pinfo *np = inet6_sk(sk); 744 struct ipv6_pinfo *np = inet6_sk(sk);
1137 struct tcp_options_received tmp_opt; 745 struct tcp_options_received tmp_opt;
1138 struct tcp_sock *tp = tcp_sk(sk); 746 struct tcp_sock *tp = tcp_sk(sk);
@@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1157 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 765 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1158 goto drop; 766 goto drop;
1159 767
1160 req = reqsk_alloc(&tcp6_request_sock_ops); 768 req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
1161 if (req == NULL) 769 if (req == NULL)
1162 goto drop; 770 goto drop;
1163 771
@@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1170 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 778 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1171 tcp_openreq_init(req, &tmp_opt, skb); 779 tcp_openreq_init(req, &tmp_opt, skb);
1172 780
1173 treq = tcp6_rsk(req); 781 treq = inet6_rsk(req);
1174 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); 782 ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
1175 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); 783 ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
1176 TCP_ECN_create_request(req, skb->h.th); 784 TCP_ECN_create_request(req, skb->h.th);
@@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
1196 if (tcp_v6_send_synack(sk, req, NULL)) 804 if (tcp_v6_send_synack(sk, req, NULL))
1197 goto drop; 805 goto drop;
1198 806
1199 tcp_v6_synq_add(sk, req); 807 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1200
1201 return 0; 808 return 0;
1202 809
1203drop: 810drop:
@@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1212 struct request_sock *req, 819 struct request_sock *req,
1213 struct dst_entry *dst) 820 struct dst_entry *dst)
1214{ 821{
1215 struct tcp6_request_sock *treq = tcp6_rsk(req); 822 struct inet6_request_sock *treq = inet6_rsk(req);
1216 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 823 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
1217 struct tcp6_sock *newtcp6sk; 824 struct tcp6_sock *newtcp6sk;
1218 struct inet_sock *newinet; 825 struct inet_sock *newinet;
@@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1247 854
1248 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); 855 ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
1249 856
1250 newtp->af_specific = &ipv6_mapped; 857 inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
1251 newsk->sk_backlog_rcv = tcp_v4_do_rcv; 858 newsk->sk_backlog_rcv = tcp_v4_do_rcv;
1252 newnp->pktoptions = NULL; 859 newnp->pktoptions = NULL;
1253 newnp->opt = NULL; 860 newnp->opt = NULL;
@@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1261 */ 868 */
1262 869
1263 /* It is tricky place. Until this moment IPv4 tcp 870 /* It is tricky place. Until this moment IPv4 tcp
1264 worked with IPv6 af_tcp.af_specific. 871 worked with IPv6 icsk.icsk_af_ops.
1265 Sync it now. 872 Sync it now.
1266 */ 873 */
1267 tcp_sync_mss(newsk, newtp->pmtu_cookie); 874 tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
1268 875
1269 return newsk; 876 return newsk;
1270 } 877 }
@@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1371 sock_kfree_s(sk, opt, opt->tot_len); 978 sock_kfree_s(sk, opt, opt->tot_len);
1372 } 979 }
1373 980
1374 newtp->ext_header_len = 0; 981 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1375 if (newnp->opt) 982 if (newnp->opt)
1376 newtp->ext_header_len = newnp->opt->opt_nflen + 983 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
1377 newnp->opt->opt_flen; 984 newnp->opt->opt_flen);
1378 985
1379 tcp_sync_mss(newsk, dst_mtu(dst)); 986 tcp_sync_mss(newsk, dst_mtu(dst));
1380 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 987 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1382 989
1383 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; 990 newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
1384 991
1385 __tcp_v6_hash(newsk); 992 __inet6_hash(&tcp_hashinfo, newsk);
1386 inet_inherit_port(&tcp_hashinfo, sk, newsk); 993 inet_inherit_port(&tcp_hashinfo, sk, newsk);
1387 994
1388 return newsk; 995 return newsk;
@@ -1679,139 +1286,16 @@ do_time_wait:
1679 goto discard_it; 1286 goto discard_it;
1680} 1287}
1681 1288
1682static int tcp_v6_rebuild_header(struct sock *sk)
1683{
1684 int err;
1685 struct dst_entry *dst;
1686 struct ipv6_pinfo *np = inet6_sk(sk);
1687
1688 dst = __sk_dst_check(sk, np->dst_cookie);
1689
1690 if (dst == NULL) {
1691 struct inet_sock *inet = inet_sk(sk);
1692 struct in6_addr *final_p = NULL, final;
1693 struct flowi fl;
1694
1695 memset(&fl, 0, sizeof(fl));
1696 fl.proto = IPPROTO_TCP;
1697 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1698 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
1699 fl.fl6_flowlabel = np->flow_label;
1700 fl.oif = sk->sk_bound_dev_if;
1701 fl.fl_ip_dport = inet->dport;
1702 fl.fl_ip_sport = inet->sport;
1703
1704 if (np->opt && np->opt->srcrt) {
1705 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
1706 ipv6_addr_copy(&final, &fl.fl6_dst);
1707 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1708 final_p = &final;
1709 }
1710
1711 err = ip6_dst_lookup(sk, &dst, &fl);
1712 if (err) {
1713 sk->sk_route_caps = 0;
1714 return err;
1715 }
1716 if (final_p)
1717 ipv6_addr_copy(&fl.fl6_dst, final_p);
1718
1719 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
1720 sk->sk_err_soft = -err;
1721 return err;
1722 }
1723
1724 ip6_dst_store(sk, dst, NULL);
1725 sk->sk_route_caps = dst->dev->features &
1726 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
1727 }
1728
1729 return 0;
1730}
1731
1732static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
1733{
1734 struct sock *sk = skb->sk;
1735 struct inet_sock *inet = inet_sk(sk);
1736 struct ipv6_pinfo *np = inet6_sk(sk);
1737 struct flowi fl;
1738 struct dst_entry *dst;
1739 struct in6_addr *final_p = NULL, final;
1740
1741 memset(&fl, 0, sizeof(fl));
1742 fl.proto = IPPROTO_TCP;
1743 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1744 ipv6_addr_copy(&fl.fl6_src, &np->saddr);
1745 fl.fl6_flowlabel = np->flow_label;
1746 IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
1747 fl.oif = sk->sk_bound_dev_if;
1748 fl.fl_ip_sport = inet->sport;
1749 fl.fl_ip_dport = inet->dport;
1750
1751 if (np->opt && np->opt->srcrt) {
1752 struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
1753 ipv6_addr_copy(&final, &fl.fl6_dst);
1754 ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
1755 final_p = &final;
1756 }
1757
1758 dst = __sk_dst_check(sk, np->dst_cookie);
1759
1760 if (dst == NULL) {
1761 int err = ip6_dst_lookup(sk, &dst, &fl);
1762
1763 if (err) {
1764 sk->sk_err_soft = -err;
1765 return err;
1766 }
1767
1768 if (final_p)
1769 ipv6_addr_copy(&fl.fl6_dst, final_p);
1770
1771 if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
1772 sk->sk_route_caps = 0;
1773 return err;
1774 }
1775
1776 ip6_dst_store(sk, dst, NULL);
1777 sk->sk_route_caps = dst->dev->features &
1778 ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
1779 }
1780
1781 skb->dst = dst_clone(dst);
1782
1783 /* Restore final destination back after routing done */
1784 ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
1785
1786 return ip6_xmit(sk, skb, &fl, np->opt, 0);
1787}
1788
1789static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1790{
1791 struct ipv6_pinfo *np = inet6_sk(sk);
1792 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
1793
1794 sin6->sin6_family = AF_INET6;
1795 ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
1796 sin6->sin6_port = inet_sk(sk)->dport;
1797 /* We do not store received flowlabel for TCP */
1798 sin6->sin6_flowinfo = 0;
1799 sin6->sin6_scope_id = 0;
1800 if (sk->sk_bound_dev_if &&
1801 ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
1802 sin6->sin6_scope_id = sk->sk_bound_dev_if;
1803}
1804
1805static int tcp_v6_remember_stamp(struct sock *sk) 1289static int tcp_v6_remember_stamp(struct sock *sk)
1806{ 1290{
1807 /* Alas, not yet... */ 1291 /* Alas, not yet... */
1808 return 0; 1292 return 0;
1809} 1293}
1810 1294
1811static struct tcp_func ipv6_specific = { 1295static struct inet_connection_sock_af_ops ipv6_specific = {
1812 .queue_xmit = tcp_v6_xmit, 1296 .queue_xmit = inet6_csk_xmit,
1813 .send_check = tcp_v6_send_check, 1297 .send_check = tcp_v6_send_check,
1814 .rebuild_header = tcp_v6_rebuild_header, 1298 .rebuild_header = inet6_sk_rebuild_header,
1815 .conn_request = tcp_v6_conn_request, 1299 .conn_request = tcp_v6_conn_request,
1816 .syn_recv_sock = tcp_v6_syn_recv_sock, 1300 .syn_recv_sock = tcp_v6_syn_recv_sock,
1817 .remember_stamp = tcp_v6_remember_stamp, 1301 .remember_stamp = tcp_v6_remember_stamp,
@@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = {
1819 1303
1820 .setsockopt = ipv6_setsockopt, 1304 .setsockopt = ipv6_setsockopt,
1821 .getsockopt = ipv6_getsockopt, 1305 .getsockopt = ipv6_getsockopt,
1822 .addr2sockaddr = v6_addr2sockaddr, 1306 .addr2sockaddr = inet6_csk_addr2sockaddr,
1823 .sockaddr_len = sizeof(struct sockaddr_in6) 1307 .sockaddr_len = sizeof(struct sockaddr_in6)
1824}; 1308};
1825 1309
@@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = {
1827 * TCP over IPv4 via INET6 API 1311 * TCP over IPv4 via INET6 API
1828 */ 1312 */
1829 1313
1830static struct tcp_func ipv6_mapped = { 1314static struct inet_connection_sock_af_ops ipv6_mapped = {
1831 .queue_xmit = ip_queue_xmit, 1315 .queue_xmit = ip_queue_xmit,
1832 .send_check = tcp_v4_send_check, 1316 .send_check = tcp_v4_send_check,
1833 .rebuild_header = inet_sk_rebuild_header, 1317 .rebuild_header = inet_sk_rebuild_header,
@@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = {
1838 1322
1839 .setsockopt = ipv6_setsockopt, 1323 .setsockopt = ipv6_setsockopt,
1840 .getsockopt = ipv6_getsockopt, 1324 .getsockopt = ipv6_getsockopt,
1841 .addr2sockaddr = v6_addr2sockaddr, 1325 .addr2sockaddr = inet6_csk_addr2sockaddr,
1842 .sockaddr_len = sizeof(struct sockaddr_in6) 1326 .sockaddr_len = sizeof(struct sockaddr_in6)
1843}; 1327};
1844 1328
@@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk)
1877 1361
1878 sk->sk_state = TCP_CLOSE; 1362 sk->sk_state = TCP_CLOSE;
1879 1363
1880 tp->af_specific = &ipv6_specific; 1364 icsk->icsk_af_ops = &ipv6_specific;
1881 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1365 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1366 icsk->icsk_sync_mss = tcp_sync_mss;
1882 sk->sk_write_space = sk_stream_write_space; 1367 sk->sk_write_space = sk_stream_write_space;
1883 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1368 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1884 1369
@@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
1900static void get_openreq6(struct seq_file *seq, 1385static void get_openreq6(struct seq_file *seq,
1901 struct sock *sk, struct request_sock *req, int i, int uid) 1386 struct sock *sk, struct request_sock *req, int i, int uid)
1902{ 1387{
1903 struct in6_addr *dest, *src;
1904 int ttd = req->expires - jiffies; 1388 int ttd = req->expires - jiffies;
1389 struct in6_addr *src = &inet6_rsk(req)->loc_addr;
1390 struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
1905 1391
1906 if (ttd < 0) 1392 if (ttd < 0)
1907 ttd = 0; 1393 ttd = 0;
1908 1394
1909 src = &tcp6_rsk(req)->loc_addr;
1910 dest = &tcp6_rsk(req)->rmt_addr;
1911 seq_printf(seq, 1395 seq_printf(seq,
1912 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " 1396 "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
1913 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", 1397 "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq,
1988{ 1472{
1989 struct in6_addr *dest, *src; 1473 struct in6_addr *dest, *src;
1990 __u16 destp, srcp; 1474 __u16 destp, srcp;
1991 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 1475 struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
1992 int ttd = tw->tw_ttd - jiffies; 1476 int ttd = tw->tw_ttd - jiffies;
1993 1477
1994 if (ttd < 0) 1478 if (ttd < 0)
1995 ttd = 0; 1479 ttd = 0;
1996 1480
1997 dest = &tcp6tw->tw_v6_daddr; 1481 dest = &tw6->tw_v6_daddr;
1998 src = &tcp6tw->tw_v6_rcv_saddr; 1482 src = &tw6->tw_v6_rcv_saddr;
1999 destp = ntohs(tw->tw_dport); 1483 destp = ntohs(tw->tw_dport);
2000 srcp = ntohs(tw->tw_sport); 1484 srcp = ntohs(tw->tw_sport);
2001 1485
@@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = {
2093 .sysctl_rmem = sysctl_tcp_rmem, 1577 .sysctl_rmem = sysctl_tcp_rmem,
2094 .max_header = MAX_TCP_HEADER, 1578 .max_header = MAX_TCP_HEADER,
2095 .obj_size = sizeof(struct tcp6_sock), 1579 .obj_size = sizeof(struct tcp6_sock),
2096 .twsk_obj_size = sizeof(struct tcp6_timewait_sock), 1580 .twsk_prot = &tcp6_timewait_sock_ops,
2097 .rsk_prot = &tcp6_request_sock_ops, 1581 .rsk_prot = &tcp6_request_sock_ops,
2098}; 1582};
2099 1583
@@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
2110 .ops = &inet6_stream_ops, 1594 .ops = &inet6_stream_ops,
2111 .capability = -1, 1595 .capability = -1,
2112 .no_check = 0, 1596 .no_check = 0,
2113 .flags = INET_PROTOSW_PERMANENT, 1597 .flags = INET_PROTOSW_PERMANENT |
1598 INET_PROTOSW_ICSK,
2114}; 1599};
2115 1600
2116void __init tcpv6_init(void) 1601void __init tcpv6_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55b..d8538dcea813 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
36#include <linux/ipv6.h> 36#include <linux/ipv6.h>
37#include <linux/icmpv6.h> 37#include <linux/icmpv6.h>
38#include <linux/init.h> 38#include <linux/init.h>
39#include <linux/skbuff.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <net/sock.h> 42#include <net/sock.h>
@@ -300,20 +301,7 @@ out:
300 return err; 301 return err;
301 302
302csum_copy_err: 303csum_copy_err:
303 /* Clear queue. */ 304 skb_kill_datagram(sk, skb, flags);
304 if (flags&MSG_PEEK) {
305 int clear = 0;
306 spin_lock_bh(&sk->sk_receive_queue.lock);
307 if (skb == skb_peek(&sk->sk_receive_queue)) {
308 __skb_unlink(skb, &sk->sk_receive_queue);
309 clear = 1;
310 }
311 spin_unlock_bh(&sk->sk_receive_queue.lock);
312 if (clear)
313 kfree_skb(skb);
314 }
315
316 skb_free_datagram(sk, skb);
317 305
318 if (flags & MSG_DONTWAIT) { 306 if (flags & MSG_DONTWAIT) {
319 UDP6_INC_STATS_USER(UDP_MIB_INERRORS); 307 UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb868409..0dc519b40404 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
75static struct datalink_proto *p8023_datalink; 75static struct datalink_proto *p8023_datalink;
76static struct datalink_proto *pSNAP_datalink; 76static struct datalink_proto *pSNAP_datalink;
77 77
78static struct proto_ops ipx_dgram_ops; 78static const struct proto_ops ipx_dgram_ops;
79 79
80LIST_HEAD(ipx_interfaces); 80LIST_HEAD(ipx_interfaces);
81DEFINE_SPINLOCK(ipx_interfaces_lock); 81DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1884 rc = -EINVAL; 1884 rc = -EINVAL;
1885 break; 1885 break;
1886 default: 1886 default:
1887 rc = dev_ioctl(cmd, argp); 1887 rc = -ENOIOCTLCMD;
1888 break; 1888 break;
1889 } 1889 }
1890 1890
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
1901 .owner = THIS_MODULE, 1901 .owner = THIS_MODULE,
1902}; 1902};
1903 1903
1904static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { 1904static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
1905 .family = PF_IPX, 1905 .family = PF_IPX,
1906 .owner = THIS_MODULE, 1906 .owner = THIS_MODULE,
1907 .release = ipx_release, 1907 .release = ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 6f92f9c62990..fbfa96754417 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
62 62
63static int irda_create(struct socket *sock, int protocol); 63static int irda_create(struct socket *sock, int protocol);
64 64
65static struct proto_ops irda_stream_ops; 65static const struct proto_ops irda_stream_ops;
66static struct proto_ops irda_seqpacket_ops; 66static const struct proto_ops irda_seqpacket_ops;
67static struct proto_ops irda_dgram_ops; 67static const struct proto_ops irda_dgram_ops;
68 68
69#ifdef CONFIG_IRDA_ULTRA 69#ifdef CONFIG_IRDA_ULTRA
70static struct proto_ops irda_ultra_ops; 70static const struct proto_ops irda_ultra_ops;
71#define ULTRA_MAX_DATA 382 71#define ULTRA_MAX_DATA 382
72#endif /* CONFIG_IRDA_ULTRA */ 72#endif /* CONFIG_IRDA_ULTRA */
73 73
@@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
1438 /* 1438 /*
1439 * POSIX 1003.1g mandates this order. 1439 * POSIX 1003.1g mandates this order.
1440 */ 1440 */
1441 if (sk->sk_err) 1441 ret = sock_error(sk);
1442 ret = sock_error(sk); 1442 if (ret)
1443 break;
1443 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1444 else if (sk->sk_shutdown & RCV_SHUTDOWN)
1444 ; 1445 ;
1445 else if (noblock) 1446 else if (noblock)
@@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1821 return -EINVAL; 1822 return -EINVAL;
1822 default: 1823 default:
1823 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); 1824 IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__);
1824 return dev_ioctl(cmd, (void __user *) arg); 1825 return -ENOIOCTLCMD;
1825 } 1826 }
1826 1827
1827 /*NOTREACHED*/ 1828 /*NOTREACHED*/
@@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
2463 .owner = THIS_MODULE, 2464 .owner = THIS_MODULE,
2464}; 2465};
2465 2466
2466static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { 2467static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
2467 .family = PF_IRDA, 2468 .family = PF_IRDA,
2468 .owner = THIS_MODULE, 2469 .owner = THIS_MODULE,
2469 .release = irda_release, 2470 .release = irda_release,
@@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
2484 .sendpage = sock_no_sendpage, 2485 .sendpage = sock_no_sendpage,
2485}; 2486};
2486 2487
2487static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { 2488static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
2488 .family = PF_IRDA, 2489 .family = PF_IRDA,
2489 .owner = THIS_MODULE, 2490 .owner = THIS_MODULE,
2490 .release = irda_release, 2491 .release = irda_release,
@@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
2505 .sendpage = sock_no_sendpage, 2506 .sendpage = sock_no_sendpage,
2506}; 2507};
2507 2508
2508static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { 2509static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
2509 .family = PF_IRDA, 2510 .family = PF_IRDA,
2510 .owner = THIS_MODULE, 2511 .owner = THIS_MODULE,
2511 .release = irda_release, 2512 .release = irda_release,
@@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
2527}; 2528};
2528 2529
2529#ifdef CONFIG_IRDA_ULTRA 2530#ifdef CONFIG_IRDA_ULTRA
2530static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { 2531static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
2531 .family = PF_IRDA, 2532 .family = PF_IRDA,
2532 .owner = THIS_MODULE, 2533 .owner = THIS_MODULE,
2533 .release = irda_release, 2534 .release = irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65c..52efd04cbedb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
113} 113}
114 114
115 115
116static struct proto_ops pfkey_ops; 116static const struct proto_ops pfkey_ops;
117 117
118static void pfkey_insert(struct sock *sk) 118static void pfkey_insert(struct sock *sk)
119{ 119{
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
336 [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), 336 [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
337 [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), 337 [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
338 [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), 338 [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
339 [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx),
339}; 340};
340 341
341/* Verify sadb_address_{len,prefixlen} against sa_family. */ 342/* Verify sadb_address_{len,prefixlen} against sa_family. */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
383 return 0; 384 return 0;
384} 385}
385 386
387static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
388{
389 int len = 0;
390
391 len += sizeof(struct sadb_x_sec_ctx);
392 len += sec_ctx->sadb_x_ctx_len;
393 len += sizeof(uint64_t) - 1;
394 len /= sizeof(uint64_t);
395
396 return len;
397}
398
399static inline int verify_sec_ctx_len(void *p)
400{
401 struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
402 int len;
403
404 if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
405 return -EINVAL;
406
407 len = pfkey_sec_ctx_len(sec_ctx);
408
409 if (sec_ctx->sadb_x_sec_len != len)
410 return -EINVAL;
411
412 return 0;
413}
414
415static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
416{
417 struct xfrm_user_sec_ctx *uctx = NULL;
418 int ctx_size = sec_ctx->sadb_x_ctx_len;
419
420 uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
421
422 if (!uctx)
423 return NULL;
424
425 uctx->len = pfkey_sec_ctx_len(sec_ctx);
426 uctx->exttype = sec_ctx->sadb_x_sec_exttype;
427 uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
428 uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
429 uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
430 memcpy(uctx + 1, sec_ctx + 1,
431 uctx->ctx_len);
432
433 return uctx;
434}
435
386static int present_and_same_family(struct sadb_address *src, 436static int present_and_same_family(struct sadb_address *src,
387 struct sadb_address *dst) 437 struct sadb_address *dst)
388{ 438{
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
438 if (verify_address_len(p)) 488 if (verify_address_len(p))
439 return -EINVAL; 489 return -EINVAL;
440 } 490 }
491 if (ext_type == SADB_X_EXT_SEC_CTX) {
492 if (verify_sec_ctx_len(p))
493 return -EINVAL;
494 }
441 ext_hdrs[ext_type-1] = p; 495 ext_hdrs[ext_type-1] = p;
442 } 496 }
443 p += ext_len; 497 p += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
586 struct sadb_key *key; 640 struct sadb_key *key;
587 struct sadb_x_sa2 *sa2; 641 struct sadb_x_sa2 *sa2;
588 struct sockaddr_in *sin; 642 struct sockaddr_in *sin;
643 struct sadb_x_sec_ctx *sec_ctx;
644 struct xfrm_sec_ctx *xfrm_ctx;
645 int ctx_size = 0;
589#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 646#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
590 struct sockaddr_in6 *sin6; 647 struct sockaddr_in6 *sin6;
591#endif 648#endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
609 sizeof(struct sadb_address)*2 + 666 sizeof(struct sadb_address)*2 +
610 sockaddr_size*2 + 667 sockaddr_size*2 +
611 sizeof(struct sadb_x_sa2); 668 sizeof(struct sadb_x_sa2);
669
670 if ((xfrm_ctx = x->security)) {
671 ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
672 size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
673 }
674
612 /* identity & sensitivity */ 675 /* identity & sensitivity */
613 676
614 if ((x->props.family == AF_INET && 677 if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
899 n_port->sadb_x_nat_t_port_reserved = 0; 962 n_port->sadb_x_nat_t_port_reserved = 0;
900 } 963 }
901 964
965 /* security context */
966 if (xfrm_ctx) {
967 sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
968 sizeof(struct sadb_x_sec_ctx) + ctx_size);
969 sec_ctx->sadb_x_sec_len =
970 (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
971 sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
972 sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
973 sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
974 sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
975 memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
976 xfrm_ctx->ctx_len);
977 }
978
902 return skb; 979 return skb;
903} 980}
904 981
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
909 struct sadb_lifetime *lifetime; 986 struct sadb_lifetime *lifetime;
910 struct sadb_sa *sa; 987 struct sadb_sa *sa;
911 struct sadb_key *key; 988 struct sadb_key *key;
989 struct sadb_x_sec_ctx *sec_ctx;
912 uint16_t proto; 990 uint16_t proto;
913 int err; 991 int err;
914 992
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
993 x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; 1071 x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
994 x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; 1072 x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
995 } 1073 }
1074
1075 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
1076 if (sec_ctx != NULL) {
1077 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
1078
1079 if (!uctx)
1080 goto out;
1081
1082 err = security_xfrm_state_alloc(x, uctx);
1083 kfree(uctx);
1084
1085 if (err)
1086 goto out;
1087 }
1088
996 key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; 1089 key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
997 if (sa->sadb_sa_auth) { 1090 if (sa->sadb_sa_auth) {
998 int keysize = 0; 1091 int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
1720 return 0; 1813 return 0;
1721} 1814}
1722 1815
1816static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
1817{
1818 struct xfrm_sec_ctx *xfrm_ctx = xp->security;
1819
1820 if (xfrm_ctx) {
1821 int len = sizeof(struct sadb_x_sec_ctx);
1822 len += xfrm_ctx->ctx_len;
1823 return PFKEY_ALIGN8(len);
1824 }
1825 return 0;
1826}
1827
1723static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) 1828static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
1724{ 1829{
1725 int sockaddr_size = pfkey_sockaddr_size(xp->family); 1830 int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
1733 (sockaddr_size * 2) + 1838 (sockaddr_size * 2) +
1734 sizeof(struct sadb_x_policy) + 1839 sizeof(struct sadb_x_policy) +
1735 (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + 1840 (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
1736 (socklen * 2))); 1841 (socklen * 2))) +
1842 pfkey_xfrm_policy2sec_ctx_size(xp);
1737} 1843}
1738 1844
1739static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) 1845static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1757 struct sadb_lifetime *lifetime; 1863 struct sadb_lifetime *lifetime;
1758 struct sadb_x_policy *pol; 1864 struct sadb_x_policy *pol;
1759 struct sockaddr_in *sin; 1865 struct sockaddr_in *sin;
1866 struct sadb_x_sec_ctx *sec_ctx;
1867 struct xfrm_sec_ctx *xfrm_ctx;
1760#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 1868#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1761 struct sockaddr_in6 *sin6; 1869 struct sockaddr_in6 *sin6;
1762#endif 1870#endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
1941 } 2049 }
1942 } 2050 }
1943 } 2051 }
2052
2053 /* security context */
2054 if ((xfrm_ctx = xp->security)) {
2055 int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
2056
2057 sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
2058 sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
2059 sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
2060 sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
2061 sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
2062 sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
2063 memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
2064 xfrm_ctx->ctx_len);
2065 }
2066
1944 hdr->sadb_msg_len = size / sizeof(uint64_t); 2067 hdr->sadb_msg_len = size / sizeof(uint64_t);
1945 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); 2068 hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
1946} 2069}
@@ -1976,12 +2099,13 @@ out:
1976 2099
1977static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) 2100static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
1978{ 2101{
1979 int err; 2102 int err = 0;
1980 struct sadb_lifetime *lifetime; 2103 struct sadb_lifetime *lifetime;
1981 struct sadb_address *sa; 2104 struct sadb_address *sa;
1982 struct sadb_x_policy *pol; 2105 struct sadb_x_policy *pol;
1983 struct xfrm_policy *xp; 2106 struct xfrm_policy *xp;
1984 struct km_event c; 2107 struct km_event c;
2108 struct sadb_x_sec_ctx *sec_ctx;
1985 2109
1986 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2110 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
1987 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2111 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2028 if (xp->selector.dport) 2152 if (xp->selector.dport)
2029 xp->selector.dport_mask = ~0; 2153 xp->selector.dport_mask = ~0;
2030 2154
2155 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
2156 if (sec_ctx != NULL) {
2157 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2158
2159 if (!uctx) {
2160 err = -ENOBUFS;
2161 goto out;
2162 }
2163
2164 err = security_xfrm_policy_alloc(xp, uctx);
2165 kfree(uctx);
2166
2167 if (err)
2168 goto out;
2169 }
2170
2031 xp->lft.soft_byte_limit = XFRM_INF; 2171 xp->lft.soft_byte_limit = XFRM_INF;
2032 xp->lft.hard_byte_limit = XFRM_INF; 2172 xp->lft.hard_byte_limit = XFRM_INF;
2033 xp->lft.soft_packet_limit = XFRM_INF; 2173 xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2051 2191
2052 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, 2192 err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
2053 hdr->sadb_msg_type != SADB_X_SPDUPDATE); 2193 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
2054 if (err) { 2194
2055 kfree(xp); 2195 if (err)
2056 return err; 2196 goto out;
2057 }
2058 2197
2059 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) 2198 if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
2060 c.event = XFRM_MSG_UPDPOLICY; 2199 c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
2069 return 0; 2208 return 0;
2070 2209
2071out: 2210out:
2211 security_xfrm_policy_free(xp);
2072 kfree(xp); 2212 kfree(xp);
2073 return err; 2213 return err;
2074} 2214}
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2078 int err; 2218 int err;
2079 struct sadb_address *sa; 2219 struct sadb_address *sa;
2080 struct sadb_x_policy *pol; 2220 struct sadb_x_policy *pol;
2081 struct xfrm_policy *xp; 2221 struct xfrm_policy *xp, tmp;
2082 struct xfrm_selector sel; 2222 struct xfrm_selector sel;
2083 struct km_event c; 2223 struct km_event c;
2224 struct sadb_x_sec_ctx *sec_ctx;
2084 2225
2085 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], 2226 if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
2086 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || 2227 ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
2109 if (sel.dport) 2250 if (sel.dport)
2110 sel.dport_mask = ~0; 2251 sel.dport_mask = ~0;
2111 2252
2112 xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); 2253 sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
2254 memset(&tmp, 0, sizeof(struct xfrm_policy));
2255
2256 if (sec_ctx != NULL) {
2257 struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2258
2259 if (!uctx)
2260 return -ENOMEM;
2261
2262 err = security_xfrm_policy_alloc(&tmp, uctx);
2263 kfree(uctx);
2264
2265 if (err)
2266 return err;
2267 }
2268
2269 xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
2270 security_xfrm_policy_free(&tmp);
2113 if (xp == NULL) 2271 if (xp == NULL)
2114 return -ENOENT; 2272 return -ENOENT;
2115 2273
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
2660{ 2818{
2661 struct xfrm_policy *xp; 2819 struct xfrm_policy *xp;
2662 struct sadb_x_policy *pol = (struct sadb_x_policy*)data; 2820 struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
2821 struct sadb_x_sec_ctx *sec_ctx;
2663 2822
2664 switch (family) { 2823 switch (family) {
2665 case AF_INET: 2824 case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
2709 (*dir = parse_ipsecrequests(xp, pol)) < 0) 2868 (*dir = parse_ipsecrequests(xp, pol)) < 0)
2710 goto out; 2869 goto out;
2711 2870
2871 /* security context too */
2872 if (len >= (pol->sadb_x_policy_len*8 +
2873 sizeof(struct sadb_x_sec_ctx))) {
2874 char *p = (char *)pol;
2875 struct xfrm_user_sec_ctx *uctx;
2876
2877 p += pol->sadb_x_policy_len*8;
2878 sec_ctx = (struct sadb_x_sec_ctx *)p;
2879 if (len < pol->sadb_x_policy_len*8 +
2880 sec_ctx->sadb_x_sec_len)
2881 goto out;
2882 if ((*dir = verify_sec_ctx_len(p)))
2883 goto out;
2884 uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
2885 *dir = security_xfrm_policy_alloc(xp, uctx);
2886 kfree(uctx);
2887
2888 if (*dir)
2889 goto out;
2890 }
2891
2712 *dir = pol->sadb_x_policy_dir-1; 2892 *dir = pol->sadb_x_policy_dir-1;
2713 return xp; 2893 return xp;
2714 2894
2715out: 2895out:
2896 security_xfrm_policy_free(xp);
2716 kfree(xp); 2897 kfree(xp);
2717 return NULL; 2898 return NULL;
2718} 2899}
@@ -2946,7 +3127,7 @@ out:
2946 return err; 3127 return err;
2947} 3128}
2948 3129
2949static struct proto_ops pfkey_ops = { 3130static const struct proto_ops pfkey_ops = {
2950 .family = PF_KEY, 3131 .family = PF_KEY,
2951 .owner = THIS_MODULE, 3132 .owner = THIS_MODULE,
2952 /* Operations that make no sense on pfkey sockets. */ 3133 /* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c3f0b0783453..8171c53bc0ed 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
36static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; 36static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
37static u16 llc_ui_sap_link_no_max[256]; 37static u16 llc_ui_sap_link_no_max[256];
38static struct sockaddr_llc llc_ui_addrnull; 38static struct sockaddr_llc llc_ui_addrnull;
39static struct proto_ops llc_ui_ops; 39static const struct proto_ops llc_ui_ops;
40 40
41static int llc_ui_wait_for_conn(struct sock *sk, long timeout); 41static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
42static int llc_ui_wait_for_disc(struct sock *sk, long timeout); 42static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo)
566 /* 566 /*
567 * POSIX 1003.1g mandates this order. 567 * POSIX 1003.1g mandates this order.
568 */ 568 */
569 if (sk->sk_err) { 569 rc = sock_error(sk);
570 rc = sock_error(sk); 570 if (rc)
571 break; 571 break;
572 }
573 rc = 0; 572 rc = 0;
574 if (sk->sk_shutdown & RCV_SHUTDOWN) 573 if (sk->sk_shutdown & RCV_SHUTDOWN)
575 break; 574 break;
@@ -960,7 +959,7 @@ out:
960static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, 959static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
961 unsigned long arg) 960 unsigned long arg)
962{ 961{
963 return dev_ioctl(cmd, (void __user *)arg); 962 return -ENOIOCTLCMD;
964} 963}
965 964
966/** 965/**
@@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
1099 .owner = THIS_MODULE, 1098 .owner = THIS_MODULE,
1100}; 1099};
1101 1100
1102static struct proto_ops llc_ui_ops = { 1101static const struct proto_ops llc_ui_ops = {
1103 .family = PF_LLC, 1102 .family = PF_LLC,
1104 .owner = THIS_MODULE, 1103 .owner = THIS_MODULE,
1105 .release = llc_ui_release, 1104 .release = llc_ui_release,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cba63729313d..e10512e229b6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid)
151 goto out_unlock; 151 goto out_unlock;
152 152
153 INIT_HLIST_NODE(&inst->hlist); 153 INIT_HLIST_NODE(&inst->hlist);
154 inst->lock = SPIN_LOCK_UNLOCKED; 154 spin_lock_init(&inst->lock);
155 /* needs to be two, since we _put() after creation */ 155 /* needs to be two, since we _put() after creation */
156 atomic_set(&inst->use, 2); 156 atomic_set(&inst->use, 2);
157 157
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f28460b61e47..55afdda3d940 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid)
148 atomic_set(&inst->id_sequence, 0); 148 atomic_set(&inst->id_sequence, 0);
149 /* needs to be two, since we _put() after creation */ 149 /* needs to be two, since we _put() after creation */
150 atomic_set(&inst->use, 2); 150 atomic_set(&inst->use, 2);
151 inst->lock = SPIN_LOCK_UNLOCKED; 151 spin_lock_init(&inst->lock);
152 INIT_LIST_HEAD(&inst->queue_list); 152 INIT_LIST_HEAD(&inst->queue_list);
153 153
154 if (!try_module_get(THIS_MODULE)) 154 if (!try_module_get(THIS_MODULE))
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e8..7849cac14d3a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
293 return 0; 293 return 0;
294} 294}
295 295
296static struct proto_ops netlink_ops; 296static const struct proto_ops netlink_ops;
297 297
298static int netlink_insert(struct sock *sk, u32 pid) 298static int netlink_insert(struct sock *sk, u32 pid)
299{ 299{
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
1656 return notifier_chain_unregister(&netlink_chain, nb); 1656 return notifier_chain_unregister(&netlink_chain, nb);
1657} 1657}
1658 1658
1659static struct proto_ops netlink_ops = { 1659static const struct proto_ops netlink_ops = {
1660 .family = PF_NETLINK, 1660 .family = PF_NETLINK,
1661 .owner = THIS_MODULE, 1661 .owner = THIS_MODULE,
1662 .release = netlink_release, 1662 .release = netlink_release,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 287cfcc56951..3b1378498d50 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -441,7 +441,7 @@ errout:
441} 441}
442 442
443static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, 443static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
444 int seq, int cmd) 444 int seq, u8 cmd)
445{ 445{
446 struct sk_buff *skb; 446 struct sk_buff *skb;
447 int err; 447 int err;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711cae..63b0e4afeb33 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
63static HLIST_HEAD(nr_list); 63static HLIST_HEAD(nr_list);
64static DEFINE_SPINLOCK(nr_list_lock); 64static DEFINE_SPINLOCK(nr_list_lock);
65 65
66static struct proto_ops nr_proto_ops; 66static const struct proto_ops nr_proto_ops;
67 67
68/* 68/*
69 * Socket removal during an interrupt is now safe. 69 * Socket removal during an interrupt is now safe.
@@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1166 void __user *argp = (void __user *)arg; 1166 void __user *argp = (void __user *)arg;
1167 int ret; 1167 int ret;
1168 1168
1169 lock_sock(sk);
1170 switch (cmd) { 1169 switch (cmd) {
1171 case TIOCOUTQ: { 1170 case TIOCOUTQ: {
1172 long amount; 1171 long amount;
1172
1173 lock_sock(sk);
1173 amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); 1174 amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
1174 if (amount < 0) 1175 if (amount < 0)
1175 amount = 0; 1176 amount = 0;
@@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1180 case TIOCINQ: { 1181 case TIOCINQ: {
1181 struct sk_buff *skb; 1182 struct sk_buff *skb;
1182 long amount = 0L; 1183 long amount = 0L;
1184
1185 lock_sock(sk);
1183 /* These two are safe on a single CPU system as only user tasks fiddle here */ 1186 /* These two are safe on a single CPU system as only user tasks fiddle here */
1184 if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) 1187 if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
1185 amount = skb->len; 1188 amount = skb->len;
@@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1188 } 1191 }
1189 1192
1190 case SIOCGSTAMP: 1193 case SIOCGSTAMP:
1194 lock_sock(sk);
1191 ret = sock_get_timestamp(sk, argp); 1195 ret = sock_get_timestamp(sk, argp);
1192 release_sock(sk); 1196 release_sock(sk);
1193 return ret; 1197 return ret;
@@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1202 case SIOCSIFNETMASK: 1206 case SIOCSIFNETMASK:
1203 case SIOCGIFMETRIC: 1207 case SIOCGIFMETRIC:
1204 case SIOCSIFMETRIC: 1208 case SIOCSIFMETRIC:
1205 release_sock(sk);
1206 return -EINVAL; 1209 return -EINVAL;
1207 1210
1208 case SIOCADDRT: 1211 case SIOCADDRT:
1209 case SIOCDELRT: 1212 case SIOCDELRT:
1210 case SIOCNRDECOBS: 1213 case SIOCNRDECOBS:
1211 release_sock(sk);
1212 if (!capable(CAP_NET_ADMIN)) return -EPERM; 1214 if (!capable(CAP_NET_ADMIN)) return -EPERM;
1213 return nr_rt_ioctl(cmd, argp); 1215 return nr_rt_ioctl(cmd, argp);
1214 1216
1215 default: 1217 default:
1216 release_sock(sk); 1218 return -ENOIOCTLCMD;
1217 return dev_ioctl(cmd, argp);
1218 } 1219 }
1219 release_sock(sk);
1220 1220
1221 return 0; 1221 return 0;
1222} 1222}
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
1337 .owner = THIS_MODULE, 1337 .owner = THIS_MODULE,
1338}; 1338};
1339 1339
1340static struct proto_ops nr_proto_ops = { 1340static const struct proto_ops nr_proto_ops = {
1341 .family = PF_NETROM, 1341 .family = PF_NETROM,
1342 .owner = THIS_MODULE, 1342 .owner = THIS_MODULE,
1343 .release = nr_release, 1343 .release = nr_release,
diff --git a/net/nonet.c b/net/nonet.c
index e5241dceaa57..1230f0ae832e 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -14,11 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16 16
17void __init sock_init(void)
18{
19 printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n");
20}
21
22static int sock_no_open(struct inode *irrelevant, struct file *dontcare) 17static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
23{ 18{
24 return -ENXIO; 19 return -ENXIO;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e2462760413..f69e5ed9bd06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
251} 251}
252 252
253 253
254static struct proto_ops packet_ops; 254static const struct proto_ops packet_ops;
255 255
256#ifdef CONFIG_SOCK_PACKET 256#ifdef CONFIG_SOCK_PACKET
257static struct proto_ops packet_ops_spkt; 257static const struct proto_ops packet_ops_spkt;
258 258
259static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 259static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
260{ 260{
@@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
1521#endif 1521#endif
1522 1522
1523 default: 1523 default:
1524 return dev_ioctl(cmd, (void __user *)arg); 1524 return -ENOIOCTLCMD;
1525 } 1525 }
1526 return 0; 1526 return 0;
1527} 1527}
@@ -1784,7 +1784,7 @@ out:
1784 1784
1785 1785
1786#ifdef CONFIG_SOCK_PACKET 1786#ifdef CONFIG_SOCK_PACKET
1787static struct proto_ops packet_ops_spkt = { 1787static const struct proto_ops packet_ops_spkt = {
1788 .family = PF_PACKET, 1788 .family = PF_PACKET,
1789 .owner = THIS_MODULE, 1789 .owner = THIS_MODULE,
1790 .release = packet_release, 1790 .release = packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
1806}; 1806};
1807#endif 1807#endif
1808 1808
1809static struct proto_ops packet_ops = { 1809static const struct proto_ops packet_ops = {
1810 .family = PF_PACKET, 1810 .family = PF_PACKET,
1811 .owner = THIS_MODULE, 1811 .owner = THIS_MODULE,
1812 .release = packet_release, 1812 .release = packet_release,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 829fdbc4400b..63090be2315a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1320 return 0; 1320 return 0;
1321 1321
1322 default: 1322 default:
1323 return dev_ioctl(cmd, argp); 1323 return -ENOIOCTLCMD;
1324 } 1324 }
1325 1325
1326 return 0; 1326 return 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a5..ba5283204837 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
25 25
26#include <net/pkt_sched.h> 26#include <net/pkt_sched.h>
27 27
28#define VERSION "1.1" 28#define VERSION "1.2"
29 29
30/* Network Emulation Queuing algorithm. 30/* Network Emulation Queuing algorithm.
31 ==================================== 31 ====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
65 u32 jitter; 65 u32 jitter;
66 u32 duplicate; 66 u32 duplicate;
67 u32 reorder; 67 u32 reorder;
68 u32 corrupt;
68 69
69 struct crndstate { 70 struct crndstate {
70 unsigned long last; 71 unsigned long last;
71 unsigned long rho; 72 unsigned long rho;
72 } delay_cor, loss_cor, dup_cor, reorder_cor; 73 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
73 74
74 struct disttable { 75 struct disttable {
75 u32 size; 76 u32 size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
183 q->duplicate = dupsave; 184 q->duplicate = dupsave;
184 } 185 }
185 186
187 /*
188 * Randomized packet corruption.
189 * Make copy if needed since we are modifying
190 * If packet is going to be hardware checksummed, then
191 * do it now in software before we mangle it.
192 */
193 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
194 if (!(skb = skb_unshare(skb, GFP_ATOMIC))
195 || (skb->ip_summed == CHECKSUM_HW
196 && skb_checksum_help(skb, 0))) {
197 sch->qstats.drops++;
198 return NET_XMIT_DROP;
199 }
200
201 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
202 }
203
186 if (q->gap == 0 /* not doing reordering */ 204 if (q->gap == 0 /* not doing reordering */
187 || q->counter < q->gap /* inside last reordering gap */ 205 || q->counter < q->gap /* inside last reordering gap */
188 || q->reorder < get_crandom(&q->reorder_cor)) { 206 || q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
382 return 0; 400 return 0;
383} 401}
384 402
403static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
404{
405 struct netem_sched_data *q = qdisc_priv(sch);
406 const struct tc_netem_corrupt *r = RTA_DATA(attr);
407
408 if (RTA_PAYLOAD(attr) != sizeof(*r))
409 return -EINVAL;
410
411 q->corrupt = r->probability;
412 init_crandom(&q->corrupt_cor, r->correlation);
413 return 0;
414}
415
416/* Parse netlink message to set options */
385static int netem_change(struct Qdisc *sch, struct rtattr *opt) 417static int netem_change(struct Qdisc *sch, struct rtattr *opt)
386{ 418{
387 struct netem_sched_data *q = qdisc_priv(sch); 419 struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
432 if (ret) 464 if (ret)
433 return ret; 465 return ret;
434 } 466 }
467
435 if (tb[TCA_NETEM_REORDER-1]) { 468 if (tb[TCA_NETEM_REORDER-1]) {
436 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); 469 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
437 if (ret) 470 if (ret)
438 return ret; 471 return ret;
439 } 472 }
440 }
441 473
474 if (tb[TCA_NETEM_CORRUPT-1]) {
475 ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
476 if (ret)
477 return ret;
478 }
479 }
442 480
443 return 0; 481 return 0;
444} 482}
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
564 struct tc_netem_qopt qopt; 602 struct tc_netem_qopt qopt;
565 struct tc_netem_corr cor; 603 struct tc_netem_corr cor;
566 struct tc_netem_reorder reorder; 604 struct tc_netem_reorder reorder;
605 struct tc_netem_corrupt corrupt;
567 606
568 qopt.latency = q->latency; 607 qopt.latency = q->latency;
569 qopt.jitter = q->jitter; 608 qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
582 reorder.correlation = q->reorder_cor.rho; 621 reorder.correlation = q->reorder_cor.rho;
583 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); 622 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
584 623
624 corrupt.probability = q->corrupt;
625 corrupt.correlation = q->corrupt_cor.rho;
626 RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
627
585 rta->rta_len = skb->tail - b; 628 rta->rta_len = skb->tail - b;
586 629
587 return skb->len; 630 return skb->len;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/if_arp.h>
25#include <linux/if_ether.h> 26#include <linux/if_ether.h>
26#include <linux/inet.h> 27#include <linux/inet.h>
27#include <linux/netdevice.h> 28#include <linux/netdevice.h>
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dec68a604773..9d05e13e92f6 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
110 asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; 110 asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
111 asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) 111 asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
112 * 1000; 112 * 1000;
113 asoc->pmtu = 0;
114 asoc->frag_point = 0; 113 asoc->frag_point = 0;
115 114
116 /* Set the association max_retrans and RTO values from the 115 /* Set the association max_retrans and RTO values from the
@@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
123 122
124 asoc->overall_error_count = 0; 123 asoc->overall_error_count = 0;
125 124
125 /* Initialize the association's heartbeat interval based on the
126 * sock configured value.
127 */
128 asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
129
130 /* Initialize path max retrans value. */
131 asoc->pathmaxrxt = sp->pathmaxrxt;
132
133 /* Initialize default path MTU. */
134 asoc->pathmtu = sp->pathmtu;
135
136 /* Set association default SACK delay */
137 asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
138
139 /* Set the association default flags controlling
140 * Heartbeat, SACK delay, and Path MTU Discovery.
141 */
142 asoc->param_flags = sp->param_flags;
143
126 /* Initialize the maximum mumber of new data packets that can be sent 144 /* Initialize the maximum mumber of new data packets that can be sent
127 * in a burst. 145 * in a burst.
128 */ 146 */
@@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
144 = 5 * asoc->rto_max; 162 = 5 * asoc->rto_max;
145 163
146 asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; 164 asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
147 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 165 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
148 SCTP_DEFAULT_TIMEOUT_SACK;
149 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = 166 asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
150 sp->autoclose * HZ; 167 sp->autoclose * HZ;
151 168
@@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
540 557
541 sctp_transport_set_owner(peer, asoc); 558 sctp_transport_set_owner(peer, asoc);
542 559
560 /* Initialize the peer's heartbeat interval based on the
561 * association configured value.
562 */
563 peer->hbinterval = asoc->hbinterval;
564
565 /* Set the path max_retrans. */
566 peer->pathmaxrxt = asoc->pathmaxrxt;
567
568 /* Initialize the peer's SACK delay timeout based on the
569 * association configured value.
570 */
571 peer->sackdelay = asoc->sackdelay;
572
573 /* Enable/disable heartbeat, SACK delay, and path MTU discovery
574 * based on association setting.
575 */
576 peer->param_flags = asoc->param_flags;
577
543 /* Initialize the pmtu of the transport. */ 578 /* Initialize the pmtu of the transport. */
544 sctp_transport_pmtu(peer); 579 if (peer->param_flags & SPP_PMTUD_ENABLE)
580 sctp_transport_pmtu(peer);
581 else if (asoc->pathmtu)
582 peer->pathmtu = asoc->pathmtu;
583 else
584 peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
545 585
546 /* If this is the first transport addr on this association, 586 /* If this is the first transport addr on this association,
547 * initialize the association PMTU to the peer's PMTU. 587 * initialize the association PMTU to the peer's PMTU.
548 * If not and the current association PMTU is higher than the new 588 * If not and the current association PMTU is higher than the new
549 * peer's PMTU, reset the association PMTU to the new peer's PMTU. 589 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
550 */ 590 */
551 if (asoc->pmtu) 591 if (asoc->pathmtu)
552 asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); 592 asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
553 else 593 else
554 asoc->pmtu = peer->pmtu; 594 asoc->pathmtu = peer->pathmtu;
555 595
556 SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " 596 SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
557 "%d\n", asoc, asoc->pmtu); 597 "%d\n", asoc, asoc->pathmtu);
558 598
559 asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); 599 asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
560 600
561 /* The asoc->peer.port might not be meaningful yet, but 601 /* The asoc->peer.port might not be meaningful yet, but
562 * initialize the packet structure anyway. 602 * initialize the packet structure anyway.
@@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
574 * (for example, implementations MAY use the size of the 614 * (for example, implementations MAY use the size of the
575 * receiver advertised window). 615 * receiver advertised window).
576 */ 616 */
577 peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); 617 peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
578 618
579 /* At this point, we may not have the receiver's advertised window, 619 /* At this point, we may not have the receiver's advertised window,
580 * so initialize ssthresh to the default value and it will be set 620 * so initialize ssthresh to the default value and it will be set
@@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
585 peer->partial_bytes_acked = 0; 625 peer->partial_bytes_acked = 0;
586 peer->flight_size = 0; 626 peer->flight_size = 0;
587 627
588 /* By default, enable heartbeat for peer address. */
589 peer->hb_allowed = 1;
590
591 /* Initialize the peer's heartbeat interval based on the
592 * sock configured value.
593 */
594 peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval);
595
596 /* Set the path max_retrans. */
597 peer->max_retrans = sp->paddrparam.spp_pathmaxrxt;
598
599 /* Set the transport's RTO.initial value */ 628 /* Set the transport's RTO.initial value */
600 peer->rto = asoc->rto_initial; 629 peer->rto = asoc->rto_initial;
601 630
@@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
1155 /* Get the lowest pmtu of all the transports. */ 1184 /* Get the lowest pmtu of all the transports. */
1156 list_for_each(pos, &asoc->peer.transport_addr_list) { 1185 list_for_each(pos, &asoc->peer.transport_addr_list) {
1157 t = list_entry(pos, struct sctp_transport, transports); 1186 t = list_entry(pos, struct sctp_transport, transports);
1158 if (!pmtu || (t->pmtu < pmtu)) 1187 if (!pmtu || (t->pathmtu < pmtu))
1159 pmtu = t->pmtu; 1188 pmtu = t->pathmtu;
1160 } 1189 }
1161 1190
1162 if (pmtu) { 1191 if (pmtu) {
1163 struct sctp_sock *sp = sctp_sk(asoc->base.sk); 1192 struct sctp_sock *sp = sctp_sk(asoc->base.sk);
1164 asoc->pmtu = pmtu; 1193 asoc->pathmtu = pmtu;
1165 asoc->frag_point = sctp_frag_point(sp, pmtu); 1194 asoc->frag_point = sctp_frag_point(sp, pmtu);
1166 } 1195 }
1167 1196
1168 SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", 1197 SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
1169 __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); 1198 __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point);
1170} 1199}
1171 1200
1172/* Should we send a SACK to update our peer? */ 1201/* Should we send a SACK to update our peer? */
@@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc)
1179 case SCTP_STATE_SHUTDOWN_SENT: 1208 case SCTP_STATE_SHUTDOWN_SENT:
1180 if ((asoc->rwnd > asoc->a_rwnd) && 1209 if ((asoc->rwnd > asoc->a_rwnd) &&
1181 ((asoc->rwnd - asoc->a_rwnd) >= 1210 ((asoc->rwnd - asoc->a_rwnd) >=
1182 min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) 1211 min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu)))
1183 return 1; 1212 return 1;
1184 break; 1213 break;
1185 default: 1214 default:
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b24ff2c1aef5..238f1bffa684 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
305void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, 305void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
306 struct sctp_transport *t, __u32 pmtu) 306 struct sctp_transport *t, __u32 pmtu)
307{ 307{
308 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { 308 if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu))
309 printk(KERN_WARNING "%s: Reported pmtu %d too low, " 309 return;
310 "using default minimum of %d\n", __FUNCTION__, pmtu,
311 SCTP_DEFAULT_MINSEGMENT);
312 pmtu = SCTP_DEFAULT_MINSEGMENT;
313 }
314 310
315 if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { 311 if (t->param_flags & SPP_PMTUD_ENABLE) {
316 t->pmtu = pmtu; 312 if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
313 printk(KERN_WARNING "%s: Reported pmtu %d too low, "
314 "using default minimum of %d\n",
315 __FUNCTION__, pmtu,
316 SCTP_DEFAULT_MINSEGMENT);
317 /* Use default minimum segment size and disable
318 * pmtu discovery on this transport.
319 */
320 t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
321 t->param_flags = (t->param_flags & ~SPP_HB) |
322 SPP_PMTUD_DISABLE;
323 } else {
324 t->pathmtu = pmtu;
325 }
326
327 /* Update association pmtu. */
317 sctp_assoc_sync_pmtu(asoc); 328 sctp_assoc_sync_pmtu(asoc);
318 sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
319 } 329 }
330
331 /* Retransmit with the new pmtu setting.
332 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
333 * Needed will never be sent, but if a message was sent before
334 * PMTU discovery was disabled that was larger than the PMTU, it
335 * would not be fragmented, so it must be re-transmitted fragmented.
336 */
337 sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
320} 338}
321 339
322/* 340/*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5f..15c05165c905 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
866 return 2; 866 return 2;
867} 867}
868 868
869static struct proto_ops inet6_seqpacket_ops = { 869static const struct proto_ops inet6_seqpacket_ops = {
870 .family = PF_INET6, 870 .family = PF_INET6,
871 .owner = THIS_MODULE, 871 .owner = THIS_MODULE,
872 .release = inet6_release, 872 .release = inet6_release,
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 931371633464..a40991ef72c9 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
234 goto finish; 234 goto finish;
235 235
236 pmtu = ((packet->transport->asoc) ? 236 pmtu = ((packet->transport->asoc) ?
237 (packet->transport->asoc->pmtu) : 237 (packet->transport->asoc->pathmtu) :
238 (packet->transport->pmtu)); 238 (packet->transport->pathmtu));
239 239
240 too_big = (psize + chunk_len > pmtu); 240 too_big = (psize + chunk_len > pmtu);
241 241
@@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
482 if (!dst || (dst->obsolete > 1)) { 482 if (!dst || (dst->obsolete > 1)) {
483 dst_release(dst); 483 dst_release(dst);
484 sctp_transport_route(tp, NULL, sctp_sk(sk)); 484 sctp_transport_route(tp, NULL, sctp_sk(sk));
485 sctp_assoc_sync_pmtu(asoc); 485 if (asoc->param_flags & SPP_PMTUD_ENABLE) {
486 sctp_assoc_sync_pmtu(asoc);
487 }
486 } 488 }
487 489
488 nskb->dst = dst_clone(tp->dst); 490 nskb->dst = dst_clone(tp->dst);
@@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
492 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", 494 SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
493 nskb->len); 495 nskb->len);
494 496
495 (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); 497 if (tp->param_flags & SPP_PMTUD_ENABLE)
498 (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
499 else
500 (*tp->af_specific->sctp_xmit)(nskb, tp, 1);
496 501
497out: 502out:
498 packet->size = packet->overhead; 503 packet->size = packet->overhead;
@@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
577 * if ((flightsize + Max.Burst * MTU) < cwnd) 582 * if ((flightsize + Max.Burst * MTU) < cwnd)
578 * cwnd = flightsize + Max.Burst * MTU 583 * cwnd = flightsize + Max.Burst * MTU
579 */ 584 */
580 max_burst_bytes = asoc->max_burst * asoc->pmtu; 585 max_burst_bytes = asoc->max_burst * asoc->pathmtu;
581 if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { 586 if ((transport->flight_size + max_burst_bytes) < transport->cwnd) {
582 transport->cwnd = transport->flight_size + max_burst_bytes; 587 transport->cwnd = transport->flight_size + max_burst_bytes;
583 SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " 588 SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: "
@@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
622 * data will fit or delay in hopes of bundling a full 627 * data will fit or delay in hopes of bundling a full
623 * sized packet. 628 * sized packet.
624 */ 629 */
625 if (len < asoc->pmtu - packet->overhead) { 630 if (len < asoc->pathmtu - packet->overhead) {
626 retval = SCTP_XMIT_NAGLE_DELAY; 631 retval = SCTP_XMIT_NAGLE_DELAY;
627 goto finish; 632 goto finish;
628 } 633 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
54#include <net/protocol.h> 54#include <net/protocol.h>
55#include <net/ip.h> 55#include <net/ip.h>
56#include <net/ipv6.h> 56#include <net/ipv6.h>
57#include <net/route.h>
57#include <net/sctp/sctp.h> 58#include <net/sctp/sctp.h>
58#include <net/addrconf.h> 59#include <net/addrconf.h>
59#include <net/inet_common.h> 60#include <net/inet_common.h>
@@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
829}; 830};
830 831
831/* Socket operations. */ 832/* Socket operations. */
832static struct proto_ops inet_seqpacket_ops = { 833static const struct proto_ops inet_seqpacket_ops = {
833 .family = PF_INET, 834 .family = PF_INET,
834 .owner = THIS_MODULE, 835 .owner = THIS_MODULE,
835 .release = inet_release, /* Needs to be wrapped... */ 836 .release = inet_release, /* Needs to be wrapped... */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 823947170a33..2d7d8a5db2ac 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
157{ 157{
158 __u32 ctsn, max_tsn_seen; 158 __u32 ctsn, max_tsn_seen;
159 struct sctp_chunk *sack; 159 struct sctp_chunk *sack;
160 struct sctp_transport *trans = asoc->peer.last_data_from;
160 int error = 0; 161 int error = 0;
161 162
162 if (force) 163 if (force ||
164 (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
165 (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
163 asoc->peer.sack_needed = 1; 166 asoc->peer.sack_needed = 1;
164 167
165 ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); 168 ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
189 if (!asoc->peer.sack_needed) { 192 if (!asoc->peer.sack_needed) {
190 /* We will need a SACK for the next packet. */ 193 /* We will need a SACK for the next packet. */
191 asoc->peer.sack_needed = 1; 194 asoc->peer.sack_needed = 1;
192 goto out; 195
196 /* Set the SACK delay timeout based on the
197 * SACK delay for the last transport
198 * data was received from, or the default
199 * for the association.
200 */
201 if (trans)
202 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
203 trans->sackdelay;
204 else
205 asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
206 asoc->sackdelay;
207
208 /* Restart the SACK timer. */
209 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
210 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
193 } else { 211 } else {
194 if (asoc->a_rwnd > asoc->rwnd) 212 if (asoc->a_rwnd > asoc->rwnd)
195 asoc->a_rwnd = asoc->rwnd; 213 asoc->a_rwnd = asoc->rwnd;
@@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
205 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, 223 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
206 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); 224 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
207 } 225 }
208out: 226
209 return error; 227 return error;
210nomem: 228nomem:
211 error = -ENOMEM; 229 error = -ENOMEM;
@@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
415 asoc->overall_error_count++; 433 asoc->overall_error_count++;
416 434
417 if (transport->state != SCTP_INACTIVE && 435 if (transport->state != SCTP_INACTIVE &&
418 (transport->error_count++ >= transport->max_retrans)) { 436 (transport->error_count++ >= transport->pathmaxrxt)) {
419 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", 437 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
420 " transport IP: port:%d failed.\n", 438 " transport IP: port:%d failed.\n",
421 asoc, 439 asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 475bfb4972d9..557a7d90b92a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
900 * HEARTBEAT is sent (see Section 8.3). 900 * HEARTBEAT is sent (see Section 8.3).
901 */ 901 */
902 902
903 if (transport->hb_allowed) { 903 if (transport->param_flags & SPP_HB_ENABLE) {
904 if (SCTP_DISPOSITION_NOMEM == 904 if (SCTP_DISPOSITION_NOMEM ==
905 sctp_sf_heartbeat(ep, asoc, type, arg, 905 sctp_sf_heartbeat(ep, asoc, type, arg,
906 commands)) 906 commands))
@@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1051 return SCTP_DISPOSITION_DISCARD; 1051 return SCTP_DISPOSITION_DISCARD;
1052 } 1052 }
1053 1053
1054 max_interval = link->hb_interval + link->rto; 1054 max_interval = link->hbinterval + link->rto;
1055 1055
1056 /* Check if the timestamp looks valid. */ 1056 /* Check if the timestamp looks valid. */
1057 if (time_after(hbinfo->sent_at, jiffies) || 1057 if (time_after(hbinfo->sent_at, jiffies) ||
@@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
2691 * document allow. However, an SCTP transmitter MUST NOT be 2691 * document allow. However, an SCTP transmitter MUST NOT be
2692 * more aggressive than the following algorithms allow. 2692 * more aggressive than the following algorithms allow.
2693 */ 2693 */
2694 if (chunk->end_of_packet) { 2694 if (chunk->end_of_packet)
2695 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 2695 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
2696 2696
2697 /* Start the SACK timer. */
2698 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
2699 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
2700 }
2701
2702 return SCTP_DISPOSITION_CONSUME; 2697 return SCTP_DISPOSITION_CONSUME;
2703 2698
2704discard_force: 2699discard_force:
@@ -2721,13 +2716,9 @@ discard_force:
2721 return SCTP_DISPOSITION_DISCARD; 2716 return SCTP_DISPOSITION_DISCARD;
2722 2717
2723discard_noforce: 2718discard_noforce:
2724 if (chunk->end_of_packet) { 2719 if (chunk->end_of_packet)
2725 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 2720 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
2726 2721
2727 /* Start the SACK timer. */
2728 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
2729 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
2730 }
2731 return SCTP_DISPOSITION_DISCARD; 2722 return SCTP_DISPOSITION_DISCARD;
2732consume: 2723consume:
2733 return SCTP_DISPOSITION_CONSUME; 2724 return SCTP_DISPOSITION_CONSUME;
@@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
3442 * send another. 3433 * send another.
3443 */ 3434 */
3444 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); 3435 sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
3445 /* Start the SACK timer. */
3446 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
3447 SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
3448 3436
3449 return SCTP_DISPOSITION_CONSUME; 3437 return SCTP_DISPOSITION_CONSUME;
3450 3438
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9df888e932c5..fc04d185fa33 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
1941 * address's parameters: 1941 * address's parameters:
1942 * 1942 *
1943 * struct sctp_paddrparams { 1943 * struct sctp_paddrparams {
1944 * sctp_assoc_t spp_assoc_id; 1944 * sctp_assoc_t spp_assoc_id;
1945 * struct sockaddr_storage spp_address; 1945 * struct sockaddr_storage spp_address;
1946 * uint32_t spp_hbinterval; 1946 * uint32_t spp_hbinterval;
1947 * uint16_t spp_pathmaxrxt; 1947 * uint16_t spp_pathmaxrxt;
1948 * }; 1948 * uint32_t spp_pathmtu;
1949 * 1949 * uint32_t spp_sackdelay;
1950 * spp_assoc_id - (UDP style socket) This is filled in the application, 1950 * uint32_t spp_flags;
1951 * and identifies the association for this query. 1951 * };
1952 *
1953 * spp_assoc_id - (one-to-many style socket) This is filled in the
1954 * application, and identifies the association for
1955 * this query.
1952 * spp_address - This specifies which address is of interest. 1956 * spp_address - This specifies which address is of interest.
1953 * spp_hbinterval - This contains the value of the heartbeat interval, 1957 * spp_hbinterval - This contains the value of the heartbeat interval,
1954 * in milliseconds. A value of 0, when modifying the 1958 * in milliseconds. If a value of zero
1955 * parameter, specifies that the heartbeat on this 1959 * is present in this field then no changes are to
1956 * address should be disabled. A value of UINT32_MAX 1960 * be made to this parameter.
1957 * (4294967295), when modifying the parameter,
1958 * specifies that a heartbeat should be sent
1959 * immediately to the peer address, and the current
1960 * interval should remain unchanged.
1961 * spp_pathmaxrxt - This contains the maximum number of 1961 * spp_pathmaxrxt - This contains the maximum number of
1962 * retransmissions before this address shall be 1962 * retransmissions before this address shall be
1963 * considered unreachable. 1963 * considered unreachable. If a value of zero
1964 * is present in this field then no changes are to
1965 * be made to this parameter.
1966 * spp_pathmtu - When Path MTU discovery is disabled the value
1967 * specified here will be the "fixed" path mtu.
1968 * Note that if the spp_address field is empty
1969 * then all associations on this address will
1970 * have this fixed path mtu set upon them.
1971 *
1972 * spp_sackdelay - When delayed sack is enabled, this value specifies
1973 * the number of milliseconds that sacks will be delayed
1974 * for. This value will apply to all addresses of an
1975 * association if the spp_address field is empty. Note
1976 * also, that if delayed sack is enabled and this
1977 * value is set to 0, no change is made to the last
1978 * recorded delayed sack timer value.
1979 *
1980 * spp_flags - These flags are used to control various features
1981 * on an association. The flag field may contain
1982 * zero or more of the following options.
1983 *
1984 * SPP_HB_ENABLE - Enable heartbeats on the
1985 * specified address. Note that if the address
1986 * field is empty all addresses for the association
1987 * have heartbeats enabled upon them.
1988 *
1989 * SPP_HB_DISABLE - Disable heartbeats on the
1990 * speicifed address. Note that if the address
1991 * field is empty all addresses for the association
1992 * will have their heartbeats disabled. Note also
1993 * that SPP_HB_ENABLE and SPP_HB_DISABLE are
1994 * mutually exclusive, only one of these two should
1995 * be specified. Enabling both fields will have
1996 * undetermined results.
1997 *
1998 * SPP_HB_DEMAND - Request a user initiated heartbeat
1999 * to be made immediately.
2000 *
2001 * SPP_PMTUD_ENABLE - This field will enable PMTU
2002 * discovery upon the specified address. Note that
2003 * if the address feild is empty then all addresses
2004 * on the association are effected.
2005 *
2006 * SPP_PMTUD_DISABLE - This field will disable PMTU
2007 * discovery upon the specified address. Note that
2008 * if the address feild is empty then all addresses
2009 * on the association are effected. Not also that
2010 * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
2011 * exclusive. Enabling both will have undetermined
2012 * results.
2013 *
2014 * SPP_SACKDELAY_ENABLE - Setting this flag turns
2015 * on delayed sack. The time specified in spp_sackdelay
2016 * is used to specify the sack delay for this address. Note
2017 * that if spp_address is empty then all addresses will
2018 * enable delayed sack and take on the sack delay
2019 * value specified in spp_sackdelay.
2020 * SPP_SACKDELAY_DISABLE - Setting this flag turns
2021 * off delayed sack. If the spp_address field is blank then
2022 * delayed sack is disabled for the entire association. Note
2023 * also that this field is mutually exclusive to
2024 * SPP_SACKDELAY_ENABLE, setting both will have undefined
2025 * results.
1964 */ 2026 */
2027int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
2028 struct sctp_transport *trans,
2029 struct sctp_association *asoc,
2030 struct sctp_sock *sp,
2031 int hb_change,
2032 int pmtud_change,
2033 int sackdelay_change)
2034{
2035 int error;
2036
2037 if (params->spp_flags & SPP_HB_DEMAND && trans) {
2038 error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
2039 if (error)
2040 return error;
2041 }
2042
2043 if (params->spp_hbinterval) {
2044 if (trans) {
2045 trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
2046 } else if (asoc) {
2047 asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
2048 } else {
2049 sp->hbinterval = params->spp_hbinterval;
2050 }
2051 }
2052
2053 if (hb_change) {
2054 if (trans) {
2055 trans->param_flags =
2056 (trans->param_flags & ~SPP_HB) | hb_change;
2057 } else if (asoc) {
2058 asoc->param_flags =
2059 (asoc->param_flags & ~SPP_HB) | hb_change;
2060 } else {
2061 sp->param_flags =
2062 (sp->param_flags & ~SPP_HB) | hb_change;
2063 }
2064 }
2065
2066 if (params->spp_pathmtu) {
2067 if (trans) {
2068 trans->pathmtu = params->spp_pathmtu;
2069 sctp_assoc_sync_pmtu(asoc);
2070 } else if (asoc) {
2071 asoc->pathmtu = params->spp_pathmtu;
2072 sctp_frag_point(sp, params->spp_pathmtu);
2073 } else {
2074 sp->pathmtu = params->spp_pathmtu;
2075 }
2076 }
2077
2078 if (pmtud_change) {
2079 if (trans) {
2080 int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
2081 (params->spp_flags & SPP_PMTUD_ENABLE);
2082 trans->param_flags =
2083 (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
2084 if (update) {
2085 sctp_transport_pmtu(trans);
2086 sctp_assoc_sync_pmtu(asoc);
2087 }
2088 } else if (asoc) {
2089 asoc->param_flags =
2090 (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
2091 } else {
2092 sp->param_flags =
2093 (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
2094 }
2095 }
2096
2097 if (params->spp_sackdelay) {
2098 if (trans) {
2099 trans->sackdelay =
2100 msecs_to_jiffies(params->spp_sackdelay);
2101 } else if (asoc) {
2102 asoc->sackdelay =
2103 msecs_to_jiffies(params->spp_sackdelay);
2104 } else {
2105 sp->sackdelay = params->spp_sackdelay;
2106 }
2107 }
2108
2109 if (sackdelay_change) {
2110 if (trans) {
2111 trans->param_flags =
2112 (trans->param_flags & ~SPP_SACKDELAY) |
2113 sackdelay_change;
2114 } else if (asoc) {
2115 asoc->param_flags =
2116 (asoc->param_flags & ~SPP_SACKDELAY) |
2117 sackdelay_change;
2118 } else {
2119 sp->param_flags =
2120 (sp->param_flags & ~SPP_SACKDELAY) |
2121 sackdelay_change;
2122 }
2123 }
2124
2125 if (params->spp_pathmaxrxt) {
2126 if (trans) {
2127 trans->pathmaxrxt = params->spp_pathmaxrxt;
2128 } else if (asoc) {
2129 asoc->pathmaxrxt = params->spp_pathmaxrxt;
2130 } else {
2131 sp->pathmaxrxt = params->spp_pathmaxrxt;
2132 }
2133 }
2134
2135 return 0;
2136}
2137
1965static int sctp_setsockopt_peer_addr_params(struct sock *sk, 2138static int sctp_setsockopt_peer_addr_params(struct sock *sk,
1966 char __user *optval, int optlen) 2139 char __user *optval, int optlen)
1967{ 2140{
1968 struct sctp_paddrparams params; 2141 struct sctp_paddrparams params;
1969 struct sctp_transport *trans; 2142 struct sctp_transport *trans = NULL;
2143 struct sctp_association *asoc = NULL;
2144 struct sctp_sock *sp = sctp_sk(sk);
1970 int error; 2145 int error;
2146 int hb_change, pmtud_change, sackdelay_change;
1971 2147
1972 if (optlen != sizeof(struct sctp_paddrparams)) 2148 if (optlen != sizeof(struct sctp_paddrparams))
1973 return -EINVAL; 2149 return - EINVAL;
2150
1974 if (copy_from_user(&params, optval, optlen)) 2151 if (copy_from_user(&params, optval, optlen))
1975 return -EFAULT; 2152 return -EFAULT;
1976 2153
1977 /* 2154 /* Validate flags and value parameters. */
1978 * API 7. Socket Options (setting the default value for the endpoint) 2155 hb_change = params.spp_flags & SPP_HB;
1979 * All options that support specific settings on an association by 2156 pmtud_change = params.spp_flags & SPP_PMTUD;
1980 * filling in either an association id variable or a sockaddr_storage 2157 sackdelay_change = params.spp_flags & SPP_SACKDELAY;
1981 * SHOULD also support setting of the same value for the entire endpoint 2158
1982 * (i.e. future associations). To accomplish this the following logic is 2159 if (hb_change == SPP_HB ||
1983 * used when setting one of these options: 2160 pmtud_change == SPP_PMTUD ||
1984 2161 sackdelay_change == SPP_SACKDELAY ||
1985 * c) If neither the sockaddr_storage or association identification is 2162 params.spp_sackdelay > 500 ||
1986 * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and 2163 (params.spp_pathmtu
1987 * the association identification is 0, the settings are a default 2164 && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
1988 * and to be applied to the endpoint (all future associations). 2165 return -EINVAL;
1989 */
1990 2166
1991 /* update default value for endpoint (all future associations) */ 2167 /* If an address other than INADDR_ANY is specified, and
1992 if (!params.spp_assoc_id && 2168 * no transport is found, then the request is invalid.
1993 sctp_is_any(( union sctp_addr *)&params.spp_address)) { 2169 */
1994 /* Manual heartbeat on an endpoint is invalid. */ 2170 if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
1995 if (0xffffffff == params.spp_hbinterval) 2171 trans = sctp_addr_id2transport(sk, &params.spp_address,
2172 params.spp_assoc_id);
2173 if (!trans)
1996 return -EINVAL; 2174 return -EINVAL;
1997 else if (params.spp_hbinterval)
1998 sctp_sk(sk)->paddrparam.spp_hbinterval =
1999 params.spp_hbinterval;
2000 if (params.spp_pathmaxrxt)
2001 sctp_sk(sk)->paddrparam.spp_pathmaxrxt =
2002 params.spp_pathmaxrxt;
2003 return 0;
2004 } 2175 }
2005 2176
2006 trans = sctp_addr_id2transport(sk, &params.spp_address, 2177 /* Get association, if assoc_id != 0 and the socket is a one
2007 params.spp_assoc_id); 2178 * to many style socket, and an association was not found, then
2008 if (!trans) 2179 * the id was invalid.
2180 */
2181 asoc = sctp_id2assoc(sk, params.spp_assoc_id);
2182 if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
2009 return -EINVAL; 2183 return -EINVAL;
2010 2184
2011 /* Applications can enable or disable heartbeats for any peer address 2185 /* Heartbeat demand can only be sent on a transport or
2012 * of an association, modify an address's heartbeat interval, force a 2186 * association, but not a socket.
2013 * heartbeat to be sent immediately, and adjust the address's maximum
2014 * number of retransmissions sent before an address is considered
2015 * unreachable.
2016 *
2017 * The value of the heartbeat interval, in milliseconds. A value of
2018 * UINT32_MAX (4294967295), when modifying the parameter, specifies
2019 * that a heartbeat should be sent immediately to the peer address,
2020 * and the current interval should remain unchanged.
2021 */ 2187 */
2022 if (0xffffffff == params.spp_hbinterval) { 2188 if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
2023 error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); 2189 return -EINVAL;
2024 if (error) 2190
2025 return error; 2191 /* Process parameters. */
2026 } else { 2192 error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
2027 /* The value of the heartbeat interval, in milliseconds. A value of 0, 2193 hb_change, pmtud_change,
2028 * when modifying the parameter, specifies that the heartbeat on this 2194 sackdelay_change);
2029 * address should be disabled. 2195
2196 if (error)
2197 return error;
2198
2199 /* If changes are for association, also apply parameters to each
2200 * transport.
2030 */ 2201 */
2031 if (params.spp_hbinterval) { 2202 if (!trans && asoc) {
2032 trans->hb_allowed = 1; 2203 struct list_head *pos;
2033 trans->hb_interval = 2204
2034 msecs_to_jiffies(params.spp_hbinterval); 2205 list_for_each(pos, &asoc->peer.transport_addr_list) {
2035 } else 2206 trans = list_entry(pos, struct sctp_transport,
2036 trans->hb_allowed = 0; 2207 transports);
2208 sctp_apply_peer_addr_params(&params, trans, asoc, sp,
2209 hb_change, pmtud_change,
2210 sackdelay_change);
2211 }
2037 } 2212 }
2038 2213
2039 /* spp_pathmaxrxt contains the maximum number of retransmissions 2214 return 0;
2040 * before this address shall be considered unreachable. 2215}
2041 */ 2216
2042 if (params.spp_pathmaxrxt) 2217/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
2043 trans->max_retrans = params.spp_pathmaxrxt; 2218 *
2219 * This options will get or set the delayed ack timer. The time is set
2220 * in milliseconds. If the assoc_id is 0, then this sets or gets the
2221 * endpoints default delayed ack timer value. If the assoc_id field is
2222 * non-zero, then the set or get effects the specified association.
2223 *
2224 * struct sctp_assoc_value {
2225 * sctp_assoc_t assoc_id;
2226 * uint32_t assoc_value;
2227 * };
2228 *
2229 * assoc_id - This parameter, indicates which association the
2230 * user is preforming an action upon. Note that if
2231 * this field's value is zero then the endpoints
2232 * default value is changed (effecting future
2233 * associations only).
2234 *
2235 * assoc_value - This parameter contains the number of milliseconds
2236 * that the user is requesting the delayed ACK timer
2237 * be set to. Note that this value is defined in
2238 * the standard to be between 200 and 500 milliseconds.
2239 *
2240 * Note: a value of zero will leave the value alone,
2241 * but disable SACK delay. A non-zero value will also
2242 * enable SACK delay.
2243 */
2044 2244
2245static int sctp_setsockopt_delayed_ack_time(struct sock *sk,
2246 char __user *optval, int optlen)
2247{
2248 struct sctp_assoc_value params;
2249 struct sctp_transport *trans = NULL;
2250 struct sctp_association *asoc = NULL;
2251 struct sctp_sock *sp = sctp_sk(sk);
2252
2253 if (optlen != sizeof(struct sctp_assoc_value))
2254 return - EINVAL;
2255
2256 if (copy_from_user(&params, optval, optlen))
2257 return -EFAULT;
2258
2259 /* Validate value parameter. */
2260 if (params.assoc_value > 500)
2261 return -EINVAL;
2262
2263 /* Get association, if assoc_id != 0 and the socket is a one
2264 * to many style socket, and an association was not found, then
2265 * the id was invalid.
2266 */
2267 asoc = sctp_id2assoc(sk, params.assoc_id);
2268 if (!asoc && params.assoc_id && sctp_style(sk, UDP))
2269 return -EINVAL;
2270
2271 if (params.assoc_value) {
2272 if (asoc) {
2273 asoc->sackdelay =
2274 msecs_to_jiffies(params.assoc_value);
2275 asoc->param_flags =
2276 (asoc->param_flags & ~SPP_SACKDELAY) |
2277 SPP_SACKDELAY_ENABLE;
2278 } else {
2279 sp->sackdelay = params.assoc_value;
2280 sp->param_flags =
2281 (sp->param_flags & ~SPP_SACKDELAY) |
2282 SPP_SACKDELAY_ENABLE;
2283 }
2284 } else {
2285 if (asoc) {
2286 asoc->param_flags =
2287 (asoc->param_flags & ~SPP_SACKDELAY) |
2288 SPP_SACKDELAY_DISABLE;
2289 } else {
2290 sp->param_flags =
2291 (sp->param_flags & ~SPP_SACKDELAY) |
2292 SPP_SACKDELAY_DISABLE;
2293 }
2294 }
2295
2296 /* If change is for association, also apply to each transport. */
2297 if (asoc) {
2298 struct list_head *pos;
2299
2300 list_for_each(pos, &asoc->peer.transport_addr_list) {
2301 trans = list_entry(pos, struct sctp_transport,
2302 transports);
2303 if (params.assoc_value) {
2304 trans->sackdelay =
2305 msecs_to_jiffies(params.assoc_value);
2306 trans->param_flags =
2307 (trans->param_flags & ~SPP_SACKDELAY) |
2308 SPP_SACKDELAY_ENABLE;
2309 } else {
2310 trans->param_flags =
2311 (trans->param_flags & ~SPP_SACKDELAY) |
2312 SPP_SACKDELAY_DISABLE;
2313 }
2314 }
2315 }
2316
2045 return 0; 2317 return 0;
2046} 2318}
2047 2319
@@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
2334 /* Update the frag_point of the existing associations. */ 2606 /* Update the frag_point of the existing associations. */
2335 list_for_each(pos, &(sp->ep->asocs)) { 2607 list_for_each(pos, &(sp->ep->asocs)) {
2336 asoc = list_entry(pos, struct sctp_association, asocs); 2608 asoc = list_entry(pos, struct sctp_association, asocs);
2337 asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); 2609 asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
2338 } 2610 }
2339 2611
2340 return 0; 2612 return 0;
@@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
2491 retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); 2763 retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
2492 break; 2764 break;
2493 2765
2766 case SCTP_DELAYED_ACK_TIME:
2767 retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen);
2768 break;
2769
2494 case SCTP_INITMSG: 2770 case SCTP_INITMSG:
2495 retval = sctp_setsockopt_initmsg(sk, optval, optlen); 2771 retval = sctp_setsockopt_initmsg(sk, optval, optlen);
2496 break; 2772 break;
@@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
2715 /* Default Peer Address Parameters. These defaults can 2991 /* Default Peer Address Parameters. These defaults can
2716 * be modified via SCTP_PEER_ADDR_PARAMS 2992 * be modified via SCTP_PEER_ADDR_PARAMS
2717 */ 2993 */
2718 sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); 2994 sp->hbinterval = jiffies_to_msecs(sctp_hb_interval);
2719 sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; 2995 sp->pathmaxrxt = sctp_max_retrans_path;
2996 sp->pathmtu = 0; // allow default discovery
2997 sp->sackdelay = sctp_sack_timeout;
2998 sp->param_flags = SPP_HB_ENABLE |
2999 SPP_PMTUD_ENABLE |
3000 SPP_SACKDELAY_ENABLE;
2720 3001
2721 /* If enabled no SCTP message fragmentation will be performed. 3002 /* If enabled no SCTP message fragmentation will be performed.
2722 * Configure through SCTP_DISABLE_FRAGMENTS socket option. 3003 * Configure through SCTP_DISABLE_FRAGMENTS socket option.
@@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
2865 status.sstat_primary.spinfo_cwnd = transport->cwnd; 3146 status.sstat_primary.spinfo_cwnd = transport->cwnd;
2866 status.sstat_primary.spinfo_srtt = transport->srtt; 3147 status.sstat_primary.spinfo_srtt = transport->srtt;
2867 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); 3148 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
2868 status.sstat_primary.spinfo_mtu = transport->pmtu; 3149 status.sstat_primary.spinfo_mtu = transport->pathmtu;
2869 3150
2870 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) 3151 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
2871 status.sstat_primary.spinfo_state = SCTP_ACTIVE; 3152 status.sstat_primary.spinfo_state = SCTP_ACTIVE;
@@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
2924 pinfo.spinfo_cwnd = transport->cwnd; 3205 pinfo.spinfo_cwnd = transport->cwnd;
2925 pinfo.spinfo_srtt = transport->srtt; 3206 pinfo.spinfo_srtt = transport->srtt;
2926 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); 3207 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
2927 pinfo.spinfo_mtu = transport->pmtu; 3208 pinfo.spinfo_mtu = transport->pathmtu;
2928 3209
2929 if (pinfo.spinfo_state == SCTP_UNKNOWN) 3210 if (pinfo.spinfo_state == SCTP_UNKNOWN)
2930 pinfo.spinfo_state = SCTP_ACTIVE; 3211 pinfo.spinfo_state = SCTP_ACTIVE;
@@ -3086,69 +3367,227 @@ out:
3086 * address's parameters: 3367 * address's parameters:
3087 * 3368 *
3088 * struct sctp_paddrparams { 3369 * struct sctp_paddrparams {
3089 * sctp_assoc_t spp_assoc_id; 3370 * sctp_assoc_t spp_assoc_id;
3090 * struct sockaddr_storage spp_address; 3371 * struct sockaddr_storage spp_address;
3091 * uint32_t spp_hbinterval; 3372 * uint32_t spp_hbinterval;
3092 * uint16_t spp_pathmaxrxt; 3373 * uint16_t spp_pathmaxrxt;
3093 * }; 3374 * uint32_t spp_pathmtu;
3094 * 3375 * uint32_t spp_sackdelay;
3095 * spp_assoc_id - (UDP style socket) This is filled in the application, 3376 * uint32_t spp_flags;
3096 * and identifies the association for this query. 3377 * };
3378 *
3379 * spp_assoc_id - (one-to-many style socket) This is filled in the
3380 * application, and identifies the association for
3381 * this query.
3097 * spp_address - This specifies which address is of interest. 3382 * spp_address - This specifies which address is of interest.
3098 * spp_hbinterval - This contains the value of the heartbeat interval, 3383 * spp_hbinterval - This contains the value of the heartbeat interval,
3099 * in milliseconds. A value of 0, when modifying the 3384 * in milliseconds. If a value of zero
3100 * parameter, specifies that the heartbeat on this 3385 * is present in this field then no changes are to
3101 * address should be disabled. A value of UINT32_MAX 3386 * be made to this parameter.
3102 * (4294967295), when modifying the parameter,
3103 * specifies that a heartbeat should be sent
3104 * immediately to the peer address, and the current
3105 * interval should remain unchanged.
3106 * spp_pathmaxrxt - This contains the maximum number of 3387 * spp_pathmaxrxt - This contains the maximum number of
3107 * retransmissions before this address shall be 3388 * retransmissions before this address shall be
3108 * considered unreachable. 3389 * considered unreachable. If a value of zero
3390 * is present in this field then no changes are to
3391 * be made to this parameter.
3392 * spp_pathmtu - When Path MTU discovery is disabled the value
3393 * specified here will be the "fixed" path mtu.
3394 * Note that if the spp_address field is empty
3395 * then all associations on this address will
3396 * have this fixed path mtu set upon them.
3397 *
3398 * spp_sackdelay - When delayed sack is enabled, this value specifies
3399 * the number of milliseconds that sacks will be delayed
3400 * for. This value will apply to all addresses of an
3401 * association if the spp_address field is empty. Note
3402 * also, that if delayed sack is enabled and this
3403 * value is set to 0, no change is made to the last
3404 * recorded delayed sack timer value.
3405 *
3406 * spp_flags - These flags are used to control various features
3407 * on an association. The flag field may contain
3408 * zero or more of the following options.
3409 *
3410 * SPP_HB_ENABLE - Enable heartbeats on the
3411 * specified address. Note that if the address
3412 * field is empty all addresses for the association
3413 * have heartbeats enabled upon them.
3414 *
3415 * SPP_HB_DISABLE - Disable heartbeats on the
3416 * speicifed address. Note that if the address
3417 * field is empty all addresses for the association
3418 * will have their heartbeats disabled. Note also
3419 * that SPP_HB_ENABLE and SPP_HB_DISABLE are
3420 * mutually exclusive, only one of these two should
3421 * be specified. Enabling both fields will have
3422 * undetermined results.
3423 *
3424 * SPP_HB_DEMAND - Request a user initiated heartbeat
3425 * to be made immediately.
3426 *
3427 * SPP_PMTUD_ENABLE - This field will enable PMTU
3428 * discovery upon the specified address. Note that
3429 * if the address feild is empty then all addresses
3430 * on the association are effected.
3431 *
3432 * SPP_PMTUD_DISABLE - This field will disable PMTU
3433 * discovery upon the specified address. Note that
3434 * if the address feild is empty then all addresses
3435 * on the association are effected. Not also that
3436 * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
3437 * exclusive. Enabling both will have undetermined
3438 * results.
3439 *
3440 * SPP_SACKDELAY_ENABLE - Setting this flag turns
3441 * on delayed sack. The time specified in spp_sackdelay
3442 * is used to specify the sack delay for this address. Note
3443 * that if spp_address is empty then all addresses will
3444 * enable delayed sack and take on the sack delay
3445 * value specified in spp_sackdelay.
3446 * SPP_SACKDELAY_DISABLE - Setting this flag turns
3447 * off delayed sack. If the spp_address field is blank then
3448 * delayed sack is disabled for the entire association. Note
3449 * also that this field is mutually exclusive to
3450 * SPP_SACKDELAY_ENABLE, setting both will have undefined
3451 * results.
3109 */ 3452 */
3110static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, 3453static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
3111 char __user *optval, int __user *optlen) 3454 char __user *optval, int __user *optlen)
3112{ 3455{
3113 struct sctp_paddrparams params; 3456 struct sctp_paddrparams params;
3114 struct sctp_transport *trans; 3457 struct sctp_transport *trans = NULL;
3458 struct sctp_association *asoc = NULL;
3459 struct sctp_sock *sp = sctp_sk(sk);
3115 3460
3116 if (len != sizeof(struct sctp_paddrparams)) 3461 if (len != sizeof(struct sctp_paddrparams))
3117 return -EINVAL; 3462 return -EINVAL;
3463
3118 if (copy_from_user(&params, optval, len)) 3464 if (copy_from_user(&params, optval, len))
3119 return -EFAULT; 3465 return -EFAULT;
3120 3466
3121 /* If no association id is specified retrieve the default value 3467 /* If an address other than INADDR_ANY is specified, and
3122 * for the endpoint that will be used for all future associations 3468 * no transport is found, then the request is invalid.
3123 */ 3469 */
3124 if (!params.spp_assoc_id && 3470 if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
3125 sctp_is_any(( union sctp_addr *)&params.spp_address)) { 3471 trans = sctp_addr_id2transport(sk, &params.spp_address,
3126 params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; 3472 params.spp_assoc_id);
3127 params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; 3473 if (!trans) {
3128 3474 SCTP_DEBUG_PRINTK("Failed no transport\n");
3129 goto done; 3475 return -EINVAL;
3476 }
3130 } 3477 }
3131 3478
3132 trans = sctp_addr_id2transport(sk, &params.spp_address, 3479 /* Get association, if assoc_id != 0 and the socket is a one
3133 params.spp_assoc_id); 3480 * to many style socket, and an association was not found, then
3134 if (!trans) 3481 * the id was invalid.
3482 */
3483 asoc = sctp_id2assoc(sk, params.spp_assoc_id);
3484 if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
3485 SCTP_DEBUG_PRINTK("Failed no association\n");
3135 return -EINVAL; 3486 return -EINVAL;
3487 }
3136 3488
3137 /* The value of the heartbeat interval, in milliseconds. A value of 0, 3489 if (trans) {
3138 * when modifying the parameter, specifies that the heartbeat on this 3490 /* Fetch transport values. */
3139 * address should be disabled. 3491 params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
3140 */ 3492 params.spp_pathmtu = trans->pathmtu;
3141 if (!trans->hb_allowed) 3493 params.spp_pathmaxrxt = trans->pathmaxrxt;
3142 params.spp_hbinterval = 0; 3494 params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay);
3143 else 3495
3144 params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); 3496 /*draft-11 doesn't say what to return in spp_flags*/
3497 params.spp_flags = trans->param_flags;
3498 } else if (asoc) {
3499 /* Fetch association values. */
3500 params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
3501 params.spp_pathmtu = asoc->pathmtu;
3502 params.spp_pathmaxrxt = asoc->pathmaxrxt;
3503 params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay);
3504
3505 /*draft-11 doesn't say what to return in spp_flags*/
3506 params.spp_flags = asoc->param_flags;
3507 } else {
3508 /* Fetch socket values. */
3509 params.spp_hbinterval = sp->hbinterval;
3510 params.spp_pathmtu = sp->pathmtu;
3511 params.spp_sackdelay = sp->sackdelay;
3512 params.spp_pathmaxrxt = sp->pathmaxrxt;
3513
3514 /*draft-11 doesn't say what to return in spp_flags*/
3515 params.spp_flags = sp->param_flags;
3516 }
3145 3517
3146 /* spp_pathmaxrxt contains the maximum number of retransmissions 3518 if (copy_to_user(optval, &params, len))
3147 * before this address shall be considered unreachable. 3519 return -EFAULT;
3148 */ 3520
3149 params.spp_pathmaxrxt = trans->max_retrans; 3521 if (put_user(len, optlen))
3522 return -EFAULT;
3523
3524 return 0;
3525}
3526
3527/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
3528 *
3529 * This options will get or set the delayed ack timer. The time is set
3530 * in milliseconds. If the assoc_id is 0, then this sets or gets the
3531 * endpoints default delayed ack timer value. If the assoc_id field is
3532 * non-zero, then the set or get effects the specified association.
3533 *
3534 * struct sctp_assoc_value {
3535 * sctp_assoc_t assoc_id;
3536 * uint32_t assoc_value;
3537 * };
3538 *
3539 * assoc_id - This parameter, indicates which association the
3540 * user is preforming an action upon. Note that if
3541 * this field's value is zero then the endpoints
3542 * default value is changed (effecting future
3543 * associations only).
3544 *
3545 * assoc_value - This parameter contains the number of milliseconds
3546 * that the user is requesting the delayed ACK timer
3547 * be set to. Note that this value is defined in
3548 * the standard to be between 200 and 500 milliseconds.
3549 *
3550 * Note: a value of zero will leave the value alone,
3551 * but disable SACK delay. A non-zero value will also
3552 * enable SACK delay.
3553 */
3554static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len,
3555 char __user *optval,
3556 int __user *optlen)
3557{
3558 struct sctp_assoc_value params;
3559 struct sctp_association *asoc = NULL;
3560 struct sctp_sock *sp = sctp_sk(sk);
3561
3562 if (len != sizeof(struct sctp_assoc_value))
3563 return - EINVAL;
3564
3565 if (copy_from_user(&params, optval, len))
3566 return -EFAULT;
3567
3568 /* Get association, if assoc_id != 0 and the socket is a one
3569 * to many style socket, and an association was not found, then
3570 * the id was invalid.
3571 */
3572 asoc = sctp_id2assoc(sk, params.assoc_id);
3573 if (!asoc && params.assoc_id && sctp_style(sk, UDP))
3574 return -EINVAL;
3575
3576 if (asoc) {
3577 /* Fetch association values. */
3578 if (asoc->param_flags & SPP_SACKDELAY_ENABLE)
3579 params.assoc_value = jiffies_to_msecs(
3580 asoc->sackdelay);
3581 else
3582 params.assoc_value = 0;
3583 } else {
3584 /* Fetch socket values. */
3585 if (sp->param_flags & SPP_SACKDELAY_ENABLE)
3586 params.assoc_value = sp->sackdelay;
3587 else
3588 params.assoc_value = 0;
3589 }
3150 3590
3151done:
3152 if (copy_to_user(optval, &params, len)) 3591 if (copy_to_user(optval, &params, len))
3153 return -EFAULT; 3592 return -EFAULT;
3154 3593
@@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
4015 retval = sctp_getsockopt_peer_addr_params(sk, len, optval, 4454 retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
4016 optlen); 4455 optlen);
4017 break; 4456 break;
4457 case SCTP_DELAYED_ACK_TIME:
4458 retval = sctp_getsockopt_delayed_ack_time(sk, len, optval,
4459 optlen);
4460 break;
4018 case SCTP_INITMSG: 4461 case SCTP_INITMSG:
4019 retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); 4462 retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
4020 break; 4463 break;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 268ddaf2dc0f..68d73e2dd155 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
86 peer->init_sent_count = 0; 86 peer->init_sent_count = 0;
87 87
88 peer->state = SCTP_ACTIVE; 88 peer->state = SCTP_ACTIVE;
89 peer->hb_allowed = 0; 89 peer->param_flags = SPP_HB_DISABLE |
90 SPP_PMTUD_ENABLE |
91 SPP_SACKDELAY_ENABLE;
92 peer->hbinterval = 0;
90 93
91 /* Initialize the default path max_retrans. */ 94 /* Initialize the default path max_retrans. */
92 peer->max_retrans = sctp_max_retrans_path; 95 peer->pathmaxrxt = sctp_max_retrans_path;
93 peer->error_count = 0; 96 peer->error_count = 0;
94 97
95 INIT_LIST_HEAD(&peer->transmitted); 98 INIT_LIST_HEAD(&peer->transmitted);
@@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport)
229 dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); 232 dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
230 233
231 if (dst) { 234 if (dst) {
232 transport->pmtu = dst_mtu(dst); 235 transport->pathmtu = dst_mtu(dst);
233 dst_release(dst); 236 dst_release(dst);
234 } else 237 } else
235 transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; 238 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
236} 239}
237 240
238/* Caches the dst entry and source address for a transport's destination 241/* Caches the dst entry and source address for a transport's destination
@@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport,
254 af->get_saddr(asoc, dst, daddr, &transport->saddr); 257 af->get_saddr(asoc, dst, daddr, &transport->saddr);
255 258
256 transport->dst = dst; 259 transport->dst = dst;
260 if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
261 return;
262 }
257 if (dst) { 263 if (dst) {
258 transport->pmtu = dst_mtu(dst); 264 transport->pathmtu = dst_mtu(dst);
259 265
260 /* Initialize sk->sk_rcv_saddr, if the transport is the 266 /* Initialize sk->sk_rcv_saddr, if the transport is the
261 * association's active path for getsockname(). 267 * association's active path for getsockname().
@@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport,
264 opt->pf->af->to_sk_saddr(&transport->saddr, 270 opt->pf->af->to_sk_saddr(&transport->saddr,
265 asoc->base.sk); 271 asoc->base.sk);
266 } else 272 } else
267 transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; 273 transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
268} 274}
269 275
270/* Hold a reference to a transport. */ 276/* Hold a reference to a transport. */
@@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
369 375
370 ssthresh = transport->ssthresh; 376 ssthresh = transport->ssthresh;
371 pba = transport->partial_bytes_acked; 377 pba = transport->partial_bytes_acked;
372 pmtu = transport->asoc->pmtu; 378 pmtu = transport->asoc->pathmtu;
373 379
374 if (cwnd <= ssthresh) { 380 if (cwnd <= ssthresh) {
375 /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less 381 /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
@@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
441 * partial_bytes_acked = 0 447 * partial_bytes_acked = 0
442 */ 448 */
443 transport->ssthresh = max(transport->cwnd/2, 449 transport->ssthresh = max(transport->cwnd/2,
444 4*transport->asoc->pmtu); 450 4*transport->asoc->pathmtu);
445 transport->cwnd = transport->asoc->pmtu; 451 transport->cwnd = transport->asoc->pathmtu;
446 break; 452 break;
447 453
448 case SCTP_LOWER_CWND_FAST_RTX: 454 case SCTP_LOWER_CWND_FAST_RTX:
@@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
459 * partial_bytes_acked = 0 465 * partial_bytes_acked = 0
460 */ 466 */
461 transport->ssthresh = max(transport->cwnd/2, 467 transport->ssthresh = max(transport->cwnd/2,
462 4*transport->asoc->pmtu); 468 4*transport->asoc->pathmtu);
463 transport->cwnd = transport->ssthresh; 469 transport->cwnd = transport->ssthresh;
464 break; 470 break;
465 471
@@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
479 if ((jiffies - transport->last_time_ecne_reduced) > 485 if ((jiffies - transport->last_time_ecne_reduced) >
480 transport->rtt) { 486 transport->rtt) {
481 transport->ssthresh = max(transport->cwnd/2, 487 transport->ssthresh = max(transport->cwnd/2,
482 4*transport->asoc->pmtu); 488 4*transport->asoc->pathmtu);
483 transport->cwnd = transport->ssthresh; 489 transport->cwnd = transport->ssthresh;
484 transport->last_time_ecne_reduced = jiffies; 490 transport->last_time_ecne_reduced = jiffies;
485 } 491 }
@@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
496 */ 502 */
497 if ((jiffies - transport->last_time_used) > transport->rto) 503 if ((jiffies - transport->last_time_used) > transport->rto)
498 transport->cwnd = max(transport->cwnd/2, 504 transport->cwnd = max(transport->cwnd/2,
499 4*transport->asoc->pmtu); 505 4*transport->asoc->pathmtu);
500 break; 506 break;
501 }; 507 };
502 508
@@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
511unsigned long sctp_transport_timeout(struct sctp_transport *t) 517unsigned long sctp_transport_timeout(struct sctp_transport *t)
512{ 518{
513 unsigned long timeout; 519 unsigned long timeout;
514 timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); 520 timeout = t->hbinterval + t->rto + sctp_jitter(t->rto);
515 timeout += jiffies; 521 timeout += jiffies;
516 return timeout; 522 return timeout;
517} 523}
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf54..06fa217f58a9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb)
640 kfree(iocb->private); 640 kfree(iocb->private);
641} 641}
642 642
643/* 643static ssize_t sock_sendpage(struct file *file, struct page *page,
644 * Read data from a socket. ubuf is a user mode pointer. We make sure the user 644 int offset, size_t size, loff_t *ppos, int more)
645 * area ubuf...ubuf+size-1 is writable before asking the protocol.
646 */
647
648static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
649 size_t size, loff_t pos)
650{ 645{
651 struct sock_iocb *x, siocb;
652 struct socket *sock; 646 struct socket *sock;
653 int flags; 647 int flags;
654 648
655 if (pos != 0) 649 sock = file->private_data;
656 return -ESPIPE;
657 if (size==0) /* Match SYS5 behaviour */
658 return 0;
659 650
660 if (is_sync_kiocb(iocb)) 651 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
661 x = &siocb; 652 if (more)
662 else { 653 flags |= MSG_MORE;
663 x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); 654
664 if (!x) 655 return sock->ops->sendpage(sock, page, offset, size, flags);
665 return -ENOMEM; 656}
657
658static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
659 char __user *ubuf, size_t size, struct sock_iocb *siocb)
660{
661 if (!is_sync_kiocb(iocb)) {
662 siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
663 if (!siocb)
664 return NULL;
666 iocb->ki_dtor = sock_aio_dtor; 665 iocb->ki_dtor = sock_aio_dtor;
667 } 666 }
668 iocb->private = x;
669 x->kiocb = iocb;
670 sock = iocb->ki_filp->private_data;
671 667
672 x->async_msg.msg_name = NULL; 668 siocb->kiocb = iocb;
673 x->async_msg.msg_namelen = 0; 669 siocb->async_iov.iov_base = ubuf;
674 x->async_msg.msg_iov = &x->async_iov; 670 siocb->async_iov.iov_len = size;
675 x->async_msg.msg_iovlen = 1;
676 x->async_msg.msg_control = NULL;
677 x->async_msg.msg_controllen = 0;
678 x->async_iov.iov_base = ubuf;
679 x->async_iov.iov_len = size;
680 flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
681 671
682 return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); 672 iocb->private = siocb;
673 return siocb;
683} 674}
684 675
676static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
677 struct file *file, struct iovec *iov, unsigned long nr_segs)
678{
679 struct socket *sock = file->private_data;
680 size_t size = 0;
681 int i;
685 682
686/* 683 for (i = 0 ; i < nr_segs ; i++)
687 * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 684 size += iov[i].iov_len;
688 * is readable by the user process.
689 */
690 685
691static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, 686 msg->msg_name = NULL;
692 size_t size, loff_t pos) 687 msg->msg_namelen = 0;
688 msg->msg_control = NULL;
689 msg->msg_controllen = 0;
690 msg->msg_iov = (struct iovec *) iov;
691 msg->msg_iovlen = nr_segs;
692 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
693
694 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
695}
696
697static ssize_t sock_readv(struct file *file, const struct iovec *iov,
698 unsigned long nr_segs, loff_t *ppos)
693{ 699{
694 struct sock_iocb *x, siocb; 700 struct kiocb iocb;
695 struct socket *sock; 701 struct sock_iocb siocb;
696 702 struct msghdr msg;
703 int ret;
704
705 init_sync_kiocb(&iocb, NULL);
706 iocb.private = &siocb;
707
708 ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
709 if (-EIOCBQUEUED == ret)
710 ret = wait_on_sync_kiocb(&iocb);
711 return ret;
712}
713
714static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
715 size_t count, loff_t pos)
716{
717 struct sock_iocb siocb, *x;
718
697 if (pos != 0) 719 if (pos != 0)
698 return -ESPIPE; 720 return -ESPIPE;
699 if(size==0) /* Match SYS5 behaviour */ 721 if (count == 0) /* Match SYS5 behaviour */
700 return 0; 722 return 0;
701 723
702 if (is_sync_kiocb(iocb)) 724 x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
703 x = &siocb; 725 if (!x)
704 else { 726 return -ENOMEM;
705 x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); 727 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
706 if (!x) 728 &x->async_iov, 1);
707 return -ENOMEM;
708 iocb->ki_dtor = sock_aio_dtor;
709 }
710 iocb->private = x;
711 x->kiocb = iocb;
712 sock = iocb->ki_filp->private_data;
713
714 x->async_msg.msg_name = NULL;
715 x->async_msg.msg_namelen = 0;
716 x->async_msg.msg_iov = &x->async_iov;
717 x->async_msg.msg_iovlen = 1;
718 x->async_msg.msg_control = NULL;
719 x->async_msg.msg_controllen = 0;
720 x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
721 if (sock->type == SOCK_SEQPACKET)
722 x->async_msg.msg_flags |= MSG_EOR;
723 x->async_iov.iov_base = (void __user *)ubuf;
724 x->async_iov.iov_len = size;
725
726 return __sock_sendmsg(iocb, sock, &x->async_msg, size);
727} 729}
728 730
729static ssize_t sock_sendpage(struct file *file, struct page *page, 731static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
730 int offset, size_t size, loff_t *ppos, int more) 732 struct file *file, struct iovec *iov, unsigned long nr_segs)
731{ 733{
732 struct socket *sock; 734 struct socket *sock = file->private_data;
733 int flags; 735 size_t size = 0;
736 int i;
734 737
735 sock = file->private_data; 738 for (i = 0 ; i < nr_segs ; i++)
739 size += iov[i].iov_len;
736 740
737 flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; 741 msg->msg_name = NULL;
738 if (more) 742 msg->msg_namelen = 0;
739 flags |= MSG_MORE; 743 msg->msg_control = NULL;
744 msg->msg_controllen = 0;
745 msg->msg_iov = (struct iovec *) iov;
746 msg->msg_iovlen = nr_segs;
747 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
748 if (sock->type == SOCK_SEQPACKET)
749 msg->msg_flags |= MSG_EOR;
740 750
741 return sock->ops->sendpage(sock, page, offset, size, flags); 751 return __sock_sendmsg(iocb, sock, msg, size);
742} 752}
743 753
744static int sock_readv_writev(int type, 754static ssize_t sock_writev(struct file *file, const struct iovec *iov,
745 struct file * file, const struct iovec * iov, 755 unsigned long nr_segs, loff_t *ppos)
746 long count, size_t size)
747{ 756{
748 struct msghdr msg; 757 struct msghdr msg;
749 struct socket *sock; 758 struct kiocb iocb;
759 struct sock_iocb siocb;
760 int ret;
750 761
751 sock = file->private_data; 762 init_sync_kiocb(&iocb, NULL);
763 iocb.private = &siocb;
752 764
753 msg.msg_name = NULL; 765 ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
754 msg.msg_namelen = 0; 766 if (-EIOCBQUEUED == ret)
755 msg.msg_control = NULL; 767 ret = wait_on_sync_kiocb(&iocb);
756 msg.msg_controllen = 0; 768 return ret;
757 msg.msg_iov = (struct iovec *) iov; 769}
758 msg.msg_iovlen = count;
759 msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
760 770
761 /* read() does a VERIFY_WRITE */ 771static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
762 if (type == VERIFY_WRITE) 772 size_t count, loff_t pos)
763 return sock_recvmsg(sock, &msg, size, msg.msg_flags); 773{
774 struct sock_iocb siocb, *x;
764 775
765 if (sock->type == SOCK_SEQPACKET) 776 if (pos != 0)
766 msg.msg_flags |= MSG_EOR; 777 return -ESPIPE;
778 if (count == 0) /* Match SYS5 behaviour */
779 return 0;
767 780
768 return sock_sendmsg(sock, &msg, size); 781 x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
769} 782 if (!x)
783 return -ENOMEM;
770 784
771static ssize_t sock_readv(struct file *file, const struct iovec *vector, 785 return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
772 unsigned long count, loff_t *ppos) 786 &x->async_iov, 1);
773{
774 size_t tot_len = 0;
775 int i;
776 for (i = 0 ; i < count ; i++)
777 tot_len += vector[i].iov_len;
778 return sock_readv_writev(VERIFY_WRITE,
779 file, vector, count, tot_len);
780}
781
782static ssize_t sock_writev(struct file *file, const struct iovec *vector,
783 unsigned long count, loff_t *ppos)
784{
785 size_t tot_len = 0;
786 int i;
787 for (i = 0 ; i < count ; i++)
788 tot_len += vector[i].iov_len;
789 return sock_readv_writev(VERIFY_READ,
790 file, vector, count, tot_len);
791} 787}
792 788
793 789
@@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
904 break; 900 break;
905 default: 901 default:
906 err = sock->ops->ioctl(sock, cmd, arg); 902 err = sock->ops->ioctl(sock, cmd, arg);
903
904 /*
905 * If this ioctl is unknown try to hand it down
906 * to the NIC driver.
907 */
908 if (err == -ENOIOCTLCMD)
909 err = dev_ioctl(cmd, argp);
907 break; 910 break;
908 } 911 }
909 return err; 912 return err;
@@ -2036,7 +2039,7 @@ int sock_unregister(int family)
2036 return 0; 2039 return 0;
2037} 2040}
2038 2041
2039void __init sock_init(void) 2042static int __init sock_init(void)
2040{ 2043{
2041 /* 2044 /*
2042 * Initialize sock SLAB cache. 2045 * Initialize sock SLAB cache.
@@ -2044,12 +2047,10 @@ void __init sock_init(void)
2044 2047
2045 sk_init(); 2048 sk_init();
2046 2049
2047#ifdef SLAB_SKB
2048 /* 2050 /*
2049 * Initialize skbuff SLAB cache 2051 * Initialize skbuff SLAB cache
2050 */ 2052 */
2051 skb_init(); 2053 skb_init();
2052#endif
2053 2054
2054 /* 2055 /*
2055 * Initialize the protocols module. 2056 * Initialize the protocols module.
@@ -2058,15 +2059,19 @@ void __init sock_init(void)
2058 init_inodecache(); 2059 init_inodecache();
2059 register_filesystem(&sock_fs_type); 2060 register_filesystem(&sock_fs_type);
2060 sock_mnt = kern_mount(&sock_fs_type); 2061 sock_mnt = kern_mount(&sock_fs_type);
2061 /* The real protocol initialization is performed when 2062
2062 * do_initcalls is run. 2063 /* The real protocol initialization is performed in later initcalls.
2063 */ 2064 */
2064 2065
2065#ifdef CONFIG_NETFILTER 2066#ifdef CONFIG_NETFILTER
2066 netfilter_init(); 2067 netfilter_init();
2067#endif 2068#endif
2069
2070 return 0;
2068} 2071}
2069 2072
2073core_initcall(sock_init); /* early initcall */
2074
2070#ifdef CONFIG_PROC_FS 2075#ifdef CONFIG_PROC_FS
2071void socket_seq_show(struct seq_file *seq) 2076void socket_seq_show(struct seq_file *seq)
2072{ 2077{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71e..d68eba481291 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
758 struct svc_serv *serv = svsk->sk_server; 758 struct svc_serv *serv = svsk->sk_server;
759 struct socket *sock = svsk->sk_sock; 759 struct socket *sock = svsk->sk_sock;
760 struct socket *newsock; 760 struct socket *newsock;
761 struct proto_ops *ops; 761 const struct proto_ops *ops;
762 struct svc_sock *newsvsk; 762 struct svc_sock *newsvsk;
763 int err, slen; 763 int err, slen;
764 764
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acc73ba8bade..5f6ae79b8b16 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -121,7 +121,7 @@
121int sysctl_unix_max_dgram_qlen = 10; 121int sysctl_unix_max_dgram_qlen = 10;
122 122
123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; 123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124DEFINE_RWLOCK(unix_table_lock); 124DEFINE_SPINLOCK(unix_table_lock);
125static atomic_t unix_nr_socks = ATOMIC_INIT(0); 125static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126 126
127#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) 127#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
@@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
130 130
131/* 131/*
132 * SMP locking strategy: 132 * SMP locking strategy:
133 * hash table is protected with rwlock unix_table_lock 133 * hash table is protected with spinlock unix_table_lock
134 * each socket state is protected by separate rwlock. 134 * each socket state is protected by separate rwlock.
135 */ 135 */
136 136
@@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
214 214
215static inline void unix_remove_socket(struct sock *sk) 215static inline void unix_remove_socket(struct sock *sk)
216{ 216{
217 write_lock(&unix_table_lock); 217 spin_lock(&unix_table_lock);
218 __unix_remove_socket(sk); 218 __unix_remove_socket(sk);
219 write_unlock(&unix_table_lock); 219 spin_unlock(&unix_table_lock);
220} 220}
221 221
222static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) 222static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
223{ 223{
224 write_lock(&unix_table_lock); 224 spin_lock(&unix_table_lock);
225 __unix_insert_socket(list, sk); 225 __unix_insert_socket(list, sk);
226 write_unlock(&unix_table_lock); 226 spin_unlock(&unix_table_lock);
227} 227}
228 228
229static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, 229static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
@@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
250{ 250{
251 struct sock *s; 251 struct sock *s;
252 252
253 read_lock(&unix_table_lock); 253 spin_lock(&unix_table_lock);
254 s = __unix_find_socket_byname(sunname, len, type, hash); 254 s = __unix_find_socket_byname(sunname, len, type, hash);
255 if (s) 255 if (s)
256 sock_hold(s); 256 sock_hold(s);
257 read_unlock(&unix_table_lock); 257 spin_unlock(&unix_table_lock);
258 return s; 258 return s;
259} 259}
260 260
@@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
263 struct sock *s; 263 struct sock *s;
264 struct hlist_node *node; 264 struct hlist_node *node;
265 265
266 read_lock(&unix_table_lock); 266 spin_lock(&unix_table_lock);
267 sk_for_each(s, node, 267 sk_for_each(s, node,
268 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { 268 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
269 struct dentry *dentry = unix_sk(s)->dentry; 269 struct dentry *dentry = unix_sk(s)->dentry;
@@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
276 } 276 }
277 s = NULL; 277 s = NULL;
278found: 278found:
279 read_unlock(&unix_table_lock); 279 spin_unlock(&unix_table_lock);
280 return s; 280 return s;
281} 281}
282 282
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
473static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, 473static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
474 struct msghdr *, size_t); 474 struct msghdr *, size_t);
475 475
476static struct proto_ops unix_stream_ops = { 476static const struct proto_ops unix_stream_ops = {
477 .family = PF_UNIX, 477 .family = PF_UNIX,
478 .owner = THIS_MODULE, 478 .owner = THIS_MODULE,
479 .release = unix_release, 479 .release = unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
494 .sendpage = sock_no_sendpage, 494 .sendpage = sock_no_sendpage,
495}; 495};
496 496
497static struct proto_ops unix_dgram_ops = { 497static const struct proto_ops unix_dgram_ops = {
498 .family = PF_UNIX, 498 .family = PF_UNIX,
499 .owner = THIS_MODULE, 499 .owner = THIS_MODULE,
500 .release = unix_release, 500 .release = unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
515 .sendpage = sock_no_sendpage, 515 .sendpage = sock_no_sendpage,
516}; 516};
517 517
518static struct proto_ops unix_seqpacket_ops = { 518static const struct proto_ops unix_seqpacket_ops = {
519 .family = PF_UNIX, 519 .family = PF_UNIX,
520 .owner = THIS_MODULE, 520 .owner = THIS_MODULE,
521 .release = unix_release, 521 .release = unix_release,
@@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock)
564 u = unix_sk(sk); 564 u = unix_sk(sk);
565 u->dentry = NULL; 565 u->dentry = NULL;
566 u->mnt = NULL; 566 u->mnt = NULL;
567 rwlock_init(&u->lock); 567 spin_lock_init(&u->lock);
568 atomic_set(&u->inflight, sock ? 0 : -1); 568 atomic_set(&u->inflight, sock ? 0 : -1);
569 init_MUTEX(&u->readsem); /* single task reading lock */ 569 init_MUTEX(&u->readsem); /* single task reading lock */
570 init_waitqueue_head(&u->peer_wait); 570 init_waitqueue_head(&u->peer_wait);
@@ -642,12 +642,12 @@ retry:
642 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); 642 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
643 addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); 643 addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
644 644
645 write_lock(&unix_table_lock); 645 spin_lock(&unix_table_lock);
646 ordernum = (ordernum+1)&0xFFFFF; 646 ordernum = (ordernum+1)&0xFFFFF;
647 647
648 if (__unix_find_socket_byname(addr->name, addr->len, sock->type, 648 if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
649 addr->hash)) { 649 addr->hash)) {
650 write_unlock(&unix_table_lock); 650 spin_unlock(&unix_table_lock);
651 /* Sanity yield. It is unusual case, but yet... */ 651 /* Sanity yield. It is unusual case, but yet... */
652 if (!(ordernum&0xFF)) 652 if (!(ordernum&0xFF))
653 yield(); 653 yield();
@@ -658,7 +658,7 @@ retry:
658 __unix_remove_socket(sk); 658 __unix_remove_socket(sk);
659 u->addr = addr; 659 u->addr = addr;
660 __unix_insert_socket(&unix_socket_table[addr->hash], sk); 660 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
661 write_unlock(&unix_table_lock); 661 spin_unlock(&unix_table_lock);
662 err = 0; 662 err = 0;
663 663
664out: up(&u->readsem); 664out: up(&u->readsem);
@@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
791 addr->hash = UNIX_HASH_SIZE; 791 addr->hash = UNIX_HASH_SIZE;
792 } 792 }
793 793
794 write_lock(&unix_table_lock); 794 spin_lock(&unix_table_lock);
795 795
796 if (!sunaddr->sun_path[0]) { 796 if (!sunaddr->sun_path[0]) {
797 err = -EADDRINUSE; 797 err = -EADDRINUSE;
@@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
814 __unix_insert_socket(list, sk); 814 __unix_insert_socket(list, sk);
815 815
816out_unlock: 816out_unlock:
817 write_unlock(&unix_table_lock); 817 spin_unlock(&unix_table_lock);
818out_up: 818out_up:
819 up(&u->readsem); 819 up(&u->readsem);
820out: 820out:
@@ -1063,10 +1063,12 @@ restart:
1063 /* Set credentials */ 1063 /* Set credentials */
1064 sk->sk_peercred = other->sk_peercred; 1064 sk->sk_peercred = other->sk_peercred;
1065 1065
1066 sock_hold(newsk);
1067 unix_peer(sk) = newsk;
1068 sock->state = SS_CONNECTED; 1066 sock->state = SS_CONNECTED;
1069 sk->sk_state = TCP_ESTABLISHED; 1067 sk->sk_state = TCP_ESTABLISHED;
1068 sock_hold(newsk);
1069
1070 smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */
1071 unix_peer(sk) = newsk;
1070 1072
1071 unix_state_wunlock(sk); 1073 unix_state_wunlock(sk);
1072 1074
@@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1414 } else { 1416 } else {
1415 sunaddr = NULL; 1417 sunaddr = NULL;
1416 err = -ENOTCONN; 1418 err = -ENOTCONN;
1417 other = unix_peer_get(sk); 1419 other = unix_peer(sk);
1418 if (!other) 1420 if (!other)
1419 goto out_err; 1421 goto out_err;
1420 } 1422 }
@@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1476 other->sk_data_ready(other, size); 1478 other->sk_data_ready(other, size);
1477 sent+=size; 1479 sent+=size;
1478 } 1480 }
1479 sock_put(other);
1480 1481
1481 scm_destroy(siocb->scm); 1482 scm_destroy(siocb->scm);
1482 siocb->scm = NULL; 1483 siocb->scm = NULL;
@@ -1491,8 +1492,6 @@ pipe_err:
1491 send_sig(SIGPIPE,current,0); 1492 send_sig(SIGPIPE,current,0);
1492 err = -EPIPE; 1493 err = -EPIPE;
1493out_err: 1494out_err:
1494 if (other)
1495 sock_put(other);
1496 scm_destroy(siocb->scm); 1495 scm_destroy(siocb->scm);
1497 siocb->scm = NULL; 1496 siocb->scm = NULL;
1498 return sent ? : err; 1497 return sent ? : err;
@@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1860 } 1859 }
1861 1860
1862 default: 1861 default:
1863 err = dev_ioctl(cmd, (void __user *)arg); 1862 err = -ENOIOCTLCMD;
1864 break; 1863 break;
1865 } 1864 }
1866 return err; 1865 return err;
@@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
1917 1916
1918static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 1917static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1919{ 1918{
1920 read_lock(&unix_table_lock); 1919 spin_lock(&unix_table_lock);
1921 return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); 1920 return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1922} 1921}
1923 1922
@@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1932 1931
1933static void unix_seq_stop(struct seq_file *seq, void *v) 1932static void unix_seq_stop(struct seq_file *seq, void *v)
1934{ 1933{
1935 read_unlock(&unix_table_lock); 1934 spin_unlock(&unix_table_lock);
1936} 1935}
1937 1936
1938static int unix_seq_show(struct seq_file *seq, void *v) 1937static int unix_seq_show(struct seq_file *seq, void *v)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6ffc64e1712d..411802bd4d37 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -182,7 +182,7 @@ void unix_gc(void)
182 if (down_trylock(&unix_gc_sem)) 182 if (down_trylock(&unix_gc_sem))
183 return; 183 return;
184 184
185 read_lock(&unix_table_lock); 185 spin_lock(&unix_table_lock);
186 186
187 forall_unix_sockets(i, s) 187 forall_unix_sockets(i, s)
188 { 188 {
@@ -301,7 +301,7 @@ void unix_gc(void)
301 } 301 }
302 u->gc_tree = GC_ORPHAN; 302 u->gc_tree = GC_ORPHAN;
303 } 303 }
304 read_unlock(&unix_table_lock); 304 spin_unlock(&unix_table_lock);
305 305
306 /* 306 /*
307 * Here we are. Hitlist is filled. Die. 307 * Here we are. Hitlist is filled. Die.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b2132..7a43ae4721ed 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
181#endif 181#endif
182 182
183static int sk_count; 183static int sk_count;
184extern struct proto_ops wanpipe_ops; 184extern const struct proto_ops wanpipe_ops;
185static unsigned long find_free_critical; 185static unsigned long find_free_critical;
186 186
187static void wanpipe_unlink_driver(struct sock *sk); 187static void wanpipe_unlink_driver(struct sock *sk);
@@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar
1839#endif 1839#endif
1840 1840
1841 default: 1841 default:
1842 return dev_ioctl(cmd,(void __user *) arg); 1842 return -ENOIOCTLCMD;
1843 } 1843 }
1844 /*NOTREACHED*/ 1844 /*NOTREACHED*/
1845} 1845}
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
2546 return 0; 2546 return 0;
2547} 2547}
2548 2548
2549struct proto_ops wanpipe_ops = { 2549const struct proto_ops wanpipe_ops = {
2550 .family = PF_WANPIPE, 2550 .family = PF_WANPIPE,
2551 .owner = THIS_MODULE, 2551 .owner = THIS_MODULE,
2552 .release = wanpipe_release, 2552 .release = wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc8414..16459c7f54b2 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2;
64HLIST_HEAD(x25_list); 64HLIST_HEAD(x25_list);
65DEFINE_RWLOCK(x25_list_lock); 65DEFINE_RWLOCK(x25_list_lock);
66 66
67static struct proto_ops x25_proto_ops; 67static const struct proto_ops x25_proto_ops;
68 68
69static struct x25_address null_x25_address = {" "}; 69static struct x25_address null_x25_address = {" "};
70 70
@@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1378 } 1378 }
1379 1379
1380 default: 1380 default:
1381 rc = dev_ioctl(cmd, argp); 1381 rc = -ENOIOCTLCMD;
1382 break; 1382 break;
1383 } 1383 }
1384 1384
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
1391 .owner = THIS_MODULE, 1391 .owner = THIS_MODULE,
1392}; 1392};
1393 1393
1394static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { 1394static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
1395 .family = AF_X25, 1395 .family = AF_X25,
1396 .owner = THIS_MODULE, 1396 .owner = THIS_MODULE,
1397 .release = x25_release, 1397 .release = x25_release,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4a..64a447375fdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
10 * YOSHIFUJI Hideaki 10 * YOSHIFUJI Hideaki
11 * Split up af-specific portion 11 * Split up af-specific portion
12 * Derek Atkins <derek@ihtfp.com> Add the post_input processor 12 * Derek Atkins <derek@ihtfp.com> Add the post_input processor
13 * 13 *
14 */ 14 */
15 15
16#include <asm/bug.h> 16#include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
256 if (del_timer(&policy->timer)) 256 if (del_timer(&policy->timer))
257 BUG(); 257 BUG();
258 258
259 security_xfrm_policy_free(policy);
259 kfree(policy); 260 kfree(policy);
260} 261}
261EXPORT_SYMBOL(__xfrm_policy_destroy); 262EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
350 351
351 write_lock_bh(&xfrm_policy_lock); 352 write_lock_bh(&xfrm_policy_lock);
352 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { 353 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
353 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { 354 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
355 xfrm_sec_ctx_match(pol->security, policy->security)) {
354 if (excl) { 356 if (excl) {
355 write_unlock_bh(&xfrm_policy_lock); 357 write_unlock_bh(&xfrm_policy_lock);
356 return -EEXIST; 358 return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
416} 418}
417EXPORT_SYMBOL(xfrm_policy_insert); 419EXPORT_SYMBOL(xfrm_policy_insert);
418 420
419struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, 421struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
420 int delete) 422 struct xfrm_sec_ctx *ctx, int delete)
421{ 423{
422 struct xfrm_policy *pol, **p; 424 struct xfrm_policy *pol, **p;
423 425
424 write_lock_bh(&xfrm_policy_lock); 426 write_lock_bh(&xfrm_policy_lock);
425 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { 427 for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
426 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { 428 if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
429 (xfrm_sec_ctx_match(ctx, pol->security))) {
427 xfrm_pol_hold(pol); 430 xfrm_pol_hold(pol);
428 if (delete) 431 if (delete)
429 *p = pol->next; 432 *p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
438 } 441 }
439 return pol; 442 return pol;
440} 443}
441EXPORT_SYMBOL(xfrm_policy_bysel); 444EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
442 445
443struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) 446struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
444{ 447{
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
519 522
520/* Find policy to apply to this flow. */ 523/* Find policy to apply to this flow. */
521 524
522static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, 525static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
523 void **objp, atomic_t **obj_refp) 526 void **objp, atomic_t **obj_refp)
524{ 527{
525 struct xfrm_policy *pol; 528 struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
533 continue; 536 continue;
534 537
535 match = xfrm_selector_match(sel, fl, family); 538 match = xfrm_selector_match(sel, fl, family);
539
536 if (match) { 540 if (match) {
537 xfrm_pol_hold(pol); 541 if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
538 break; 542 xfrm_pol_hold(pol);
543 break;
544 }
539 } 545 }
540 } 546 }
541 read_unlock_bh(&xfrm_policy_lock); 547 read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
543 *obj_refp = &pol->refcnt; 549 *obj_refp = &pol->refcnt;
544} 550}
545 551
546static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) 552static inline int policy_to_flow_dir(int dir)
553{
554 if (XFRM_POLICY_IN == FLOW_DIR_IN &&
555 XFRM_POLICY_OUT == FLOW_DIR_OUT &&
556 XFRM_POLICY_FWD == FLOW_DIR_FWD)
557 return dir;
558 switch (dir) {
559 default:
560 case XFRM_POLICY_IN:
561 return FLOW_DIR_IN;
562 case XFRM_POLICY_OUT:
563 return FLOW_DIR_OUT;
564 case XFRM_POLICY_FWD:
565 return FLOW_DIR_FWD;
566 };
567}
568
569static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
547{ 570{
548 struct xfrm_policy *pol; 571 struct xfrm_policy *pol;
549 572
550 read_lock_bh(&xfrm_policy_lock); 573 read_lock_bh(&xfrm_policy_lock);
551 if ((pol = sk->sk_policy[dir]) != NULL) { 574 if ((pol = sk->sk_policy[dir]) != NULL) {
552 int match = xfrm_selector_match(&pol->selector, fl, 575 int match = xfrm_selector_match(&pol->selector, fl,
553 sk->sk_family); 576 sk->sk_family);
577 int err = 0;
578
554 if (match) 579 if (match)
580 err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
581
582 if (match && !err)
555 xfrm_pol_hold(pol); 583 xfrm_pol_hold(pol);
556 else 584 else
557 pol = NULL; 585 pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
624 652
625 if (newp) { 653 if (newp) {
626 newp->selector = old->selector; 654 newp->selector = old->selector;
655 if (security_xfrm_policy_clone(old, newp)) {
656 kfree(newp);
657 return NULL; /* ENOMEM */
658 }
627 newp->lft = old->lft; 659 newp->lft = old->lft;
628 newp->curlft = old->curlft; 660 newp->curlft = old->curlft;
629 newp->action = old->action; 661 newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
735 return err; 767 return err;
736} 768}
737 769
738static inline int policy_to_flow_dir(int dir)
739{
740 if (XFRM_POLICY_IN == FLOW_DIR_IN &&
741 XFRM_POLICY_OUT == FLOW_DIR_OUT &&
742 XFRM_POLICY_FWD == FLOW_DIR_FWD)
743 return dir;
744 switch (dir) {
745 default:
746 case XFRM_POLICY_IN:
747 return FLOW_DIR_IN;
748 case XFRM_POLICY_OUT:
749 return FLOW_DIR_OUT;
750 case XFRM_POLICY_FWD:
751 return FLOW_DIR_FWD;
752 };
753}
754 770
755static int stale_bundle(struct dst_entry *dst); 771static int stale_bundle(struct dst_entry *dst);
756 772
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
769 int err; 785 int err;
770 u32 genid; 786 u32 genid;
771 u16 family = dst_orig->ops->family; 787 u16 family = dst_orig->ops->family;
788 u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
789 u32 sk_sid = security_sk_sid(sk, fl, dir);
772restart: 790restart:
773 genid = atomic_read(&flow_cache_genid); 791 genid = atomic_read(&flow_cache_genid);
774 policy = NULL; 792 policy = NULL;
775 if (sk && sk->sk_policy[1]) 793 if (sk && sk->sk_policy[1])
776 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); 794 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
777 795
778 if (!policy) { 796 if (!policy) {
779 /* To accelerate a bit... */ 797 /* To accelerate a bit... */
780 if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) 798 if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
781 return 0; 799 return 0;
782 800
783 policy = flow_cache_lookup(fl, family, 801 policy = flow_cache_lookup(fl, sk_sid, family, dir,
784 policy_to_flow_dir(XFRM_POLICY_OUT),
785 xfrm_policy_lookup); 802 xfrm_policy_lookup);
786 } 803 }
787 804
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
962{ 979{
963 struct xfrm_policy *pol; 980 struct xfrm_policy *pol;
964 struct flowi fl; 981 struct flowi fl;
982 u8 fl_dir = policy_to_flow_dir(dir);
983 u32 sk_sid;
965 984
966 if (_decode_session(skb, &fl, family) < 0) 985 if (_decode_session(skb, &fl, family) < 0)
967 return 0; 986 return 0;
968 987
988 sk_sid = security_sk_sid(sk, &fl, fl_dir);
989
969 /* First, check used SA against their selectors. */ 990 /* First, check used SA against their selectors. */
970 if (skb->sp) { 991 if (skb->sp) {
971 int i; 992 int i;
972 993
973 for (i=skb->sp->len-1; i>=0; i--) { 994 for (i=skb->sp->len-1; i>=0; i--) {
974 struct sec_decap_state *xvec = &(skb->sp->x[i]); 995 struct sec_decap_state *xvec = &(skb->sp->x[i]);
975 if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) 996 if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
976 return 0; 997 return 0;
977 998
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
986 1007
987 pol = NULL; 1008 pol = NULL;
988 if (sk && sk->sk_policy[dir]) 1009 if (sk && sk->sk_policy[dir])
989 pol = xfrm_sk_policy_lookup(sk, dir, &fl); 1010 pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
990 1011
991 if (!pol) 1012 if (!pol)
992 pol = flow_cache_lookup(&fl, family, 1013 pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
993 policy_to_flow_dir(dir),
994 xfrm_policy_lookup); 1014 xfrm_policy_lookup);
995 1015
996 if (!pol) 1016 if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc97666..e12d0be5f976 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
10 * Split up af-specific functions 10 * Split up af-specific functions
11 * Derek Atkins <derek@ihtfp.com> 11 * Derek Atkins <derek@ihtfp.com>
12 * Add UDP Encapsulation 12 * Add UDP Encapsulation
13 * 13 *
14 */ 14 */
15 15
16#include <linux/workqueue.h> 16#include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
70 x->type->destructor(x); 70 x->type->destructor(x);
71 xfrm_put_type(x->type); 71 xfrm_put_type(x->type);
72 } 72 }
73 security_xfrm_state_free(x);
73 kfree(x); 74 kfree(x);
74} 75}
75 76
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
343 selector. 344 selector.
344 */ 345 */
345 if (x->km.state == XFRM_STATE_VALID) { 346 if (x->km.state == XFRM_STATE_VALID) {
346 if (!xfrm_selector_match(&x->sel, fl, family)) 347 if (!xfrm_selector_match(&x->sel, fl, family) ||
348 !xfrm_sec_ctx_match(pol->security, x->security))
347 continue; 349 continue;
348 if (!best || 350 if (!best ||
349 best->km.dying > x->km.dying || 351 best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
354 acquire_in_progress = 1; 356 acquire_in_progress = 1;
355 } else if (x->km.state == XFRM_STATE_ERROR || 357 } else if (x->km.state == XFRM_STATE_ERROR ||
356 x->km.state == XFRM_STATE_EXPIRED) { 358 x->km.state == XFRM_STATE_EXPIRED) {
357 if (xfrm_selector_match(&x->sel, fl, family)) 359 if (xfrm_selector_match(&x->sel, fl, family) &&
360 xfrm_sec_ctx_match(pol->security, x->security))
358 error = -ESRCH; 361 error = -ESRCH;
359 } 362 }
360 } 363 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e043..92e2b804c606 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
7 * Kazunori MIYAZAWA @USAGI 7 * Kazunori MIYAZAWA @USAGI
8 * Kunihiro Ishiguro <kunihiro@ipinfusion.com> 8 * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
9 * IPv6 support 9 * IPv6 support
10 * 10 *
11 */ 11 */
12 12
13#include <linux/module.h> 13#include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
88 return 0; 88 return 0;
89} 89}
90 90
91
92static inline int verify_sec_ctx_len(struct rtattr **xfrma)
93{
94 struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
95 struct xfrm_user_sec_ctx *uctx;
96 int len = 0;
97
98 if (!rt)
99 return 0;
100
101 if (rt->rta_len < sizeof(*uctx))
102 return -EINVAL;
103
104 uctx = RTA_DATA(rt);
105
106 if (uctx->ctx_len > PAGE_SIZE)
107 return -EINVAL;
108
109 len += sizeof(struct xfrm_user_sec_ctx);
110 len += uctx->ctx_len;
111
112 if (uctx->len != len)
113 return -EINVAL;
114
115 return 0;
116}
117
118
91static int verify_newsa_info(struct xfrm_usersa_info *p, 119static int verify_newsa_info(struct xfrm_usersa_info *p,
92 struct rtattr **xfrma) 120 struct rtattr **xfrma)
93{ 121{
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
145 goto out; 173 goto out;
146 if ((err = verify_encap_tmpl(xfrma))) 174 if ((err = verify_encap_tmpl(xfrma)))
147 goto out; 175 goto out;
176 if ((err = verify_sec_ctx_len(xfrma)))
177 goto out;
148 178
149 err = -EINVAL; 179 err = -EINVAL;
150 switch (p->mode) { 180 switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
209 return 0; 239 return 0;
210} 240}
211 241
242
243static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
244{
245 struct xfrm_sec_ctx *xfrm_ctx = xp->security;
246 int len = 0;
247
248 if (xfrm_ctx) {
249 len += sizeof(struct xfrm_user_sec_ctx);
250 len += xfrm_ctx->ctx_len;
251 }
252 return len;
253}
254
255static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
256{
257 struct xfrm_user_sec_ctx *uctx;
258
259 if (!u_arg)
260 return 0;
261
262 uctx = RTA_DATA(u_arg);
263 return security_xfrm_state_alloc(x, uctx);
264}
265
212static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) 266static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
213{ 267{
214 memcpy(&x->id, &p->id, sizeof(x->id)); 268 memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
253 if (err) 307 if (err)
254 goto error; 308 goto error;
255 309
310 if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
311 goto error;
312
256 x->km.seq = p->seq; 313 x->km.seq = p->seq;
257 314
258 return x; 315 return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
272 int err; 329 int err;
273 struct km_event c; 330 struct km_event c;
274 331
275 err = verify_newsa_info(p, (struct rtattr **) xfrma); 332 err = verify_newsa_info(p, (struct rtattr **)xfrma);
276 if (err) 333 if (err)
277 return err; 334 return err;
278 335
279 x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); 336 x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
280 if (!x) 337 if (!x)
281 return err; 338 return err;
282 339
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
390 if (x->encap) 447 if (x->encap)
391 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); 448 RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
392 449
450 if (x->security) {
451 int ctx_size = sizeof(struct xfrm_sec_ctx) +
452 x->security->ctx_len;
453 struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
454 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
455
456 uctx->exttype = XFRMA_SEC_CTX;
457 uctx->len = ctx_size;
458 uctx->ctx_doi = x->security->ctx_doi;
459 uctx->ctx_alg = x->security->ctx_alg;
460 uctx->ctx_len = x->security->ctx_len;
461 memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
462 }
393 nlh->nlmsg_len = skb->tail - b; 463 nlh->nlmsg_len = skb->tail - b;
394out: 464out:
395 sp->this_idx++; 465 sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
603 return verify_policy_dir(p->dir); 673 return verify_policy_dir(p->dir);
604} 674}
605 675
676static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
677{
678 struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
679 struct xfrm_user_sec_ctx *uctx;
680
681 if (!rt)
682 return 0;
683
684 uctx = RTA_DATA(rt);
685 return security_xfrm_policy_alloc(pol, uctx);
686}
687
606static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, 688static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
607 int nr) 689 int nr)
608{ 690{
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
681 } 763 }
682 764
683 copy_from_user_policy(xp, p); 765 copy_from_user_policy(xp, p);
684 err = copy_from_user_tmpl(xp, xfrma); 766
767 if (!(err = copy_from_user_tmpl(xp, xfrma)))
768 err = copy_from_user_sec_ctx(xp, xfrma);
769
685 if (err) { 770 if (err) {
686 *errp = err; 771 *errp = err;
687 kfree(xp); 772 kfree(xp);
@@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
702 err = verify_newpolicy_info(p); 787 err = verify_newpolicy_info(p);
703 if (err) 788 if (err)
704 return err; 789 return err;
790 err = verify_sec_ctx_len((struct rtattr **)xfrma);
791 if (err)
792 return err;
705 793
706 xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); 794 xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
707 if (!xp) 795 if (!xp)
708 return err; 796 return err;
709 797
@@ -761,6 +849,27 @@ rtattr_failure:
761 return -1; 849 return -1;
762} 850}
763 851
852static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
853{
854 if (xp->security) {
855 int ctx_size = sizeof(struct xfrm_sec_ctx) +
856 xp->security->ctx_len;
857 struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
858 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
859
860 uctx->exttype = XFRMA_SEC_CTX;
861 uctx->len = ctx_size;
862 uctx->ctx_doi = xp->security->ctx_doi;
863 uctx->ctx_alg = xp->security->ctx_alg;
864 uctx->ctx_len = xp->security->ctx_len;
865 memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
866 }
867 return 0;
868
869 rtattr_failure:
870 return -1;
871}
872
764static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) 873static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
765{ 874{
766 struct xfrm_dump_info *sp = ptr; 875 struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
782 copy_to_user_policy(xp, p, dir); 891 copy_to_user_policy(xp, p, dir);
783 if (copy_to_user_tmpl(xp, skb) < 0) 892 if (copy_to_user_tmpl(xp, skb) < 0)
784 goto nlmsg_failure; 893 goto nlmsg_failure;
894 if (copy_to_user_sec_ctx(xp, skb))
895 goto nlmsg_failure;
785 896
786 nlh->nlmsg_len = skb->tail - b; 897 nlh->nlmsg_len = skb->tail - b;
787out: 898out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
852 963
853 if (p->index) 964 if (p->index)
854 xp = xfrm_policy_byid(p->dir, p->index, delete); 965 xp = xfrm_policy_byid(p->dir, p->index, delete);
855 else 966 else {
856 xp = xfrm_policy_bysel(p->dir, &p->sel, delete); 967 struct rtattr **rtattrs = (struct rtattr **)xfrma;
968 struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
969 struct xfrm_policy tmp;
970
971 err = verify_sec_ctx_len(rtattrs);
972 if (err)
973 return err;
974
975 memset(&tmp, 0, sizeof(struct xfrm_policy));
976 if (rt) {
977 struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
978
979 if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
980 return err;
981 }
982 xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
983 security_xfrm_policy_free(&tmp);
984 }
857 if (xp == NULL) 985 if (xp == NULL)
858 return -ENOENT; 986 return -ENOENT;
859 987
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
1224 1352
1225 if (copy_to_user_tmpl(xp, skb) < 0) 1353 if (copy_to_user_tmpl(xp, skb) < 0)
1226 goto nlmsg_failure; 1354 goto nlmsg_failure;
1355 if (copy_to_user_sec_ctx(xp, skb))
1356 goto nlmsg_failure;
1227 1357
1228 nlh->nlmsg_len = skb->tail - b; 1358 nlh->nlmsg_len = skb->tail - b;
1229 return skb->len; 1359 return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
1241 1371
1242 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); 1372 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1243 len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); 1373 len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
1374 len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
1244 skb = alloc_skb(len, GFP_ATOMIC); 1375 skb = alloc_skb(len, GFP_ATOMIC);
1245 if (skb == NULL) 1376 if (skb == NULL)
1246 return -ENOMEM; 1377 return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
1324 copy_to_user_policy(xp, &upe->pol, dir); 1455 copy_to_user_policy(xp, &upe->pol, dir);
1325 if (copy_to_user_tmpl(xp, skb) < 0) 1456 if (copy_to_user_tmpl(xp, skb) < 0)
1326 goto nlmsg_failure; 1457 goto nlmsg_failure;
1458 if (copy_to_user_sec_ctx(xp, skb))
1459 goto nlmsg_failure;
1327 upe->hard = !!hard; 1460 upe->hard = !!hard;
1328 1461
1329 nlh->nlmsg_len = skb->tail - b; 1462 nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
1341 1474
1342 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); 1475 len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
1343 len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); 1476 len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
1477 len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
1344 skb = alloc_skb(len, GFP_ATOMIC); 1478 skb = alloc_skb(len, GFP_ATOMIC);
1345 if (skb == NULL) 1479 if (skb == NULL)
1346 return -ENOMEM; 1480 return -ENOMEM;