aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c18
-rw-r--r--net/ipv4/arp.c31
-rw-r--r--net/ipv4/devinet.c97
-rw-r--r--net/ipv4/esp4.c32
-rw-r--r--net/ipv4/fib_frontend.c44
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/icmp.c31
-rw-r--r--net/ipv4/igmp.c284
-rw-r--r--net/ipv4/inet_connection_sock.c22
-rw-r--r--net/ipv4/inet_hashtables.c3
-rw-r--r--net/ipv4/inetpeer.c167
-rw-r--r--net/ipv4/ip_fragment.c36
-rw-r--r--net/ipv4/ip_gre.c50
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/ipconfig.c32
-rw-r--r--net/ipv4/ipip.c21
-rw-r--r--net/ipv4/ipmr.c18
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c2
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c210
-rw-r--r--net/ipv4/syncookies.c15
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c6
-rw-r--r--net/ipv4/tcp_input.c62
-rw-r--r--net/ipv4/tcp_ipv4.c90
-rw-r--r--net/ipv4/tcp_minisocks.c65
-rw-r--r--net/ipv4/tcp_output.c79
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/udp.c23
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_policy.c23
39 files changed, 875 insertions, 685 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f581f77d1097..f2b61107df6c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1148,21 +1148,13 @@ int inet_sk_rebuild_header(struct sock *sk)
1148 struct flowi fl = { 1148 struct flowi fl = {
1149 .oif = sk->sk_bound_dev_if, 1149 .oif = sk->sk_bound_dev_if,
1150 .mark = sk->sk_mark, 1150 .mark = sk->sk_mark,
1151 .nl_u = { 1151 .fl4_dst = daddr,
1152 .ip4_u = { 1152 .fl4_src = inet->inet_saddr,
1153 .daddr = daddr, 1153 .fl4_tos = RT_CONN_FLAGS(sk),
1154 .saddr = inet->inet_saddr,
1155 .tos = RT_CONN_FLAGS(sk),
1156 },
1157 },
1158 .proto = sk->sk_protocol, 1154 .proto = sk->sk_protocol,
1159 .flags = inet_sk_flowi_flags(sk), 1155 .flags = inet_sk_flowi_flags(sk),
1160 .uli_u = { 1156 .fl_ip_sport = inet->inet_sport,
1161 .ports = { 1157 .fl_ip_dport = inet->inet_dport,
1162 .sport = inet->inet_sport,
1163 .dport = inet->inet_dport,
1164 },
1165 },
1166 }; 1158 };
1167 1159
1168 security_sk_classify_flow(sk, &fl); 1160 security_sk_classify_flow(sk, &fl);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index d8e540c5b071..a2fc7b961dbc 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -433,8 +433,8 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
433 433
434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
435{ 435{
436 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, 436 struct flowi fl = { .fl4_dst = sip,
437 .saddr = tip } } }; 437 .fl4_src = tip };
438 struct rtable *rt; 438 struct rtable *rt;
439 int flag = 0; 439 int flag = 0;
440 /*unsigned long now; */ 440 /*unsigned long now; */
@@ -883,7 +883,7 @@ static int arp_process(struct sk_buff *skb)
883 883
884 dont_send = arp_ignore(in_dev, sip, tip); 884 dont_send = arp_ignore(in_dev, sip, tip);
885 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
886 dont_send |= arp_filter(sip, tip, dev); 886 dont_send = arp_filter(sip, tip, dev);
887 if (!dont_send) { 887 if (!dont_send) {
888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
889 if (n) { 889 if (n) {
@@ -1017,13 +1017,14 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; 1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
1018 return 0; 1018 return 0;
1019 } 1019 }
1020 if (__in_dev_get_rtnl(dev)) { 1020 if (__in_dev_get_rcu(dev)) {
1021 IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); 1021 IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
1022 return 0; 1022 return 0;
1023 } 1023 }
1024 return -ENXIO; 1024 return -ENXIO;
1025} 1025}
1026 1026
1027/* must be called with rcu_read_lock() */
1027static int arp_req_set_public(struct net *net, struct arpreq *r, 1028static int arp_req_set_public(struct net *net, struct arpreq *r,
1028 struct net_device *dev) 1029 struct net_device *dev)
1029{ 1030{
@@ -1033,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033 if (mask && mask != htonl(0xFFFFFFFF)) 1034 if (mask && mask != htonl(0xFFFFFFFF))
1034 return -EINVAL; 1035 return -EINVAL;
1035 if (!dev && (r->arp_flags & ATF_COM)) { 1036 if (!dev && (r->arp_flags & ATF_COM)) {
1036 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1037 dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
1037 r->arp_ha.sa_data); 1038 r->arp_ha.sa_data);
1038 if (!dev) 1039 if (!dev)
1039 return -ENODEV; 1040 return -ENODEV;
@@ -1061,8 +1062,8 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1061 if (r->arp_flags & ATF_PERM) 1062 if (r->arp_flags & ATF_PERM)
1062 r->arp_flags |= ATF_COM; 1063 r->arp_flags |= ATF_COM;
1063 if (dev == NULL) { 1064 if (dev == NULL) {
1064 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, 1065 struct flowi fl = { .fl4_dst = ip,
1065 .tos = RTO_ONLINK } }; 1066 .fl4_tos = RTO_ONLINK };
1066 struct rtable *rt; 1067 struct rtable *rt;
1067 err = ip_route_output_key(net, &rt, &fl); 1068 err = ip_route_output_key(net, &rt, &fl);
1068 if (err != 0) 1069 if (err != 0)
@@ -1169,8 +1170,8 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1169 1170
1170 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1171 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1171 if (dev == NULL) { 1172 if (dev == NULL) {
1172 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, 1173 struct flowi fl = { .fl4_dst = ip,
1173 .tos = RTO_ONLINK } }; 1174 .fl4_tos = RTO_ONLINK };
1174 struct rtable *rt; 1175 struct rtable *rt;
1175 err = ip_route_output_key(net, &rt, &fl); 1176 err = ip_route_output_key(net, &rt, &fl);
1176 if (err != 0) 1177 if (err != 0)
@@ -1225,10 +1226,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1225 if (!(r.arp_flags & ATF_NETMASK)) 1226 if (!(r.arp_flags & ATF_NETMASK))
1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1227 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
1227 htonl(0xFFFFFFFFUL); 1228 htonl(0xFFFFFFFFUL);
1228 rtnl_lock(); 1229 rcu_read_lock();
1229 if (r.arp_dev[0]) { 1230 if (r.arp_dev[0]) {
1230 err = -ENODEV; 1231 err = -ENODEV;
1231 dev = __dev_get_by_name(net, r.arp_dev); 1232 dev = dev_get_by_name_rcu(net, r.arp_dev);
1232 if (dev == NULL) 1233 if (dev == NULL)
1233 goto out; 1234 goto out;
1234 1235
@@ -1252,12 +1253,12 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1252 break; 1253 break;
1253 case SIOCGARP: 1254 case SIOCGARP:
1254 err = arp_req_get(&r, dev); 1255 err = arp_req_get(&r, dev);
1255 if (!err && copy_to_user(arg, &r, sizeof(r)))
1256 err = -EFAULT;
1257 break; 1256 break;
1258 } 1257 }
1259out: 1258out:
1260 rtnl_unlock(); 1259 rcu_read_unlock();
1260 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
1261 err = -EFAULT;
1261 return err; 1262 return err;
1262} 1263}
1263 1264
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dc94b0316b78..748cb5b337bd 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1256,6 +1256,87 @@ errout:
1256 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1256 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
1257} 1257}
1258 1258
1259static size_t inet_get_link_af_size(const struct net_device *dev)
1260{
1261 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1262
1263 if (!in_dev)
1264 return 0;
1265
1266 return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
1267}
1268
1269static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
1270{
1271 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1272 struct nlattr *nla;
1273 int i;
1274
1275 if (!in_dev)
1276 return -ENODATA;
1277
1278 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
1279 if (nla == NULL)
1280 return -EMSGSIZE;
1281
1282 for (i = 0; i < IPV4_DEVCONF_MAX; i++)
1283 ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
1284
1285 return 0;
1286}
1287
1288static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
1289 [IFLA_INET_CONF] = { .type = NLA_NESTED },
1290};
1291
1292static int inet_validate_link_af(const struct net_device *dev,
1293 const struct nlattr *nla)
1294{
1295 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1296 int err, rem;
1297
1298 if (dev && !__in_dev_get_rtnl(dev))
1299 return -EAFNOSUPPORT;
1300
1301 err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
1302 if (err < 0)
1303 return err;
1304
1305 if (tb[IFLA_INET_CONF]) {
1306 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
1307 int cfgid = nla_type(a);
1308
1309 if (nla_len(a) < 4)
1310 return -EINVAL;
1311
1312 if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
1313 return -EINVAL;
1314 }
1315 }
1316
1317 return 0;
1318}
1319
1320static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
1321{
1322 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1323 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1324 int rem;
1325
1326 if (!in_dev)
1327 return -EAFNOSUPPORT;
1328
1329 if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
1330 BUG();
1331
1332 if (tb[IFLA_INET_CONF]) {
1333 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
1334 ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
1335 }
1336
1337 return 0;
1338}
1339
1259#ifdef CONFIG_SYSCTL 1340#ifdef CONFIG_SYSCTL
1260 1341
1261static void devinet_copy_dflt_conf(struct net *net, int i) 1342static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1349,9 +1430,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1349 return ret; 1430 return ret;
1350} 1431}
1351 1432
1352int ipv4_doint_and_flush(ctl_table *ctl, int write, 1433static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1353 void __user *buffer, 1434 void __user *buffer,
1354 size_t *lenp, loff_t *ppos) 1435 size_t *lenp, loff_t *ppos)
1355{ 1436{
1356 int *valp = ctl->data; 1437 int *valp = ctl->data;
1357 int val = *valp; 1438 int val = *valp;
@@ -1619,6 +1700,14 @@ static __net_initdata struct pernet_operations devinet_ops = {
1619 .exit = devinet_exit_net, 1700 .exit = devinet_exit_net,
1620}; 1701};
1621 1702
1703static struct rtnl_af_ops inet_af_ops = {
1704 .family = AF_INET,
1705 .fill_link_af = inet_fill_link_af,
1706 .get_link_af_size = inet_get_link_af_size,
1707 .validate_link_af = inet_validate_link_af,
1708 .set_link_af = inet_set_link_af,
1709};
1710
1622void __init devinet_init(void) 1711void __init devinet_init(void)
1623{ 1712{
1624 register_pernet_subsys(&devinet_ops); 1713 register_pernet_subsys(&devinet_ops);
@@ -1626,6 +1715,8 @@ void __init devinet_init(void)
1626 register_gifconf(PF_INET, inet_gifconf); 1715 register_gifconf(PF_INET, inet_gifconf);
1627 register_netdevice_notifier(&ip_netdev_notifier); 1716 register_netdevice_notifier(&ip_netdev_notifier);
1628 1717
1718 rtnl_af_register(&inet_af_ops);
1719
1629 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); 1720 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
1630 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); 1721 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
1631 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1722 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1c3fb0..e42a905180f0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
23 23
24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) 24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
25 25
26static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
27
26/* 28/*
27 * Allocate an AEAD request structure with extra space for SG and IV. 29 * Allocate an AEAD request structure with extra space for SG and IV.
28 * 30 *
@@ -117,25 +119,35 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
117 int blksize; 119 int blksize;
118 int clen; 120 int clen;
119 int alen; 121 int alen;
122 int plen;
123 int tfclen;
120 int nfrags; 124 int nfrags;
121 125
122 /* skb is pure payload to encrypt */ 126 /* skb is pure payload to encrypt */
123 127
124 err = -ENOMEM; 128 err = -ENOMEM;
125 129
126 /* Round to block size */
127 clen = skb->len;
128
129 esp = x->data; 130 esp = x->data;
130 aead = esp->aead; 131 aead = esp->aead;
131 alen = crypto_aead_authsize(aead); 132 alen = crypto_aead_authsize(aead);
132 133
134 tfclen = 0;
135 if (x->tfcpad) {
136 struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
137 u32 padto;
138
139 padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
140 if (skb->len < padto)
141 tfclen = padto - skb->len;
142 }
133 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 143 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
134 clen = ALIGN(clen + 2, blksize); 144 clen = ALIGN(skb->len + 2 + tfclen, blksize);
135 if (esp->padlen) 145 if (esp->padlen)
136 clen = ALIGN(clen, esp->padlen); 146 clen = ALIGN(clen, esp->padlen);
147 plen = clen - skb->len - tfclen;
137 148
138 if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) 149 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
150 if (err < 0)
139 goto error; 151 goto error;
140 nfrags = err; 152 nfrags = err;
141 153
@@ -150,13 +162,17 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
150 162
151 /* Fill padding... */ 163 /* Fill padding... */
152 tail = skb_tail_pointer(trailer); 164 tail = skb_tail_pointer(trailer);
165 if (tfclen) {
166 memset(tail, 0, tfclen);
167 tail += tfclen;
168 }
153 do { 169 do {
154 int i; 170 int i;
155 for (i=0; i<clen-skb->len - 2; i++) 171 for (i = 0; i < plen - 2; i++)
156 tail[i] = i + 1; 172 tail[i] = i + 1;
157 } while (0); 173 } while (0);
158 tail[clen - skb->len - 2] = (clen - skb->len) - 2; 174 tail[plen - 2] = plen - 2;
159 tail[clen - skb->len - 1] = *skb_mac_header(skb); 175 tail[plen - 1] = *skb_mac_header(skb);
160 pskb_put(skb, trailer, clen - skb->len + alen); 176 pskb_put(skb, trailer, clen - skb->len + alen);
161 177
162 skb_push(skb, -skb_network_offset(skb)); 178 skb_push(skb, -skb_network_offset(skb));
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eb6f69a8f27a..1d2cdd43a878 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -158,18 +158,20 @@ static void fib_flush(struct net *net)
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{ 159{
160 struct flowi fl = { 160 struct flowi fl = {
161 .nl_u = { 161 .fl4_dst = addr,
162 .ip4_u = {
163 .daddr = addr
164 }
165 },
166 .flags = FLOWI_FLAG_MATCH_ANY_IIF
167 }; 162 };
168 struct fib_result res = { 0 }; 163 struct fib_result res = { 0 };
169 struct net_device *dev = NULL; 164 struct net_device *dev = NULL;
165 struct fib_table *local_table;
166
167#ifdef CONFIG_IP_MULTIPLE_TABLES
168 res.r = NULL;
169#endif
170 170
171 rcu_read_lock(); 171 rcu_read_lock();
172 if (fib_lookup(net, &fl, &res)) { 172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
173 rcu_read_unlock(); 175 rcu_read_unlock();
174 return NULL; 176 return NULL;
175 } 177 }
@@ -193,7 +195,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
193 const struct net_device *dev, 195 const struct net_device *dev,
194 __be32 addr) 196 __be32 addr)
195{ 197{
196 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 198 struct flowi fl = { .fl4_dst = addr };
197 struct fib_result res; 199 struct fib_result res;
198 unsigned ret = RTN_BROADCAST; 200 unsigned ret = RTN_BROADCAST;
199 struct fib_table *local_table; 201 struct fib_table *local_table;
@@ -247,13 +249,9 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
247{ 249{
248 struct in_device *in_dev; 250 struct in_device *in_dev;
249 struct flowi fl = { 251 struct flowi fl = {
250 .nl_u = { 252 .fl4_dst = src,
251 .ip4_u = { 253 .fl4_src = dst,
252 .daddr = src, 254 .fl4_tos = tos,
253 .saddr = dst,
254 .tos = tos
255 }
256 },
257 .mark = mark, 255 .mark = mark,
258 .iif = oif 256 .iif = oif
259 }; 257 };
@@ -853,13 +851,9 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
853 struct fib_result res; 851 struct fib_result res;
854 struct flowi fl = { 852 struct flowi fl = {
855 .mark = frn->fl_mark, 853 .mark = frn->fl_mark,
856 .nl_u = { 854 .fl4_dst = frn->fl_addr,
857 .ip4_u = { 855 .fl4_tos = frn->fl_tos,
858 .daddr = frn->fl_addr, 856 .fl4_scope = frn->fl_scope,
859 .tos = frn->fl_tos,
860 .scope = frn->fl_scope
861 }
862 }
863 }; 857 };
864 858
865#ifdef CONFIG_IP_MULTIPLE_TABLES 859#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -999,7 +993,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
999 rt_cache_flush(dev_net(dev), 0); 993 rt_cache_flush(dev_net(dev), 0);
1000 break; 994 break;
1001 case NETDEV_UNREGISTER_BATCH: 995 case NETDEV_UNREGISTER_BATCH:
1002 rt_cache_flush_batch(); 996 /* The batch unregister is only called on the first
997 * device in the list of devices being unregistered.
998 * Therefore we should not pass dev_net(dev) in here.
999 */
1000 rt_cache_flush_batch(NULL);
1003 break; 1001 break;
1004 } 1002 }
1005 return NOTIFY_DONE; 1003 return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a72c62d03106..9aff11d7278f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -563,12 +563,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
563 rcu_read_lock(); 563 rcu_read_lock();
564 { 564 {
565 struct flowi fl = { 565 struct flowi fl = {
566 .nl_u = { 566 .fl4_dst = nh->nh_gw,
567 .ip4_u = { 567 .fl4_scope = cfg->fc_scope + 1,
568 .daddr = nh->nh_gw,
569 .scope = cfg->fc_scope + 1,
570 },
571 },
572 .oif = nh->nh_oif, 568 .oif = nh->nh_oif,
573 }; 569 };
574 570
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 200eb538fbb3..0f280348e0fd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size)
365 if (size <= PAGE_SIZE) 365 if (size <= PAGE_SIZE)
366 return kzalloc(size, GFP_KERNEL); 366 return kzalloc(size, GFP_KERNEL);
367 else 367 else
368 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 368 return vzalloc(size);
369} 369}
370 370
371static void __tnode_vfree(struct work_struct *arg) 371static void __tnode_vfree(struct work_struct *arg)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c6e2affafbd3..4aa1b7f01ea0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -386,10 +386,9 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
386 daddr = icmp_param->replyopts.faddr; 386 daddr = icmp_param->replyopts.faddr;
387 } 387 }
388 { 388 {
389 struct flowi fl = { .nl_u = { .ip4_u = 389 struct flowi fl = { .fl4_dst= daddr,
390 { .daddr = daddr, 390 .fl4_src = rt->rt_spec_dst,
391 .saddr = rt->rt_spec_dst, 391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
392 .tos = RT_TOS(ip_hdr(skb)->tos) } },
393 .proto = IPPROTO_ICMP }; 392 .proto = IPPROTO_ICMP };
394 security_skb_classify_flow(skb, &fl); 393 security_skb_classify_flow(skb, &fl);
395 if (ip_route_output_key(net, &rt, &fl)) 394 if (ip_route_output_key(net, &rt, &fl))
@@ -542,22 +541,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
542 541
543 { 542 {
544 struct flowi fl = { 543 struct flowi fl = {
545 .nl_u = { 544 .fl4_dst = icmp_param.replyopts.srr ?
546 .ip4_u = { 545 icmp_param.replyopts.faddr : iph->saddr,
547 .daddr = icmp_param.replyopts.srr ? 546 .fl4_src = saddr,
548 icmp_param.replyopts.faddr : 547 .fl4_tos = RT_TOS(tos),
549 iph->saddr,
550 .saddr = saddr,
551 .tos = RT_TOS(tos)
552 }
553 },
554 .proto = IPPROTO_ICMP, 548 .proto = IPPROTO_ICMP,
555 .uli_u = { 549 .fl_icmp_type = type,
556 .icmpt = { 550 .fl_icmp_code = code,
557 .type = type,
558 .code = code
559 }
560 }
561 }; 551 };
562 int err; 552 int err;
563 struct rtable *rt2; 553 struct rtable *rt2;
@@ -569,6 +559,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
569 /* No need to clone since we're just using its address. */ 559 /* No need to clone since we're just using its address. */
570 rt2 = rt; 560 rt2 = rt;
571 561
562 if (!fl.nl_u.ip4_u.saddr)
563 fl.nl_u.ip4_u.saddr = rt->rt_src;
564
572 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); 565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
573 switch (err) { 566 switch (err) {
574 case 0: 567 case 0:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 08d0d81ffc15..e0e77e297de3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -149,21 +149,37 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc);
149static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, 149static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
150 int sfcount, __be32 *psfsrc, int delta); 150 int sfcount, __be32 *psfsrc, int delta);
151 151
152
153static void ip_mc_list_reclaim(struct rcu_head *head)
154{
155 kfree(container_of(head, struct ip_mc_list, rcu));
156}
157
152static void ip_ma_put(struct ip_mc_list *im) 158static void ip_ma_put(struct ip_mc_list *im)
153{ 159{
154 if (atomic_dec_and_test(&im->refcnt)) { 160 if (atomic_dec_and_test(&im->refcnt)) {
155 in_dev_put(im->interface); 161 in_dev_put(im->interface);
156 kfree(im); 162 call_rcu(&im->rcu, ip_mc_list_reclaim);
157 } 163 }
158} 164}
159 165
166#define for_each_pmc_rcu(in_dev, pmc) \
167 for (pmc = rcu_dereference(in_dev->mc_list); \
168 pmc != NULL; \
169 pmc = rcu_dereference(pmc->next_rcu))
170
171#define for_each_pmc_rtnl(in_dev, pmc) \
172 for (pmc = rtnl_dereference(in_dev->mc_list); \
173 pmc != NULL; \
174 pmc = rtnl_dereference(pmc->next_rcu))
175
160#ifdef CONFIG_IP_MULTICAST 176#ifdef CONFIG_IP_MULTICAST
161 177
162/* 178/*
163 * Timer management 179 * Timer management
164 */ 180 */
165 181
166static __inline__ void igmp_stop_timer(struct ip_mc_list *im) 182static void igmp_stop_timer(struct ip_mc_list *im)
167{ 183{
168 spin_lock_bh(&im->lock); 184 spin_lock_bh(&im->lock);
169 if (del_timer(&im->timer)) 185 if (del_timer(&im->timer))
@@ -284,6 +300,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
284 return scount; 300 return scount;
285} 301}
286 302
303#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
304
287static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) 305static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
288{ 306{
289 struct sk_buff *skb; 307 struct sk_buff *skb;
@@ -292,14 +310,20 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
292 struct igmpv3_report *pig; 310 struct igmpv3_report *pig;
293 struct net *net = dev_net(dev); 311 struct net *net = dev_net(dev);
294 312
295 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 313 while (1) {
296 if (skb == NULL) 314 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
297 return NULL; 315 GFP_ATOMIC | __GFP_NOWARN);
316 if (skb)
317 break;
318 size >>= 1;
319 if (size < 256)
320 return NULL;
321 }
322 igmp_skb_size(skb) = size;
298 323
299 { 324 {
300 struct flowi fl = { .oif = dev->ifindex, 325 struct flowi fl = { .oif = dev->ifindex,
301 .nl_u = { .ip4_u = { 326 .fl4_dst = IGMPV3_ALL_MCR,
302 .daddr = IGMPV3_ALL_MCR } },
303 .proto = IPPROTO_IGMP }; 327 .proto = IPPROTO_IGMP };
304 if (ip_route_output_key(net, &rt, &fl)) { 328 if (ip_route_output_key(net, &rt, &fl)) {
305 kfree_skb(skb); 329 kfree_skb(skb);
@@ -384,7 +408,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
384 return skb; 408 return skb;
385} 409}
386 410
387#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ 411#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
388 skb_tailroom(skb)) : 0) 412 skb_tailroom(skb)) : 0)
389 413
390static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, 414static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -502,8 +526,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
502 int type; 526 int type;
503 527
504 if (!pmc) { 528 if (!pmc) {
505 read_lock(&in_dev->mc_list_lock); 529 rcu_read_lock();
506 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 530 for_each_pmc_rcu(in_dev, pmc) {
507 if (pmc->multiaddr == IGMP_ALL_HOSTS) 531 if (pmc->multiaddr == IGMP_ALL_HOSTS)
508 continue; 532 continue;
509 spin_lock_bh(&pmc->lock); 533 spin_lock_bh(&pmc->lock);
@@ -514,7 +538,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
514 skb = add_grec(skb, pmc, type, 0, 0); 538 skb = add_grec(skb, pmc, type, 0, 0);
515 spin_unlock_bh(&pmc->lock); 539 spin_unlock_bh(&pmc->lock);
516 } 540 }
517 read_unlock(&in_dev->mc_list_lock); 541 rcu_read_unlock();
518 } else { 542 } else {
519 spin_lock_bh(&pmc->lock); 543 spin_lock_bh(&pmc->lock);
520 if (pmc->sfcount[MCAST_EXCLUDE]) 544 if (pmc->sfcount[MCAST_EXCLUDE])
@@ -556,7 +580,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
556 struct sk_buff *skb = NULL; 580 struct sk_buff *skb = NULL;
557 int type, dtype; 581 int type, dtype;
558 582
559 read_lock(&in_dev->mc_list_lock); 583 rcu_read_lock();
560 spin_lock_bh(&in_dev->mc_tomb_lock); 584 spin_lock_bh(&in_dev->mc_tomb_lock);
561 585
562 /* deleted MCA's */ 586 /* deleted MCA's */
@@ -593,7 +617,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
593 spin_unlock_bh(&in_dev->mc_tomb_lock); 617 spin_unlock_bh(&in_dev->mc_tomb_lock);
594 618
595 /* change recs */ 619 /* change recs */
596 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 620 for_each_pmc_rcu(in_dev, pmc) {
597 spin_lock_bh(&pmc->lock); 621 spin_lock_bh(&pmc->lock);
598 if (pmc->sfcount[MCAST_EXCLUDE]) { 622 if (pmc->sfcount[MCAST_EXCLUDE]) {
599 type = IGMPV3_BLOCK_OLD_SOURCES; 623 type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -616,7 +640,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
616 } 640 }
617 spin_unlock_bh(&pmc->lock); 641 spin_unlock_bh(&pmc->lock);
618 } 642 }
619 read_unlock(&in_dev->mc_list_lock); 643 rcu_read_unlock();
620 644
621 if (!skb) 645 if (!skb)
622 return; 646 return;
@@ -644,7 +668,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
644 668
645 { 669 {
646 struct flowi fl = { .oif = dev->ifindex, 670 struct flowi fl = { .oif = dev->ifindex,
647 .nl_u = { .ip4_u = { .daddr = dst } }, 671 .fl4_dst = dst,
648 .proto = IPPROTO_IGMP }; 672 .proto = IPPROTO_IGMP };
649 if (ip_route_output_key(net, &rt, &fl)) 673 if (ip_route_output_key(net, &rt, &fl))
650 return -1; 674 return -1;
@@ -813,14 +837,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
813 if (group == IGMP_ALL_HOSTS) 837 if (group == IGMP_ALL_HOSTS)
814 return; 838 return;
815 839
816 read_lock(&in_dev->mc_list_lock); 840 rcu_read_lock();
817 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 841 for_each_pmc_rcu(in_dev, im) {
818 if (im->multiaddr == group) { 842 if (im->multiaddr == group) {
819 igmp_stop_timer(im); 843 igmp_stop_timer(im);
820 break; 844 break;
821 } 845 }
822 } 846 }
823 read_unlock(&in_dev->mc_list_lock); 847 rcu_read_unlock();
824} 848}
825 849
826static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 850static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -906,8 +930,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
906 * - Use the igmp->igmp_code field as the maximum 930 * - Use the igmp->igmp_code field as the maximum
907 * delay possible 931 * delay possible
908 */ 932 */
909 read_lock(&in_dev->mc_list_lock); 933 rcu_read_lock();
910 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 934 for_each_pmc_rcu(in_dev, im) {
911 int changed; 935 int changed;
912 936
913 if (group && group != im->multiaddr) 937 if (group && group != im->multiaddr)
@@ -925,7 +949,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
925 if (changed) 949 if (changed)
926 igmp_mod_timer(im, max_delay); 950 igmp_mod_timer(im, max_delay);
927 } 951 }
928 read_unlock(&in_dev->mc_list_lock); 952 rcu_read_unlock();
929} 953}
930 954
931/* called in rcu_read_lock() section */ 955/* called in rcu_read_lock() section */
@@ -1110,8 +1134,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1110 kfree(pmc); 1134 kfree(pmc);
1111 } 1135 }
1112 /* clear dead sources, too */ 1136 /* clear dead sources, too */
1113 read_lock(&in_dev->mc_list_lock); 1137 rcu_read_lock();
1114 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1138 for_each_pmc_rcu(in_dev, pmc) {
1115 struct ip_sf_list *psf, *psf_next; 1139 struct ip_sf_list *psf, *psf_next;
1116 1140
1117 spin_lock_bh(&pmc->lock); 1141 spin_lock_bh(&pmc->lock);
@@ -1123,7 +1147,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1123 kfree(psf); 1147 kfree(psf);
1124 } 1148 }
1125 } 1149 }
1126 read_unlock(&in_dev->mc_list_lock); 1150 rcu_read_unlock();
1127} 1151}
1128#endif 1152#endif
1129 1153
@@ -1209,7 +1233,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1209 1233
1210 ASSERT_RTNL(); 1234 ASSERT_RTNL();
1211 1235
1212 for (im=in_dev->mc_list; im; im=im->next) { 1236 for_each_pmc_rtnl(in_dev, im) {
1213 if (im->multiaddr == addr) { 1237 if (im->multiaddr == addr) {
1214 im->users++; 1238 im->users++;
1215 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); 1239 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1217,7 +1241,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1217 } 1241 }
1218 } 1242 }
1219 1243
1220 im = kmalloc(sizeof(*im), GFP_KERNEL); 1244 im = kzalloc(sizeof(*im), GFP_KERNEL);
1221 if (!im) 1245 if (!im)
1222 goto out; 1246 goto out;
1223 1247
@@ -1227,26 +1251,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1227 im->multiaddr = addr; 1251 im->multiaddr = addr;
1228 /* initial mode is (EX, empty) */ 1252 /* initial mode is (EX, empty) */
1229 im->sfmode = MCAST_EXCLUDE; 1253 im->sfmode = MCAST_EXCLUDE;
1230 im->sfcount[MCAST_INCLUDE] = 0;
1231 im->sfcount[MCAST_EXCLUDE] = 1; 1254 im->sfcount[MCAST_EXCLUDE] = 1;
1232 im->sources = NULL;
1233 im->tomb = NULL;
1234 im->crcount = 0;
1235 atomic_set(&im->refcnt, 1); 1255 atomic_set(&im->refcnt, 1);
1236 spin_lock_init(&im->lock); 1256 spin_lock_init(&im->lock);
1237#ifdef CONFIG_IP_MULTICAST 1257#ifdef CONFIG_IP_MULTICAST
1238 im->tm_running = 0;
1239 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1258 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
1240 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1259 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1241 im->reporter = 0;
1242 im->gsquery = 0;
1243#endif 1260#endif
1244 im->loaded = 0; 1261
1245 write_lock_bh(&in_dev->mc_list_lock); 1262 im->next_rcu = in_dev->mc_list;
1246 im->next = in_dev->mc_list;
1247 in_dev->mc_list = im;
1248 in_dev->mc_count++; 1263 in_dev->mc_count++;
1249 write_unlock_bh(&in_dev->mc_list_lock); 1264 rcu_assign_pointer(in_dev->mc_list, im);
1265
1250#ifdef CONFIG_IP_MULTICAST 1266#ifdef CONFIG_IP_MULTICAST
1251 igmpv3_del_delrec(in_dev, im->multiaddr); 1267 igmpv3_del_delrec(in_dev, im->multiaddr);
1252#endif 1268#endif
@@ -1260,26 +1276,32 @@ EXPORT_SYMBOL(ip_mc_inc_group);
1260 1276
1261/* 1277/*
1262 * Resend IGMP JOIN report; used for bonding. 1278 * Resend IGMP JOIN report; used for bonding.
1279 * Called with rcu_read_lock()
1263 */ 1280 */
1264void ip_mc_rejoin_group(struct ip_mc_list *im) 1281void ip_mc_rejoin_groups(struct in_device *in_dev)
1265{ 1282{
1266#ifdef CONFIG_IP_MULTICAST 1283#ifdef CONFIG_IP_MULTICAST
1267 struct in_device *in_dev = im->interface; 1284 struct ip_mc_list *im;
1285 int type;
1268 1286
1269 if (im->multiaddr == IGMP_ALL_HOSTS) 1287 for_each_pmc_rcu(in_dev, im) {
1270 return; 1288 if (im->multiaddr == IGMP_ALL_HOSTS)
1289 continue;
1271 1290
1272 /* a failover is happening and switches 1291 /* a failover is happening and switches
1273 * must be notified immediately */ 1292 * must be notified immediately
1274 if (IGMP_V1_SEEN(in_dev)) 1293 */
1275 igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); 1294 if (IGMP_V1_SEEN(in_dev))
1276 else if (IGMP_V2_SEEN(in_dev)) 1295 type = IGMP_HOST_MEMBERSHIP_REPORT;
1277 igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); 1296 else if (IGMP_V2_SEEN(in_dev))
1278 else 1297 type = IGMPV2_HOST_MEMBERSHIP_REPORT;
1279 igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); 1298 else
1299 type = IGMPV3_HOST_MEMBERSHIP_REPORT;
1300 igmp_send_report(in_dev, im, type);
1301 }
1280#endif 1302#endif
1281} 1303}
1282EXPORT_SYMBOL(ip_mc_rejoin_group); 1304EXPORT_SYMBOL(ip_mc_rejoin_groups);
1283 1305
1284/* 1306/*
1285 * A socket has left a multicast group on device dev 1307 * A socket has left a multicast group on device dev
@@ -1287,17 +1309,18 @@ EXPORT_SYMBOL(ip_mc_rejoin_group);
1287 1309
1288void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) 1310void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1289{ 1311{
1290 struct ip_mc_list *i, **ip; 1312 struct ip_mc_list *i;
1313 struct ip_mc_list __rcu **ip;
1291 1314
1292 ASSERT_RTNL(); 1315 ASSERT_RTNL();
1293 1316
1294 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { 1317 for (ip = &in_dev->mc_list;
1318 (i = rtnl_dereference(*ip)) != NULL;
1319 ip = &i->next_rcu) {
1295 if (i->multiaddr == addr) { 1320 if (i->multiaddr == addr) {
1296 if (--i->users == 0) { 1321 if (--i->users == 0) {
1297 write_lock_bh(&in_dev->mc_list_lock); 1322 *ip = i->next_rcu;
1298 *ip = i->next;
1299 in_dev->mc_count--; 1323 in_dev->mc_count--;
1300 write_unlock_bh(&in_dev->mc_list_lock);
1301 igmp_group_dropped(i); 1324 igmp_group_dropped(i);
1302 1325
1303 if (!in_dev->dead) 1326 if (!in_dev->dead)
@@ -1316,34 +1339,34 @@ EXPORT_SYMBOL(ip_mc_dec_group);
1316 1339
1317void ip_mc_unmap(struct in_device *in_dev) 1340void ip_mc_unmap(struct in_device *in_dev)
1318{ 1341{
1319 struct ip_mc_list *i; 1342 struct ip_mc_list *pmc;
1320 1343
1321 ASSERT_RTNL(); 1344 ASSERT_RTNL();
1322 1345
1323 for (i = in_dev->mc_list; i; i = i->next) 1346 for_each_pmc_rtnl(in_dev, pmc)
1324 igmp_group_dropped(i); 1347 igmp_group_dropped(pmc);
1325} 1348}
1326 1349
1327void ip_mc_remap(struct in_device *in_dev) 1350void ip_mc_remap(struct in_device *in_dev)
1328{ 1351{
1329 struct ip_mc_list *i; 1352 struct ip_mc_list *pmc;
1330 1353
1331 ASSERT_RTNL(); 1354 ASSERT_RTNL();
1332 1355
1333 for (i = in_dev->mc_list; i; i = i->next) 1356 for_each_pmc_rtnl(in_dev, pmc)
1334 igmp_group_added(i); 1357 igmp_group_added(pmc);
1335} 1358}
1336 1359
1337/* Device going down */ 1360/* Device going down */
1338 1361
1339void ip_mc_down(struct in_device *in_dev) 1362void ip_mc_down(struct in_device *in_dev)
1340{ 1363{
1341 struct ip_mc_list *i; 1364 struct ip_mc_list *pmc;
1342 1365
1343 ASSERT_RTNL(); 1366 ASSERT_RTNL();
1344 1367
1345 for (i=in_dev->mc_list; i; i=i->next) 1368 for_each_pmc_rtnl(in_dev, pmc)
1346 igmp_group_dropped(i); 1369 igmp_group_dropped(pmc);
1347 1370
1348#ifdef CONFIG_IP_MULTICAST 1371#ifdef CONFIG_IP_MULTICAST
1349 in_dev->mr_ifc_count = 0; 1372 in_dev->mr_ifc_count = 0;
@@ -1374,7 +1397,6 @@ void ip_mc_init_dev(struct in_device *in_dev)
1374 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1397 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
1375#endif 1398#endif
1376 1399
1377 rwlock_init(&in_dev->mc_list_lock);
1378 spin_lock_init(&in_dev->mc_tomb_lock); 1400 spin_lock_init(&in_dev->mc_tomb_lock);
1379} 1401}
1380 1402
@@ -1382,14 +1404,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
1382 1404
1383void ip_mc_up(struct in_device *in_dev) 1405void ip_mc_up(struct in_device *in_dev)
1384{ 1406{
1385 struct ip_mc_list *i; 1407 struct ip_mc_list *pmc;
1386 1408
1387 ASSERT_RTNL(); 1409 ASSERT_RTNL();
1388 1410
1389 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); 1411 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1390 1412
1391 for (i=in_dev->mc_list; i; i=i->next) 1413 for_each_pmc_rtnl(in_dev, pmc)
1392 igmp_group_added(i); 1414 igmp_group_added(pmc);
1393} 1415}
1394 1416
1395/* 1417/*
@@ -1405,24 +1427,19 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1405 /* Deactivate timers */ 1427 /* Deactivate timers */
1406 ip_mc_down(in_dev); 1428 ip_mc_down(in_dev);
1407 1429
1408 write_lock_bh(&in_dev->mc_list_lock); 1430 while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
1409 while ((i = in_dev->mc_list) != NULL) { 1431 in_dev->mc_list = i->next_rcu;
1410 in_dev->mc_list = i->next;
1411 in_dev->mc_count--; 1432 in_dev->mc_count--;
1412 write_unlock_bh(&in_dev->mc_list_lock); 1433
1413 igmp_group_dropped(i); 1434 igmp_group_dropped(i);
1414 ip_ma_put(i); 1435 ip_ma_put(i);
1415
1416 write_lock_bh(&in_dev->mc_list_lock);
1417 } 1436 }
1418 write_unlock_bh(&in_dev->mc_list_lock);
1419} 1437}
1420 1438
1421/* RTNL is locked */ 1439/* RTNL is locked */
1422static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1423{ 1441{
1424 struct flowi fl = { .nl_u = { .ip4_u = 1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1425 { .daddr = imr->imr_multiaddr.s_addr } } };
1426 struct rtable *rt; 1443 struct rtable *rt;
1427 struct net_device *dev = NULL; 1444 struct net_device *dev = NULL;
1428 struct in_device *idev = NULL; 1445 struct in_device *idev = NULL;
@@ -1513,18 +1530,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1513 1530
1514 if (!in_dev) 1531 if (!in_dev)
1515 return -ENODEV; 1532 return -ENODEV;
1516 read_lock(&in_dev->mc_list_lock); 1533 rcu_read_lock();
1517 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1534 for_each_pmc_rcu(in_dev, pmc) {
1518 if (*pmca == pmc->multiaddr) 1535 if (*pmca == pmc->multiaddr)
1519 break; 1536 break;
1520 } 1537 }
1521 if (!pmc) { 1538 if (!pmc) {
1522 /* MCA not found?? bug */ 1539 /* MCA not found?? bug */
1523 read_unlock(&in_dev->mc_list_lock); 1540 rcu_read_unlock();
1524 return -ESRCH; 1541 return -ESRCH;
1525 } 1542 }
1526 spin_lock_bh(&pmc->lock); 1543 spin_lock_bh(&pmc->lock);
1527 read_unlock(&in_dev->mc_list_lock); 1544 rcu_read_unlock();
1528#ifdef CONFIG_IP_MULTICAST 1545#ifdef CONFIG_IP_MULTICAST
1529 sf_markstate(pmc); 1546 sf_markstate(pmc);
1530#endif 1547#endif
@@ -1685,18 +1702,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1685 1702
1686 if (!in_dev) 1703 if (!in_dev)
1687 return -ENODEV; 1704 return -ENODEV;
1688 read_lock(&in_dev->mc_list_lock); 1705 rcu_read_lock();
1689 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1706 for_each_pmc_rcu(in_dev, pmc) {
1690 if (*pmca == pmc->multiaddr) 1707 if (*pmca == pmc->multiaddr)
1691 break; 1708 break;
1692 } 1709 }
1693 if (!pmc) { 1710 if (!pmc) {
1694 /* MCA not found?? bug */ 1711 /* MCA not found?? bug */
1695 read_unlock(&in_dev->mc_list_lock); 1712 rcu_read_unlock();
1696 return -ESRCH; 1713 return -ESRCH;
1697 } 1714 }
1698 spin_lock_bh(&pmc->lock); 1715 spin_lock_bh(&pmc->lock);
1699 read_unlock(&in_dev->mc_list_lock); 1716 rcu_read_unlock();
1700 1717
1701#ifdef CONFIG_IP_MULTICAST 1718#ifdef CONFIG_IP_MULTICAST
1702 sf_markstate(pmc); 1719 sf_markstate(pmc);
@@ -1793,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1793 1810
1794 err = -EADDRINUSE; 1811 err = -EADDRINUSE;
1795 ifindex = imr->imr_ifindex; 1812 ifindex = imr->imr_ifindex;
1796 for (i = inet->mc_list; i; i = i->next) { 1813 for_each_pmc_rtnl(inet, i) {
1797 if (i->multi.imr_multiaddr.s_addr == addr && 1814 if (i->multi.imr_multiaddr.s_addr == addr &&
1798 i->multi.imr_ifindex == ifindex) 1815 i->multi.imr_ifindex == ifindex)
1799 goto done; 1816 goto done;
@@ -1807,7 +1824,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1807 goto done; 1824 goto done;
1808 1825
1809 memcpy(&iml->multi, imr, sizeof(*imr)); 1826 memcpy(&iml->multi, imr, sizeof(*imr));
1810 iml->next = inet->mc_list; 1827 iml->next_rcu = inet->mc_list;
1811 iml->sflist = NULL; 1828 iml->sflist = NULL;
1812 iml->sfmode = MCAST_EXCLUDE; 1829 iml->sfmode = MCAST_EXCLUDE;
1813 rcu_assign_pointer(inet->mc_list, iml); 1830 rcu_assign_pointer(inet->mc_list, iml);
@@ -1821,17 +1838,14 @@ EXPORT_SYMBOL(ip_mc_join_group);
1821 1838
1822static void ip_sf_socklist_reclaim(struct rcu_head *rp) 1839static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1823{ 1840{
1824 struct ip_sf_socklist *psf; 1841 kfree(container_of(rp, struct ip_sf_socklist, rcu));
1825
1826 psf = container_of(rp, struct ip_sf_socklist, rcu);
1827 /* sk_omem_alloc should have been decreased by the caller*/ 1842 /* sk_omem_alloc should have been decreased by the caller*/
1828 kfree(psf);
1829} 1843}
1830 1844
1831static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1845static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1832 struct in_device *in_dev) 1846 struct in_device *in_dev)
1833{ 1847{
1834 struct ip_sf_socklist *psf = iml->sflist; 1848 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
1835 int err; 1849 int err;
1836 1850
1837 if (psf == NULL) { 1851 if (psf == NULL) {
@@ -1851,11 +1865,8 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1851 1865
1852static void ip_mc_socklist_reclaim(struct rcu_head *rp) 1866static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1853{ 1867{
1854 struct ip_mc_socklist *iml; 1868 kfree(container_of(rp, struct ip_mc_socklist, rcu));
1855
1856 iml = container_of(rp, struct ip_mc_socklist, rcu);
1857 /* sk_omem_alloc should have been decreased by the caller*/ 1869 /* sk_omem_alloc should have been decreased by the caller*/
1858 kfree(iml);
1859} 1870}
1860 1871
1861 1872
@@ -1866,7 +1877,8 @@ static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1866int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) 1877int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1867{ 1878{
1868 struct inet_sock *inet = inet_sk(sk); 1879 struct inet_sock *inet = inet_sk(sk);
1869 struct ip_mc_socklist *iml, **imlp; 1880 struct ip_mc_socklist *iml;
1881 struct ip_mc_socklist __rcu **imlp;
1870 struct in_device *in_dev; 1882 struct in_device *in_dev;
1871 struct net *net = sock_net(sk); 1883 struct net *net = sock_net(sk);
1872 __be32 group = imr->imr_multiaddr.s_addr; 1884 __be32 group = imr->imr_multiaddr.s_addr;
@@ -1876,7 +1888,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1876 rtnl_lock(); 1888 rtnl_lock();
1877 in_dev = ip_mc_find_dev(net, imr); 1889 in_dev = ip_mc_find_dev(net, imr);
1878 ifindex = imr->imr_ifindex; 1890 ifindex = imr->imr_ifindex;
1879 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1891 for (imlp = &inet->mc_list;
1892 (iml = rtnl_dereference(*imlp)) != NULL;
1893 imlp = &iml->next_rcu) {
1880 if (iml->multi.imr_multiaddr.s_addr != group) 1894 if (iml->multi.imr_multiaddr.s_addr != group)
1881 continue; 1895 continue;
1882 if (ifindex) { 1896 if (ifindex) {
@@ -1888,7 +1902,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1888 1902
1889 (void) ip_mc_leave_src(sk, iml, in_dev); 1903 (void) ip_mc_leave_src(sk, iml, in_dev);
1890 1904
1891 rcu_assign_pointer(*imlp, iml->next); 1905 *imlp = iml->next_rcu;
1892 1906
1893 if (in_dev) 1907 if (in_dev)
1894 ip_mc_dec_group(in_dev, group); 1908 ip_mc_dec_group(in_dev, group);
@@ -1934,7 +1948,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1934 } 1948 }
1935 err = -EADDRNOTAVAIL; 1949 err = -EADDRNOTAVAIL;
1936 1950
1937 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1951 for_each_pmc_rtnl(inet, pmc) {
1938 if ((pmc->multi.imr_multiaddr.s_addr == 1952 if ((pmc->multi.imr_multiaddr.s_addr ==
1939 imr.imr_multiaddr.s_addr) && 1953 imr.imr_multiaddr.s_addr) &&
1940 (pmc->multi.imr_ifindex == imr.imr_ifindex)) 1954 (pmc->multi.imr_ifindex == imr.imr_ifindex))
@@ -1958,7 +1972,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1958 pmc->sfmode = omode; 1972 pmc->sfmode = omode;
1959 } 1973 }
1960 1974
1961 psl = pmc->sflist; 1975 psl = rtnl_dereference(pmc->sflist);
1962 if (!add) { 1976 if (!add) {
1963 if (!psl) 1977 if (!psl)
1964 goto done; /* err = -EADDRNOTAVAIL */ 1978 goto done; /* err = -EADDRNOTAVAIL */
@@ -2077,7 +2091,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2077 goto done; 2091 goto done;
2078 } 2092 }
2079 2093
2080 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2094 for_each_pmc_rtnl(inet, pmc) {
2081 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2095 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2082 pmc->multi.imr_ifindex == imr.imr_ifindex) 2096 pmc->multi.imr_ifindex == imr.imr_ifindex)
2083 break; 2097 break;
@@ -2107,7 +2121,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2107 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, 2121 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
2108 msf->imsf_fmode, 0, NULL, 0); 2122 msf->imsf_fmode, 0, NULL, 0);
2109 } 2123 }
2110 psl = pmc->sflist; 2124 psl = rtnl_dereference(pmc->sflist);
2111 if (psl) { 2125 if (psl) {
2112 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2126 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2113 psl->sl_count, psl->sl_addr, 0); 2127 psl->sl_count, psl->sl_addr, 0);
@@ -2155,7 +2169,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2155 } 2169 }
2156 err = -EADDRNOTAVAIL; 2170 err = -EADDRNOTAVAIL;
2157 2171
2158 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2172 for_each_pmc_rtnl(inet, pmc) {
2159 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2173 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2160 pmc->multi.imr_ifindex == imr.imr_ifindex) 2174 pmc->multi.imr_ifindex == imr.imr_ifindex)
2161 break; 2175 break;
@@ -2163,7 +2177,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2163 if (!pmc) /* must have a prior join */ 2177 if (!pmc) /* must have a prior join */
2164 goto done; 2178 goto done;
2165 msf->imsf_fmode = pmc->sfmode; 2179 msf->imsf_fmode = pmc->sfmode;
2166 psl = pmc->sflist; 2180 psl = rtnl_dereference(pmc->sflist);
2167 rtnl_unlock(); 2181 rtnl_unlock();
2168 if (!psl) { 2182 if (!psl) {
2169 len = 0; 2183 len = 0;
@@ -2208,7 +2222,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2208 2222
2209 err = -EADDRNOTAVAIL; 2223 err = -EADDRNOTAVAIL;
2210 2224
2211 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2225 for_each_pmc_rtnl(inet, pmc) {
2212 if (pmc->multi.imr_multiaddr.s_addr == addr && 2226 if (pmc->multi.imr_multiaddr.s_addr == addr &&
2213 pmc->multi.imr_ifindex == gsf->gf_interface) 2227 pmc->multi.imr_ifindex == gsf->gf_interface)
2214 break; 2228 break;
@@ -2216,7 +2230,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2216 if (!pmc) /* must have a prior join */ 2230 if (!pmc) /* must have a prior join */
2217 goto done; 2231 goto done;
2218 gsf->gf_fmode = pmc->sfmode; 2232 gsf->gf_fmode = pmc->sfmode;
2219 psl = pmc->sflist; 2233 psl = rtnl_dereference(pmc->sflist);
2220 rtnl_unlock(); 2234 rtnl_unlock();
2221 count = psl ? psl->sl_count : 0; 2235 count = psl ? psl->sl_count : 0;
2222 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; 2236 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2257,7 +2271,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2257 goto out; 2271 goto out;
2258 2272
2259 rcu_read_lock(); 2273 rcu_read_lock();
2260 for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { 2274 for_each_pmc_rcu(inet, pmc) {
2261 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2275 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2262 pmc->multi.imr_ifindex == dif) 2276 pmc->multi.imr_ifindex == dif)
2263 break; 2277 break;
@@ -2265,7 +2279,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2265 ret = inet->mc_all; 2279 ret = inet->mc_all;
2266 if (!pmc) 2280 if (!pmc)
2267 goto unlock; 2281 goto unlock;
2268 psl = pmc->sflist; 2282 psl = rcu_dereference(pmc->sflist);
2269 ret = (pmc->sfmode == MCAST_EXCLUDE); 2283 ret = (pmc->sfmode == MCAST_EXCLUDE);
2270 if (!psl) 2284 if (!psl)
2271 goto unlock; 2285 goto unlock;
@@ -2300,16 +2314,14 @@ void ip_mc_drop_socket(struct sock *sk)
2300 return; 2314 return;
2301 2315
2302 rtnl_lock(); 2316 rtnl_lock();
2303 while ((iml = inet->mc_list) != NULL) { 2317 while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
2304 struct in_device *in_dev; 2318 struct in_device *in_dev;
2305 rcu_assign_pointer(inet->mc_list, iml->next);
2306 2319
2320 inet->mc_list = iml->next_rcu;
2307 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2321 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2308 (void) ip_mc_leave_src(sk, iml, in_dev); 2322 (void) ip_mc_leave_src(sk, iml, in_dev);
2309 if (in_dev != NULL) { 2323 if (in_dev != NULL)
2310 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2324 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2311 in_dev_put(in_dev);
2312 }
2313 /* decrease mem now to avoid the memleak warning */ 2325 /* decrease mem now to avoid the memleak warning */
2314 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 2326 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
2315 call_rcu(&iml->rcu, ip_mc_socklist_reclaim); 2327 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
@@ -2323,8 +2335,8 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2323 struct ip_sf_list *psf; 2335 struct ip_sf_list *psf;
2324 int rv = 0; 2336 int rv = 0;
2325 2337
2326 read_lock(&in_dev->mc_list_lock); 2338 rcu_read_lock();
2327 for (im=in_dev->mc_list; im; im=im->next) { 2339 for_each_pmc_rcu(in_dev, im) {
2328 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2329 break; 2341 break;
2330 } 2342 }
@@ -2345,7 +2357,7 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2345 } else 2357 } else
2346 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2347 } 2359 }
2348 read_unlock(&in_dev->mc_list_lock); 2360 rcu_read_unlock();
2349 return rv; 2361 return rv;
2350} 2362}
2351 2363
@@ -2371,13 +2383,11 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2371 in_dev = __in_dev_get_rcu(state->dev); 2383 in_dev = __in_dev_get_rcu(state->dev);
2372 if (!in_dev) 2384 if (!in_dev)
2373 continue; 2385 continue;
2374 read_lock(&in_dev->mc_list_lock); 2386 im = rcu_dereference(in_dev->mc_list);
2375 im = in_dev->mc_list;
2376 if (im) { 2387 if (im) {
2377 state->in_dev = in_dev; 2388 state->in_dev = in_dev;
2378 break; 2389 break;
2379 } 2390 }
2380 read_unlock(&in_dev->mc_list_lock);
2381 } 2391 }
2382 return im; 2392 return im;
2383} 2393}
@@ -2385,11 +2395,9 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2385static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) 2395static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
2386{ 2396{
2387 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2397 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2388 im = im->next;
2389 while (!im) {
2390 if (likely(state->in_dev != NULL))
2391 read_unlock(&state->in_dev->mc_list_lock);
2392 2398
2399 im = rcu_dereference(im->next_rcu);
2400 while (!im) {
2393 state->dev = next_net_device_rcu(state->dev); 2401 state->dev = next_net_device_rcu(state->dev);
2394 if (!state->dev) { 2402 if (!state->dev) {
2395 state->in_dev = NULL; 2403 state->in_dev = NULL;
@@ -2398,8 +2406,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
2398 state->in_dev = __in_dev_get_rcu(state->dev); 2406 state->in_dev = __in_dev_get_rcu(state->dev);
2399 if (!state->in_dev) 2407 if (!state->in_dev)
2400 continue; 2408 continue;
2401 read_lock(&state->in_dev->mc_list_lock); 2409 im = rcu_dereference(state->in_dev->mc_list);
2402 im = state->in_dev->mc_list;
2403 } 2410 }
2404 return im; 2411 return im;
2405} 2412}
@@ -2435,10 +2442,8 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2435 __releases(rcu) 2442 __releases(rcu)
2436{ 2443{
2437 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2444 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2438 if (likely(state->in_dev != NULL)) { 2445
2439 read_unlock(&state->in_dev->mc_list_lock); 2446 state->in_dev = NULL;
2440 state->in_dev = NULL;
2441 }
2442 state->dev = NULL; 2447 state->dev = NULL;
2443 rcu_read_unlock(); 2448 rcu_read_unlock();
2444} 2449}
@@ -2460,7 +2465,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2460 querier = "NONE"; 2465 querier = "NONE";
2461#endif 2466#endif
2462 2467
2463 if (state->in_dev->mc_list == im) { 2468 if (rcu_dereference(state->in_dev->mc_list) == im) {
2464 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2469 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2465 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2470 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2466 } 2471 }
@@ -2519,8 +2524,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2519 idev = __in_dev_get_rcu(state->dev); 2524 idev = __in_dev_get_rcu(state->dev);
2520 if (unlikely(idev == NULL)) 2525 if (unlikely(idev == NULL))
2521 continue; 2526 continue;
2522 read_lock(&idev->mc_list_lock); 2527 im = rcu_dereference(idev->mc_list);
2523 im = idev->mc_list;
2524 if (likely(im != NULL)) { 2528 if (likely(im != NULL)) {
2525 spin_lock_bh(&im->lock); 2529 spin_lock_bh(&im->lock);
2526 psf = im->sources; 2530 psf = im->sources;
@@ -2531,7 +2535,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2531 } 2535 }
2532 spin_unlock_bh(&im->lock); 2536 spin_unlock_bh(&im->lock);
2533 } 2537 }
2534 read_unlock(&idev->mc_list_lock);
2535 } 2538 }
2536 return psf; 2539 return psf;
2537} 2540}
@@ -2545,9 +2548,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2545 spin_unlock_bh(&state->im->lock); 2548 spin_unlock_bh(&state->im->lock);
2546 state->im = state->im->next; 2549 state->im = state->im->next;
2547 while (!state->im) { 2550 while (!state->im) {
2548 if (likely(state->idev != NULL))
2549 read_unlock(&state->idev->mc_list_lock);
2550
2551 state->dev = next_net_device_rcu(state->dev); 2551 state->dev = next_net_device_rcu(state->dev);
2552 if (!state->dev) { 2552 if (!state->dev) {
2553 state->idev = NULL; 2553 state->idev = NULL;
@@ -2556,8 +2556,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2556 state->idev = __in_dev_get_rcu(state->dev); 2556 state->idev = __in_dev_get_rcu(state->dev);
2557 if (!state->idev) 2557 if (!state->idev)
2558 continue; 2558 continue;
2559 read_lock(&state->idev->mc_list_lock); 2559 state->im = rcu_dereference(state->idev->mc_list);
2560 state->im = state->idev->mc_list;
2561 } 2560 }
2562 if (!state->im) 2561 if (!state->im)
2563 break; 2562 break;
@@ -2603,10 +2602,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2603 spin_unlock_bh(&state->im->lock); 2602 spin_unlock_bh(&state->im->lock);
2604 state->im = NULL; 2603 state->im = NULL;
2605 } 2604 }
2606 if (likely(state->idev != NULL)) { 2605 state->idev = NULL;
2607 read_unlock(&state->idev->mc_list_lock);
2608 state->idev = NULL;
2609 }
2610 state->dev = NULL; 2606 state->dev = NULL;
2611 rcu_read_unlock(); 2607 rcu_read_unlock();
2612} 2608}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7174370b1195..25e318153f14 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -55,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range);
55int inet_csk_bind_conflict(const struct sock *sk, 55int inet_csk_bind_conflict(const struct sock *sk,
56 const struct inet_bind_bucket *tb) 56 const struct inet_bind_bucket *tb)
57{ 57{
58 const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
59 struct sock *sk2; 58 struct sock *sk2;
60 struct hlist_node *node; 59 struct hlist_node *node;
61 int reuse = sk->sk_reuse; 60 int reuse = sk->sk_reuse;
@@ -75,9 +74,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
75 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76 if (!reuse || !sk2->sk_reuse || 75 if (!reuse || !sk2->sk_reuse ||
77 sk2->sk_state == TCP_LISTEN) { 76 sk2->sk_state == TCP_LISTEN) {
78 const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
79 if (!sk2_rcv_saddr || !sk_rcv_saddr || 78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
80 sk2_rcv_saddr == sk_rcv_saddr) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
81 break; 80 break;
82 } 81 }
83 } 82 }
@@ -358,17 +357,14 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
358 struct ip_options *opt = inet_rsk(req)->opt; 357 struct ip_options *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if, 358 struct flowi fl = { .oif = sk->sk_bound_dev_if,
360 .mark = sk->sk_mark, 359 .mark = sk->sk_mark,
361 .nl_u = { .ip4_u = 360 .fl4_dst = ((opt && opt->srr) ?
362 { .daddr = ((opt && opt->srr) ? 361 opt->faddr : ireq->rmt_addr),
363 opt->faddr : 362 .fl4_src = ireq->loc_addr,
364 ireq->rmt_addr), 363 .fl4_tos = RT_CONN_FLAGS(sk),
365 .saddr = ireq->loc_addr,
366 .tos = RT_CONN_FLAGS(sk) } },
367 .proto = sk->sk_protocol, 364 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk), 365 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports = 366 .fl_ip_sport = inet_sk(sk)->inet_sport,
370 { .sport = inet_sk(sk)->inet_sport, 367 .fl_ip_dport = ireq->rmt_port };
371 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 368 struct net *net = sock_net(sk);
373 369
374 security_req_classify_flow(req, &fl); 370 security_req_classify_flow(req, &fl);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 1b344f30b463..3c0369a3a663 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
133 } 133 }
134 } 134 }
135 } 135 }
136 sk_add_bind_node(child, &tb->owners); 136 inet_bind_hash(child, tb, port);
137 inet_csk(child)->icsk_bind_hash = tb;
138 spin_unlock(&head->lock); 137 spin_unlock(&head->lock);
139 138
140 return 0; 139 return 0;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9e94d7cf4f8a..d9bc85751c74 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -63,7 +63,7 @@
63 * refcnt: atomically against modifications on other CPU; 63 * refcnt: atomically against modifications on other CPU;
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * daddr: unchangeable
67 * ip_id_count: atomic value (no lock needed) 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
@@ -79,15 +79,24 @@ static const struct inet_peer peer_fake_node = {
79 .avl_height = 0 79 .avl_height = 0
80}; 80};
81 81
82static struct { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 spinlock_t lock;
85 int total; 85 int total;
86} peers = { 86};
87
88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
91 .total = 0,
92};
93
94static struct inet_peer_base v6_peers = {
87 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
88 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), 96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
89 .total = 0, 97 .total = 0,
90}; 98};
99
91#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 100#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
92 101
93/* Exported for sysctl_net_ipv4. */ 102/* Exported for sysctl_net_ipv4. */
@@ -152,28 +161,45 @@ static void unlink_from_unused(struct inet_peer *p)
152 } 161 }
153} 162}
154 163
164static int addr_compare(const struct inetpeer_addr *a,
165 const struct inetpeer_addr *b)
166{
167 int i, n = (a->family == AF_INET ? 1 : 4);
168
169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i])
171 continue;
172 if (a->a6[i] < b->a6[i])
173 return -1;
174 return 1;
175 }
176
177 return 0;
178}
179
155/* 180/*
156 * Called with local BH disabled and the pool lock held. 181 * Called with local BH disabled and the pool lock held.
157 */ 182 */
158#define lookup(_daddr, _stack) \ 183#define lookup(_daddr, _stack, _base) \
159({ \ 184({ \
160 struct inet_peer *u; \ 185 struct inet_peer *u; \
161 struct inet_peer __rcu **v; \ 186 struct inet_peer __rcu **v; \
162 \ 187 \
163 stackptr = _stack; \ 188 stackptr = _stack; \
164 *stackptr++ = &peers.root; \ 189 *stackptr++ = &_base->root; \
165 for (u = rcu_dereference_protected(peers.root, \ 190 for (u = rcu_dereference_protected(_base->root, \
166 lockdep_is_held(&peers.lock)); \ 191 lockdep_is_held(&_base->lock)); \
167 u != peer_avl_empty; ) { \ 192 u != peer_avl_empty; ) { \
168 if (_daddr == u->v4daddr) \ 193 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \
169 break; \ 195 break; \
170 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 196 if (cmp == -1) \
171 v = &u->avl_left; \ 197 v = &u->avl_left; \
172 else \ 198 else \
173 v = &u->avl_right; \ 199 v = &u->avl_right; \
174 *stackptr++ = v; \ 200 *stackptr++ = v; \
175 u = rcu_dereference_protected(*v, \ 201 u = rcu_dereference_protected(*v, \
176 lockdep_is_held(&peers.lock)); \ 202 lockdep_is_held(&_base->lock)); \
177 } \ 203 } \
178 u; \ 204 u; \
179}) 205})
@@ -185,13 +211,15 @@ static void unlink_from_unused(struct inet_peer *p)
185 * But every pointer we follow is guaranteed to be valid thanks to RCU. 211 * But every pointer we follow is guaranteed to be valid thanks to RCU.
186 * We exit from this function if number of links exceeds PEER_MAXDEPTH 212 * We exit from this function if number of links exceeds PEER_MAXDEPTH
187 */ 213 */
188static struct inet_peer *lookup_rcu_bh(__be32 daddr) 214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base)
189{ 216{
190 struct inet_peer *u = rcu_dereference_bh(peers.root); 217 struct inet_peer *u = rcu_dereference_bh(base->root);
191 int count = 0; 218 int count = 0;
192 219
193 while (u != peer_avl_empty) { 220 while (u != peer_avl_empty) {
194 if (daddr == u->v4daddr) { 221 int cmp = addr_compare(daddr, &u->daddr);
222 if (cmp == 0) {
195 /* Before taking a reference, check if this entry was 223 /* Before taking a reference, check if this entry was
196 * deleted, unlink_from_pool() sets refcnt=-1 to make 224 * deleted, unlink_from_pool() sets refcnt=-1 to make
197 * distinction between an unused entry (refcnt=0) and 225 * distinction between an unused entry (refcnt=0) and
@@ -201,7 +229,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
201 u = NULL; 229 u = NULL;
202 return u; 230 return u;
203 } 231 }
204 if ((__force __u32)daddr < (__force __u32)u->v4daddr) 232 if (cmp == -1)
205 u = rcu_dereference_bh(u->avl_left); 233 u = rcu_dereference_bh(u->avl_left);
206 else 234 else
207 u = rcu_dereference_bh(u->avl_right); 235 u = rcu_dereference_bh(u->avl_right);
@@ -212,19 +240,19 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
212} 240}
213 241
214/* Called with local BH disabled and the pool lock held. */ 242/* Called with local BH disabled and the pool lock held. */
215#define lookup_rightempty(start) \ 243#define lookup_rightempty(start, base) \
216({ \ 244({ \
217 struct inet_peer *u; \ 245 struct inet_peer *u; \
218 struct inet_peer __rcu **v; \ 246 struct inet_peer __rcu **v; \
219 *stackptr++ = &start->avl_left; \ 247 *stackptr++ = &start->avl_left; \
220 v = &start->avl_left; \ 248 v = &start->avl_left; \
221 for (u = rcu_dereference_protected(*v, \ 249 for (u = rcu_dereference_protected(*v, \
222 lockdep_is_held(&peers.lock)); \ 250 lockdep_is_held(&base->lock)); \
223 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
224 v = &u->avl_right; \ 252 v = &u->avl_right; \
225 *stackptr++ = v; \ 253 *stackptr++ = v; \
226 u = rcu_dereference_protected(*v, \ 254 u = rcu_dereference_protected(*v, \
227 lockdep_is_held(&peers.lock)); \ 255 lockdep_is_held(&base->lock)); \
228 } \ 256 } \
229 u; \ 257 u; \
230}) 258})
@@ -234,7 +262,8 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
234 * Look into mm/map_avl.c for more detail description of the ideas. 262 * Look into mm/map_avl.c for more detail description of the ideas.
235 */ 263 */
236static void peer_avl_rebalance(struct inet_peer __rcu **stack[], 264static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
237 struct inet_peer __rcu ***stackend) 265 struct inet_peer __rcu ***stackend,
266 struct inet_peer_base *base)
238{ 267{
239 struct inet_peer __rcu **nodep; 268 struct inet_peer __rcu **nodep;
240 struct inet_peer *node, *l, *r; 269 struct inet_peer *node, *l, *r;
@@ -243,20 +272,20 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
243 while (stackend > stack) { 272 while (stackend > stack) {
244 nodep = *--stackend; 273 nodep = *--stackend;
245 node = rcu_dereference_protected(*nodep, 274 node = rcu_dereference_protected(*nodep,
246 lockdep_is_held(&peers.lock)); 275 lockdep_is_held(&base->lock));
247 l = rcu_dereference_protected(node->avl_left, 276 l = rcu_dereference_protected(node->avl_left,
248 lockdep_is_held(&peers.lock)); 277 lockdep_is_held(&base->lock));
249 r = rcu_dereference_protected(node->avl_right, 278 r = rcu_dereference_protected(node->avl_right,
250 lockdep_is_held(&peers.lock)); 279 lockdep_is_held(&base->lock));
251 lh = node_height(l); 280 lh = node_height(l);
252 rh = node_height(r); 281 rh = node_height(r);
253 if (lh > rh + 1) { /* l: RH+2 */ 282 if (lh > rh + 1) { /* l: RH+2 */
254 struct inet_peer *ll, *lr, *lrl, *lrr; 283 struct inet_peer *ll, *lr, *lrl, *lrr;
255 int lrh; 284 int lrh;
256 ll = rcu_dereference_protected(l->avl_left, 285 ll = rcu_dereference_protected(l->avl_left,
257 lockdep_is_held(&peers.lock)); 286 lockdep_is_held(&base->lock));
258 lr = rcu_dereference_protected(l->avl_right, 287 lr = rcu_dereference_protected(l->avl_right,
259 lockdep_is_held(&peers.lock)); 288 lockdep_is_held(&base->lock));
260 lrh = node_height(lr); 289 lrh = node_height(lr);
261 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 290 if (lrh <= node_height(ll)) { /* ll: RH+1 */
262 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -268,9 +297,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
268 RCU_INIT_POINTER(*nodep, l); 297 RCU_INIT_POINTER(*nodep, l);
269 } else { /* ll: RH, lr: RH+1 */ 298 } else { /* ll: RH, lr: RH+1 */
270 lrl = rcu_dereference_protected(lr->avl_left, 299 lrl = rcu_dereference_protected(lr->avl_left,
271 lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ 300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */
272 lrr = rcu_dereference_protected(lr->avl_right, 301 lrr = rcu_dereference_protected(lr->avl_right,
273 lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ 302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
274 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
275 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
276 node->avl_height = rh + 1; /* node: RH+1 */ 305 node->avl_height = rh + 1; /* node: RH+1 */
@@ -286,9 +315,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
286 struct inet_peer *rr, *rl, *rlr, *rll; 315 struct inet_peer *rr, *rl, *rlr, *rll;
287 int rlh; 316 int rlh;
288 rr = rcu_dereference_protected(r->avl_right, 317 rr = rcu_dereference_protected(r->avl_right,
289 lockdep_is_held(&peers.lock)); 318 lockdep_is_held(&base->lock));
290 rl = rcu_dereference_protected(r->avl_left, 319 rl = rcu_dereference_protected(r->avl_left,
291 lockdep_is_held(&peers.lock)); 320 lockdep_is_held(&base->lock));
292 rlh = node_height(rl); 321 rlh = node_height(rl);
293 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 322 if (rlh <= node_height(rr)) { /* rr: LH+1 */
294 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -300,9 +329,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
300 RCU_INIT_POINTER(*nodep, r); 329 RCU_INIT_POINTER(*nodep, r);
301 } else { /* rr: RH, rl: RH+1 */ 330 } else { /* rr: RH, rl: RH+1 */
302 rlr = rcu_dereference_protected(rl->avl_right, 331 rlr = rcu_dereference_protected(rl->avl_right,
303 lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ 332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */
304 rll = rcu_dereference_protected(rl->avl_left, 333 rll = rcu_dereference_protected(rl->avl_left,
305 lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ 334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
306 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
307 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
308 node->avl_height = lh + 1; /* node: LH+1 */ 337 node->avl_height = lh + 1; /* node: LH+1 */
@@ -321,14 +350,14 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
321} 350}
322 351
323/* Called with local BH disabled and the pool lock held. */ 352/* Called with local BH disabled and the pool lock held. */
324#define link_to_pool(n) \ 353#define link_to_pool(n, base) \
325do { \ 354do { \
326 n->avl_height = 1; \ 355 n->avl_height = 1; \
327 n->avl_left = peer_avl_empty_rcu; \ 356 n->avl_left = peer_avl_empty_rcu; \
328 n->avl_right = peer_avl_empty_rcu; \ 357 n->avl_right = peer_avl_empty_rcu; \
329 /* lockless readers can catch us now */ \ 358 /* lockless readers can catch us now */ \
330 rcu_assign_pointer(**--stackptr, n); \ 359 rcu_assign_pointer(**--stackptr, n); \
331 peer_avl_rebalance(stack, stackptr); \ 360 peer_avl_rebalance(stack, stackptr, base); \
332} while (0) 361} while (0)
333 362
334static void inetpeer_free_rcu(struct rcu_head *head) 363static void inetpeer_free_rcu(struct rcu_head *head)
@@ -337,13 +366,13 @@ static void inetpeer_free_rcu(struct rcu_head *head)
337} 366}
338 367
339/* May be called with local BH enabled. */ 368/* May be called with local BH enabled. */
340static void unlink_from_pool(struct inet_peer *p) 369static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
341{ 370{
342 int do_free; 371 int do_free;
343 372
344 do_free = 0; 373 do_free = 0;
345 374
346 spin_lock_bh(&peers.lock); 375 spin_lock_bh(&base->lock);
347 /* Check the reference counter. It was artificially incremented by 1 376 /* Check the reference counter. It was artificially incremented by 1
348 * in cleanup() function to prevent sudden disappearing. If we can 377 * in cleanup() function to prevent sudden disappearing. If we can
349 * atomically (because of lockless readers) take this last reference, 378 * atomically (because of lockless readers) take this last reference,
@@ -353,7 +382,7 @@ static void unlink_from_pool(struct inet_peer *p)
353 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
354 struct inet_peer __rcu **stack[PEER_MAXDEPTH]; 383 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
355 struct inet_peer __rcu ***stackptr, ***delp; 384 struct inet_peer __rcu ***stackptr, ***delp;
356 if (lookup(p->v4daddr, stack) != p) 385 if (lookup(&p->daddr, stack, base) != p)
357 BUG(); 386 BUG();
358 delp = stackptr - 1; /* *delp[0] == p */ 387 delp = stackptr - 1; /* *delp[0] == p */
359 if (p->avl_left == peer_avl_empty_rcu) { 388 if (p->avl_left == peer_avl_empty_rcu) {
@@ -362,11 +391,11 @@ static void unlink_from_pool(struct inet_peer *p)
362 } else { 391 } else {
363 /* look for a node to insert instead of p */ 392 /* look for a node to insert instead of p */
364 struct inet_peer *t; 393 struct inet_peer *t;
365 t = lookup_rightempty(p); 394 t = lookup_rightempty(p, base);
366 BUG_ON(rcu_dereference_protected(*stackptr[-1], 395 BUG_ON(rcu_dereference_protected(*stackptr[-1],
367 lockdep_is_held(&peers.lock)) != t); 396 lockdep_is_held(&base->lock)) != t);
368 **--stackptr = t->avl_left; 397 **--stackptr = t->avl_left;
369 /* t is removed, t->v4daddr > x->v4daddr for any 398 /* t is removed, t->daddr > x->daddr for any
370 * x in p->avl_left subtree. 399 * x in p->avl_left subtree.
371 * Put t in the old place of p. */ 400 * Put t in the old place of p. */
372 RCU_INIT_POINTER(*delp[0], t); 401 RCU_INIT_POINTER(*delp[0], t);
@@ -376,11 +405,11 @@ static void unlink_from_pool(struct inet_peer *p)
376 BUG_ON(delp[1] != &p->avl_left); 405 BUG_ON(delp[1] != &p->avl_left);
377 delp[1] = &t->avl_left; /* was &p->avl_left */ 406 delp[1] = &t->avl_left; /* was &p->avl_left */
378 } 407 }
379 peer_avl_rebalance(stack, stackptr); 408 peer_avl_rebalance(stack, stackptr, base);
380 peers.total--; 409 base->total--;
381 do_free = 1; 410 do_free = 1;
382 } 411 }
383 spin_unlock_bh(&peers.lock); 412 spin_unlock_bh(&base->lock);
384 413
385 if (do_free) 414 if (do_free)
386 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 415 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
@@ -395,6 +424,16 @@ static void unlink_from_pool(struct inet_peer *p)
395 inet_putpeer(p); 424 inet_putpeer(p);
396} 425}
397 426
427static struct inet_peer_base *family_to_base(int family)
428{
429 return (family == AF_INET ? &v4_peers : &v6_peers);
430}
431
432static struct inet_peer_base *peer_to_base(struct inet_peer *p)
433{
434 return family_to_base(p->daddr.family);
435}
436
398/* May be called with local BH enabled. */ 437/* May be called with local BH enabled. */
399static int cleanup_once(unsigned long ttl) 438static int cleanup_once(unsigned long ttl)
400{ 439{
@@ -428,21 +467,22 @@ static int cleanup_once(unsigned long ttl)
428 * happen because of entry limits in route cache. */ 467 * happen because of entry limits in route cache. */
429 return -1; 468 return -1;
430 469
431 unlink_from_pool(p); 470 unlink_from_pool(p, peer_to_base(p));
432 return 0; 471 return 0;
433} 472}
434 473
435/* Called with or without local BH being disabled. */ 474/* Called with or without local BH being disabled. */
436struct inet_peer *inet_getpeer(__be32 daddr, int create) 475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
437{ 476{
438 struct inet_peer *p;
439 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(AF_INET);
479 struct inet_peer *p;
440 480
441 /* Look up for the address quickly, lockless. 481 /* Look up for the address quickly, lockless.
442 * Because of a concurrent writer, we might not find an existing entry. 482 * Because of a concurrent writer, we might not find an existing entry.
443 */ 483 */
444 rcu_read_lock_bh(); 484 rcu_read_lock_bh();
445 p = lookup_rcu_bh(daddr); 485 p = lookup_rcu_bh(daddr, base);
446 rcu_read_unlock_bh(); 486 rcu_read_unlock_bh();
447 487
448 if (p) { 488 if (p) {
@@ -456,50 +496,57 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
456 /* retry an exact lookup, taking the lock before. 496 /* retry an exact lookup, taking the lock before.
457 * At least, nodes should be hot in our cache. 497 * At least, nodes should be hot in our cache.
458 */ 498 */
459 spin_lock_bh(&peers.lock); 499 spin_lock_bh(&base->lock);
460 p = lookup(daddr, stack); 500 p = lookup(daddr, stack, base);
461 if (p != peer_avl_empty) { 501 if (p != peer_avl_empty) {
462 atomic_inc(&p->refcnt); 502 atomic_inc(&p->refcnt);
463 spin_unlock_bh(&peers.lock); 503 spin_unlock_bh(&base->lock);
464 /* Remove the entry from unused list if it was there. */ 504 /* Remove the entry from unused list if it was there. */
465 unlink_from_unused(p); 505 unlink_from_unused(p);
466 return p; 506 return p;
467 } 507 }
468 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 508 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
469 if (p) { 509 if (p) {
470 p->v4daddr = daddr; 510 p->daddr = *daddr;
471 atomic_set(&p->refcnt, 1); 511 atomic_set(&p->refcnt, 1);
472 atomic_set(&p->rid, 0); 512 atomic_set(&p->rid, 0);
473 atomic_set(&p->ip_id_count, secure_ip_id(daddr)); 513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
474 p->tcp_ts_stamp = 0; 514 p->tcp_ts_stamp = 0;
475 INIT_LIST_HEAD(&p->unused); 515 INIT_LIST_HEAD(&p->unused);
476 516
477 517
478 /* Link the node. */ 518 /* Link the node. */
479 link_to_pool(p); 519 link_to_pool(p, base);
480 peers.total++; 520 base->total++;
481 } 521 }
482 spin_unlock_bh(&peers.lock); 522 spin_unlock_bh(&base->lock);
483 523
484 if (peers.total >= inet_peer_threshold) 524 if (base->total >= inet_peer_threshold)
485 /* Remove one less-recently-used entry. */ 525 /* Remove one less-recently-used entry. */
486 cleanup_once(0); 526 cleanup_once(0);
487 527
488 return p; 528 return p;
489} 529}
490 530
531static int compute_total(void)
532{
533 return v4_peers.total + v6_peers.total;
534}
535EXPORT_SYMBOL_GPL(inet_getpeer);
536
491/* Called with local BH disabled. */ 537/* Called with local BH disabled. */
492static void peer_check_expire(unsigned long dummy) 538static void peer_check_expire(unsigned long dummy)
493{ 539{
494 unsigned long now = jiffies; 540 unsigned long now = jiffies;
495 int ttl; 541 int ttl, total;
496 542
497 if (peers.total >= inet_peer_threshold) 543 total = compute_total();
544 if (total >= inet_peer_threshold)
498 ttl = inet_peer_minttl; 545 ttl = inet_peer_minttl;
499 else 546 else
500 ttl = inet_peer_maxttl 547 ttl = inet_peer_maxttl
501 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 548 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
502 peers.total / inet_peer_threshold * HZ; 549 total / inet_peer_threshold * HZ;
503 while (!cleanup_once(ttl)) { 550 while (!cleanup_once(ttl)) {
504 if (jiffies != now) 551 if (jiffies != now)
505 break; 552 break;
@@ -508,13 +555,14 @@ static void peer_check_expire(unsigned long dummy)
508 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 555 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
509 * interval depending on the total number of entries (more entries, 556 * interval depending on the total number of entries (more entries,
510 * less interval). */ 557 * less interval). */
511 if (peers.total >= inet_peer_threshold) 558 total = compute_total();
559 if (total >= inet_peer_threshold)
512 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 560 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
513 else 561 else
514 peer_periodic_timer.expires = jiffies 562 peer_periodic_timer.expires = jiffies
515 + inet_peer_gc_maxtime 563 + inet_peer_gc_maxtime
516 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 564 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
517 peers.total / inet_peer_threshold * HZ; 565 total / inet_peer_threshold * HZ;
518 add_timer(&peer_periodic_timer); 566 add_timer(&peer_periodic_timer);
519} 567}
520 568
@@ -530,3 +578,4 @@ void inet_putpeer(struct inet_peer *p)
530 578
531 local_bh_enable(); 579 local_bh_enable();
532} 580}
581EXPORT_SYMBOL_GPL(inet_putpeer);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 168440834ade..a1151b8adf3c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -45,6 +45,7 @@
45#include <linux/udp.h> 45#include <linux/udp.h>
46#include <linux/inet.h> 46#include <linux/inet.h>
47#include <linux/netfilter_ipv4.h> 47#include <linux/netfilter_ipv4.h>
48#include <net/inet_ecn.h>
48 49
49/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 50/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
50 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c 51 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -70,11 +71,28 @@ struct ipq {
70 __be32 daddr; 71 __be32 daddr;
71 __be16 id; 72 __be16 id;
72 u8 protocol; 73 u8 protocol;
74 u8 ecn; /* RFC3168 support */
73 int iif; 75 int iif;
74 unsigned int rid; 76 unsigned int rid;
75 struct inet_peer *peer; 77 struct inet_peer *peer;
76}; 78};
77 79
80#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */
81#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */
82
83static inline u8 ip4_frag_ecn(u8 tos)
84{
85 tos = (tos & INET_ECN_MASK) + 1;
86 /*
87 * After the last operation we have (in binary):
88 * INET_ECN_NOT_ECT => 001
89 * INET_ECN_ECT_1 => 010
90 * INET_ECN_ECT_0 => 011
91 * INET_ECN_CE => 100
92 */
93 return (tos & 2) ? 0 : tos;
94}
95
78static struct inet_frags ip4_frags; 96static struct inet_frags ip4_frags;
79 97
80int ip_frag_nqueues(struct net *net) 98int ip_frag_nqueues(struct net *net)
@@ -137,11 +155,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
137 155
138 qp->protocol = arg->iph->protocol; 156 qp->protocol = arg->iph->protocol;
139 qp->id = arg->iph->id; 157 qp->id = arg->iph->id;
158 qp->ecn = ip4_frag_ecn(arg->iph->tos);
140 qp->saddr = arg->iph->saddr; 159 qp->saddr = arg->iph->saddr;
141 qp->daddr = arg->iph->daddr; 160 qp->daddr = arg->iph->daddr;
142 qp->user = arg->user; 161 qp->user = arg->user;
143 qp->peer = sysctl_ipfrag_max_dist ? 162 qp->peer = sysctl_ipfrag_max_dist ?
144 inet_getpeer(arg->iph->saddr, 1) : NULL; 163 inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
145} 164}
146 165
147static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 166static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -316,6 +335,7 @@ static int ip_frag_reinit(struct ipq *qp)
316 qp->q.fragments = NULL; 335 qp->q.fragments = NULL;
317 qp->q.fragments_tail = NULL; 336 qp->q.fragments_tail = NULL;
318 qp->iif = 0; 337 qp->iif = 0;
338 qp->ecn = 0;
319 339
320 return 0; 340 return 0;
321} 341}
@@ -328,6 +348,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
328 int flags, offset; 348 int flags, offset;
329 int ihl, end; 349 int ihl, end;
330 int err = -ENOENT; 350 int err = -ENOENT;
351 u8 ecn;
331 352
332 if (qp->q.last_in & INET_FRAG_COMPLETE) 353 if (qp->q.last_in & INET_FRAG_COMPLETE)
333 goto err; 354 goto err;
@@ -339,6 +360,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
339 goto err; 360 goto err;
340 } 361 }
341 362
363 ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
342 offset = ntohs(ip_hdr(skb)->frag_off); 364 offset = ntohs(ip_hdr(skb)->frag_off);
343 flags = offset & ~IP_OFFSET; 365 flags = offset & ~IP_OFFSET;
344 offset &= IP_OFFSET; 366 offset &= IP_OFFSET;
@@ -472,6 +494,7 @@ found:
472 } 494 }
473 qp->q.stamp = skb->tstamp; 495 qp->q.stamp = skb->tstamp;
474 qp->q.meat += skb->len; 496 qp->q.meat += skb->len;
497 qp->ecn |= ecn;
475 atomic_add(skb->truesize, &qp->q.net->mem); 498 atomic_add(skb->truesize, &qp->q.net->mem);
476 if (offset == 0) 499 if (offset == 0)
477 qp->q.last_in |= INET_FRAG_FIRST_IN; 500 qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -583,6 +606,17 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
583 iph = ip_hdr(head); 606 iph = ip_hdr(head);
584 iph->frag_off = 0; 607 iph->frag_off = 0;
585 iph->tot_len = htons(len); 608 iph->tot_len = htons(len);
609 /* RFC3168 5.3 Fragmentation support
610 * If one fragment had INET_ECN_NOT_ECT,
611 * reassembled frame also has INET_ECN_NOT_ECT
612 * Elif one fragment had INET_ECN_CE
613 * reassembled frame also has INET_ECN_CE
614 */
615 if (qp->ecn & IPFRAG_ECN_CLEAR)
616 iph->tos &= ~INET_ECN_MASK;
617 else if (qp->ecn & IPFRAG_ECN_SET_CE)
618 iph->tos |= INET_ECN_CE;
619
586 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 620 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
587 qp->q.fragments = NULL; 621 qp->q.fragments = NULL;
588 qp->q.fragments_tail = NULL; 622 qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index cab2057d5430..eb68a0e34e49 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -405,11 +405,11 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
405 if (parms->name[0]) 405 if (parms->name[0])
406 strlcpy(name, parms->name, IFNAMSIZ); 406 strlcpy(name, parms->name, IFNAMSIZ);
407 else 407 else
408 sprintf(name, "gre%%d"); 408 strcpy(name, "gre%d");
409 409
410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 if (!dev) 411 if (!dev)
412 return NULL; 412 return NULL;
413 413
414 dev_net_set(dev, net); 414 dev_net_set(dev, net);
415 415
@@ -772,16 +772,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
772 { 772 {
773 struct flowi fl = { 773 struct flowi fl = {
774 .oif = tunnel->parms.link, 774 .oif = tunnel->parms.link,
775 .nl_u = { 775 .fl4_dst = dst,
776 .ip4_u = { 776 .fl4_src = tiph->saddr,
777 .daddr = dst, 777 .fl4_tos = RT_TOS(tos),
778 .saddr = tiph->saddr, 778 .fl_gre_key = tunnel->parms.o_key
779 .tos = RT_TOS(tos) 779 };
780 }
781 },
782 .proto = IPPROTO_GRE
783 }
784;
785 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 780 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
786 dev->stats.tx_carrier_errors++; 781 dev->stats.tx_carrier_errors++;
787 goto tx_error; 782 goto tx_error;
@@ -823,7 +818,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
823 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 818 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
824 rt6->rt6i_dst.plen == 128) { 819 rt6->rt6i_dst.plen == 128) {
825 rt6->rt6i_flags |= RTF_MODIFIED; 820 rt6->rt6i_flags |= RTF_MODIFIED;
826 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; 821 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
827 } 822 }
828 } 823 }
829 824
@@ -895,7 +890,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
895 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 890 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
896#endif 891#endif
897 else 892 else
898 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); 893 iph->ttl = ip4_dst_hoplimit(&rt->dst);
899 } 894 }
900 895
901 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 896 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -951,14 +946,11 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
951 if (iph->daddr) { 946 if (iph->daddr) {
952 struct flowi fl = { 947 struct flowi fl = {
953 .oif = tunnel->parms.link, 948 .oif = tunnel->parms.link,
954 .nl_u = { 949 .fl4_dst = iph->daddr,
955 .ip4_u = { 950 .fl4_src = iph->saddr,
956 .daddr = iph->daddr, 951 .fl4_tos = RT_TOS(iph->tos),
957 .saddr = iph->saddr, 952 .proto = IPPROTO_GRE,
958 .tos = RT_TOS(iph->tos) 953 .fl_gre_key = tunnel->parms.o_key
959 }
960 },
961 .proto = IPPROTO_GRE
962 }; 954 };
963 struct rtable *rt; 955 struct rtable *rt;
964 956
@@ -1216,14 +1208,11 @@ static int ipgre_open(struct net_device *dev)
1216 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1208 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1217 struct flowi fl = { 1209 struct flowi fl = {
1218 .oif = t->parms.link, 1210 .oif = t->parms.link,
1219 .nl_u = { 1211 .fl4_dst = t->parms.iph.daddr,
1220 .ip4_u = { 1212 .fl4_src = t->parms.iph.saddr,
1221 .daddr = t->parms.iph.daddr, 1213 .fl4_tos = RT_TOS(t->parms.iph.tos),
1222 .saddr = t->parms.iph.saddr, 1214 .proto = IPPROTO_GRE,
1223 .tos = RT_TOS(t->parms.iph.tos) 1215 .fl_gre_key = t->parms.o_key
1224 }
1225 },
1226 .proto = IPPROTO_GRE
1227 }; 1216 };
1228 struct rtable *rt; 1217 struct rtable *rt;
1229 1218
@@ -1775,3 +1764,4 @@ module_exit(ipgre_fini);
1775MODULE_LICENSE("GPL"); 1764MODULE_LICENSE("GPL");
1776MODULE_ALIAS_RTNL_LINK("gre"); 1765MODULE_ALIAS_RTNL_LINK("gre");
1777MODULE_ALIAS_RTNL_LINK("gretap"); 1766MODULE_ALIAS_RTNL_LINK("gretap");
1767MODULE_ALIAS("gre0");
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 439d2a34ee44..04c7b3ba6b39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@
82#include <linux/tcp.h> 82#include <linux/tcp.h>
83 83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 86
86/* Generate a checksum for an outgoing IP datagram. */ 87/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph) 88__inline__ void ip_send_check(struct iphdr *iph)
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
131 132
132 if (ttl < 0) 133 if (ttl < 0)
133 ttl = dst_metric(dst, RTAX_HOPLIMIT); 134 ttl = ip4_dst_hoplimit(dst);
134 return ttl; 135 return ttl;
135} 136}
136 137
@@ -341,15 +342,13 @@ int ip_queue_xmit(struct sk_buff *skb)
341 { 342 {
342 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 struct flowi fl = { .oif = sk->sk_bound_dev_if,
343 .mark = sk->sk_mark, 344 .mark = sk->sk_mark,
344 .nl_u = { .ip4_u = 345 .fl4_dst = daddr,
345 { .daddr = daddr, 346 .fl4_src = inet->inet_saddr,
346 .saddr = inet->inet_saddr, 347 .fl4_tos = RT_CONN_FLAGS(sk),
347 .tos = RT_CONN_FLAGS(sk) } },
348 .proto = sk->sk_protocol, 348 .proto = sk->sk_protocol,
349 .flags = inet_sk_flowi_flags(sk), 349 .flags = inet_sk_flowi_flags(sk),
350 .uli_u = { .ports = 350 .fl_ip_sport = inet->inet_sport,
351 { .sport = inet->inet_sport, 351 .fl_ip_dport = inet->inet_dport };
352 .dport = inet->inet_dport } } };
353 352
354 /* If this fails, retransmit mechanism of transport layer will 353 /* If this fails, retransmit mechanism of transport layer will
355 * keep trying until route appears or the connection times 354 * keep trying until route appears or the connection times
@@ -1404,14 +1403,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1404 1403
1405 { 1404 {
1406 struct flowi fl = { .oif = arg->bound_dev_if, 1405 struct flowi fl = { .oif = arg->bound_dev_if,
1407 .nl_u = { .ip4_u = 1406 .fl4_dst = daddr,
1408 { .daddr = daddr, 1407 .fl4_src = rt->rt_spec_dst,
1409 .saddr = rt->rt_spec_dst, 1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1410 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 1409 .fl_ip_sport = tcp_hdr(skb)->dest,
1411 /* Not quite clean, but right. */ 1410 .fl_ip_dport = tcp_hdr(skb)->source,
1412 .uli_u = { .ports =
1413 { .sport = tcp_hdr(skb)->dest,
1414 .dport = tcp_hdr(skb)->source } },
1415 .proto = sk->sk_protocol, 1411 .proto = sk->sk_protocol,
1416 .flags = ip_reply_arg_flowi_flags(arg) }; 1412 .flags = ip_reply_arg_flowi_flags(arg) };
1417 security_skb_classify_flow(skb, &fl); 1413 security_skb_classify_flow(skb, &fl);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 3a6e1ec5e9ae..2b097752426b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1191,13 +1191,13 @@ static int __init ic_dynamic(void)
1191 (ic_proto_enabled & IC_USE_DHCP) && 1191 (ic_proto_enabled & IC_USE_DHCP) &&
1192 ic_dhcp_msgtype != DHCPACK) { 1192 ic_dhcp_msgtype != DHCPACK) {
1193 ic_got_reply = 0; 1193 ic_got_reply = 0;
1194 printk(","); 1194 printk(KERN_CONT ",");
1195 continue; 1195 continue;
1196 } 1196 }
1197#endif /* IPCONFIG_DHCP */ 1197#endif /* IPCONFIG_DHCP */
1198 1198
1199 if (ic_got_reply) { 1199 if (ic_got_reply) {
1200 printk(" OK\n"); 1200 printk(KERN_CONT " OK\n");
1201 break; 1201 break;
1202 } 1202 }
1203 1203
@@ -1205,7 +1205,7 @@ static int __init ic_dynamic(void)
1205 continue; 1205 continue;
1206 1206
1207 if (! --retries) { 1207 if (! --retries) {
1208 printk(" timed out!\n"); 1208 printk(KERN_CONT " timed out!\n");
1209 break; 1209 break;
1210 } 1210 }
1211 1211
@@ -1215,7 +1215,7 @@ static int __init ic_dynamic(void)
1215 if (timeout > CONF_TIMEOUT_MAX) 1215 if (timeout > CONF_TIMEOUT_MAX)
1216 timeout = CONF_TIMEOUT_MAX; 1216 timeout = CONF_TIMEOUT_MAX;
1217 1217
1218 printk("."); 1218 printk(KERN_CONT ".");
1219 } 1219 }
1220 1220
1221#ifdef IPCONFIG_BOOTP 1221#ifdef IPCONFIG_BOOTP
@@ -1236,7 +1236,7 @@ static int __init ic_dynamic(void)
1236 ((ic_got_reply & IC_RARP) ? "RARP" 1236 ((ic_got_reply & IC_RARP) ? "RARP"
1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1238 &ic_servaddr); 1238 &ic_servaddr);
1239 printk("my address is %pI4\n", &ic_myaddr); 1239 printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
1240 1240
1241 return 0; 1241 return 0;
1242} 1242}
@@ -1468,19 +1468,19 @@ static int __init ip_auto_config(void)
1468 /* 1468 /*
1469 * Clue in the operator. 1469 * Clue in the operator.
1470 */ 1470 */
1471 printk("IP-Config: Complete:"); 1471 printk("IP-Config: Complete:\n");
1472 printk("\n device=%s", ic_dev->name); 1472 printk(" device=%s", ic_dev->name);
1473 printk(", addr=%pI4", &ic_myaddr); 1473 printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
1474 printk(", mask=%pI4", &ic_netmask); 1474 printk(KERN_CONT ", mask=%pI4", &ic_netmask);
1475 printk(", gw=%pI4", &ic_gateway); 1475 printk(KERN_CONT ", gw=%pI4", &ic_gateway);
1476 printk(",\n host=%s, domain=%s, nis-domain=%s", 1476 printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
1477 utsname()->nodename, ic_domain, utsname()->domainname); 1477 utsname()->nodename, ic_domain, utsname()->domainname);
1478 printk(",\n bootserver=%pI4", &ic_servaddr); 1478 printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
1479 printk(", rootserver=%pI4", &root_server_addr); 1479 printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
1480 printk(", rootpath=%s", root_server_path); 1480 printk(KERN_CONT ", rootpath=%s", root_server_path);
1481 if (ic_dev_mtu) 1481 if (ic_dev_mtu)
1482 printk(", mtu=%d", ic_dev_mtu); 1482 printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
1483 printk("\n"); 1483 printk(KERN_CONT "\n");
1484#endif /* !SILENT */ 1484#endif /* !SILENT */
1485 1485
1486 return 0; 1486 return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index cd300aaee78f..988f52fba54a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -463,13 +463,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
463 { 463 {
464 struct flowi fl = { 464 struct flowi fl = {
465 .oif = tunnel->parms.link, 465 .oif = tunnel->parms.link,
466 .nl_u = { 466 .fl4_dst = dst,
467 .ip4_u = { 467 .fl4_src= tiph->saddr,
468 .daddr = dst, 468 .fl4_tos = RT_TOS(tos),
469 .saddr = tiph->saddr,
470 .tos = RT_TOS(tos)
471 }
472 },
473 .proto = IPPROTO_IPIP 469 .proto = IPPROTO_IPIP
474 }; 470 };
475 471
@@ -589,13 +585,9 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
589 if (iph->daddr) { 585 if (iph->daddr) {
590 struct flowi fl = { 586 struct flowi fl = {
591 .oif = tunnel->parms.link, 587 .oif = tunnel->parms.link,
592 .nl_u = { 588 .fl4_dst = iph->daddr,
593 .ip4_u = { 589 .fl4_src = iph->saddr,
594 .daddr = iph->daddr, 590 .fl4_tos = RT_TOS(iph->tos),
595 .saddr = iph->saddr,
596 .tos = RT_TOS(iph->tos)
597 }
598 },
599 .proto = IPPROTO_IPIP 591 .proto = IPPROTO_IPIP
600 }; 592 };
601 struct rtable *rt; 593 struct rtable *rt;
@@ -921,3 +913,4 @@ static void __exit ipip_fini(void)
921module_init(ipip_init); 913module_init(ipip_init);
922module_exit(ipip_fini); 914module_exit(ipip_fini);
923MODULE_LICENSE("GPL"); 915MODULE_LICENSE("GPL");
916MODULE_ALIAS("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ef2b0089e0ea..3f3a9afd73e0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1537,13 +1537,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1537 if (vif->flags & VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1538 struct flowi fl = { 1538 struct flowi fl = {
1539 .oif = vif->link, 1539 .oif = vif->link,
1540 .nl_u = { 1540 .fl4_dst = vif->remote,
1541 .ip4_u = { 1541 .fl4_src = vif->local,
1542 .daddr = vif->remote, 1542 .fl4_tos = RT_TOS(iph->tos),
1543 .saddr = vif->local,
1544 .tos = RT_TOS(iph->tos)
1545 }
1546 },
1547 .proto = IPPROTO_IPIP 1543 .proto = IPPROTO_IPIP
1548 }; 1544 };
1549 1545
@@ -1553,12 +1549,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1553 } else { 1549 } else {
1554 struct flowi fl = { 1550 struct flowi fl = {
1555 .oif = vif->link, 1551 .oif = vif->link,
1556 .nl_u = { 1552 .fl4_dst = iph->daddr,
1557 .ip4_u = { 1553 .fl4_tos = RT_TOS(iph->tos),
1558 .daddr = iph->daddr,
1559 .tos = RT_TOS(iph->tos)
1560 }
1561 },
1562 .proto = IPPROTO_IPIP 1554 .proto = IPPROTO_IPIP
1563 }; 1555 };
1564 1556
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index d88a46c54fd1..994a1f29ebbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -31,10 +31,10 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 32 */
33 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
34 fl.nl_u.ip4_u.daddr = iph->daddr; 34 fl.fl4_dst = iph->daddr;
35 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
36 fl.nl_u.ip4_u.saddr = iph->saddr; 36 fl.fl4_src = iph->saddr;
37 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); 37 fl.fl4_tos = RT_TOS(iph->tos);
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
39 fl.mark = skb->mark; 39 fl.mark = skb->mark;
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
@@ -47,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
47 } else { 47 } else {
48 /* non-local src, find valid iif to satisfy 48 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 49 * rp-filter when calling ip_route_input. */
50 fl.nl_u.ip4_u.daddr = iph->saddr; 50 fl.fl4_dst = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0) 51 if (ip_route_output_key(net, &rt, &fl) != 0)
52 return -1; 52 return -1;
53 53
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 48111594ee9b..19eb59d01037 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,15 +3,15 @@
3# 3#
4 4
5# objects for l3 independent conntrack 5# objects for l3 independent conntrack
6nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) 7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
8ifeq ($(CONFIG_PROC_FS),y) 8ifeq ($(CONFIG_PROC_FS),y)
9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o 9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o 13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o 14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15 15
16# connection tracking 16# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340a28d5..47e5178b998b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -883,6 +883,7 @@ static int compat_table_info(const struct xt_table_info *info,
883 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 883 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
884 newinfo->initial_entries = 0; 884 newinfo->initial_entries = 0;
885 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 885 loc_cpu_entry = info->entries[raw_smp_processor_id()];
886 xt_compat_init_offsets(NFPROTO_ARP, info->number);
886 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 887 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
887 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 888 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
888 if (ret != 0) 889 if (ret != 0)
@@ -1350,6 +1351,7 @@ static int translate_compat_table(const char *name,
1350 duprintf("translate_compat_table: size %u\n", info->size); 1351 duprintf("translate_compat_table: size %u\n", info->size);
1351 j = 0; 1352 j = 0;
1352 xt_compat_lock(NFPROTO_ARP); 1353 xt_compat_lock(NFPROTO_ARP);
1354 xt_compat_init_offsets(NFPROTO_ARP, number);
1353 /* Walk through entries, checking offsets. */ 1355 /* Walk through entries, checking offsets. */
1354 xt_entry_foreach(iter0, entry0, total_size) { 1356 xt_entry_foreach(iter0, entry0, total_size) {
1355 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1357 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d633b3b6..c5a75d70970f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1080,6 +1080,7 @@ static int compat_table_info(const struct xt_table_info *info,
1080 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1080 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1081 newinfo->initial_entries = 0; 1081 newinfo->initial_entries = 0;
1082 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1082 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1083 xt_compat_init_offsets(AF_INET, info->number);
1083 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1084 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1084 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1085 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1085 if (ret != 0) 1086 if (ret != 0)
@@ -1681,6 +1682,7 @@ translate_compat_table(struct net *net,
1681 duprintf("translate_compat_table: size %u\n", info->size); 1682 duprintf("translate_compat_table: size %u\n", info->size);
1682 j = 0; 1683 j = 0;
1683 xt_compat_lock(AF_INET); 1684 xt_compat_lock(AF_INET);
1685 xt_compat_init_offsets(AF_INET, number);
1684 /* Walk through entries, checking offsets. */ 1686 /* Walk through entries, checking offsets. */
1685 xt_entry_foreach(iter0, entry0, total_size) { 1687 xt_entry_foreach(iter0, entry0, total_size) {
1686 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1688 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 43eec80c0e7c..1ff79e557f96 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -116,7 +116,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
116 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
117 goto free_nskb; 117 goto free_nskb;
118 118
119 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); 119 niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
120 120
121 /* "Never happens" */ 121 /* "Never happens" */
122 if (nskb->len > dst_mtu(skb_dst(nskb))) 122 if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index ab9c05c9734e..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -100,7 +100,7 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
100 100
101 ret = security_secid_to_secctx(ct->secmark, &secctx, &len); 101 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
102 if (ret) 102 if (ret)
103 return ret; 103 return 0;
104 104
105 ret = seq_printf(s, "secctx=%s ", secctx); 105 ret = seq_printf(s, "secctx=%s ", secctx);
106 106
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4ae1f203f7cb..b14ec7d03b6e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
59 local_bh_enable(); 59 local_bh_enable();
60 60
61 socket_seq_show(seq); 61 socket_seq_show(seq);
62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
63 sock_prot_inuse_get(net, &tcp_prot), orphans, 63 sock_prot_inuse_get(net, &tcp_prot), orphans,
64 tcp_death_row.tw_count, sockets, 64 tcp_death_row.tw_count, sockets,
65 atomic_read(&tcp_memory_allocated)); 65 atomic_long_read(&tcp_memory_allocated));
66 seq_printf(seq, "UDP: inuse %d mem %d\n", 66 seq_printf(seq, "UDP: inuse %d mem %ld\n",
67 sock_prot_inuse_get(net, &udp_prot), 67 sock_prot_inuse_get(net, &udp_prot),
68 atomic_read(&udp_memory_allocated)); 68 atomic_long_read(&udp_memory_allocated));
69 seq_printf(seq, "UDPLITE: inuse %d\n", 69 seq_printf(seq, "UDPLITE: inuse %d\n",
70 sock_prot_inuse_get(net, &udplite_prot)); 70 sock_prot_inuse_get(net, &udplite_prot));
71 seq_printf(seq, "RAW: inuse %d\n", 71 seq_printf(seq, "RAW: inuse %d\n",
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), 253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), 254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), 255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
256 SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
256 SNMP_MIB_SENTINEL 257 SNMP_MIB_SENTINEL
257}; 258};
258 259
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1f85ef289895..a3d5ab786e81 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -549,10 +549,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
549 { 549 {
550 struct flowi fl = { .oif = ipc.oif, 550 struct flowi fl = { .oif = ipc.oif,
551 .mark = sk->sk_mark, 551 .mark = sk->sk_mark,
552 .nl_u = { .ip4_u = 552 .fl4_dst = daddr,
553 { .daddr = daddr, 553 .fl4_src = saddr,
554 .saddr = saddr, 554 .fl4_tos = tos,
555 .tos = tos } },
556 .proto = inet->hdrincl ? IPPROTO_RAW : 555 .proto = inet->hdrincl ? IPPROTO_RAW :
557 sk->sk_protocol, 556 sk->sk_protocol,
558 }; 557 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f70ae1bccb8a..3e5b7cc2db4f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,6 +139,8 @@ static unsigned long expires_ljiffies;
139 */ 139 */
140 140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 144static void ipv4_dst_destroy(struct dst_entry *dst);
143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
@@ -155,6 +157,8 @@ static struct dst_ops ipv4_dst_ops = {
155 .protocol = cpu_to_be16(ETH_P_IP), 157 .protocol = cpu_to_be16(ETH_P_IP),
156 .gc = rt_garbage_collect, 158 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check, 159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu,
158 .destroy = ipv4_dst_destroy, 162 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown, 163 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice, 164 .negative_advice = ipv4_negative_advice,
@@ -383,8 +387,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
383 (__force u32)r->rt_gateway, 387 (__force u32)r->rt_gateway,
384 r->rt_flags, atomic_read(&r->dst.__refcnt), 388 r->rt_flags, atomic_read(&r->dst.__refcnt),
385 r->dst.__use, 0, (__force u32)r->rt_src, 389 r->dst.__use, 0, (__force u32)r->rt_src,
386 (dst_metric(&r->dst, RTAX_ADVMSS) ? 390 dst_metric_advmss(&r->dst) + 40,
387 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
388 dst_metric(&r->dst, RTAX_WINDOW), 391 dst_metric(&r->dst, RTAX_WINDOW),
389 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
390 dst_metric(&r->dst, RTAX_RTTVAR)), 393 dst_metric(&r->dst, RTAX_RTTVAR)),
@@ -684,17 +687,17 @@ static inline bool rt_caching(const struct net *net)
684static inline bool compare_hash_inputs(const struct flowi *fl1, 687static inline bool compare_hash_inputs(const struct flowi *fl1,
685 const struct flowi *fl2) 688 const struct flowi *fl2)
686{ 689{
687 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
688 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
689 (fl1->iif ^ fl2->iif)) == 0); 692 (fl1->iif ^ fl2->iif)) == 0);
690} 693}
691 694
692static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
693{ 696{
694 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
695 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
696 (fl1->mark ^ fl2->mark) | 699 (fl1->mark ^ fl2->mark) |
697 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | 700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
698 (fl1->oif ^ fl2->oif) | 701 (fl1->oif ^ fl2->oif) |
699 (fl1->iif ^ fl2->iif)) == 0; 702 (fl1->iif ^ fl2->iif)) == 0;
700} 703}
@@ -714,13 +717,15 @@ static inline int rt_is_expired(struct rtable *rth)
714 * Can be called by a softirq or a process. 717 * Can be called by a softirq or a process.
715 * In the later case, we want to be reschedule if necessary 718 * In the later case, we want to be reschedule if necessary
716 */ 719 */
717static void rt_do_flush(int process_context) 720static void rt_do_flush(struct net *net, int process_context)
718{ 721{
719 unsigned int i; 722 unsigned int i;
720 struct rtable *rth, *next; 723 struct rtable *rth, *next;
721 struct rtable * tail;
722 724
723 for (i = 0; i <= rt_hash_mask; i++) { 725 for (i = 0; i <= rt_hash_mask; i++) {
726 struct rtable __rcu **pprev;
727 struct rtable *list;
728
724 if (process_context && need_resched()) 729 if (process_context && need_resched())
725 cond_resched(); 730 cond_resched();
726 rth = rcu_dereference_raw(rt_hash_table[i].chain); 731 rth = rcu_dereference_raw(rt_hash_table[i].chain);
@@ -728,50 +733,32 @@ static void rt_do_flush(int process_context)
728 continue; 733 continue;
729 734
730 spin_lock_bh(rt_hash_lock_addr(i)); 735 spin_lock_bh(rt_hash_lock_addr(i));
731#ifdef CONFIG_NET_NS
732 {
733 struct rtable __rcu **prev;
734 struct rtable *p;
735 736
736 rth = rcu_dereference_protected(rt_hash_table[i].chain, 737 list = NULL;
738 pprev = &rt_hash_table[i].chain;
739 rth = rcu_dereference_protected(*pprev,
737 lockdep_is_held(rt_hash_lock_addr(i))); 740 lockdep_is_held(rt_hash_lock_addr(i)));
738 741
739 /* defer releasing the head of the list after spin_unlock */ 742 while (rth) {
740 for (tail = rth; tail; 743 next = rcu_dereference_protected(rth->dst.rt_next,
741 tail = rcu_dereference_protected(tail->dst.rt_next,
742 lockdep_is_held(rt_hash_lock_addr(i))))
743 if (!rt_is_expired(tail))
744 break;
745 if (rth != tail)
746 rt_hash_table[i].chain = tail;
747
748 /* call rt_free on entries after the tail requiring flush */
749 prev = &rt_hash_table[i].chain;
750 for (p = rcu_dereference_protected(*prev,
751 lockdep_is_held(rt_hash_lock_addr(i))); 744 lockdep_is_held(rt_hash_lock_addr(i)));
752 p != NULL; 745
753 p = next) { 746 if (!net ||
754 next = rcu_dereference_protected(p->dst.rt_next, 747 net_eq(dev_net(rth->dst.dev), net)) {
755 lockdep_is_held(rt_hash_lock_addr(i))); 748 rcu_assign_pointer(*pprev, next);
756 if (!rt_is_expired(p)) { 749 rcu_assign_pointer(rth->dst.rt_next, list);
757 prev = &p->dst.rt_next; 750 list = rth;
758 } else { 751 } else {
759 *prev = next; 752 pprev = &rth->dst.rt_next;
760 rt_free(p);
761 } 753 }
754 rth = next;
762 } 755 }
763 } 756
764#else
765 rth = rcu_dereference_protected(rt_hash_table[i].chain,
766 lockdep_is_held(rt_hash_lock_addr(i)));
767 rcu_assign_pointer(rt_hash_table[i].chain, NULL);
768 tail = NULL;
769#endif
770 spin_unlock_bh(rt_hash_lock_addr(i)); 757 spin_unlock_bh(rt_hash_lock_addr(i));
771 758
772 for (; rth != tail; rth = next) { 759 for (; list; list = next) {
773 next = rcu_dereference_protected(rth->dst.rt_next, 1); 760 next = rcu_dereference_protected(list->dst.rt_next, 1);
774 rt_free(rth); 761 rt_free(list);
775 } 762 }
776 } 763 }
777} 764}
@@ -919,13 +906,13 @@ void rt_cache_flush(struct net *net, int delay)
919{ 906{
920 rt_cache_invalidate(net); 907 rt_cache_invalidate(net);
921 if (delay >= 0) 908 if (delay >= 0)
922 rt_do_flush(!in_softirq()); 909 rt_do_flush(net, !in_softirq());
923} 910}
924 911
925/* Flush previous cache invalidated entries from the cache */ 912/* Flush previous cache invalidated entries from the cache */
926void rt_cache_flush_batch(void) 913void rt_cache_flush_batch(struct net *net)
927{ 914{
928 rt_do_flush(!in_softirq()); 915 rt_do_flush(net, !in_softirq());
929} 916}
930 917
931static void rt_emergency_hash_rebuild(struct net *net) 918static void rt_emergency_hash_rebuild(struct net *net)
@@ -1289,7 +1276,7 @@ void rt_bind_peer(struct rtable *rt, int create)
1289{ 1276{
1290 struct inet_peer *peer; 1277 struct inet_peer *peer;
1291 1278
1292 peer = inet_getpeer(rt->rt_dst, create); 1279 peer = inet_getpeer_v4(rt->rt_dst, create);
1293 1280
1294 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1295 inet_putpeer(peer); 1282 inet_putpeer(peer);
@@ -1686,11 +1673,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1686 if (mtu < dst_mtu(&rth->dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1687 dst_confirm(&rth->dst); 1674 dst_confirm(&rth->dst);
1688 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1676 u32 lock = dst_metric(&rth->dst,
1677 RTAX_LOCK);
1689 mtu = ip_rt_min_pmtu; 1678 mtu = ip_rt_min_pmtu;
1690 rth->dst.metrics[RTAX_LOCK-1] |= 1679 lock |= (1 << RTAX_MTU);
1691 (1 << RTAX_MTU); 1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1692 } 1682 }
1693 rth->dst.metrics[RTAX_MTU-1] = mtu; 1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1694 dst_set_expires(&rth->dst, 1684 dst_set_expires(&rth->dst,
1695 ip_rt_mtu_expires); 1685 ip_rt_mtu_expires);
1696 } 1686 }
@@ -1708,10 +1698,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1708 if (dst_mtu(dst) > mtu && mtu >= 68 && 1698 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1709 !(dst_metric_locked(dst, RTAX_MTU))) { 1699 !(dst_metric_locked(dst, RTAX_MTU))) {
1710 if (mtu < ip_rt_min_pmtu) { 1700 if (mtu < ip_rt_min_pmtu) {
1701 u32 lock = dst_metric(dst, RTAX_LOCK);
1711 mtu = ip_rt_min_pmtu; 1702 mtu = ip_rt_min_pmtu;
1712 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1713 } 1704 }
1714 dst->metrics[RTAX_MTU-1] = mtu; 1705 dst_metric_set(dst, RTAX_MTU, mtu);
1715 dst_set_expires(dst, ip_rt_mtu_expires); 1706 dst_set_expires(dst, ip_rt_mtu_expires);
1716 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1717 } 1708 }
@@ -1794,38 +1785,55 @@ static void set_class_tag(struct rtable *rt, u32 tag)
1794} 1785}
1795#endif 1786#endif
1796 1787
1788static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1789{
1790 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1791
1792 if (advmss == 0) {
1793 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1794 ip_rt_min_advmss);
1795 if (advmss > 65535 - 40)
1796 advmss = 65535 - 40;
1797 }
1798 return advmss;
1799}
1800
1801static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1802{
1803 unsigned int mtu = dst->dev->mtu;
1804
1805 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1806 const struct rtable *rt = (const struct rtable *) dst;
1807
1808 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1809 mtu = 576;
1810 }
1811
1812 if (mtu > IP_MAX_MTU)
1813 mtu = IP_MAX_MTU;
1814
1815 return mtu;
1816}
1817
1797static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1798{ 1819{
1820 struct dst_entry *dst = &rt->dst;
1799 struct fib_info *fi = res->fi; 1821 struct fib_info *fi = res->fi;
1800 1822
1801 if (fi) { 1823 if (fi) {
1802 if (FIB_RES_GW(*res) && 1824 if (FIB_RES_GW(*res) &&
1803 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1804 rt->rt_gateway = FIB_RES_GW(*res); 1826 rt->rt_gateway = FIB_RES_GW(*res);
1805 memcpy(rt->dst.metrics, fi->fib_metrics, 1827 dst_import_metrics(dst, fi->fib_metrics);
1806 sizeof(rt->dst.metrics));
1807 if (fi->fib_mtu == 0) {
1808 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1809 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1810 rt->rt_gateway != rt->rt_dst &&
1811 rt->dst.dev->mtu > 576)
1812 rt->dst.metrics[RTAX_MTU-1] = 576;
1813 }
1814#ifdef CONFIG_IP_ROUTE_CLASSID 1828#ifdef CONFIG_IP_ROUTE_CLASSID
1815 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1816#endif 1830#endif
1817 } else 1831 }
1818 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; 1832
1819 1833 if (dst_mtu(dst) > IP_MAX_MTU)
1820 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) 1834 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1821 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1822 if (dst_mtu(&rt->dst) > IP_MAX_MTU) 1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1823 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1824 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1825 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1826 ip_rt_min_advmss);
1827 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1828 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1829 1837
1830#ifdef CONFIG_IP_ROUTE_CLASSID 1838#ifdef CONFIG_IP_ROUTE_CLASSID
1831#ifdef CONFIG_IP_MULTIPLE_TABLES 1839#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -2089,12 +2097,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2089{ 2097{
2090 struct fib_result res; 2098 struct fib_result res;
2091 struct in_device *in_dev = __in_dev_get_rcu(dev); 2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
2092 struct flowi fl = { .nl_u = { .ip4_u = 2100 struct flowi fl = { .fl4_dst = daddr,
2093 { .daddr = daddr, 2101 .fl4_src = saddr,
2094 .saddr = saddr, 2102 .fl4_tos = tos,
2095 .tos = tos, 2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2096 .scope = RT_SCOPE_UNIVERSE,
2097 } },
2098 .mark = skb->mark, 2104 .mark = skb->mark,
2099 .iif = dev->ifindex }; 2105 .iif = dev->ifindex };
2100 unsigned flags = 0; 2106 unsigned flags = 0;
@@ -2480,14 +2486,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2480 const struct flowi *oldflp) 2486 const struct flowi *oldflp)
2481{ 2487{
2482 u32 tos = RT_FL_TOS(oldflp); 2488 u32 tos = RT_FL_TOS(oldflp);
2483 struct flowi fl = { .nl_u = { .ip4_u = 2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2484 { .daddr = oldflp->fl4_dst, 2490 .fl4_src = oldflp->fl4_src,
2485 .saddr = oldflp->fl4_src, 2491 .fl4_tos = tos & IPTOS_RT_MASK,
2486 .tos = tos & IPTOS_RT_MASK, 2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2487 .scope = ((tos & RTO_ONLINK) ? 2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2488 RT_SCOPE_LINK :
2489 RT_SCOPE_UNIVERSE),
2490 } },
2491 .mark = oldflp->mark, 2494 .mark = oldflp->mark,
2492 .iif = net->loopback_dev->ifindex, 2495 .iif = net->loopback_dev->ifindex,
2493 .oif = oldflp->oif }; 2496 .oif = oldflp->oif };
@@ -2559,9 +2562,10 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2559 goto out; 2562 goto out;
2560 2563
2561 /* RACE: Check return value of inet_select_addr instead. */ 2564 /* RACE: Check return value of inet_select_addr instead. */
2562 if (rcu_dereference(dev_out->ip_ptr) == NULL) 2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2563 goto out; /* Wrong error code */ 2566 err = -ENETUNREACH;
2564 2567 goto out;
2568 }
2565 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2566 ipv4_is_lbcast(oldflp->fl4_dst)) { 2570 ipv4_is_lbcast(oldflp->fl4_dst)) {
2567 if (!fl.fl4_src) 2571 if (!fl.fl4_src)
@@ -2622,8 +2626,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2622 } 2626 }
2623 2627
2624 if (res.type == RTN_LOCAL) { 2628 if (res.type == RTN_LOCAL) {
2625 if (!fl.fl4_src) 2629 if (!fl.fl4_src) {
2626 fl.fl4_src = fl.fl4_dst; 2630 if (res.fi->fib_prefsrc)
2631 fl.fl4_src = res.fi->fib_prefsrc;
2632 else
2633 fl.fl4_src = fl.fl4_dst;
2634 }
2627 dev_out = net->loopback_dev; 2635 dev_out = net->loopback_dev;
2628 fl.oif = dev_out->ifindex; 2636 fl.oif = dev_out->ifindex;
2629 res.fi = NULL; 2637 res.fi = NULL;
@@ -2725,7 +2733,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2725 new->__use = 1; 2733 new->__use = 1;
2726 new->input = dst_discard; 2734 new->input = dst_discard;
2727 new->output = dst_discard; 2735 new->output = dst_discard;
2728 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); 2736 dst_copy_metrics(new, &ort->dst);
2729 2737
2730 new->dev = ort->dst.dev; 2738 new->dev = ort->dst.dev;
2731 if (new->dev) 2739 if (new->dev)
@@ -2832,7 +2840,7 @@ static int rt_fill_info(struct net *net,
2832 if (rt->rt_dst != rt->rt_gateway) 2840 if (rt->rt_dst != rt->rt_gateway)
2833 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2841 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2834 2842
2835 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) 2843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2836 goto nla_put_failure; 2844 goto nla_put_failure;
2837 2845
2838 if (rt->fl.mark) 2846 if (rt->fl.mark)
@@ -2944,13 +2952,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2944 err = -rt->dst.error; 2952 err = -rt->dst.error;
2945 } else { 2953 } else {
2946 struct flowi fl = { 2954 struct flowi fl = {
2947 .nl_u = { 2955 .fl4_dst = dst,
2948 .ip4_u = { 2956 .fl4_src = src,
2949 .daddr = dst, 2957 .fl4_tos = rtm->rtm_tos,
2950 .saddr = src,
2951 .tos = rtm->rtm_tos,
2952 },
2953 },
2954 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2955 .mark = mark, 2959 .mark = mark,
2956 }; 2960 };
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace2180d..47519205a014 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,17 +346,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi fl = { .mark = sk->sk_mark,
349 .nl_u = { .ip4_u = 349 .fl4_dst = ((opt && opt->srr) ?
350 { .daddr = ((opt && opt->srr) ? 350 opt->faddr : ireq->rmt_addr),
351 opt->faddr : 351 .fl4_src = ireq->loc_addr,
352 ireq->rmt_addr), 352 .fl4_tos = RT_CONN_FLAGS(sk),
353 .saddr = ireq->loc_addr,
354 .tos = RT_CONN_FLAGS(sk) } },
355 .proto = IPPROTO_TCP, 353 .proto = IPPROTO_TCP,
356 .flags = inet_sk_flowi_flags(sk), 354 .flags = inet_sk_flowi_flags(sk),
357 .uli_u = { .ports = 355 .fl_ip_sport = th->dest,
358 { .sport = th->dest, 356 .fl_ip_dport = th->source };
359 .dport = th->source } } };
360 security_req_classify_flow(req, &fl); 357 security_req_classify_flow(req, &fl);
361 if (ip_route_output_key(sock_net(sk), &rt, &fl)) { 358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
362 reqsk_free(req); 359 reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d96c1da4b17c..1a456652086b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,6 +26,10 @@ static int zero;
26static int tcp_retr1_max = 255; 26static int tcp_retr1_max = 255;
27static int ip_local_port_range_min[] = { 1, 1 }; 27static int ip_local_port_range_min[] = { 1, 1 };
28static int ip_local_port_range_max[] = { 65535, 65535 }; 28static int ip_local_port_range_max[] = { 65535, 65535 };
29static int tcp_adv_win_scale_min = -31;
30static int tcp_adv_win_scale_max = 31;
31static int ip_ttl_min = 1;
32static int ip_ttl_max = 255;
29 33
30/* Update system visible IP port range */ 34/* Update system visible IP port range */
31static void set_local_port_range(int range[2]) 35static void set_local_port_range(int range[2])
@@ -153,8 +157,9 @@ static struct ctl_table ipv4_table[] = {
153 .data = &sysctl_ip_default_ttl, 157 .data = &sysctl_ip_default_ttl,
154 .maxlen = sizeof(int), 158 .maxlen = sizeof(int),
155 .mode = 0644, 159 .mode = 0644,
156 .proc_handler = ipv4_doint_and_flush, 160 .proc_handler = proc_dointvec_minmax,
157 .extra2 = &init_net, 161 .extra1 = &ip_ttl_min,
162 .extra2 = &ip_ttl_max,
158 }, 163 },
159 { 164 {
160 .procname = "ip_no_pmtu_disc", 165 .procname = "ip_no_pmtu_disc",
@@ -398,7 +403,7 @@ static struct ctl_table ipv4_table[] = {
398 .data = &sysctl_tcp_mem, 403 .data = &sysctl_tcp_mem,
399 .maxlen = sizeof(sysctl_tcp_mem), 404 .maxlen = sizeof(sysctl_tcp_mem),
400 .mode = 0644, 405 .mode = 0644,
401 .proc_handler = proc_dointvec 406 .proc_handler = proc_doulongvec_minmax
402 }, 407 },
403 { 408 {
404 .procname = "tcp_wmem", 409 .procname = "tcp_wmem",
@@ -426,7 +431,9 @@ static struct ctl_table ipv4_table[] = {
426 .data = &sysctl_tcp_adv_win_scale, 431 .data = &sysctl_tcp_adv_win_scale,
427 .maxlen = sizeof(int), 432 .maxlen = sizeof(int),
428 .mode = 0644, 433 .mode = 0644,
429 .proc_handler = proc_dointvec 434 .proc_handler = proc_dointvec_minmax,
435 .extra1 = &tcp_adv_win_scale_min,
436 .extra2 = &tcp_adv_win_scale_max,
430 }, 437 },
431 { 438 {
432 .procname = "tcp_tw_reuse", 439 .procname = "tcp_tw_reuse",
@@ -602,8 +609,7 @@ static struct ctl_table ipv4_table[] = {
602 .data = &sysctl_udp_mem, 609 .data = &sysctl_udp_mem,
603 .maxlen = sizeof(sysctl_udp_mem), 610 .maxlen = sizeof(sysctl_udp_mem),
604 .mode = 0644, 611 .mode = 0644,
605 .proc_handler = proc_dointvec_minmax, 612 .proc_handler = proc_doulongvec_minmax,
606 .extra1 = &zero
607 }, 613 },
608 { 614 {
609 .procname = "udp_rmem_min", 615 .procname = "udp_rmem_min",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5f738c5c0dc4..6c11eece262c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
282struct percpu_counter tcp_orphan_count; 282struct percpu_counter tcp_orphan_count;
283EXPORT_SYMBOL_GPL(tcp_orphan_count); 283EXPORT_SYMBOL_GPL(tcp_orphan_count);
284 284
285int sysctl_tcp_mem[3] __read_mostly; 285long sysctl_tcp_mem[3] __read_mostly;
286int sysctl_tcp_wmem[3] __read_mostly; 286int sysctl_tcp_wmem[3] __read_mostly;
287int sysctl_tcp_rmem[3] __read_mostly; 287int sysctl_tcp_rmem[3] __read_mostly;
288 288
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
290EXPORT_SYMBOL(sysctl_tcp_rmem); 290EXPORT_SYMBOL(sysctl_tcp_rmem);
291EXPORT_SYMBOL(sysctl_tcp_wmem); 291EXPORT_SYMBOL(sysctl_tcp_wmem);
292 292
293atomic_t tcp_memory_allocated; /* Current allocated memory. */ 293atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
294EXPORT_SYMBOL(tcp_memory_allocated); 294EXPORT_SYMBOL(tcp_memory_allocated);
295 295
296/* 296/*
@@ -2244,7 +2244,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2244 /* Values greater than interface MTU won't take effect. However 2244 /* Values greater than interface MTU won't take effect. However
2245 * at the point when this call is done we typically don't yet 2245 * at the point when this call is done we typically don't yet
2246 * know which interface is going to be used */ 2246 * know which interface is going to be used */
2247 if (val < 8 || val > MAX_TCP_WINDOW) { 2247 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2248 err = -EINVAL; 2248 err = -EINVAL;
2249 break; 2249 break;
2250 } 2250 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3357f69e353d..2549b29b062d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + 259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
260 sizeof(struct sk_buff); 260 sizeof(struct sk_buff);
261 261
262 if (sk->sk_sndbuf < 3 * sndmem) 262 if (sk->sk_sndbuf < 3 * sndmem) {
263 sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); 263 sk->sk_sndbuf = 3 * sndmem;
264 if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
265 sk->sk_sndbuf = sysctl_tcp_wmem[2];
266 }
264} 267}
265 268
266/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 269/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
396 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 399 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
397 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 400 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
398 !tcp_memory_pressure && 401 !tcp_memory_pressure &&
399 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { 402 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
400 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 403 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
401 sysctl_tcp_rmem[2]); 404 sysctl_tcp_rmem[2]);
402 } 405 }
@@ -731,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
731 * Reset our results. 734 * Reset our results.
732 */ 735 */
733 if (!(dst_metric_locked(dst, RTAX_RTT))) 736 if (!(dst_metric_locked(dst, RTAX_RTT)))
734 dst->metrics[RTAX_RTT - 1] = 0; 737 dst_metric_set(dst, RTAX_RTT, 0);
735 return; 738 return;
736 } 739 }
737 740
@@ -773,34 +776,38 @@ void tcp_update_metrics(struct sock *sk)
773 if (dst_metric(dst, RTAX_SSTHRESH) && 776 if (dst_metric(dst, RTAX_SSTHRESH) &&
774 !dst_metric_locked(dst, RTAX_SSTHRESH) && 777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
775 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
776 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; 779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
777 if (!dst_metric_locked(dst, RTAX_CWND) && 780 if (!dst_metric_locked(dst, RTAX_CWND) &&
778 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
779 dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; 782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
780 } else if (tp->snd_cwnd > tp->snd_ssthresh && 783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
781 icsk->icsk_ca_state == TCP_CA_Open) { 784 icsk->icsk_ca_state == TCP_CA_Open) {
782 /* Cong. avoidance phase, cwnd is reliable. */ 785 /* Cong. avoidance phase, cwnd is reliable. */
783 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
784 dst->metrics[RTAX_SSTHRESH-1] = 787 dst_metric_set(dst, RTAX_SSTHRESH,
785 max(tp->snd_cwnd >> 1, tp->snd_ssthresh); 788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
786 if (!dst_metric_locked(dst, RTAX_CWND)) 789 if (!dst_metric_locked(dst, RTAX_CWND))
787 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; 790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
788 } else { 793 } else {
789 /* Else slow start did not finish, cwnd is non-sense, 794 /* Else slow start did not finish, cwnd is non-sense,
790 ssthresh may be also invalid. 795 ssthresh may be also invalid.
791 */ 796 */
792 if (!dst_metric_locked(dst, RTAX_CWND)) 797 if (!dst_metric_locked(dst, RTAX_CWND))
793 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; 798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
794 if (dst_metric(dst, RTAX_SSTHRESH) && 801 if (dst_metric(dst, RTAX_SSTHRESH) &&
795 !dst_metric_locked(dst, RTAX_SSTHRESH) && 802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
796 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
797 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; 804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
798 } 805 }
799 806
800 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
801 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
802 tp->reordering != sysctl_tcp_reordering) 809 tp->reordering != sysctl_tcp_reordering)
803 dst->metrics[RTAX_REORDERING-1] = tp->reordering; 810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
804 } 811 }
805 } 812 }
806} 813}
@@ -909,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
909 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
910 } 917 }
911 tcp_set_rto(sk); 918 tcp_set_rto(sk);
912 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
913 goto reset;
914
915cwnd:
916 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
917 tp->snd_cwnd_stamp = tcp_time_stamp;
918 return;
919
920reset: 920reset:
921 /* Play conservative. If timestamps are not 921 /* Play conservative. If timestamps are not
922 * supported, TCP will fail to recalculate correct 922 * supported, TCP will fail to recalculate correct
923 * rtt, if initial rto is too small. FORGET ALL AND RESET! 923 * rtt, if initial rto is too small. FORGET ALL AND RESET!
924 */ 924 */
925 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 925 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
926 tp->srtt = 0; 926 tp->srtt = 0;
927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
929 }
929 } 930 }
930 goto cwnd; 931 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
932 tp->snd_cwnd_stamp = tcp_time_stamp;
931} 933}
932 934
933static void tcp_update_reordering(struct sock *sk, const int metric, 935static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -4861,7 +4863,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
4861 return 0; 4863 return 0;
4862 4864
4863 /* If we are under soft global TCP memory pressure, do not expand. */ 4865 /* If we are under soft global TCP memory pressure, do not expand. */
4864 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) 4866 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
4865 return 0; 4867 return 0;
4866 4868
4867 /* If we filled the congestion window, do not expand. */ 4869 /* If we filled the congestion window, do not expand. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8f8527d41682..856f68466d49 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
415 !icsk->icsk_backoff) 415 !icsk->icsk_backoff)
416 break; 416 break;
417 417
418 if (sock_owned_by_user(sk))
419 break;
420
418 icsk->icsk_backoff--; 421 icsk->icsk_backoff--;
419 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << 422 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
420 icsk->icsk_backoff; 423 icsk->icsk_backoff;
@@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
429 if (remaining) { 432 if (remaining) {
430 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 433 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
431 remaining, TCP_RTO_MAX); 434 remaining, TCP_RTO_MAX);
432 } else if (sock_owned_by_user(sk)) {
433 /* RTO revert clocked out retransmission,
434 * but socket is locked. Will defer. */
435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 HZ/20, TCP_RTO_MAX);
437 } else { 435 } else {
438 /* RTO revert clocked out retransmission. 436 /* RTO revert clocked out retransmission.
439 * Will retransmit now */ 437 * Will retransmit now */
@@ -1212,12 +1210,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1212}; 1210};
1213#endif 1211#endif
1214 1212
1215static struct timewait_sock_ops tcp_timewait_sock_ops = {
1216 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1217 .twsk_unique = tcp_twsk_unique,
1218 .twsk_destructor= tcp_twsk_destructor,
1219};
1220
1221int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1213int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222{ 1214{
1223 struct tcp_extend_values tmp_ext; 1215 struct tcp_extend_values tmp_ext;
@@ -1349,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1349 tcp_death_row.sysctl_tw_recycle && 1341 tcp_death_row.sysctl_tw_recycle &&
1350 (dst = inet_csk_route_req(sk, req)) != NULL && 1342 (dst = inet_csk_route_req(sk, req)) != NULL &&
1351 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352 peer->v4daddr == saddr) { 1344 peer->daddr.a4 == saddr) {
1353 inet_peer_refcheck(peer); 1345 inet_peer_refcheck(peer);
1354 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355 (s32)(peer->tcp_ts - req->ts_recent) > 1347 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1444,7 +1436,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1444 1436
1445 tcp_mtup_init(newsk); 1437 tcp_mtup_init(newsk);
1446 tcp_sync_mss(newsk, dst_mtu(dst)); 1438 tcp_sync_mss(newsk, dst_mtu(dst));
1447 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1439 newtp->advmss = dst_metric_advmss(dst);
1448 if (tcp_sk(sk)->rx_opt.user_mss && 1440 if (tcp_sk(sk)->rx_opt.user_mss &&
1449 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1441 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1450 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1442 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1765,64 +1757,40 @@ do_time_wait:
1765 goto discard_it; 1757 goto discard_it;
1766} 1758}
1767 1759
1768/* VJ's idea. Save last timestamp seen from this destination 1760struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1769 * and hold it at least for normal timewait interval to use for duplicate
1770 * segment detection in subsequent connections, before they enter synchronized
1771 * state.
1772 */
1773
1774int tcp_v4_remember_stamp(struct sock *sk)
1775{ 1761{
1762 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1776 struct inet_sock *inet = inet_sk(sk); 1763 struct inet_sock *inet = inet_sk(sk);
1777 struct tcp_sock *tp = tcp_sk(sk); 1764 struct inet_peer *peer;
1778 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1779 struct inet_peer *peer = NULL;
1780 int release_it = 0;
1781 1765
1782 if (!rt || rt->rt_dst != inet->inet_daddr) { 1766 if (!rt || rt->rt_dst != inet->inet_daddr) {
1783 peer = inet_getpeer(inet->inet_daddr, 1); 1767 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1784 release_it = 1; 1768 *release_it = true;
1785 } else { 1769 } else {
1786 if (!rt->peer) 1770 if (!rt->peer)
1787 rt_bind_peer(rt, 1); 1771 rt_bind_peer(rt, 1);
1788 peer = rt->peer; 1772 peer = rt->peer;
1773 *release_it = false;
1789 } 1774 }
1790 1775
1791 if (peer) { 1776 return peer;
1792 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1793 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1794 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1795 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1796 peer->tcp_ts = tp->rx_opt.ts_recent;
1797 }
1798 if (release_it)
1799 inet_putpeer(peer);
1800 return 1;
1801 }
1802
1803 return 0;
1804} 1777}
1805EXPORT_SYMBOL(tcp_v4_remember_stamp); 1778EXPORT_SYMBOL(tcp_v4_get_peer);
1806 1779
1807int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1780void *tcp_v4_tw_get_peer(struct sock *sk)
1808{ 1781{
1809 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1782 struct inet_timewait_sock *tw = inet_twsk(sk);
1810
1811 if (peer) {
1812 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1813
1814 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1815 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1816 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1817 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1818 peer->tcp_ts = tcptw->tw_ts_recent;
1819 }
1820 inet_putpeer(peer);
1821 return 1;
1822 }
1823 1783
1824 return 0; 1784 return inet_getpeer_v4(tw->tw_daddr, 1);
1825} 1785}
1786EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1787
1788static struct timewait_sock_ops tcp_timewait_sock_ops = {
1789 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1790 .twsk_unique = tcp_twsk_unique,
1791 .twsk_destructor= tcp_twsk_destructor,
1792 .twsk_getpeer = tcp_v4_tw_get_peer,
1793};
1826 1794
1827const struct inet_connection_sock_af_ops ipv4_specific = { 1795const struct inet_connection_sock_af_ops ipv4_specific = {
1828 .queue_xmit = ip_queue_xmit, 1796 .queue_xmit = ip_queue_xmit,
@@ -1830,7 +1798,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1830 .rebuild_header = inet_sk_rebuild_header, 1798 .rebuild_header = inet_sk_rebuild_header,
1831 .conn_request = tcp_v4_conn_request, 1799 .conn_request = tcp_v4_conn_request,
1832 .syn_recv_sock = tcp_v4_syn_recv_sock, 1800 .syn_recv_sock = tcp_v4_syn_recv_sock,
1833 .remember_stamp = tcp_v4_remember_stamp, 1801 .get_peer = tcp_v4_get_peer,
1834 .net_header_len = sizeof(struct iphdr), 1802 .net_header_len = sizeof(struct iphdr),
1835 .setsockopt = ip_setsockopt, 1803 .setsockopt = ip_setsockopt,
1836 .getsockopt = ip_getsockopt, 1804 .getsockopt = ip_getsockopt,
@@ -2032,7 +2000,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2032get_req: 2000get_req:
2033 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 2001 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2034 } 2002 }
2035 sk = sk_next(st->syn_wait_sk); 2003 sk = sk_nulls_next(st->syn_wait_sk);
2036 st->state = TCP_SEQ_STATE_LISTENING; 2004 st->state = TCP_SEQ_STATE_LISTENING;
2037 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2005 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038 } else { 2006 } else {
@@ -2041,11 +2009,13 @@ get_req:
2041 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 2009 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2042 goto start_req; 2010 goto start_req;
2043 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2044 sk = sk_next(sk); 2012 sk = sk_nulls_next(sk);
2045 } 2013 }
2046get_sk: 2014get_sk:
2047 sk_nulls_for_each_from(sk, node) { 2015 sk_nulls_for_each_from(sk, node) {
2048 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 2016 if (!net_eq(sock_net(sk), net))
2017 continue;
2018 if (sk->sk_family == st->family) {
2049 cur = sk; 2019 cur = sk;
2050 goto out; 2020 goto out;
2051 } 2021 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43cf901d7659..80b1f80759ab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,6 +49,56 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static int tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return 1;
76 }
77
78 return 0;
79}
80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return 1;
98 }
99 return 0;
100}
101
52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53{ 103{
54 if (seq == s_win) 104 if (seq == s_win)
@@ -149,14 +199,9 @@ kill_with_rst:
149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
150 } 200 }
151 201
152 /* I am shamed, but failed to make it more elegant. 202 if (tcp_death_row.sysctl_tw_recycle &&
153 * Yes, it is direct reference to IP, which is impossible 203 tcptw->tw_ts_recent_stamp &&
154 * to generalize to IPv6. Taking into account that IPv6 204 tcp_tw_remember_stamp(tw))
155 * do not understand recycling in any case, it not
156 * a big problem in practice. --ANK */
157 if (tw->tw_family == AF_INET &&
158 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
159 tcp_v4_tw_remember_stamp(tw))
160 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 205 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
161 TCP_TIMEWAIT_LEN); 206 TCP_TIMEWAIT_LEN);
162 else 207 else
@@ -274,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
274 int recycle_ok = 0; 319 int recycle_ok = 0;
275 320
276 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
277 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
278 323
279 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 324 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
280 tw = inet_twsk_alloc(sk, state); 325 tw = inet_twsk_alloc(sk, state);
@@ -347,7 +392,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
347 * socket up. We've got bigger problems than 392 * socket up. We've got bigger problems than
348 * non-graceful socket closings. 393 * non-graceful socket closings.
349 */ 394 */
350 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); 395 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
351 } 396 }
352 397
353 tcp_update_metrics(sk); 398 tcp_update_metrics(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 05b1ecf36763..dc7c096ddfef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
55int sysctl_tcp_tso_win_divisor __read_mostly = 3; 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
56 56
57int sysctl_tcp_mtu_probing __read_mostly = 0; 57int sysctl_tcp_mtu_probing __read_mostly = 0;
58int sysctl_tcp_base_mss __read_mostly = 512; 58int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
59 59
60/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
61int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
119 struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
120 int mss = tp->advmss; 120 int mss = tp->advmss;
121 121
122 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { 122 if (dst) {
123 mss = dst_metric(dst, RTAX_ADVMSS); 123 unsigned int metric = dst_metric_advmss(dst);
124 tp->advmss = mss; 124
125 if (metric < mss) {
126 mss = metric;
127 tp->advmss = mss;
128 }
125 } 129 }
126 130
127 return (__u16)mss; 131 return (__u16)mss;
@@ -224,18 +228,22 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 228 }
225 } 229 }
226 230
227 /* Set initial window to value enough for senders, following RFC5681. */ 231 /* Set initial window to a value enough for senders starting with
232 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
233 * a limit on the initial window when mss is larger than 1460.
234 */
228 if (mss > (1 << *rcv_wscale)) { 235 if (mss > (1 << *rcv_wscale)) {
229 int init_cwnd = rfc3390_bytes_to_packets(mss); 236 int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
230 237 if (mss > 1460)
238 init_cwnd =
239 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
231 /* when initializing use the value from init_rcv_wnd 240 /* when initializing use the value from init_rcv_wnd
232 * rather than the default from above 241 * rather than the default from above
233 */ 242 */
234 if (init_rcv_wnd && 243 if (init_rcv_wnd)
235 (*rcv_wnd > init_rcv_wnd * mss)) 244 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
236 *rcv_wnd = init_rcv_wnd * mss; 245 else
237 else if (*rcv_wnd > init_cwnd * mss) 246 *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
238 *rcv_wnd = init_cwnd * mss;
239 } 247 }
240 248
241 /* Set the clamp no higher than max representable value */ 249 /* Set the clamp no higher than max representable value */
@@ -386,27 +394,30 @@ struct tcp_out_options {
386 */ 394 */
387static u8 tcp_cookie_size_check(u8 desired) 395static u8 tcp_cookie_size_check(u8 desired)
388{ 396{
389 if (desired > 0) { 397 int cookie_size;
398
399 if (desired > 0)
390 /* previously specified */ 400 /* previously specified */
391 return desired; 401 return desired;
392 } 402
393 if (sysctl_tcp_cookie_size <= 0) { 403 cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
404 if (cookie_size <= 0)
394 /* no default specified */ 405 /* no default specified */
395 return 0; 406 return 0;
396 } 407
397 if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { 408 if (cookie_size <= TCP_COOKIE_MIN)
398 /* value too small, specify minimum */ 409 /* value too small, specify minimum */
399 return TCP_COOKIE_MIN; 410 return TCP_COOKIE_MIN;
400 } 411
401 if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { 412 if (cookie_size >= TCP_COOKIE_MAX)
402 /* value too large, specify maximum */ 413 /* value too large, specify maximum */
403 return TCP_COOKIE_MAX; 414 return TCP_COOKIE_MAX;
404 } 415
405 if (0x1 & sysctl_tcp_cookie_size) { 416 if (cookie_size & 1)
406 /* 8-bit multiple, illegal, fix it */ 417 /* 8-bit multiple, illegal, fix it */
407 return (u8)(sysctl_tcp_cookie_size + 0x1); 418 cookie_size++;
408 } 419
409 return (u8)sysctl_tcp_cookie_size; 420 return (u8)cookie_size;
410} 421}
411 422
412/* Write previously computed TCP options to the packet. 423/* Write previously computed TCP options to the packet.
@@ -822,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
822 &md5); 833 &md5);
823 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 834 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
824 835
825 if (tcp_packets_in_flight(tp) == 0) 836 if (tcp_packets_in_flight(tp) == 0) {
826 tcp_ca_event(sk, CA_EVENT_TX_START); 837 tcp_ca_event(sk, CA_EVENT_TX_START);
838 skb->ooo_okay = 1;
839 } else
840 skb->ooo_okay = 0;
827 841
828 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
829 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
@@ -1513,6 +1527,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1513 struct tcp_sock *tp = tcp_sk(sk); 1527 struct tcp_sock *tp = tcp_sk(sk);
1514 const struct inet_connection_sock *icsk = inet_csk(sk); 1528 const struct inet_connection_sock *icsk = inet_csk(sk);
1515 u32 send_win, cong_win, limit, in_flight; 1529 u32 send_win, cong_win, limit, in_flight;
1530 int win_divisor;
1516 1531
1517 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) 1532 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1518 goto send_now; 1533 goto send_now;
@@ -1544,13 +1559,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1544 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1559 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1545 goto send_now; 1560 goto send_now;
1546 1561
1547 if (sysctl_tcp_tso_win_divisor) { 1562 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1563 if (win_divisor) {
1548 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1564 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1549 1565
1550 /* If at least some fraction of a window is available, 1566 /* If at least some fraction of a window is available,
1551 * just use it. 1567 * just use it.
1552 */ 1568 */
1553 chunk /= sysctl_tcp_tso_win_divisor; 1569 chunk /= win_divisor;
1554 if (limit >= chunk) 1570 if (limit >= chunk)
1555 goto send_now; 1571 goto send_now;
1556 } else { 1572 } else {
@@ -2415,7 +2431,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2415 2431
2416 skb_dst_set(skb, dst_clone(dst)); 2432 skb_dst_set(skb, dst_clone(dst));
2417 2433
2418 mss = dst_metric(dst, RTAX_ADVMSS); 2434 mss = dst_metric_advmss(dst);
2419 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2435 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2420 mss = tp->rx_opt.user_mss; 2436 mss = tp->rx_opt.user_mss;
2421 2437
@@ -2549,7 +2565,7 @@ static void tcp_connect_init(struct sock *sk)
2549 2565
2550 if (!tp->window_clamp) 2566 if (!tp->window_clamp)
2551 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2567 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2552 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2568 tp->advmss = dst_metric_advmss(dst);
2553 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2569 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2554 tp->advmss = tp->rx_opt.user_mss; 2570 tp->advmss = tp->rx_opt.user_mss;
2555 2571
@@ -2592,6 +2608,7 @@ int tcp_connect(struct sock *sk)
2592{ 2608{
2593 struct tcp_sock *tp = tcp_sk(sk); 2609 struct tcp_sock *tp = tcp_sk(sk);
2594 struct sk_buff *buff; 2610 struct sk_buff *buff;
2611 int err;
2595 2612
2596 tcp_connect_init(sk); 2613 tcp_connect_init(sk);
2597 2614
@@ -2614,7 +2631,9 @@ int tcp_connect(struct sock *sk)
2614 sk->sk_wmem_queued += buff->truesize; 2631 sk->sk_wmem_queued += buff->truesize;
2615 sk_mem_charge(sk, buff->truesize); 2632 sk_mem_charge(sk, buff->truesize);
2616 tp->packets_out += tcp_skb_pcount(buff); 2633 tp->packets_out += tcp_skb_pcount(buff);
2617 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 2634 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2635 if (err == -ECONNREFUSED)
2636 return err;
2618 2637
2619 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2638 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2620 * in order to make this packet get counted in tcpOutSegs. 2639 * in order to make this packet get counted in tcpOutSegs.
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 6211e2114173..85ee7eb7e38e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n)
154 struct timespec tv 154 struct timespec tv
155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
156 156
157 return snprintf(tbuf, n, 157 return scnprintf(tbuf, n,
158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", 158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
159 (unsigned long) tv.tv_sec, 159 (unsigned long) tv.tv_sec,
160 (unsigned long) tv.tv_nsec, 160 (unsigned long) tv.tv_nsec,
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
174 return -EINVAL; 174 return -EINVAL;
175 175
176 while (cnt < len) { 176 while (cnt < len) {
177 char tbuf[128]; 177 char tbuf[164];
178 int width; 178 int width;
179 179
180 /* Wait for data in buffer */ 180 /* Wait for data in buffer */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 28cb2d733a3c..8157b17959ee 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -110,7 +110,7 @@
110struct udp_table udp_table __read_mostly; 110struct udp_table udp_table __read_mostly;
111EXPORT_SYMBOL(udp_table); 111EXPORT_SYMBOL(udp_table);
112 112
113int sysctl_udp_mem[3] __read_mostly; 113long sysctl_udp_mem[3] __read_mostly;
114EXPORT_SYMBOL(sysctl_udp_mem); 114EXPORT_SYMBOL(sysctl_udp_mem);
115 115
116int sysctl_udp_rmem_min __read_mostly; 116int sysctl_udp_rmem_min __read_mostly;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
119int sysctl_udp_wmem_min __read_mostly; 119int sysctl_udp_wmem_min __read_mostly;
120EXPORT_SYMBOL(sysctl_udp_wmem_min); 120EXPORT_SYMBOL(sysctl_udp_wmem_min);
121 121
122atomic_t udp_memory_allocated; 122atomic_long_t udp_memory_allocated;
123EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
124 124
125#define MAX_UDP_PORTS 65536 125#define MAX_UDP_PORTS 65536
@@ -430,7 +430,7 @@ begin:
430 430
431 if (result) { 431 if (result) {
432exact_match: 432exact_match:
433 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 433 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
434 result = NULL; 434 result = NULL;
435 else if (unlikely(compute_score2(result, net, saddr, sport, 435 else if (unlikely(compute_score2(result, net, saddr, sport,
436 daddr, hnum, dif) < badness)) { 436 daddr, hnum, dif) < badness)) {
@@ -500,7 +500,7 @@ begin:
500 goto begin; 500 goto begin;
501 501
502 if (result) { 502 if (result) {
503 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 503 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
504 result = NULL; 504 result = NULL;
505 else if (unlikely(compute_score(result, net, saddr, hnum, sport, 505 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
506 daddr, dport, dif) < badness)) { 506 daddr, dport, dif) < badness)) {
@@ -890,15 +890,13 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
890 if (rt == NULL) { 890 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif, 891 struct flowi fl = { .oif = ipc.oif,
892 .mark = sk->sk_mark, 892 .mark = sk->sk_mark,
893 .nl_u = { .ip4_u = 893 .fl4_dst = faddr,
894 { .daddr = faddr, 894 .fl4_src = saddr,
895 .saddr = saddr, 895 .fl4_tos = tos,
896 .tos = tos } },
897 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
898 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
899 .uli_u = { .ports = 898 .fl_ip_sport = inet->inet_sport,
900 { .sport = inet->inet_sport, 899 .fl_ip_dport = dport };
901 .dport = dport } } };
902 struct net *net = sock_net(sk); 900 struct net *net = sock_net(sk);
903 901
904 security_sk_classify_flow(sk, &fl); 902 security_sk_classify_flow(sk, &fl);
@@ -1899,6 +1897,7 @@ struct proto udp_prot = {
1899 .compat_setsockopt = compat_udp_setsockopt, 1897 .compat_setsockopt = compat_udp_setsockopt,
1900 .compat_getsockopt = compat_udp_getsockopt, 1898 .compat_getsockopt = compat_udp_getsockopt,
1901#endif 1899#endif
1900 .clear_sk = sk_prot_clear_portaddr_nulls,
1902}; 1901};
1903EXPORT_SYMBOL(udp_prot); 1902EXPORT_SYMBOL(udp_prot);
1904 1903
@@ -2228,7 +2227,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
2228 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot 2227 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
2229 * do checksum of UDP packets sent as multiple IP fragments. 2228 * do checksum of UDP packets sent as multiple IP fragments.
2230 */ 2229 */
2231 offset = skb->csum_start - skb_headroom(skb); 2230 offset = skb_checksum_start_offset(skb);
2232 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2231 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2233 offset += skb->csum_offset; 2232 offset += skb->csum_offset;
2234 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2233 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa928fa9..aee9963f7f5a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto udplite_prot = {
57 .compat_setsockopt = compat_udp_setsockopt, 57 .compat_setsockopt = compat_udp_setsockopt,
58 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
59#endif 59#endif
60 .clear_sk = sk_prot_clear_portaddr_nulls,
60}; 61};
61EXPORT_SYMBOL(udplite_prot); 62EXPORT_SYMBOL(udplite_prot);
62 63
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 6f368413eb0e..534972e114ac 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -56,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
57 ip_select_ident(top_iph, dst->child, NULL); 57 ip_select_ident(top_iph, dst->child, NULL);
58 58
59 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); 59 top_iph->ttl = ip4_dst_hoplimit(dst->child);
60 60
61 top_iph->saddr = x->props.saddr.a4; 61 top_iph->saddr = x->props.saddr.a4;
62 top_iph->daddr = x->id.daddr.a4; 62 top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index dd1fd8c473fc..b057d40addec 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -11,6 +11,7 @@
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/if_tunnel.h>
14#include <net/dst.h> 15#include <net/dst.h>
15#include <net/xfrm.h> 16#include <net/xfrm.h>
16#include <net/ip.h> 17#include <net/ip.h>
@@ -22,12 +23,8 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *daddr) 23 xfrm_address_t *daddr)
23{ 24{
24 struct flowi fl = { 25 struct flowi fl = {
25 .nl_u = { 26 .fl4_dst = daddr->a4,
26 .ip4_u = { 27 .fl4_tos = tos,
27 .tos = tos,
28 .daddr = daddr->a4,
29 },
30 },
31 }; 28 };
32 struct dst_entry *dst; 29 struct dst_entry *dst;
33 struct rtable *rt; 30 struct rtable *rt;
@@ -154,6 +151,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
154 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
155 } 152 }
156 break; 153 break;
154
155 case IPPROTO_GRE:
156 if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
157 __be16 *greflags = (__be16 *)xprth;
158 __be32 *gre_hdr = (__be32 *)xprth;
159
160 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM)
162 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1];
164 }
165 }
166 break;
167
157 default: 168 default:
158 fl->fl_ipsec_spi = 0; 169 fl->fl_ipsec_spi = 0;
159 break; 170 break;