aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c18
-rw-r--r--net/ipv4/arp.c31
-rw-r--r--net/ipv4/devinet.c97
-rw-r--r--net/ipv4/esp4.c32
-rw-r--r--net/ipv4/fib_frontend.c34
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/icmp.c32
-rw-r--r--net/ipv4/igmp.c282
-rw-r--r--net/ipv4/inet_connection_sock.c22
-rw-r--r--net/ipv4/inetpeer.c167
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_gre.c52
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/ipconfig.c32
-rw-r--r--net/ipv4/ipip.c21
-rw-r--r--net/ipv4/ipmr.c20
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c252
-rw-r--r--net/ipv4/syncookies.c15
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c16
-rw-r--r--net/ipv4/tcp_input.c51
-rw-r--r--net/ipv4/tcp_ipv4.c74
-rw-r--r--net/ipv4/tcp_minisocks.c63
-rw-r--r--net/ipv4/tcp_output.c37
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/udp.c18
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_policy.c47
32 files changed, 780 insertions, 707 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f581f77d1097..f2b61107df6c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1148,21 +1148,13 @@ int inet_sk_rebuild_header(struct sock *sk)
1148 struct flowi fl = { 1148 struct flowi fl = {
1149 .oif = sk->sk_bound_dev_if, 1149 .oif = sk->sk_bound_dev_if,
1150 .mark = sk->sk_mark, 1150 .mark = sk->sk_mark,
1151 .nl_u = { 1151 .fl4_dst = daddr,
1152 .ip4_u = { 1152 .fl4_src = inet->inet_saddr,
1153 .daddr = daddr, 1153 .fl4_tos = RT_CONN_FLAGS(sk),
1154 .saddr = inet->inet_saddr,
1155 .tos = RT_CONN_FLAGS(sk),
1156 },
1157 },
1158 .proto = sk->sk_protocol, 1154 .proto = sk->sk_protocol,
1159 .flags = inet_sk_flowi_flags(sk), 1155 .flags = inet_sk_flowi_flags(sk),
1160 .uli_u = { 1156 .fl_ip_sport = inet->inet_sport,
1161 .ports = { 1157 .fl_ip_dport = inet->inet_dport,
1162 .sport = inet->inet_sport,
1163 .dport = inet->inet_dport,
1164 },
1165 },
1166 }; 1158 };
1167 1159
1168 security_sk_classify_flow(sk, &fl); 1160 security_sk_classify_flow(sk, &fl);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index d8e540c5b071..a2fc7b961dbc 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -433,8 +433,8 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
433 433
434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
435{ 435{
436 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, 436 struct flowi fl = { .fl4_dst = sip,
437 .saddr = tip } } }; 437 .fl4_src = tip };
438 struct rtable *rt; 438 struct rtable *rt;
439 int flag = 0; 439 int flag = 0;
440 /*unsigned long now; */ 440 /*unsigned long now; */
@@ -883,7 +883,7 @@ static int arp_process(struct sk_buff *skb)
883 883
884 dont_send = arp_ignore(in_dev, sip, tip); 884 dont_send = arp_ignore(in_dev, sip, tip);
885 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
886 dont_send |= arp_filter(sip, tip, dev); 886 dont_send = arp_filter(sip, tip, dev);
887 if (!dont_send) { 887 if (!dont_send) {
888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
889 if (n) { 889 if (n) {
@@ -1017,13 +1017,14 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; 1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
1018 return 0; 1018 return 0;
1019 } 1019 }
1020 if (__in_dev_get_rtnl(dev)) { 1020 if (__in_dev_get_rcu(dev)) {
1021 IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); 1021 IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
1022 return 0; 1022 return 0;
1023 } 1023 }
1024 return -ENXIO; 1024 return -ENXIO;
1025} 1025}
1026 1026
1027/* must be called with rcu_read_lock() */
1027static int arp_req_set_public(struct net *net, struct arpreq *r, 1028static int arp_req_set_public(struct net *net, struct arpreq *r,
1028 struct net_device *dev) 1029 struct net_device *dev)
1029{ 1030{
@@ -1033,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033 if (mask && mask != htonl(0xFFFFFFFF)) 1034 if (mask && mask != htonl(0xFFFFFFFF))
1034 return -EINVAL; 1035 return -EINVAL;
1035 if (!dev && (r->arp_flags & ATF_COM)) { 1036 if (!dev && (r->arp_flags & ATF_COM)) {
1036 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1037 dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
1037 r->arp_ha.sa_data); 1038 r->arp_ha.sa_data);
1038 if (!dev) 1039 if (!dev)
1039 return -ENODEV; 1040 return -ENODEV;
@@ -1061,8 +1062,8 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1061 if (r->arp_flags & ATF_PERM) 1062 if (r->arp_flags & ATF_PERM)
1062 r->arp_flags |= ATF_COM; 1063 r->arp_flags |= ATF_COM;
1063 if (dev == NULL) { 1064 if (dev == NULL) {
1064 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, 1065 struct flowi fl = { .fl4_dst = ip,
1065 .tos = RTO_ONLINK } }; 1066 .fl4_tos = RTO_ONLINK };
1066 struct rtable *rt; 1067 struct rtable *rt;
1067 err = ip_route_output_key(net, &rt, &fl); 1068 err = ip_route_output_key(net, &rt, &fl);
1068 if (err != 0) 1069 if (err != 0)
@@ -1169,8 +1170,8 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1169 1170
1170 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1171 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1171 if (dev == NULL) { 1172 if (dev == NULL) {
1172 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, 1173 struct flowi fl = { .fl4_dst = ip,
1173 .tos = RTO_ONLINK } }; 1174 .fl4_tos = RTO_ONLINK };
1174 struct rtable *rt; 1175 struct rtable *rt;
1175 err = ip_route_output_key(net, &rt, &fl); 1176 err = ip_route_output_key(net, &rt, &fl);
1176 if (err != 0) 1177 if (err != 0)
@@ -1225,10 +1226,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1225 if (!(r.arp_flags & ATF_NETMASK)) 1226 if (!(r.arp_flags & ATF_NETMASK))
1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1227 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
1227 htonl(0xFFFFFFFFUL); 1228 htonl(0xFFFFFFFFUL);
1228 rtnl_lock(); 1229 rcu_read_lock();
1229 if (r.arp_dev[0]) { 1230 if (r.arp_dev[0]) {
1230 err = -ENODEV; 1231 err = -ENODEV;
1231 dev = __dev_get_by_name(net, r.arp_dev); 1232 dev = dev_get_by_name_rcu(net, r.arp_dev);
1232 if (dev == NULL) 1233 if (dev == NULL)
1233 goto out; 1234 goto out;
1234 1235
@@ -1252,12 +1253,12 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1252 break; 1253 break;
1253 case SIOCGARP: 1254 case SIOCGARP:
1254 err = arp_req_get(&r, dev); 1255 err = arp_req_get(&r, dev);
1255 if (!err && copy_to_user(arg, &r, sizeof(r)))
1256 err = -EFAULT;
1257 break; 1256 break;
1258 } 1257 }
1259out: 1258out:
1260 rtnl_unlock(); 1259 rcu_read_unlock();
1260 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
1261 err = -EFAULT;
1261 return err; 1262 return err;
1262} 1263}
1263 1264
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dc94b0316b78..748cb5b337bd 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1256,6 +1256,87 @@ errout:
1256 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1256 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
1257} 1257}
1258 1258
1259static size_t inet_get_link_af_size(const struct net_device *dev)
1260{
1261 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1262
1263 if (!in_dev)
1264 return 0;
1265
1266 return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
1267}
1268
1269static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
1270{
1271 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1272 struct nlattr *nla;
1273 int i;
1274
1275 if (!in_dev)
1276 return -ENODATA;
1277
1278 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
1279 if (nla == NULL)
1280 return -EMSGSIZE;
1281
1282 for (i = 0; i < IPV4_DEVCONF_MAX; i++)
1283 ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
1284
1285 return 0;
1286}
1287
1288static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
1289 [IFLA_INET_CONF] = { .type = NLA_NESTED },
1290};
1291
1292static int inet_validate_link_af(const struct net_device *dev,
1293 const struct nlattr *nla)
1294{
1295 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1296 int err, rem;
1297
1298 if (dev && !__in_dev_get_rtnl(dev))
1299 return -EAFNOSUPPORT;
1300
1301 err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
1302 if (err < 0)
1303 return err;
1304
1305 if (tb[IFLA_INET_CONF]) {
1306 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
1307 int cfgid = nla_type(a);
1308
1309 if (nla_len(a) < 4)
1310 return -EINVAL;
1311
1312 if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
1313 return -EINVAL;
1314 }
1315 }
1316
1317 return 0;
1318}
1319
1320static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
1321{
1322 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1323 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1324 int rem;
1325
1326 if (!in_dev)
1327 return -EAFNOSUPPORT;
1328
1329 if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
1330 BUG();
1331
1332 if (tb[IFLA_INET_CONF]) {
1333 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
1334 ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
1335 }
1336
1337 return 0;
1338}
1339
1259#ifdef CONFIG_SYSCTL 1340#ifdef CONFIG_SYSCTL
1260 1341
1261static void devinet_copy_dflt_conf(struct net *net, int i) 1342static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1349,9 +1430,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1349 return ret; 1430 return ret;
1350} 1431}
1351 1432
1352int ipv4_doint_and_flush(ctl_table *ctl, int write, 1433static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1353 void __user *buffer, 1434 void __user *buffer,
1354 size_t *lenp, loff_t *ppos) 1435 size_t *lenp, loff_t *ppos)
1355{ 1436{
1356 int *valp = ctl->data; 1437 int *valp = ctl->data;
1357 int val = *valp; 1438 int val = *valp;
@@ -1619,6 +1700,14 @@ static __net_initdata struct pernet_operations devinet_ops = {
1619 .exit = devinet_exit_net, 1700 .exit = devinet_exit_net,
1620}; 1701};
1621 1702
1703static struct rtnl_af_ops inet_af_ops = {
1704 .family = AF_INET,
1705 .fill_link_af = inet_fill_link_af,
1706 .get_link_af_size = inet_get_link_af_size,
1707 .validate_link_af = inet_validate_link_af,
1708 .set_link_af = inet_set_link_af,
1709};
1710
1622void __init devinet_init(void) 1711void __init devinet_init(void)
1623{ 1712{
1624 register_pernet_subsys(&devinet_ops); 1713 register_pernet_subsys(&devinet_ops);
@@ -1626,6 +1715,8 @@ void __init devinet_init(void)
1626 register_gifconf(PF_INET, inet_gifconf); 1715 register_gifconf(PF_INET, inet_gifconf);
1627 register_netdevice_notifier(&ip_netdev_notifier); 1716 register_netdevice_notifier(&ip_netdev_notifier);
1628 1717
1718 rtnl_af_register(&inet_af_ops);
1719
1629 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); 1720 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
1630 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); 1721 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
1631 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1722 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1c3fb0..e42a905180f0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
23 23
24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) 24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
25 25
26static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
27
26/* 28/*
27 * Allocate an AEAD request structure with extra space for SG and IV. 29 * Allocate an AEAD request structure with extra space for SG and IV.
28 * 30 *
@@ -117,25 +119,35 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
117 int blksize; 119 int blksize;
118 int clen; 120 int clen;
119 int alen; 121 int alen;
122 int plen;
123 int tfclen;
120 int nfrags; 124 int nfrags;
121 125
122 /* skb is pure payload to encrypt */ 126 /* skb is pure payload to encrypt */
123 127
124 err = -ENOMEM; 128 err = -ENOMEM;
125 129
126 /* Round to block size */
127 clen = skb->len;
128
129 esp = x->data; 130 esp = x->data;
130 aead = esp->aead; 131 aead = esp->aead;
131 alen = crypto_aead_authsize(aead); 132 alen = crypto_aead_authsize(aead);
132 133
134 tfclen = 0;
135 if (x->tfcpad) {
136 struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
137 u32 padto;
138
139 padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
140 if (skb->len < padto)
141 tfclen = padto - skb->len;
142 }
133 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 143 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
134 clen = ALIGN(clen + 2, blksize); 144 clen = ALIGN(skb->len + 2 + tfclen, blksize);
135 if (esp->padlen) 145 if (esp->padlen)
136 clen = ALIGN(clen, esp->padlen); 146 clen = ALIGN(clen, esp->padlen);
147 plen = clen - skb->len - tfclen;
137 148
138 if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) 149 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
150 if (err < 0)
139 goto error; 151 goto error;
140 nfrags = err; 152 nfrags = err;
141 153
@@ -150,13 +162,17 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
150 162
151 /* Fill padding... */ 163 /* Fill padding... */
152 tail = skb_tail_pointer(trailer); 164 tail = skb_tail_pointer(trailer);
165 if (tfclen) {
166 memset(tail, 0, tfclen);
167 tail += tfclen;
168 }
153 do { 169 do {
154 int i; 170 int i;
155 for (i=0; i<clen-skb->len - 2; i++) 171 for (i = 0; i < plen - 2; i++)
156 tail[i] = i + 1; 172 tail[i] = i + 1;
157 } while (0); 173 } while (0);
158 tail[clen - skb->len - 2] = (clen - skb->len) - 2; 174 tail[plen - 2] = plen - 2;
159 tail[clen - skb->len - 1] = *skb_mac_header(skb); 175 tail[plen - 1] = *skb_mac_header(skb);
160 pskb_put(skb, trailer, clen - skb->len + alen); 176 pskb_put(skb, trailer, clen - skb->len + alen);
161 177
162 skb_push(skb, -skb_network_offset(skb)); 178 skb_push(skb, -skb_network_offset(skb));
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c19c1f739fba..1d2cdd43a878 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -158,11 +158,7 @@ static void fib_flush(struct net *net)
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{ 159{
160 struct flowi fl = { 160 struct flowi fl = {
161 .nl_u = { 161 .fl4_dst = addr,
162 .ip4_u = {
163 .daddr = addr
164 }
165 },
166 }; 162 };
167 struct fib_result res = { 0 }; 163 struct fib_result res = { 0 };
168 struct net_device *dev = NULL; 164 struct net_device *dev = NULL;
@@ -199,7 +195,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
199 const struct net_device *dev, 195 const struct net_device *dev,
200 __be32 addr) 196 __be32 addr)
201{ 197{
202 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 198 struct flowi fl = { .fl4_dst = addr };
203 struct fib_result res; 199 struct fib_result res;
204 unsigned ret = RTN_BROADCAST; 200 unsigned ret = RTN_BROADCAST;
205 struct fib_table *local_table; 201 struct fib_table *local_table;
@@ -253,13 +249,9 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
253{ 249{
254 struct in_device *in_dev; 250 struct in_device *in_dev;
255 struct flowi fl = { 251 struct flowi fl = {
256 .nl_u = { 252 .fl4_dst = src,
257 .ip4_u = { 253 .fl4_src = dst,
258 .daddr = src, 254 .fl4_tos = tos,
259 .saddr = dst,
260 .tos = tos
261 }
262 },
263 .mark = mark, 255 .mark = mark,
264 .iif = oif 256 .iif = oif
265 }; 257 };
@@ -859,13 +851,9 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
859 struct fib_result res; 851 struct fib_result res;
860 struct flowi fl = { 852 struct flowi fl = {
861 .mark = frn->fl_mark, 853 .mark = frn->fl_mark,
862 .nl_u = { 854 .fl4_dst = frn->fl_addr,
863 .ip4_u = { 855 .fl4_tos = frn->fl_tos,
864 .daddr = frn->fl_addr, 856 .fl4_scope = frn->fl_scope,
865 .tos = frn->fl_tos,
866 .scope = frn->fl_scope
867 }
868 }
869 }; 857 };
870 858
871#ifdef CONFIG_IP_MULTIPLE_TABLES 859#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1005,7 +993,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1005 rt_cache_flush(dev_net(dev), 0); 993 rt_cache_flush(dev_net(dev), 0);
1006 break; 994 break;
1007 case NETDEV_UNREGISTER_BATCH: 995 case NETDEV_UNREGISTER_BATCH:
1008 rt_cache_flush_batch(); 996 /* The batch unregister is only called on the first
997 * device in the list of devices being unregistered.
998 * Therefore we should not pass dev_net(dev) in here.
999 */
1000 rt_cache_flush_batch(NULL);
1009 break; 1001 break;
1010 } 1002 }
1011 return NOTIFY_DONE; 1003 return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3e0da3ef6116..12d3dc3df1b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -563,12 +563,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
563 rcu_read_lock(); 563 rcu_read_lock();
564 { 564 {
565 struct flowi fl = { 565 struct flowi fl = {
566 .nl_u = { 566 .fl4_dst = nh->nh_gw,
567 .ip4_u = { 567 .fl4_scope = cfg->fc_scope + 1,
568 .daddr = nh->nh_gw,
569 .scope = cfg->fc_scope + 1,
570 },
571 },
572 .oif = nh->nh_oif, 568 .oif = nh->nh_oif,
573 }; 569 };
574 570
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e5d1a44bcbdf..4aa1b7f01ea0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -386,10 +386,9 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
386 daddr = icmp_param->replyopts.faddr; 386 daddr = icmp_param->replyopts.faddr;
387 } 387 }
388 { 388 {
389 struct flowi fl = { .nl_u = { .ip4_u = 389 struct flowi fl = { .fl4_dst= daddr,
390 { .daddr = daddr, 390 .fl4_src = rt->rt_spec_dst,
391 .saddr = rt->rt_spec_dst, 391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
392 .tos = RT_TOS(ip_hdr(skb)->tos) } },
393 .proto = IPPROTO_ICMP }; 392 .proto = IPPROTO_ICMP };
394 security_skb_classify_flow(skb, &fl); 393 security_skb_classify_flow(skb, &fl);
395 if (ip_route_output_key(net, &rt, &fl)) 394 if (ip_route_output_key(net, &rt, &fl))
@@ -506,8 +505,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
506 struct net_device *dev = NULL; 505 struct net_device *dev = NULL;
507 506
508 rcu_read_lock(); 507 rcu_read_lock();
509 if (rt->fl.iif && 508 if (rt_is_input_route(rt) &&
510 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 509 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
511 dev = dev_get_by_index_rcu(net, rt->fl.iif); 510 dev = dev_get_by_index_rcu(net, rt->fl.iif);
512 511
513 if (dev) 512 if (dev)
@@ -542,22 +541,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
542 541
543 { 542 {
544 struct flowi fl = { 543 struct flowi fl = {
545 .nl_u = { 544 .fl4_dst = icmp_param.replyopts.srr ?
546 .ip4_u = { 545 icmp_param.replyopts.faddr : iph->saddr,
547 .daddr = icmp_param.replyopts.srr ? 546 .fl4_src = saddr,
548 icmp_param.replyopts.faddr : 547 .fl4_tos = RT_TOS(tos),
549 iph->saddr,
550 .saddr = saddr,
551 .tos = RT_TOS(tos)
552 }
553 },
554 .proto = IPPROTO_ICMP, 548 .proto = IPPROTO_ICMP,
555 .uli_u = { 549 .fl_icmp_type = type,
556 .icmpt = { 550 .fl_icmp_code = code,
557 .type = type,
558 .code = code
559 }
560 }
561 }; 551 };
562 int err; 552 int err;
563 struct rtable *rt2; 553 struct rtable *rt2;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3c53c2d89e3b..e0e77e297de3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -149,21 +149,37 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc);
149static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, 149static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
150 int sfcount, __be32 *psfsrc, int delta); 150 int sfcount, __be32 *psfsrc, int delta);
151 151
152
153static void ip_mc_list_reclaim(struct rcu_head *head)
154{
155 kfree(container_of(head, struct ip_mc_list, rcu));
156}
157
152static void ip_ma_put(struct ip_mc_list *im) 158static void ip_ma_put(struct ip_mc_list *im)
153{ 159{
154 if (atomic_dec_and_test(&im->refcnt)) { 160 if (atomic_dec_and_test(&im->refcnt)) {
155 in_dev_put(im->interface); 161 in_dev_put(im->interface);
156 kfree(im); 162 call_rcu(&im->rcu, ip_mc_list_reclaim);
157 } 163 }
158} 164}
159 165
166#define for_each_pmc_rcu(in_dev, pmc) \
167 for (pmc = rcu_dereference(in_dev->mc_list); \
168 pmc != NULL; \
169 pmc = rcu_dereference(pmc->next_rcu))
170
171#define for_each_pmc_rtnl(in_dev, pmc) \
172 for (pmc = rtnl_dereference(in_dev->mc_list); \
173 pmc != NULL; \
174 pmc = rtnl_dereference(pmc->next_rcu))
175
160#ifdef CONFIG_IP_MULTICAST 176#ifdef CONFIG_IP_MULTICAST
161 177
162/* 178/*
163 * Timer management 179 * Timer management
164 */ 180 */
165 181
166static __inline__ void igmp_stop_timer(struct ip_mc_list *im) 182static void igmp_stop_timer(struct ip_mc_list *im)
167{ 183{
168 spin_lock_bh(&im->lock); 184 spin_lock_bh(&im->lock);
169 if (del_timer(&im->timer)) 185 if (del_timer(&im->timer))
@@ -284,6 +300,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
284 return scount; 300 return scount;
285} 301}
286 302
303#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
304
287static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) 305static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
288{ 306{
289 struct sk_buff *skb; 307 struct sk_buff *skb;
@@ -292,14 +310,20 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
292 struct igmpv3_report *pig; 310 struct igmpv3_report *pig;
293 struct net *net = dev_net(dev); 311 struct net *net = dev_net(dev);
294 312
295 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 313 while (1) {
296 if (skb == NULL) 314 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
297 return NULL; 315 GFP_ATOMIC | __GFP_NOWARN);
316 if (skb)
317 break;
318 size >>= 1;
319 if (size < 256)
320 return NULL;
321 }
322 igmp_skb_size(skb) = size;
298 323
299 { 324 {
300 struct flowi fl = { .oif = dev->ifindex, 325 struct flowi fl = { .oif = dev->ifindex,
301 .nl_u = { .ip4_u = { 326 .fl4_dst = IGMPV3_ALL_MCR,
302 .daddr = IGMPV3_ALL_MCR } },
303 .proto = IPPROTO_IGMP }; 327 .proto = IPPROTO_IGMP };
304 if (ip_route_output_key(net, &rt, &fl)) { 328 if (ip_route_output_key(net, &rt, &fl)) {
305 kfree_skb(skb); 329 kfree_skb(skb);
@@ -384,7 +408,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
384 return skb; 408 return skb;
385} 409}
386 410
387#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ 411#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
388 skb_tailroom(skb)) : 0) 412 skb_tailroom(skb)) : 0)
389 413
390static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, 414static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -502,8 +526,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
502 int type; 526 int type;
503 527
504 if (!pmc) { 528 if (!pmc) {
505 read_lock(&in_dev->mc_list_lock); 529 rcu_read_lock();
506 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 530 for_each_pmc_rcu(in_dev, pmc) {
507 if (pmc->multiaddr == IGMP_ALL_HOSTS) 531 if (pmc->multiaddr == IGMP_ALL_HOSTS)
508 continue; 532 continue;
509 spin_lock_bh(&pmc->lock); 533 spin_lock_bh(&pmc->lock);
@@ -514,7 +538,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
514 skb = add_grec(skb, pmc, type, 0, 0); 538 skb = add_grec(skb, pmc, type, 0, 0);
515 spin_unlock_bh(&pmc->lock); 539 spin_unlock_bh(&pmc->lock);
516 } 540 }
517 read_unlock(&in_dev->mc_list_lock); 541 rcu_read_unlock();
518 } else { 542 } else {
519 spin_lock_bh(&pmc->lock); 543 spin_lock_bh(&pmc->lock);
520 if (pmc->sfcount[MCAST_EXCLUDE]) 544 if (pmc->sfcount[MCAST_EXCLUDE])
@@ -556,7 +580,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
556 struct sk_buff *skb = NULL; 580 struct sk_buff *skb = NULL;
557 int type, dtype; 581 int type, dtype;
558 582
559 read_lock(&in_dev->mc_list_lock); 583 rcu_read_lock();
560 spin_lock_bh(&in_dev->mc_tomb_lock); 584 spin_lock_bh(&in_dev->mc_tomb_lock);
561 585
562 /* deleted MCA's */ 586 /* deleted MCA's */
@@ -593,7 +617,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
593 spin_unlock_bh(&in_dev->mc_tomb_lock); 617 spin_unlock_bh(&in_dev->mc_tomb_lock);
594 618
595 /* change recs */ 619 /* change recs */
596 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 620 for_each_pmc_rcu(in_dev, pmc) {
597 spin_lock_bh(&pmc->lock); 621 spin_lock_bh(&pmc->lock);
598 if (pmc->sfcount[MCAST_EXCLUDE]) { 622 if (pmc->sfcount[MCAST_EXCLUDE]) {
599 type = IGMPV3_BLOCK_OLD_SOURCES; 623 type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -616,7 +640,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
616 } 640 }
617 spin_unlock_bh(&pmc->lock); 641 spin_unlock_bh(&pmc->lock);
618 } 642 }
619 read_unlock(&in_dev->mc_list_lock); 643 rcu_read_unlock();
620 644
621 if (!skb) 645 if (!skb)
622 return; 646 return;
@@ -644,7 +668,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
644 668
645 { 669 {
646 struct flowi fl = { .oif = dev->ifindex, 670 struct flowi fl = { .oif = dev->ifindex,
647 .nl_u = { .ip4_u = { .daddr = dst } }, 671 .fl4_dst = dst,
648 .proto = IPPROTO_IGMP }; 672 .proto = IPPROTO_IGMP };
649 if (ip_route_output_key(net, &rt, &fl)) 673 if (ip_route_output_key(net, &rt, &fl))
650 return -1; 674 return -1;
@@ -813,14 +837,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
813 if (group == IGMP_ALL_HOSTS) 837 if (group == IGMP_ALL_HOSTS)
814 return; 838 return;
815 839
816 read_lock(&in_dev->mc_list_lock); 840 rcu_read_lock();
817 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 841 for_each_pmc_rcu(in_dev, im) {
818 if (im->multiaddr == group) { 842 if (im->multiaddr == group) {
819 igmp_stop_timer(im); 843 igmp_stop_timer(im);
820 break; 844 break;
821 } 845 }
822 } 846 }
823 read_unlock(&in_dev->mc_list_lock); 847 rcu_read_unlock();
824} 848}
825 849
826static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 850static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -906,8 +930,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
906 * - Use the igmp->igmp_code field as the maximum 930 * - Use the igmp->igmp_code field as the maximum
907 * delay possible 931 * delay possible
908 */ 932 */
909 read_lock(&in_dev->mc_list_lock); 933 rcu_read_lock();
910 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 934 for_each_pmc_rcu(in_dev, im) {
911 int changed; 935 int changed;
912 936
913 if (group && group != im->multiaddr) 937 if (group && group != im->multiaddr)
@@ -925,7 +949,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
925 if (changed) 949 if (changed)
926 igmp_mod_timer(im, max_delay); 950 igmp_mod_timer(im, max_delay);
927 } 951 }
928 read_unlock(&in_dev->mc_list_lock); 952 rcu_read_unlock();
929} 953}
930 954
931/* called in rcu_read_lock() section */ 955/* called in rcu_read_lock() section */
@@ -961,7 +985,7 @@ int igmp_rcv(struct sk_buff *skb)
961 case IGMP_HOST_MEMBERSHIP_REPORT: 985 case IGMP_HOST_MEMBERSHIP_REPORT:
962 case IGMPV2_HOST_MEMBERSHIP_REPORT: 986 case IGMPV2_HOST_MEMBERSHIP_REPORT:
963 /* Is it our report looped back? */ 987 /* Is it our report looped back? */
964 if (skb_rtable(skb)->fl.iif == 0) 988 if (rt_is_output_route(skb_rtable(skb)))
965 break; 989 break;
966 /* don't rely on MC router hearing unicast reports */ 990 /* don't rely on MC router hearing unicast reports */
967 if (skb->pkt_type == PACKET_MULTICAST || 991 if (skb->pkt_type == PACKET_MULTICAST ||
@@ -1110,8 +1134,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1110 kfree(pmc); 1134 kfree(pmc);
1111 } 1135 }
1112 /* clear dead sources, too */ 1136 /* clear dead sources, too */
1113 read_lock(&in_dev->mc_list_lock); 1137 rcu_read_lock();
1114 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1138 for_each_pmc_rcu(in_dev, pmc) {
1115 struct ip_sf_list *psf, *psf_next; 1139 struct ip_sf_list *psf, *psf_next;
1116 1140
1117 spin_lock_bh(&pmc->lock); 1141 spin_lock_bh(&pmc->lock);
@@ -1123,7 +1147,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1123 kfree(psf); 1147 kfree(psf);
1124 } 1148 }
1125 } 1149 }
1126 read_unlock(&in_dev->mc_list_lock); 1150 rcu_read_unlock();
1127} 1151}
1128#endif 1152#endif
1129 1153
@@ -1209,7 +1233,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1209 1233
1210 ASSERT_RTNL(); 1234 ASSERT_RTNL();
1211 1235
1212 for (im=in_dev->mc_list; im; im=im->next) { 1236 for_each_pmc_rtnl(in_dev, im) {
1213 if (im->multiaddr == addr) { 1237 if (im->multiaddr == addr) {
1214 im->users++; 1238 im->users++;
1215 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); 1239 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1217,7 +1241,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1217 } 1241 }
1218 } 1242 }
1219 1243
1220 im = kmalloc(sizeof(*im), GFP_KERNEL); 1244 im = kzalloc(sizeof(*im), GFP_KERNEL);
1221 if (!im) 1245 if (!im)
1222 goto out; 1246 goto out;
1223 1247
@@ -1227,26 +1251,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1227 im->multiaddr = addr; 1251 im->multiaddr = addr;
1228 /* initial mode is (EX, empty) */ 1252 /* initial mode is (EX, empty) */
1229 im->sfmode = MCAST_EXCLUDE; 1253 im->sfmode = MCAST_EXCLUDE;
1230 im->sfcount[MCAST_INCLUDE] = 0;
1231 im->sfcount[MCAST_EXCLUDE] = 1; 1254 im->sfcount[MCAST_EXCLUDE] = 1;
1232 im->sources = NULL;
1233 im->tomb = NULL;
1234 im->crcount = 0;
1235 atomic_set(&im->refcnt, 1); 1255 atomic_set(&im->refcnt, 1);
1236 spin_lock_init(&im->lock); 1256 spin_lock_init(&im->lock);
1237#ifdef CONFIG_IP_MULTICAST 1257#ifdef CONFIG_IP_MULTICAST
1238 im->tm_running = 0;
1239 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1258 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
1240 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1259 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1241 im->reporter = 0;
1242 im->gsquery = 0;
1243#endif 1260#endif
1244 im->loaded = 0; 1261
1245 write_lock_bh(&in_dev->mc_list_lock); 1262 im->next_rcu = in_dev->mc_list;
1246 im->next = in_dev->mc_list;
1247 in_dev->mc_list = im;
1248 in_dev->mc_count++; 1263 in_dev->mc_count++;
1249 write_unlock_bh(&in_dev->mc_list_lock); 1264 rcu_assign_pointer(in_dev->mc_list, im);
1265
1250#ifdef CONFIG_IP_MULTICAST 1266#ifdef CONFIG_IP_MULTICAST
1251 igmpv3_del_delrec(in_dev, im->multiaddr); 1267 igmpv3_del_delrec(in_dev, im->multiaddr);
1252#endif 1268#endif
@@ -1260,26 +1276,32 @@ EXPORT_SYMBOL(ip_mc_inc_group);
1260 1276
1261/* 1277/*
1262 * Resend IGMP JOIN report; used for bonding. 1278 * Resend IGMP JOIN report; used for bonding.
1279 * Called with rcu_read_lock()
1263 */ 1280 */
1264void ip_mc_rejoin_group(struct ip_mc_list *im) 1281void ip_mc_rejoin_groups(struct in_device *in_dev)
1265{ 1282{
1266#ifdef CONFIG_IP_MULTICAST 1283#ifdef CONFIG_IP_MULTICAST
1267 struct in_device *in_dev = im->interface; 1284 struct ip_mc_list *im;
1285 int type;
1268 1286
1269 if (im->multiaddr == IGMP_ALL_HOSTS) 1287 for_each_pmc_rcu(in_dev, im) {
1270 return; 1288 if (im->multiaddr == IGMP_ALL_HOSTS)
1289 continue;
1271 1290
1272 /* a failover is happening and switches 1291 /* a failover is happening and switches
1273 * must be notified immediately */ 1292 * must be notified immediately
1274 if (IGMP_V1_SEEN(in_dev)) 1293 */
1275 igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); 1294 if (IGMP_V1_SEEN(in_dev))
1276 else if (IGMP_V2_SEEN(in_dev)) 1295 type = IGMP_HOST_MEMBERSHIP_REPORT;
1277 igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); 1296 else if (IGMP_V2_SEEN(in_dev))
1278 else 1297 type = IGMPV2_HOST_MEMBERSHIP_REPORT;
1279 igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); 1298 else
1299 type = IGMPV3_HOST_MEMBERSHIP_REPORT;
1300 igmp_send_report(in_dev, im, type);
1301 }
1280#endif 1302#endif
1281} 1303}
1282EXPORT_SYMBOL(ip_mc_rejoin_group); 1304EXPORT_SYMBOL(ip_mc_rejoin_groups);
1283 1305
1284/* 1306/*
1285 * A socket has left a multicast group on device dev 1307 * A socket has left a multicast group on device dev
@@ -1287,17 +1309,18 @@ EXPORT_SYMBOL(ip_mc_rejoin_group);
1287 1309
1288void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) 1310void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1289{ 1311{
1290 struct ip_mc_list *i, **ip; 1312 struct ip_mc_list *i;
1313 struct ip_mc_list __rcu **ip;
1291 1314
1292 ASSERT_RTNL(); 1315 ASSERT_RTNL();
1293 1316
1294 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { 1317 for (ip = &in_dev->mc_list;
1318 (i = rtnl_dereference(*ip)) != NULL;
1319 ip = &i->next_rcu) {
1295 if (i->multiaddr == addr) { 1320 if (i->multiaddr == addr) {
1296 if (--i->users == 0) { 1321 if (--i->users == 0) {
1297 write_lock_bh(&in_dev->mc_list_lock); 1322 *ip = i->next_rcu;
1298 *ip = i->next;
1299 in_dev->mc_count--; 1323 in_dev->mc_count--;
1300 write_unlock_bh(&in_dev->mc_list_lock);
1301 igmp_group_dropped(i); 1324 igmp_group_dropped(i);
1302 1325
1303 if (!in_dev->dead) 1326 if (!in_dev->dead)
@@ -1316,34 +1339,34 @@ EXPORT_SYMBOL(ip_mc_dec_group);
1316 1339
1317void ip_mc_unmap(struct in_device *in_dev) 1340void ip_mc_unmap(struct in_device *in_dev)
1318{ 1341{
1319 struct ip_mc_list *i; 1342 struct ip_mc_list *pmc;
1320 1343
1321 ASSERT_RTNL(); 1344 ASSERT_RTNL();
1322 1345
1323 for (i = in_dev->mc_list; i; i = i->next) 1346 for_each_pmc_rtnl(in_dev, pmc)
1324 igmp_group_dropped(i); 1347 igmp_group_dropped(pmc);
1325} 1348}
1326 1349
1327void ip_mc_remap(struct in_device *in_dev) 1350void ip_mc_remap(struct in_device *in_dev)
1328{ 1351{
1329 struct ip_mc_list *i; 1352 struct ip_mc_list *pmc;
1330 1353
1331 ASSERT_RTNL(); 1354 ASSERT_RTNL();
1332 1355
1333 for (i = in_dev->mc_list; i; i = i->next) 1356 for_each_pmc_rtnl(in_dev, pmc)
1334 igmp_group_added(i); 1357 igmp_group_added(pmc);
1335} 1358}
1336 1359
1337/* Device going down */ 1360/* Device going down */
1338 1361
1339void ip_mc_down(struct in_device *in_dev) 1362void ip_mc_down(struct in_device *in_dev)
1340{ 1363{
1341 struct ip_mc_list *i; 1364 struct ip_mc_list *pmc;
1342 1365
1343 ASSERT_RTNL(); 1366 ASSERT_RTNL();
1344 1367
1345 for (i=in_dev->mc_list; i; i=i->next) 1368 for_each_pmc_rtnl(in_dev, pmc)
1346 igmp_group_dropped(i); 1369 igmp_group_dropped(pmc);
1347 1370
1348#ifdef CONFIG_IP_MULTICAST 1371#ifdef CONFIG_IP_MULTICAST
1349 in_dev->mr_ifc_count = 0; 1372 in_dev->mr_ifc_count = 0;
@@ -1374,7 +1397,6 @@ void ip_mc_init_dev(struct in_device *in_dev)
1374 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1397 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
1375#endif 1398#endif
1376 1399
1377 rwlock_init(&in_dev->mc_list_lock);
1378 spin_lock_init(&in_dev->mc_tomb_lock); 1400 spin_lock_init(&in_dev->mc_tomb_lock);
1379} 1401}
1380 1402
@@ -1382,14 +1404,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
1382 1404
1383void ip_mc_up(struct in_device *in_dev) 1405void ip_mc_up(struct in_device *in_dev)
1384{ 1406{
1385 struct ip_mc_list *i; 1407 struct ip_mc_list *pmc;
1386 1408
1387 ASSERT_RTNL(); 1409 ASSERT_RTNL();
1388 1410
1389 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); 1411 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1390 1412
1391 for (i=in_dev->mc_list; i; i=i->next) 1413 for_each_pmc_rtnl(in_dev, pmc)
1392 igmp_group_added(i); 1414 igmp_group_added(pmc);
1393} 1415}
1394 1416
1395/* 1417/*
@@ -1405,24 +1427,19 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1405 /* Deactivate timers */ 1427 /* Deactivate timers */
1406 ip_mc_down(in_dev); 1428 ip_mc_down(in_dev);
1407 1429
1408 write_lock_bh(&in_dev->mc_list_lock); 1430 while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
1409 while ((i = in_dev->mc_list) != NULL) { 1431 in_dev->mc_list = i->next_rcu;
1410 in_dev->mc_list = i->next;
1411 in_dev->mc_count--; 1432 in_dev->mc_count--;
1412 write_unlock_bh(&in_dev->mc_list_lock); 1433
1413 igmp_group_dropped(i); 1434 igmp_group_dropped(i);
1414 ip_ma_put(i); 1435 ip_ma_put(i);
1415
1416 write_lock_bh(&in_dev->mc_list_lock);
1417 } 1436 }
1418 write_unlock_bh(&in_dev->mc_list_lock);
1419} 1437}
1420 1438
1421/* RTNL is locked */ 1439/* RTNL is locked */
1422static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1423{ 1441{
1424 struct flowi fl = { .nl_u = { .ip4_u = 1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1425 { .daddr = imr->imr_multiaddr.s_addr } } };
1426 struct rtable *rt; 1443 struct rtable *rt;
1427 struct net_device *dev = NULL; 1444 struct net_device *dev = NULL;
1428 struct in_device *idev = NULL; 1445 struct in_device *idev = NULL;
@@ -1513,18 +1530,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1513 1530
1514 if (!in_dev) 1531 if (!in_dev)
1515 return -ENODEV; 1532 return -ENODEV;
1516 read_lock(&in_dev->mc_list_lock); 1533 rcu_read_lock();
1517 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1534 for_each_pmc_rcu(in_dev, pmc) {
1518 if (*pmca == pmc->multiaddr) 1535 if (*pmca == pmc->multiaddr)
1519 break; 1536 break;
1520 } 1537 }
1521 if (!pmc) { 1538 if (!pmc) {
1522 /* MCA not found?? bug */ 1539 /* MCA not found?? bug */
1523 read_unlock(&in_dev->mc_list_lock); 1540 rcu_read_unlock();
1524 return -ESRCH; 1541 return -ESRCH;
1525 } 1542 }
1526 spin_lock_bh(&pmc->lock); 1543 spin_lock_bh(&pmc->lock);
1527 read_unlock(&in_dev->mc_list_lock); 1544 rcu_read_unlock();
1528#ifdef CONFIG_IP_MULTICAST 1545#ifdef CONFIG_IP_MULTICAST
1529 sf_markstate(pmc); 1546 sf_markstate(pmc);
1530#endif 1547#endif
@@ -1685,18 +1702,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1685 1702
1686 if (!in_dev) 1703 if (!in_dev)
1687 return -ENODEV; 1704 return -ENODEV;
1688 read_lock(&in_dev->mc_list_lock); 1705 rcu_read_lock();
1689 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1706 for_each_pmc_rcu(in_dev, pmc) {
1690 if (*pmca == pmc->multiaddr) 1707 if (*pmca == pmc->multiaddr)
1691 break; 1708 break;
1692 } 1709 }
1693 if (!pmc) { 1710 if (!pmc) {
1694 /* MCA not found?? bug */ 1711 /* MCA not found?? bug */
1695 read_unlock(&in_dev->mc_list_lock); 1712 rcu_read_unlock();
1696 return -ESRCH; 1713 return -ESRCH;
1697 } 1714 }
1698 spin_lock_bh(&pmc->lock); 1715 spin_lock_bh(&pmc->lock);
1699 read_unlock(&in_dev->mc_list_lock); 1716 rcu_read_unlock();
1700 1717
1701#ifdef CONFIG_IP_MULTICAST 1718#ifdef CONFIG_IP_MULTICAST
1702 sf_markstate(pmc); 1719 sf_markstate(pmc);
@@ -1793,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1793 1810
1794 err = -EADDRINUSE; 1811 err = -EADDRINUSE;
1795 ifindex = imr->imr_ifindex; 1812 ifindex = imr->imr_ifindex;
1796 for (i = inet->mc_list; i; i = i->next) { 1813 for_each_pmc_rtnl(inet, i) {
1797 if (i->multi.imr_multiaddr.s_addr == addr && 1814 if (i->multi.imr_multiaddr.s_addr == addr &&
1798 i->multi.imr_ifindex == ifindex) 1815 i->multi.imr_ifindex == ifindex)
1799 goto done; 1816 goto done;
@@ -1807,7 +1824,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1807 goto done; 1824 goto done;
1808 1825
1809 memcpy(&iml->multi, imr, sizeof(*imr)); 1826 memcpy(&iml->multi, imr, sizeof(*imr));
1810 iml->next = inet->mc_list; 1827 iml->next_rcu = inet->mc_list;
1811 iml->sflist = NULL; 1828 iml->sflist = NULL;
1812 iml->sfmode = MCAST_EXCLUDE; 1829 iml->sfmode = MCAST_EXCLUDE;
1813 rcu_assign_pointer(inet->mc_list, iml); 1830 rcu_assign_pointer(inet->mc_list, iml);
@@ -1821,17 +1838,14 @@ EXPORT_SYMBOL(ip_mc_join_group);
1821 1838
1822static void ip_sf_socklist_reclaim(struct rcu_head *rp) 1839static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1823{ 1840{
1824 struct ip_sf_socklist *psf; 1841 kfree(container_of(rp, struct ip_sf_socklist, rcu));
1825
1826 psf = container_of(rp, struct ip_sf_socklist, rcu);
1827 /* sk_omem_alloc should have been decreased by the caller*/ 1842 /* sk_omem_alloc should have been decreased by the caller*/
1828 kfree(psf);
1829} 1843}
1830 1844
1831static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1845static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1832 struct in_device *in_dev) 1846 struct in_device *in_dev)
1833{ 1847{
1834 struct ip_sf_socklist *psf = iml->sflist; 1848 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
1835 int err; 1849 int err;
1836 1850
1837 if (psf == NULL) { 1851 if (psf == NULL) {
@@ -1851,11 +1865,8 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1851 1865
1852static void ip_mc_socklist_reclaim(struct rcu_head *rp) 1866static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1853{ 1867{
1854 struct ip_mc_socklist *iml; 1868 kfree(container_of(rp, struct ip_mc_socklist, rcu));
1855
1856 iml = container_of(rp, struct ip_mc_socklist, rcu);
1857 /* sk_omem_alloc should have been decreased by the caller*/ 1869 /* sk_omem_alloc should have been decreased by the caller*/
1858 kfree(iml);
1859} 1870}
1860 1871
1861 1872
@@ -1866,7 +1877,8 @@ static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1866int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) 1877int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1867{ 1878{
1868 struct inet_sock *inet = inet_sk(sk); 1879 struct inet_sock *inet = inet_sk(sk);
1869 struct ip_mc_socklist *iml, **imlp; 1880 struct ip_mc_socklist *iml;
1881 struct ip_mc_socklist __rcu **imlp;
1870 struct in_device *in_dev; 1882 struct in_device *in_dev;
1871 struct net *net = sock_net(sk); 1883 struct net *net = sock_net(sk);
1872 __be32 group = imr->imr_multiaddr.s_addr; 1884 __be32 group = imr->imr_multiaddr.s_addr;
@@ -1876,7 +1888,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1876 rtnl_lock(); 1888 rtnl_lock();
1877 in_dev = ip_mc_find_dev(net, imr); 1889 in_dev = ip_mc_find_dev(net, imr);
1878 ifindex = imr->imr_ifindex; 1890 ifindex = imr->imr_ifindex;
1879 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1891 for (imlp = &inet->mc_list;
1892 (iml = rtnl_dereference(*imlp)) != NULL;
1893 imlp = &iml->next_rcu) {
1880 if (iml->multi.imr_multiaddr.s_addr != group) 1894 if (iml->multi.imr_multiaddr.s_addr != group)
1881 continue; 1895 continue;
1882 if (ifindex) { 1896 if (ifindex) {
@@ -1888,7 +1902,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1888 1902
1889 (void) ip_mc_leave_src(sk, iml, in_dev); 1903 (void) ip_mc_leave_src(sk, iml, in_dev);
1890 1904
1891 rcu_assign_pointer(*imlp, iml->next); 1905 *imlp = iml->next_rcu;
1892 1906
1893 if (in_dev) 1907 if (in_dev)
1894 ip_mc_dec_group(in_dev, group); 1908 ip_mc_dec_group(in_dev, group);
@@ -1934,7 +1948,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1934 } 1948 }
1935 err = -EADDRNOTAVAIL; 1949 err = -EADDRNOTAVAIL;
1936 1950
1937 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1951 for_each_pmc_rtnl(inet, pmc) {
1938 if ((pmc->multi.imr_multiaddr.s_addr == 1952 if ((pmc->multi.imr_multiaddr.s_addr ==
1939 imr.imr_multiaddr.s_addr) && 1953 imr.imr_multiaddr.s_addr) &&
1940 (pmc->multi.imr_ifindex == imr.imr_ifindex)) 1954 (pmc->multi.imr_ifindex == imr.imr_ifindex))
@@ -1958,7 +1972,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1958 pmc->sfmode = omode; 1972 pmc->sfmode = omode;
1959 } 1973 }
1960 1974
1961 psl = pmc->sflist; 1975 psl = rtnl_dereference(pmc->sflist);
1962 if (!add) { 1976 if (!add) {
1963 if (!psl) 1977 if (!psl)
1964 goto done; /* err = -EADDRNOTAVAIL */ 1978 goto done; /* err = -EADDRNOTAVAIL */
@@ -2077,7 +2091,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2077 goto done; 2091 goto done;
2078 } 2092 }
2079 2093
2080 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2094 for_each_pmc_rtnl(inet, pmc) {
2081 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2095 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2082 pmc->multi.imr_ifindex == imr.imr_ifindex) 2096 pmc->multi.imr_ifindex == imr.imr_ifindex)
2083 break; 2097 break;
@@ -2107,7 +2121,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2107 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, 2121 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
2108 msf->imsf_fmode, 0, NULL, 0); 2122 msf->imsf_fmode, 0, NULL, 0);
2109 } 2123 }
2110 psl = pmc->sflist; 2124 psl = rtnl_dereference(pmc->sflist);
2111 if (psl) { 2125 if (psl) {
2112 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2126 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2113 psl->sl_count, psl->sl_addr, 0); 2127 psl->sl_count, psl->sl_addr, 0);
@@ -2155,7 +2169,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2155 } 2169 }
2156 err = -EADDRNOTAVAIL; 2170 err = -EADDRNOTAVAIL;
2157 2171
2158 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2172 for_each_pmc_rtnl(inet, pmc) {
2159 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2173 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2160 pmc->multi.imr_ifindex == imr.imr_ifindex) 2174 pmc->multi.imr_ifindex == imr.imr_ifindex)
2161 break; 2175 break;
@@ -2163,7 +2177,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2163 if (!pmc) /* must have a prior join */ 2177 if (!pmc) /* must have a prior join */
2164 goto done; 2178 goto done;
2165 msf->imsf_fmode = pmc->sfmode; 2179 msf->imsf_fmode = pmc->sfmode;
2166 psl = pmc->sflist; 2180 psl = rtnl_dereference(pmc->sflist);
2167 rtnl_unlock(); 2181 rtnl_unlock();
2168 if (!psl) { 2182 if (!psl) {
2169 len = 0; 2183 len = 0;
@@ -2208,7 +2222,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2208 2222
2209 err = -EADDRNOTAVAIL; 2223 err = -EADDRNOTAVAIL;
2210 2224
2211 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2225 for_each_pmc_rtnl(inet, pmc) {
2212 if (pmc->multi.imr_multiaddr.s_addr == addr && 2226 if (pmc->multi.imr_multiaddr.s_addr == addr &&
2213 pmc->multi.imr_ifindex == gsf->gf_interface) 2227 pmc->multi.imr_ifindex == gsf->gf_interface)
2214 break; 2228 break;
@@ -2216,7 +2230,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2216 if (!pmc) /* must have a prior join */ 2230 if (!pmc) /* must have a prior join */
2217 goto done; 2231 goto done;
2218 gsf->gf_fmode = pmc->sfmode; 2232 gsf->gf_fmode = pmc->sfmode;
2219 psl = pmc->sflist; 2233 psl = rtnl_dereference(pmc->sflist);
2220 rtnl_unlock(); 2234 rtnl_unlock();
2221 count = psl ? psl->sl_count : 0; 2235 count = psl ? psl->sl_count : 0;
2222 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; 2236 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2257,7 +2271,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2257 goto out; 2271 goto out;
2258 2272
2259 rcu_read_lock(); 2273 rcu_read_lock();
2260 for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { 2274 for_each_pmc_rcu(inet, pmc) {
2261 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2275 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2262 pmc->multi.imr_ifindex == dif) 2276 pmc->multi.imr_ifindex == dif)
2263 break; 2277 break;
@@ -2265,7 +2279,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2265 ret = inet->mc_all; 2279 ret = inet->mc_all;
2266 if (!pmc) 2280 if (!pmc)
2267 goto unlock; 2281 goto unlock;
2268 psl = pmc->sflist; 2282 psl = rcu_dereference(pmc->sflist);
2269 ret = (pmc->sfmode == MCAST_EXCLUDE); 2283 ret = (pmc->sfmode == MCAST_EXCLUDE);
2270 if (!psl) 2284 if (!psl)
2271 goto unlock; 2285 goto unlock;
@@ -2300,10 +2314,10 @@ void ip_mc_drop_socket(struct sock *sk)
2300 return; 2314 return;
2301 2315
2302 rtnl_lock(); 2316 rtnl_lock();
2303 while ((iml = inet->mc_list) != NULL) { 2317 while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
2304 struct in_device *in_dev; 2318 struct in_device *in_dev;
2305 rcu_assign_pointer(inet->mc_list, iml->next);
2306 2319
2320 inet->mc_list = iml->next_rcu;
2307 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2321 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2308 (void) ip_mc_leave_src(sk, iml, in_dev); 2322 (void) ip_mc_leave_src(sk, iml, in_dev);
2309 if (in_dev != NULL) 2323 if (in_dev != NULL)
@@ -2321,8 +2335,8 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2321 struct ip_sf_list *psf; 2335 struct ip_sf_list *psf;
2322 int rv = 0; 2336 int rv = 0;
2323 2337
2324 read_lock(&in_dev->mc_list_lock); 2338 rcu_read_lock();
2325 for (im=in_dev->mc_list; im; im=im->next) { 2339 for_each_pmc_rcu(in_dev, im) {
2326 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2327 break; 2341 break;
2328 } 2342 }
@@ -2343,7 +2357,7 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2343 } else 2357 } else
2344 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2345 } 2359 }
2346 read_unlock(&in_dev->mc_list_lock); 2360 rcu_read_unlock();
2347 return rv; 2361 return rv;
2348} 2362}
2349 2363
@@ -2369,13 +2383,11 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2369 in_dev = __in_dev_get_rcu(state->dev); 2383 in_dev = __in_dev_get_rcu(state->dev);
2370 if (!in_dev) 2384 if (!in_dev)
2371 continue; 2385 continue;
2372 read_lock(&in_dev->mc_list_lock); 2386 im = rcu_dereference(in_dev->mc_list);
2373 im = in_dev->mc_list;
2374 if (im) { 2387 if (im) {
2375 state->in_dev = in_dev; 2388 state->in_dev = in_dev;
2376 break; 2389 break;
2377 } 2390 }
2378 read_unlock(&in_dev->mc_list_lock);
2379 } 2391 }
2380 return im; 2392 return im;
2381} 2393}
@@ -2383,11 +2395,9 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2383static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) 2395static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
2384{ 2396{
2385 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2397 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2386 im = im->next;
2387 while (!im) {
2388 if (likely(state->in_dev != NULL))
2389 read_unlock(&state->in_dev->mc_list_lock);
2390 2398
2399 im = rcu_dereference(im->next_rcu);
2400 while (!im) {
2391 state->dev = next_net_device_rcu(state->dev); 2401 state->dev = next_net_device_rcu(state->dev);
2392 if (!state->dev) { 2402 if (!state->dev) {
2393 state->in_dev = NULL; 2403 state->in_dev = NULL;
@@ -2396,8 +2406,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
2396 state->in_dev = __in_dev_get_rcu(state->dev); 2406 state->in_dev = __in_dev_get_rcu(state->dev);
2397 if (!state->in_dev) 2407 if (!state->in_dev)
2398 continue; 2408 continue;
2399 read_lock(&state->in_dev->mc_list_lock); 2409 im = rcu_dereference(state->in_dev->mc_list);
2400 im = state->in_dev->mc_list;
2401 } 2410 }
2402 return im; 2411 return im;
2403} 2412}
@@ -2433,10 +2442,8 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2433 __releases(rcu) 2442 __releases(rcu)
2434{ 2443{
2435 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2444 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2436 if (likely(state->in_dev != NULL)) { 2445
2437 read_unlock(&state->in_dev->mc_list_lock); 2446 state->in_dev = NULL;
2438 state->in_dev = NULL;
2439 }
2440 state->dev = NULL; 2447 state->dev = NULL;
2441 rcu_read_unlock(); 2448 rcu_read_unlock();
2442} 2449}
@@ -2458,7 +2465,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2458 querier = "NONE"; 2465 querier = "NONE";
2459#endif 2466#endif
2460 2467
2461 if (state->in_dev->mc_list == im) { 2468 if (rcu_dereference(state->in_dev->mc_list) == im) {
2462 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2469 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2463 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2470 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2464 } 2471 }
@@ -2517,8 +2524,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2517 idev = __in_dev_get_rcu(state->dev); 2524 idev = __in_dev_get_rcu(state->dev);
2518 if (unlikely(idev == NULL)) 2525 if (unlikely(idev == NULL))
2519 continue; 2526 continue;
2520 read_lock(&idev->mc_list_lock); 2527 im = rcu_dereference(idev->mc_list);
2521 im = idev->mc_list;
2522 if (likely(im != NULL)) { 2528 if (likely(im != NULL)) {
2523 spin_lock_bh(&im->lock); 2529 spin_lock_bh(&im->lock);
2524 psf = im->sources; 2530 psf = im->sources;
@@ -2529,7 +2535,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2529 } 2535 }
2530 spin_unlock_bh(&im->lock); 2536 spin_unlock_bh(&im->lock);
2531 } 2537 }
2532 read_unlock(&idev->mc_list_lock);
2533 } 2538 }
2534 return psf; 2539 return psf;
2535} 2540}
@@ -2543,9 +2548,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2543 spin_unlock_bh(&state->im->lock); 2548 spin_unlock_bh(&state->im->lock);
2544 state->im = state->im->next; 2549 state->im = state->im->next;
2545 while (!state->im) { 2550 while (!state->im) {
2546 if (likely(state->idev != NULL))
2547 read_unlock(&state->idev->mc_list_lock);
2548
2549 state->dev = next_net_device_rcu(state->dev); 2551 state->dev = next_net_device_rcu(state->dev);
2550 if (!state->dev) { 2552 if (!state->dev) {
2551 state->idev = NULL; 2553 state->idev = NULL;
@@ -2554,8 +2556,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2554 state->idev = __in_dev_get_rcu(state->dev); 2556 state->idev = __in_dev_get_rcu(state->dev);
2555 if (!state->idev) 2557 if (!state->idev)
2556 continue; 2558 continue;
2557 read_lock(&state->idev->mc_list_lock); 2559 state->im = rcu_dereference(state->idev->mc_list);
2558 state->im = state->idev->mc_list;
2559 } 2560 }
2560 if (!state->im) 2561 if (!state->im)
2561 break; 2562 break;
@@ -2601,10 +2602,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2601 spin_unlock_bh(&state->im->lock); 2602 spin_unlock_bh(&state->im->lock);
2602 state->im = NULL; 2603 state->im = NULL;
2603 } 2604 }
2604 if (likely(state->idev != NULL)) { 2605 state->idev = NULL;
2605 read_unlock(&state->idev->mc_list_lock);
2606 state->idev = NULL;
2607 }
2608 state->dev = NULL; 2606 state->dev = NULL;
2609 rcu_read_unlock(); 2607 rcu_read_unlock();
2610} 2608}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7174370b1195..25e318153f14 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -55,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range);
55int inet_csk_bind_conflict(const struct sock *sk, 55int inet_csk_bind_conflict(const struct sock *sk,
56 const struct inet_bind_bucket *tb) 56 const struct inet_bind_bucket *tb)
57{ 57{
58 const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
59 struct sock *sk2; 58 struct sock *sk2;
60 struct hlist_node *node; 59 struct hlist_node *node;
61 int reuse = sk->sk_reuse; 60 int reuse = sk->sk_reuse;
@@ -75,9 +74,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
75 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76 if (!reuse || !sk2->sk_reuse || 75 if (!reuse || !sk2->sk_reuse ||
77 sk2->sk_state == TCP_LISTEN) { 76 sk2->sk_state == TCP_LISTEN) {
78 const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
79 if (!sk2_rcv_saddr || !sk_rcv_saddr || 78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
80 sk2_rcv_saddr == sk_rcv_saddr) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
81 break; 80 break;
82 } 81 }
83 } 82 }
@@ -358,17 +357,14 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
358 struct ip_options *opt = inet_rsk(req)->opt; 357 struct ip_options *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if, 358 struct flowi fl = { .oif = sk->sk_bound_dev_if,
360 .mark = sk->sk_mark, 359 .mark = sk->sk_mark,
361 .nl_u = { .ip4_u = 360 .fl4_dst = ((opt && opt->srr) ?
362 { .daddr = ((opt && opt->srr) ? 361 opt->faddr : ireq->rmt_addr),
363 opt->faddr : 362 .fl4_src = ireq->loc_addr,
364 ireq->rmt_addr), 363 .fl4_tos = RT_CONN_FLAGS(sk),
365 .saddr = ireq->loc_addr,
366 .tos = RT_CONN_FLAGS(sk) } },
367 .proto = sk->sk_protocol, 364 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk), 365 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports = 366 .fl_ip_sport = inet_sk(sk)->inet_sport,
370 { .sport = inet_sk(sk)->inet_sport, 367 .fl_ip_dport = ireq->rmt_port };
371 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 368 struct net *net = sock_net(sk);
373 369
374 security_req_classify_flow(req, &fl); 370 security_req_classify_flow(req, &fl);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9e94d7cf4f8a..d9bc85751c74 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -63,7 +63,7 @@
63 * refcnt: atomically against modifications on other CPU; 63 * refcnt: atomically against modifications on other CPU;
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * daddr: unchangeable
67 * ip_id_count: atomic value (no lock needed) 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
@@ -79,15 +79,24 @@ static const struct inet_peer peer_fake_node = {
79 .avl_height = 0 79 .avl_height = 0
80}; 80};
81 81
82static struct { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 spinlock_t lock;
85 int total; 85 int total;
86} peers = { 86};
87
88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
91 .total = 0,
92};
93
94static struct inet_peer_base v6_peers = {
87 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
88 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), 96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
89 .total = 0, 97 .total = 0,
90}; 98};
99
91#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 100#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
92 101
93/* Exported for sysctl_net_ipv4. */ 102/* Exported for sysctl_net_ipv4. */
@@ -152,28 +161,45 @@ static void unlink_from_unused(struct inet_peer *p)
152 } 161 }
153} 162}
154 163
164static int addr_compare(const struct inetpeer_addr *a,
165 const struct inetpeer_addr *b)
166{
167 int i, n = (a->family == AF_INET ? 1 : 4);
168
169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i])
171 continue;
172 if (a->a6[i] < b->a6[i])
173 return -1;
174 return 1;
175 }
176
177 return 0;
178}
179
155/* 180/*
156 * Called with local BH disabled and the pool lock held. 181 * Called with local BH disabled and the pool lock held.
157 */ 182 */
158#define lookup(_daddr, _stack) \ 183#define lookup(_daddr, _stack, _base) \
159({ \ 184({ \
160 struct inet_peer *u; \ 185 struct inet_peer *u; \
161 struct inet_peer __rcu **v; \ 186 struct inet_peer __rcu **v; \
162 \ 187 \
163 stackptr = _stack; \ 188 stackptr = _stack; \
164 *stackptr++ = &peers.root; \ 189 *stackptr++ = &_base->root; \
165 for (u = rcu_dereference_protected(peers.root, \ 190 for (u = rcu_dereference_protected(_base->root, \
166 lockdep_is_held(&peers.lock)); \ 191 lockdep_is_held(&_base->lock)); \
167 u != peer_avl_empty; ) { \ 192 u != peer_avl_empty; ) { \
168 if (_daddr == u->v4daddr) \ 193 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \
169 break; \ 195 break; \
170 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 196 if (cmp == -1) \
171 v = &u->avl_left; \ 197 v = &u->avl_left; \
172 else \ 198 else \
173 v = &u->avl_right; \ 199 v = &u->avl_right; \
174 *stackptr++ = v; \ 200 *stackptr++ = v; \
175 u = rcu_dereference_protected(*v, \ 201 u = rcu_dereference_protected(*v, \
176 lockdep_is_held(&peers.lock)); \ 202 lockdep_is_held(&_base->lock)); \
177 } \ 203 } \
178 u; \ 204 u; \
179}) 205})
@@ -185,13 +211,15 @@ static void unlink_from_unused(struct inet_peer *p)
185 * But every pointer we follow is guaranteed to be valid thanks to RCU. 211 * But every pointer we follow is guaranteed to be valid thanks to RCU.
186 * We exit from this function if number of links exceeds PEER_MAXDEPTH 212 * We exit from this function if number of links exceeds PEER_MAXDEPTH
187 */ 213 */
188static struct inet_peer *lookup_rcu_bh(__be32 daddr) 214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base)
189{ 216{
190 struct inet_peer *u = rcu_dereference_bh(peers.root); 217 struct inet_peer *u = rcu_dereference_bh(base->root);
191 int count = 0; 218 int count = 0;
192 219
193 while (u != peer_avl_empty) { 220 while (u != peer_avl_empty) {
194 if (daddr == u->v4daddr) { 221 int cmp = addr_compare(daddr, &u->daddr);
222 if (cmp == 0) {
195 /* Before taking a reference, check if this entry was 223 /* Before taking a reference, check if this entry was
196 * deleted, unlink_from_pool() sets refcnt=-1 to make 224 * deleted, unlink_from_pool() sets refcnt=-1 to make
197 * distinction between an unused entry (refcnt=0) and 225 * distinction between an unused entry (refcnt=0) and
@@ -201,7 +229,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
201 u = NULL; 229 u = NULL;
202 return u; 230 return u;
203 } 231 }
204 if ((__force __u32)daddr < (__force __u32)u->v4daddr) 232 if (cmp == -1)
205 u = rcu_dereference_bh(u->avl_left); 233 u = rcu_dereference_bh(u->avl_left);
206 else 234 else
207 u = rcu_dereference_bh(u->avl_right); 235 u = rcu_dereference_bh(u->avl_right);
@@ -212,19 +240,19 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
212} 240}
213 241
214/* Called with local BH disabled and the pool lock held. */ 242/* Called with local BH disabled and the pool lock held. */
215#define lookup_rightempty(start) \ 243#define lookup_rightempty(start, base) \
216({ \ 244({ \
217 struct inet_peer *u; \ 245 struct inet_peer *u; \
218 struct inet_peer __rcu **v; \ 246 struct inet_peer __rcu **v; \
219 *stackptr++ = &start->avl_left; \ 247 *stackptr++ = &start->avl_left; \
220 v = &start->avl_left; \ 248 v = &start->avl_left; \
221 for (u = rcu_dereference_protected(*v, \ 249 for (u = rcu_dereference_protected(*v, \
222 lockdep_is_held(&peers.lock)); \ 250 lockdep_is_held(&base->lock)); \
223 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
224 v = &u->avl_right; \ 252 v = &u->avl_right; \
225 *stackptr++ = v; \ 253 *stackptr++ = v; \
226 u = rcu_dereference_protected(*v, \ 254 u = rcu_dereference_protected(*v, \
227 lockdep_is_held(&peers.lock)); \ 255 lockdep_is_held(&base->lock)); \
228 } \ 256 } \
229 u; \ 257 u; \
230}) 258})
@@ -234,7 +262,8 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
234 * Look into mm/map_avl.c for more detail description of the ideas. 262 * Look into mm/map_avl.c for more detail description of the ideas.
235 */ 263 */
236static void peer_avl_rebalance(struct inet_peer __rcu **stack[], 264static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
237 struct inet_peer __rcu ***stackend) 265 struct inet_peer __rcu ***stackend,
266 struct inet_peer_base *base)
238{ 267{
239 struct inet_peer __rcu **nodep; 268 struct inet_peer __rcu **nodep;
240 struct inet_peer *node, *l, *r; 269 struct inet_peer *node, *l, *r;
@@ -243,20 +272,20 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
243 while (stackend > stack) { 272 while (stackend > stack) {
244 nodep = *--stackend; 273 nodep = *--stackend;
245 node = rcu_dereference_protected(*nodep, 274 node = rcu_dereference_protected(*nodep,
246 lockdep_is_held(&peers.lock)); 275 lockdep_is_held(&base->lock));
247 l = rcu_dereference_protected(node->avl_left, 276 l = rcu_dereference_protected(node->avl_left,
248 lockdep_is_held(&peers.lock)); 277 lockdep_is_held(&base->lock));
249 r = rcu_dereference_protected(node->avl_right, 278 r = rcu_dereference_protected(node->avl_right,
250 lockdep_is_held(&peers.lock)); 279 lockdep_is_held(&base->lock));
251 lh = node_height(l); 280 lh = node_height(l);
252 rh = node_height(r); 281 rh = node_height(r);
253 if (lh > rh + 1) { /* l: RH+2 */ 282 if (lh > rh + 1) { /* l: RH+2 */
254 struct inet_peer *ll, *lr, *lrl, *lrr; 283 struct inet_peer *ll, *lr, *lrl, *lrr;
255 int lrh; 284 int lrh;
256 ll = rcu_dereference_protected(l->avl_left, 285 ll = rcu_dereference_protected(l->avl_left,
257 lockdep_is_held(&peers.lock)); 286 lockdep_is_held(&base->lock));
258 lr = rcu_dereference_protected(l->avl_right, 287 lr = rcu_dereference_protected(l->avl_right,
259 lockdep_is_held(&peers.lock)); 288 lockdep_is_held(&base->lock));
260 lrh = node_height(lr); 289 lrh = node_height(lr);
261 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 290 if (lrh <= node_height(ll)) { /* ll: RH+1 */
262 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -268,9 +297,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
268 RCU_INIT_POINTER(*nodep, l); 297 RCU_INIT_POINTER(*nodep, l);
269 } else { /* ll: RH, lr: RH+1 */ 298 } else { /* ll: RH, lr: RH+1 */
270 lrl = rcu_dereference_protected(lr->avl_left, 299 lrl = rcu_dereference_protected(lr->avl_left,
271 lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ 300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */
272 lrr = rcu_dereference_protected(lr->avl_right, 301 lrr = rcu_dereference_protected(lr->avl_right,
273 lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ 302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
274 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
275 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
276 node->avl_height = rh + 1; /* node: RH+1 */ 305 node->avl_height = rh + 1; /* node: RH+1 */
@@ -286,9 +315,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
286 struct inet_peer *rr, *rl, *rlr, *rll; 315 struct inet_peer *rr, *rl, *rlr, *rll;
287 int rlh; 316 int rlh;
288 rr = rcu_dereference_protected(r->avl_right, 317 rr = rcu_dereference_protected(r->avl_right,
289 lockdep_is_held(&peers.lock)); 318 lockdep_is_held(&base->lock));
290 rl = rcu_dereference_protected(r->avl_left, 319 rl = rcu_dereference_protected(r->avl_left,
291 lockdep_is_held(&peers.lock)); 320 lockdep_is_held(&base->lock));
292 rlh = node_height(rl); 321 rlh = node_height(rl);
293 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 322 if (rlh <= node_height(rr)) { /* rr: LH+1 */
294 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -300,9 +329,9 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
300 RCU_INIT_POINTER(*nodep, r); 329 RCU_INIT_POINTER(*nodep, r);
301 } else { /* rr: RH, rl: RH+1 */ 330 } else { /* rr: RH, rl: RH+1 */
302 rlr = rcu_dereference_protected(rl->avl_right, 331 rlr = rcu_dereference_protected(rl->avl_right,
303 lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ 332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */
304 rll = rcu_dereference_protected(rl->avl_left, 333 rll = rcu_dereference_protected(rl->avl_left,
305 lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ 334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
306 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
307 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
308 node->avl_height = lh + 1; /* node: LH+1 */ 337 node->avl_height = lh + 1; /* node: LH+1 */
@@ -321,14 +350,14 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
321} 350}
322 351
323/* Called with local BH disabled and the pool lock held. */ 352/* Called with local BH disabled and the pool lock held. */
324#define link_to_pool(n) \ 353#define link_to_pool(n, base) \
325do { \ 354do { \
326 n->avl_height = 1; \ 355 n->avl_height = 1; \
327 n->avl_left = peer_avl_empty_rcu; \ 356 n->avl_left = peer_avl_empty_rcu; \
328 n->avl_right = peer_avl_empty_rcu; \ 357 n->avl_right = peer_avl_empty_rcu; \
329 /* lockless readers can catch us now */ \ 358 /* lockless readers can catch us now */ \
330 rcu_assign_pointer(**--stackptr, n); \ 359 rcu_assign_pointer(**--stackptr, n); \
331 peer_avl_rebalance(stack, stackptr); \ 360 peer_avl_rebalance(stack, stackptr, base); \
332} while (0) 361} while (0)
333 362
334static void inetpeer_free_rcu(struct rcu_head *head) 363static void inetpeer_free_rcu(struct rcu_head *head)
@@ -337,13 +366,13 @@ static void inetpeer_free_rcu(struct rcu_head *head)
337} 366}
338 367
339/* May be called with local BH enabled. */ 368/* May be called with local BH enabled. */
340static void unlink_from_pool(struct inet_peer *p) 369static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
341{ 370{
342 int do_free; 371 int do_free;
343 372
344 do_free = 0; 373 do_free = 0;
345 374
346 spin_lock_bh(&peers.lock); 375 spin_lock_bh(&base->lock);
347 /* Check the reference counter. It was artificially incremented by 1 376 /* Check the reference counter. It was artificially incremented by 1
348 * in cleanup() function to prevent sudden disappearing. If we can 377 * in cleanup() function to prevent sudden disappearing. If we can
349 * atomically (because of lockless readers) take this last reference, 378 * atomically (because of lockless readers) take this last reference,
@@ -353,7 +382,7 @@ static void unlink_from_pool(struct inet_peer *p)
353 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
354 struct inet_peer __rcu **stack[PEER_MAXDEPTH]; 383 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
355 struct inet_peer __rcu ***stackptr, ***delp; 384 struct inet_peer __rcu ***stackptr, ***delp;
356 if (lookup(p->v4daddr, stack) != p) 385 if (lookup(&p->daddr, stack, base) != p)
357 BUG(); 386 BUG();
358 delp = stackptr - 1; /* *delp[0] == p */ 387 delp = stackptr - 1; /* *delp[0] == p */
359 if (p->avl_left == peer_avl_empty_rcu) { 388 if (p->avl_left == peer_avl_empty_rcu) {
@@ -362,11 +391,11 @@ static void unlink_from_pool(struct inet_peer *p)
362 } else { 391 } else {
363 /* look for a node to insert instead of p */ 392 /* look for a node to insert instead of p */
364 struct inet_peer *t; 393 struct inet_peer *t;
365 t = lookup_rightempty(p); 394 t = lookup_rightempty(p, base);
366 BUG_ON(rcu_dereference_protected(*stackptr[-1], 395 BUG_ON(rcu_dereference_protected(*stackptr[-1],
367 lockdep_is_held(&peers.lock)) != t); 396 lockdep_is_held(&base->lock)) != t);
368 **--stackptr = t->avl_left; 397 **--stackptr = t->avl_left;
369 /* t is removed, t->v4daddr > x->v4daddr for any 398 /* t is removed, t->daddr > x->daddr for any
370 * x in p->avl_left subtree. 399 * x in p->avl_left subtree.
371 * Put t in the old place of p. */ 400 * Put t in the old place of p. */
372 RCU_INIT_POINTER(*delp[0], t); 401 RCU_INIT_POINTER(*delp[0], t);
@@ -376,11 +405,11 @@ static void unlink_from_pool(struct inet_peer *p)
376 BUG_ON(delp[1] != &p->avl_left); 405 BUG_ON(delp[1] != &p->avl_left);
377 delp[1] = &t->avl_left; /* was &p->avl_left */ 406 delp[1] = &t->avl_left; /* was &p->avl_left */
378 } 407 }
379 peer_avl_rebalance(stack, stackptr); 408 peer_avl_rebalance(stack, stackptr, base);
380 peers.total--; 409 base->total--;
381 do_free = 1; 410 do_free = 1;
382 } 411 }
383 spin_unlock_bh(&peers.lock); 412 spin_unlock_bh(&base->lock);
384 413
385 if (do_free) 414 if (do_free)
386 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 415 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
@@ -395,6 +424,16 @@ static void unlink_from_pool(struct inet_peer *p)
395 inet_putpeer(p); 424 inet_putpeer(p);
396} 425}
397 426
427static struct inet_peer_base *family_to_base(int family)
428{
429 return (family == AF_INET ? &v4_peers : &v6_peers);
430}
431
432static struct inet_peer_base *peer_to_base(struct inet_peer *p)
433{
434 return family_to_base(p->daddr.family);
435}
436
398/* May be called with local BH enabled. */ 437/* May be called with local BH enabled. */
399static int cleanup_once(unsigned long ttl) 438static int cleanup_once(unsigned long ttl)
400{ 439{
@@ -428,21 +467,22 @@ static int cleanup_once(unsigned long ttl)
428 * happen because of entry limits in route cache. */ 467 * happen because of entry limits in route cache. */
429 return -1; 468 return -1;
430 469
431 unlink_from_pool(p); 470 unlink_from_pool(p, peer_to_base(p));
432 return 0; 471 return 0;
433} 472}
434 473
435/* Called with or without local BH being disabled. */ 474/* Called with or without local BH being disabled. */
436struct inet_peer *inet_getpeer(__be32 daddr, int create) 475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
437{ 476{
438 struct inet_peer *p;
439 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(AF_INET);
479 struct inet_peer *p;
440 480
441 /* Look up for the address quickly, lockless. 481 /* Look up for the address quickly, lockless.
442 * Because of a concurrent writer, we might not find an existing entry. 482 * Because of a concurrent writer, we might not find an existing entry.
443 */ 483 */
444 rcu_read_lock_bh(); 484 rcu_read_lock_bh();
445 p = lookup_rcu_bh(daddr); 485 p = lookup_rcu_bh(daddr, base);
446 rcu_read_unlock_bh(); 486 rcu_read_unlock_bh();
447 487
448 if (p) { 488 if (p) {
@@ -456,50 +496,57 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
456 /* retry an exact lookup, taking the lock before. 496 /* retry an exact lookup, taking the lock before.
457 * At least, nodes should be hot in our cache. 497 * At least, nodes should be hot in our cache.
458 */ 498 */
459 spin_lock_bh(&peers.lock); 499 spin_lock_bh(&base->lock);
460 p = lookup(daddr, stack); 500 p = lookup(daddr, stack, base);
461 if (p != peer_avl_empty) { 501 if (p != peer_avl_empty) {
462 atomic_inc(&p->refcnt); 502 atomic_inc(&p->refcnt);
463 spin_unlock_bh(&peers.lock); 503 spin_unlock_bh(&base->lock);
464 /* Remove the entry from unused list if it was there. */ 504 /* Remove the entry from unused list if it was there. */
465 unlink_from_unused(p); 505 unlink_from_unused(p);
466 return p; 506 return p;
467 } 507 }
468 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 508 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
469 if (p) { 509 if (p) {
470 p->v4daddr = daddr; 510 p->daddr = *daddr;
471 atomic_set(&p->refcnt, 1); 511 atomic_set(&p->refcnt, 1);
472 atomic_set(&p->rid, 0); 512 atomic_set(&p->rid, 0);
473 atomic_set(&p->ip_id_count, secure_ip_id(daddr)); 513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
474 p->tcp_ts_stamp = 0; 514 p->tcp_ts_stamp = 0;
475 INIT_LIST_HEAD(&p->unused); 515 INIT_LIST_HEAD(&p->unused);
476 516
477 517
478 /* Link the node. */ 518 /* Link the node. */
479 link_to_pool(p); 519 link_to_pool(p, base);
480 peers.total++; 520 base->total++;
481 } 521 }
482 spin_unlock_bh(&peers.lock); 522 spin_unlock_bh(&base->lock);
483 523
484 if (peers.total >= inet_peer_threshold) 524 if (base->total >= inet_peer_threshold)
485 /* Remove one less-recently-used entry. */ 525 /* Remove one less-recently-used entry. */
486 cleanup_once(0); 526 cleanup_once(0);
487 527
488 return p; 528 return p;
489} 529}
490 530
531static int compute_total(void)
532{
533 return v4_peers.total + v6_peers.total;
534}
535EXPORT_SYMBOL_GPL(inet_getpeer);
536
491/* Called with local BH disabled. */ 537/* Called with local BH disabled. */
492static void peer_check_expire(unsigned long dummy) 538static void peer_check_expire(unsigned long dummy)
493{ 539{
494 unsigned long now = jiffies; 540 unsigned long now = jiffies;
495 int ttl; 541 int ttl, total;
496 542
497 if (peers.total >= inet_peer_threshold) 543 total = compute_total();
544 if (total >= inet_peer_threshold)
498 ttl = inet_peer_minttl; 545 ttl = inet_peer_minttl;
499 else 546 else
500 ttl = inet_peer_maxttl 547 ttl = inet_peer_maxttl
501 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 548 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
502 peers.total / inet_peer_threshold * HZ; 549 total / inet_peer_threshold * HZ;
503 while (!cleanup_once(ttl)) { 550 while (!cleanup_once(ttl)) {
504 if (jiffies != now) 551 if (jiffies != now)
505 break; 552 break;
@@ -508,13 +555,14 @@ static void peer_check_expire(unsigned long dummy)
508 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 555 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
509 * interval depending on the total number of entries (more entries, 556 * interval depending on the total number of entries (more entries,
510 * less interval). */ 557 * less interval). */
511 if (peers.total >= inet_peer_threshold) 558 total = compute_total();
559 if (total >= inet_peer_threshold)
512 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 560 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
513 else 561 else
514 peer_periodic_timer.expires = jiffies 562 peer_periodic_timer.expires = jiffies
515 + inet_peer_gc_maxtime 563 + inet_peer_gc_maxtime
516 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 564 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
517 peers.total / inet_peer_threshold * HZ; 565 total / inet_peer_threshold * HZ;
518 add_timer(&peer_periodic_timer); 566 add_timer(&peer_periodic_timer);
519} 567}
520 568
@@ -530,3 +578,4 @@ void inet_putpeer(struct inet_peer *p)
530 578
531 local_bh_enable(); 579 local_bh_enable();
532} 580}
581EXPORT_SYMBOL_GPL(inet_putpeer);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 168440834ade..e6215bdd96c0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -141,7 +141,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
141 qp->daddr = arg->iph->daddr; 141 qp->daddr = arg->iph->daddr;
142 qp->user = arg->user; 142 qp->user = arg->user;
143 qp->peer = sysctl_ipfrag_max_dist ? 143 qp->peer = sysctl_ipfrag_max_dist ?
144 inet_getpeer(arg->iph->saddr, 1) : NULL; 144 inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
145} 145}
146 146
147static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 147static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 70ff77f02eee..eb68a0e34e49 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -405,11 +405,11 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
405 if (parms->name[0]) 405 if (parms->name[0])
406 strlcpy(name, parms->name, IFNAMSIZ); 406 strlcpy(name, parms->name, IFNAMSIZ);
407 else 407 else
408 sprintf(name, "gre%%d"); 408 strcpy(name, "gre%d");
409 409
410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 if (!dev) 411 if (!dev)
412 return NULL; 412 return NULL;
413 413
414 dev_net_set(dev, net); 414 dev_net_set(dev, net);
415 415
@@ -634,7 +634,7 @@ static int ipgre_rcv(struct sk_buff *skb)
634#ifdef CONFIG_NET_IPGRE_BROADCAST 634#ifdef CONFIG_NET_IPGRE_BROADCAST
635 if (ipv4_is_multicast(iph->daddr)) { 635 if (ipv4_is_multicast(iph->daddr)) {
636 /* Looped back packet, drop it! */ 636 /* Looped back packet, drop it! */
637 if (skb_rtable(skb)->fl.iif == 0) 637 if (rt_is_output_route(skb_rtable(skb)))
638 goto drop; 638 goto drop;
639 tunnel->dev->stats.multicast++; 639 tunnel->dev->stats.multicast++;
640 skb->pkt_type = PACKET_BROADCAST; 640 skb->pkt_type = PACKET_BROADCAST;
@@ -772,16 +772,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
772 { 772 {
773 struct flowi fl = { 773 struct flowi fl = {
774 .oif = tunnel->parms.link, 774 .oif = tunnel->parms.link,
775 .nl_u = { 775 .fl4_dst = dst,
776 .ip4_u = { 776 .fl4_src = tiph->saddr,
777 .daddr = dst, 777 .fl4_tos = RT_TOS(tos),
778 .saddr = tiph->saddr, 778 .fl_gre_key = tunnel->parms.o_key
779 .tos = RT_TOS(tos) 779 };
780 }
781 },
782 .proto = IPPROTO_GRE
783 }
784;
785 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 780 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
786 dev->stats.tx_carrier_errors++; 781 dev->stats.tx_carrier_errors++;
787 goto tx_error; 782 goto tx_error;
@@ -823,7 +818,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
823 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 818 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
824 rt6->rt6i_dst.plen == 128) { 819 rt6->rt6i_dst.plen == 128) {
825 rt6->rt6i_flags |= RTF_MODIFIED; 820 rt6->rt6i_flags |= RTF_MODIFIED;
826 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; 821 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
827 } 822 }
828 } 823 }
829 824
@@ -895,7 +890,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
895 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 890 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
896#endif 891#endif
897 else 892 else
898 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); 893 iph->ttl = ip4_dst_hoplimit(&rt->dst);
899 } 894 }
900 895
901 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 896 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -951,14 +946,11 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
951 if (iph->daddr) { 946 if (iph->daddr) {
952 struct flowi fl = { 947 struct flowi fl = {
953 .oif = tunnel->parms.link, 948 .oif = tunnel->parms.link,
954 .nl_u = { 949 .fl4_dst = iph->daddr,
955 .ip4_u = { 950 .fl4_src = iph->saddr,
956 .daddr = iph->daddr, 951 .fl4_tos = RT_TOS(iph->tos),
957 .saddr = iph->saddr, 952 .proto = IPPROTO_GRE,
958 .tos = RT_TOS(iph->tos) 953 .fl_gre_key = tunnel->parms.o_key
959 }
960 },
961 .proto = IPPROTO_GRE
962 }; 954 };
963 struct rtable *rt; 955 struct rtable *rt;
964 956
@@ -1216,14 +1208,11 @@ static int ipgre_open(struct net_device *dev)
1216 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1208 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1217 struct flowi fl = { 1209 struct flowi fl = {
1218 .oif = t->parms.link, 1210 .oif = t->parms.link,
1219 .nl_u = { 1211 .fl4_dst = t->parms.iph.daddr,
1220 .ip4_u = { 1212 .fl4_src = t->parms.iph.saddr,
1221 .daddr = t->parms.iph.daddr, 1213 .fl4_tos = RT_TOS(t->parms.iph.tos),
1222 .saddr = t->parms.iph.saddr, 1214 .proto = IPPROTO_GRE,
1223 .tos = RT_TOS(t->parms.iph.tos) 1215 .fl_gre_key = t->parms.o_key
1224 }
1225 },
1226 .proto = IPPROTO_GRE
1227 }; 1216 };
1228 struct rtable *rt; 1217 struct rtable *rt;
1229 1218
@@ -1775,3 +1764,4 @@ module_exit(ipgre_fini);
1775MODULE_LICENSE("GPL"); 1764MODULE_LICENSE("GPL");
1776MODULE_ALIAS_RTNL_LINK("gre"); 1765MODULE_ALIAS_RTNL_LINK("gre");
1777MODULE_ALIAS_RTNL_LINK("gretap"); 1766MODULE_ALIAS_RTNL_LINK("gretap");
1767MODULE_ALIAS("gre0");
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 439d2a34ee44..04c7b3ba6b39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@
82#include <linux/tcp.h> 82#include <linux/tcp.h>
83 83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 86
86/* Generate a checksum for an outgoing IP datagram. */ 87/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph) 88__inline__ void ip_send_check(struct iphdr *iph)
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
131 132
132 if (ttl < 0) 133 if (ttl < 0)
133 ttl = dst_metric(dst, RTAX_HOPLIMIT); 134 ttl = ip4_dst_hoplimit(dst);
134 return ttl; 135 return ttl;
135} 136}
136 137
@@ -341,15 +342,13 @@ int ip_queue_xmit(struct sk_buff *skb)
341 { 342 {
342 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 struct flowi fl = { .oif = sk->sk_bound_dev_if,
343 .mark = sk->sk_mark, 344 .mark = sk->sk_mark,
344 .nl_u = { .ip4_u = 345 .fl4_dst = daddr,
345 { .daddr = daddr, 346 .fl4_src = inet->inet_saddr,
346 .saddr = inet->inet_saddr, 347 .fl4_tos = RT_CONN_FLAGS(sk),
347 .tos = RT_CONN_FLAGS(sk) } },
348 .proto = sk->sk_protocol, 348 .proto = sk->sk_protocol,
349 .flags = inet_sk_flowi_flags(sk), 349 .flags = inet_sk_flowi_flags(sk),
350 .uli_u = { .ports = 350 .fl_ip_sport = inet->inet_sport,
351 { .sport = inet->inet_sport, 351 .fl_ip_dport = inet->inet_dport };
352 .dport = inet->inet_dport } } };
353 352
354 /* If this fails, retransmit mechanism of transport layer will 353 /* If this fails, retransmit mechanism of transport layer will
355 * keep trying until route appears or the connection times 354 * keep trying until route appears or the connection times
@@ -1404,14 +1403,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1404 1403
1405 { 1404 {
1406 struct flowi fl = { .oif = arg->bound_dev_if, 1405 struct flowi fl = { .oif = arg->bound_dev_if,
1407 .nl_u = { .ip4_u = 1406 .fl4_dst = daddr,
1408 { .daddr = daddr, 1407 .fl4_src = rt->rt_spec_dst,
1409 .saddr = rt->rt_spec_dst, 1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1410 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 1409 .fl_ip_sport = tcp_hdr(skb)->dest,
1411 /* Not quite clean, but right. */ 1410 .fl_ip_dport = tcp_hdr(skb)->source,
1412 .uli_u = { .ports =
1413 { .sport = tcp_hdr(skb)->dest,
1414 .dport = tcp_hdr(skb)->source } },
1415 .proto = sk->sk_protocol, 1411 .proto = sk->sk_protocol,
1416 .flags = ip_reply_arg_flowi_flags(arg) }; 1412 .flags = ip_reply_arg_flowi_flags(arg) };
1417 security_skb_classify_flow(skb, &fl); 1413 security_skb_classify_flow(skb, &fl);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 3a6e1ec5e9ae..2b097752426b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1191,13 +1191,13 @@ static int __init ic_dynamic(void)
1191 (ic_proto_enabled & IC_USE_DHCP) && 1191 (ic_proto_enabled & IC_USE_DHCP) &&
1192 ic_dhcp_msgtype != DHCPACK) { 1192 ic_dhcp_msgtype != DHCPACK) {
1193 ic_got_reply = 0; 1193 ic_got_reply = 0;
1194 printk(","); 1194 printk(KERN_CONT ",");
1195 continue; 1195 continue;
1196 } 1196 }
1197#endif /* IPCONFIG_DHCP */ 1197#endif /* IPCONFIG_DHCP */
1198 1198
1199 if (ic_got_reply) { 1199 if (ic_got_reply) {
1200 printk(" OK\n"); 1200 printk(KERN_CONT " OK\n");
1201 break; 1201 break;
1202 } 1202 }
1203 1203
@@ -1205,7 +1205,7 @@ static int __init ic_dynamic(void)
1205 continue; 1205 continue;
1206 1206
1207 if (! --retries) { 1207 if (! --retries) {
1208 printk(" timed out!\n"); 1208 printk(KERN_CONT " timed out!\n");
1209 break; 1209 break;
1210 } 1210 }
1211 1211
@@ -1215,7 +1215,7 @@ static int __init ic_dynamic(void)
1215 if (timeout > CONF_TIMEOUT_MAX) 1215 if (timeout > CONF_TIMEOUT_MAX)
1216 timeout = CONF_TIMEOUT_MAX; 1216 timeout = CONF_TIMEOUT_MAX;
1217 1217
1218 printk("."); 1218 printk(KERN_CONT ".");
1219 } 1219 }
1220 1220
1221#ifdef IPCONFIG_BOOTP 1221#ifdef IPCONFIG_BOOTP
@@ -1236,7 +1236,7 @@ static int __init ic_dynamic(void)
1236 ((ic_got_reply & IC_RARP) ? "RARP" 1236 ((ic_got_reply & IC_RARP) ? "RARP"
1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1238 &ic_servaddr); 1238 &ic_servaddr);
1239 printk("my address is %pI4\n", &ic_myaddr); 1239 printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
1240 1240
1241 return 0; 1241 return 0;
1242} 1242}
@@ -1468,19 +1468,19 @@ static int __init ip_auto_config(void)
1468 /* 1468 /*
1469 * Clue in the operator. 1469 * Clue in the operator.
1470 */ 1470 */
1471 printk("IP-Config: Complete:"); 1471 printk("IP-Config: Complete:\n");
1472 printk("\n device=%s", ic_dev->name); 1472 printk(" device=%s", ic_dev->name);
1473 printk(", addr=%pI4", &ic_myaddr); 1473 printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
1474 printk(", mask=%pI4", &ic_netmask); 1474 printk(KERN_CONT ", mask=%pI4", &ic_netmask);
1475 printk(", gw=%pI4", &ic_gateway); 1475 printk(KERN_CONT ", gw=%pI4", &ic_gateway);
1476 printk(",\n host=%s, domain=%s, nis-domain=%s", 1476 printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
1477 utsname()->nodename, ic_domain, utsname()->domainname); 1477 utsname()->nodename, ic_domain, utsname()->domainname);
1478 printk(",\n bootserver=%pI4", &ic_servaddr); 1478 printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
1479 printk(", rootserver=%pI4", &root_server_addr); 1479 printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
1480 printk(", rootpath=%s", root_server_path); 1480 printk(KERN_CONT ", rootpath=%s", root_server_path);
1481 if (ic_dev_mtu) 1481 if (ic_dev_mtu)
1482 printk(", mtu=%d", ic_dev_mtu); 1482 printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
1483 printk("\n"); 1483 printk(KERN_CONT "\n");
1484#endif /* !SILENT */ 1484#endif /* !SILENT */
1485 1485
1486 return 0; 1486 return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index cd300aaee78f..988f52fba54a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -463,13 +463,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
463 { 463 {
464 struct flowi fl = { 464 struct flowi fl = {
465 .oif = tunnel->parms.link, 465 .oif = tunnel->parms.link,
466 .nl_u = { 466 .fl4_dst = dst,
467 .ip4_u = { 467 .fl4_src= tiph->saddr,
468 .daddr = dst, 468 .fl4_tos = RT_TOS(tos),
469 .saddr = tiph->saddr,
470 .tos = RT_TOS(tos)
471 }
472 },
473 .proto = IPPROTO_IPIP 469 .proto = IPPROTO_IPIP
474 }; 470 };
475 471
@@ -589,13 +585,9 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
589 if (iph->daddr) { 585 if (iph->daddr) {
590 struct flowi fl = { 586 struct flowi fl = {
591 .oif = tunnel->parms.link, 587 .oif = tunnel->parms.link,
592 .nl_u = { 588 .fl4_dst = iph->daddr,
593 .ip4_u = { 589 .fl4_src = iph->saddr,
594 .daddr = iph->daddr, 590 .fl4_tos = RT_TOS(iph->tos),
595 .saddr = iph->saddr,
596 .tos = RT_TOS(iph->tos)
597 }
598 },
599 .proto = IPPROTO_IPIP 591 .proto = IPPROTO_IPIP
600 }; 592 };
601 struct rtable *rt; 593 struct rtable *rt;
@@ -921,3 +913,4 @@ static void __exit ipip_fini(void)
921module_init(ipip_init); 913module_init(ipip_init);
922module_exit(ipip_fini); 914module_exit(ipip_fini);
923MODULE_LICENSE("GPL"); 915MODULE_LICENSE("GPL");
916MODULE_ALIAS("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 86dd5691af46..3f3a9afd73e0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1537,13 +1537,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1537 if (vif->flags & VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1538 struct flowi fl = { 1538 struct flowi fl = {
1539 .oif = vif->link, 1539 .oif = vif->link,
1540 .nl_u = { 1540 .fl4_dst = vif->remote,
1541 .ip4_u = { 1541 .fl4_src = vif->local,
1542 .daddr = vif->remote, 1542 .fl4_tos = RT_TOS(iph->tos),
1543 .saddr = vif->local,
1544 .tos = RT_TOS(iph->tos)
1545 }
1546 },
1547 .proto = IPPROTO_IPIP 1543 .proto = IPPROTO_IPIP
1548 }; 1544 };
1549 1545
@@ -1553,12 +1549,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1553 } else { 1549 } else {
1554 struct flowi fl = { 1550 struct flowi fl = {
1555 .oif = vif->link, 1551 .oif = vif->link,
1556 .nl_u = { 1552 .fl4_dst = iph->daddr,
1557 .ip4_u = { 1553 .fl4_tos = RT_TOS(iph->tos),
1558 .daddr = iph->daddr,
1559 .tos = RT_TOS(iph->tos)
1560 }
1561 },
1562 .proto = IPPROTO_IPIP 1554 .proto = IPPROTO_IPIP
1563 }; 1555 };
1564 1556
@@ -1654,7 +1646,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1654 if (mrt->vif_table[vif].dev != skb->dev) { 1646 if (mrt->vif_table[vif].dev != skb->dev) {
1655 int true_vifi; 1647 int true_vifi;
1656 1648
1657 if (skb_rtable(skb)->fl.iif == 0) { 1649 if (rt_is_output_route(skb_rtable(skb))) {
1658 /* It is our own packet, looped back. 1650 /* It is our own packet, looped back.
1659 * Very complicated situation... 1651 * Very complicated situation...
1660 * 1652 *
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index d88a46c54fd1..994a1f29ebbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -31,10 +31,10 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 32 */
33 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
34 fl.nl_u.ip4_u.daddr = iph->daddr; 34 fl.fl4_dst = iph->daddr;
35 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
36 fl.nl_u.ip4_u.saddr = iph->saddr; 36 fl.fl4_src = iph->saddr;
37 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); 37 fl.fl4_tos = RT_TOS(iph->tos);
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
39 fl.mark = skb->mark; 39 fl.mark = skb->mark;
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
@@ -47,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
47 } else { 47 } else {
48 /* non-local src, find valid iif to satisfy 48 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 49 * rp-filter when calling ip_route_input. */
50 fl.nl_u.ip4_u.daddr = iph->saddr; 50 fl.fl4_dst = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0) 51 if (ip_route_output_key(net, &rt, &fl) != 0)
52 return -1; 52 return -1;
53 53
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 48111594ee9b..19eb59d01037 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,15 +3,15 @@
3# 3#
4 4
5# objects for l3 independent conntrack 5# objects for l3 independent conntrack
6nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) 7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
8ifeq ($(CONFIG_PROC_FS),y) 8ifeq ($(CONFIG_PROC_FS),y)
9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o 9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o 13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o 14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15 15
16# connection tracking 16# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 43eec80c0e7c..1ff79e557f96 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -116,7 +116,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
116 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
117 goto free_nskb; 117 goto free_nskb;
118 118
119 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); 119 niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
120 120
121 /* "Never happens" */ 121 /* "Never happens" */
122 if (nskb->len > dst_mtu(skb_dst(nskb))) 122 if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1f85ef289895..a3d5ab786e81 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -549,10 +549,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
549 { 549 {
550 struct flowi fl = { .oif = ipc.oif, 550 struct flowi fl = { .oif = ipc.oif,
551 .mark = sk->sk_mark, 551 .mark = sk->sk_mark,
552 .nl_u = { .ip4_u = 552 .fl4_dst = daddr,
553 { .daddr = daddr, 553 .fl4_src = saddr,
554 .saddr = saddr, 554 .fl4_tos = tos,
555 .tos = tos } },
556 .proto = inet->hdrincl ? IPPROTO_RAW : 555 .proto = inet->hdrincl ? IPPROTO_RAW :
557 sk->sk_protocol, 556 sk->sk_protocol,
558 }; 557 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 93bfd95584f4..351dc4e85242 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,20 +139,26 @@ static unsigned long expires_ljiffies;
139 */ 139 */
140 140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 144static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149 149
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
150 154
151static struct dst_ops ipv4_dst_ops = { 155static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET, 156 .family = AF_INET,
153 .protocol = cpu_to_be16(ETH_P_IP), 157 .protocol = cpu_to_be16(ETH_P_IP),
154 .gc = rt_garbage_collect, 158 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check, 159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu,
156 .destroy = ipv4_dst_destroy, 162 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown, 163 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice, 164 .negative_advice = ipv4_negative_advice,
@@ -381,8 +387,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
381 (__force u32)r->rt_gateway, 387 (__force u32)r->rt_gateway,
382 r->rt_flags, atomic_read(&r->dst.__refcnt), 388 r->rt_flags, atomic_read(&r->dst.__refcnt),
383 r->dst.__use, 0, (__force u32)r->rt_src, 389 r->dst.__use, 0, (__force u32)r->rt_src,
384 (dst_metric(&r->dst, RTAX_ADVMSS) ? 390 dst_metric_advmss(&r->dst) + 40,
385 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386 dst_metric(&r->dst, RTAX_WINDOW), 391 dst_metric(&r->dst, RTAX_WINDOW),
387 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388 dst_metric(&r->dst, RTAX_RTTVAR)), 393 dst_metric(&r->dst, RTAX_RTTVAR)),
@@ -621,7 +626,7 @@ static inline int rt_fast_clean(struct rtable *rth)
621 /* Kill broadcast/multicast entries very aggresively, if they 626 /* Kill broadcast/multicast entries very aggresively, if they
622 collide in hash table with more useful entries */ 627 collide in hash table with more useful entries */
623 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 628 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624 rth->fl.iif && rth->dst.rt_next; 629 rt_is_input_route(rth) && rth->dst.rt_next;
625} 630}
626 631
627static inline int rt_valuable(struct rtable *rth) 632static inline int rt_valuable(struct rtable *rth)
@@ -666,7 +671,7 @@ static inline u32 rt_score(struct rtable *rt)
666 if (rt_valuable(rt)) 671 if (rt_valuable(rt))
667 score |= (1<<31); 672 score |= (1<<31);
668 673
669 if (!rt->fl.iif || 674 if (rt_is_output_route(rt) ||
670 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 675 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671 score |= (1<<30); 676 score |= (1<<30);
672 677
@@ -682,17 +687,17 @@ static inline bool rt_caching(const struct net *net)
682static inline bool compare_hash_inputs(const struct flowi *fl1, 687static inline bool compare_hash_inputs(const struct flowi *fl1,
683 const struct flowi *fl2) 688 const struct flowi *fl2)
684{ 689{
685 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
686 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
687 (fl1->iif ^ fl2->iif)) == 0); 692 (fl1->iif ^ fl2->iif)) == 0);
688} 693}
689 694
690static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
691{ 696{
692 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
693 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
694 (fl1->mark ^ fl2->mark) | 699 (fl1->mark ^ fl2->mark) |
695 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | 700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
696 (fl1->oif ^ fl2->oif) | 701 (fl1->oif ^ fl2->oif) |
697 (fl1->iif ^ fl2->iif)) == 0; 702 (fl1->iif ^ fl2->iif)) == 0;
698} 703}
@@ -712,13 +717,15 @@ static inline int rt_is_expired(struct rtable *rth)
712 * Can be called by a softirq or a process. 717 * Can be called by a softirq or a process.
713 * In the later case, we want to be reschedule if necessary 718 * In the later case, we want to be reschedule if necessary
714 */ 719 */
715static void rt_do_flush(int process_context) 720static void rt_do_flush(struct net *net, int process_context)
716{ 721{
717 unsigned int i; 722 unsigned int i;
718 struct rtable *rth, *next; 723 struct rtable *rth, *next;
719 struct rtable * tail;
720 724
721 for (i = 0; i <= rt_hash_mask; i++) { 725 for (i = 0; i <= rt_hash_mask; i++) {
726 struct rtable __rcu **pprev;
727 struct rtable *list;
728
722 if (process_context && need_resched()) 729 if (process_context && need_resched())
723 cond_resched(); 730 cond_resched();
724 rth = rcu_dereference_raw(rt_hash_table[i].chain); 731 rth = rcu_dereference_raw(rt_hash_table[i].chain);
@@ -726,50 +733,32 @@ static void rt_do_flush(int process_context)
726 continue; 733 continue;
727 734
728 spin_lock_bh(rt_hash_lock_addr(i)); 735 spin_lock_bh(rt_hash_lock_addr(i));
729#ifdef CONFIG_NET_NS
730 {
731 struct rtable __rcu **prev;
732 struct rtable *p;
733 736
734 rth = rcu_dereference_protected(rt_hash_table[i].chain, 737 list = NULL;
738 pprev = &rt_hash_table[i].chain;
739 rth = rcu_dereference_protected(*pprev,
735 lockdep_is_held(rt_hash_lock_addr(i))); 740 lockdep_is_held(rt_hash_lock_addr(i)));
736 741
737 /* defer releasing the head of the list after spin_unlock */ 742 while (rth) {
738 for (tail = rth; tail; 743 next = rcu_dereference_protected(rth->dst.rt_next,
739 tail = rcu_dereference_protected(tail->dst.rt_next,
740 lockdep_is_held(rt_hash_lock_addr(i))))
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = rcu_dereference_protected(*prev,
749 lockdep_is_held(rt_hash_lock_addr(i)));
750 p != NULL;
751 p = next) {
752 next = rcu_dereference_protected(p->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i))); 744 lockdep_is_held(rt_hash_lock_addr(i)));
754 if (!rt_is_expired(p)) { 745
755 prev = &p->dst.rt_next; 746 if (!net ||
747 net_eq(dev_net(rth->dst.dev), net)) {
748 rcu_assign_pointer(*pprev, next);
749 rcu_assign_pointer(rth->dst.rt_next, list);
750 list = rth;
756 } else { 751 } else {
757 *prev = next; 752 pprev = &rth->dst.rt_next;
758 rt_free(p);
759 } 753 }
754 rth = next;
760 } 755 }
761 } 756
762#else
763 rth = rcu_dereference_protected(rt_hash_table[i].chain,
764 lockdep_is_held(rt_hash_lock_addr(i)));
765 rcu_assign_pointer(rt_hash_table[i].chain, NULL);
766 tail = NULL;
767#endif
768 spin_unlock_bh(rt_hash_lock_addr(i)); 757 spin_unlock_bh(rt_hash_lock_addr(i));
769 758
770 for (; rth != tail; rth = next) { 759 for (; list; list = next) {
771 next = rcu_dereference_protected(rth->dst.rt_next, 1); 760 next = rcu_dereference_protected(list->dst.rt_next, 1);
772 rt_free(rth); 761 rt_free(list);
773 } 762 }
774 } 763 }
775} 764}
@@ -917,13 +906,13 @@ void rt_cache_flush(struct net *net, int delay)
917{ 906{
918 rt_cache_invalidate(net); 907 rt_cache_invalidate(net);
919 if (delay >= 0) 908 if (delay >= 0)
920 rt_do_flush(!in_softirq()); 909 rt_do_flush(net, !in_softirq());
921} 910}
922 911
923/* Flush previous cache invalidated entries from the cache */ 912/* Flush previous cache invalidated entries from the cache */
924void rt_cache_flush_batch(void) 913void rt_cache_flush_batch(struct net *net)
925{ 914{
926 rt_do_flush(!in_softirq()); 915 rt_do_flush(net, !in_softirq());
927} 916}
928 917
929static void rt_emergency_hash_rebuild(struct net *net) 918static void rt_emergency_hash_rebuild(struct net *net)
@@ -1124,7 +1113,7 @@ restart:
1124 */ 1113 */
1125 1114
1126 rt->dst.flags |= DST_NOCACHE; 1115 rt->dst.flags |= DST_NOCACHE;
1127 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1116 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1128 int err = arp_bind_neighbour(&rt->dst); 1117 int err = arp_bind_neighbour(&rt->dst);
1129 if (err) { 1118 if (err) {
1130 if (net_ratelimit()) 1119 if (net_ratelimit())
@@ -1222,7 +1211,7 @@ restart:
1222 /* Try to bind route to arp only if it is output 1211 /* Try to bind route to arp only if it is output
1223 route or unicast forwarding path. 1212 route or unicast forwarding path.
1224 */ 1213 */
1225 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1214 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1226 int err = arp_bind_neighbour(&rt->dst); 1215 int err = arp_bind_neighbour(&rt->dst);
1227 if (err) { 1216 if (err) {
1228 spin_unlock_bh(rt_hash_lock_addr(hash)); 1217 spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1287,7 +1276,7 @@ void rt_bind_peer(struct rtable *rt, int create)
1287{ 1276{
1288 struct inet_peer *peer; 1277 struct inet_peer *peer;
1289 1278
1290 peer = inet_getpeer(rt->rt_dst, create); 1279 peer = inet_getpeer_v4(rt->rt_dst, create);
1291 1280
1292 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1293 inet_putpeer(peer); 1282 inet_putpeer(peer);
@@ -1404,7 +1393,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1404 if (rth->fl.fl4_dst != daddr || 1393 if (rth->fl.fl4_dst != daddr ||
1405 rth->fl.fl4_src != skeys[i] || 1394 rth->fl.fl4_src != skeys[i] ||
1406 rth->fl.oif != ikeys[k] || 1395 rth->fl.oif != ikeys[k] ||
1407 rth->fl.iif != 0 || 1396 rt_is_input_route(rth) ||
1408 rt_is_expired(rth) || 1397 rt_is_expired(rth) ||
1409 !net_eq(dev_net(rth->dst.dev), net)) { 1398 !net_eq(dev_net(rth->dst.dev), net)) {
1410 rthp = &rth->dst.rt_next; 1399 rthp = &rth->dst.rt_next;
@@ -1433,8 +1422,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1433 rt->dst.child = NULL; 1422 rt->dst.child = NULL;
1434 if (rt->dst.dev) 1423 if (rt->dst.dev)
1435 dev_hold(rt->dst.dev); 1424 dev_hold(rt->dst.dev);
1436 if (rt->idev)
1437 in_dev_hold(rt->idev);
1438 rt->dst.obsolete = -1; 1425 rt->dst.obsolete = -1;
1439 rt->dst.lastuse = jiffies; 1426 rt->dst.lastuse = jiffies;
1440 rt->dst.path = &rt->dst; 1427 rt->dst.path = &rt->dst;
@@ -1666,7 +1653,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1666 rth->rt_dst != daddr || 1653 rth->rt_dst != daddr ||
1667 rth->rt_src != iph->saddr || 1654 rth->rt_src != iph->saddr ||
1668 rth->fl.oif != ikeys[k] || 1655 rth->fl.oif != ikeys[k] ||
1669 rth->fl.iif != 0 || 1656 rt_is_input_route(rth) ||
1670 dst_metric_locked(&rth->dst, RTAX_MTU) || 1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1671 !net_eq(dev_net(rth->dst.dev), net) || 1658 !net_eq(dev_net(rth->dst.dev), net) ||
1672 rt_is_expired(rth)) 1659 rt_is_expired(rth))
@@ -1686,11 +1673,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1686 if (mtu < dst_mtu(&rth->dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1687 dst_confirm(&rth->dst); 1674 dst_confirm(&rth->dst);
1688 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1676 u32 lock = dst_metric(&rth->dst,
1677 RTAX_LOCK);
1689 mtu = ip_rt_min_pmtu; 1678 mtu = ip_rt_min_pmtu;
1690 rth->dst.metrics[RTAX_LOCK-1] |= 1679 lock |= (1 << RTAX_MTU);
1691 (1 << RTAX_MTU); 1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1692 } 1682 }
1693 rth->dst.metrics[RTAX_MTU-1] = mtu; 1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1694 dst_set_expires(&rth->dst, 1684 dst_set_expires(&rth->dst,
1695 ip_rt_mtu_expires); 1685 ip_rt_mtu_expires);
1696 } 1686 }
@@ -1708,10 +1698,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1708 if (dst_mtu(dst) > mtu && mtu >= 68 && 1698 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1709 !(dst_metric_locked(dst, RTAX_MTU))) { 1699 !(dst_metric_locked(dst, RTAX_MTU))) {
1710 if (mtu < ip_rt_min_pmtu) { 1700 if (mtu < ip_rt_min_pmtu) {
1701 u32 lock = dst_metric(dst, RTAX_LOCK);
1711 mtu = ip_rt_min_pmtu; 1702 mtu = ip_rt_min_pmtu;
1712 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1713 } 1704 }
1714 dst->metrics[RTAX_MTU-1] = mtu; 1705 dst_metric_set(dst, RTAX_MTU, mtu);
1715 dst_set_expires(dst, ip_rt_mtu_expires); 1706 dst_set_expires(dst, ip_rt_mtu_expires);
1716 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1717 } 1708 }
@@ -1728,33 +1719,13 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1728{ 1719{
1729 struct rtable *rt = (struct rtable *) dst; 1720 struct rtable *rt = (struct rtable *) dst;
1730 struct inet_peer *peer = rt->peer; 1721 struct inet_peer *peer = rt->peer;
1731 struct in_device *idev = rt->idev;
1732 1722
1733 if (peer) { 1723 if (peer) {
1734 rt->peer = NULL; 1724 rt->peer = NULL;
1735 inet_putpeer(peer); 1725 inet_putpeer(peer);
1736 } 1726 }
1737
1738 if (idev) {
1739 rt->idev = NULL;
1740 in_dev_put(idev);
1741 }
1742} 1727}
1743 1728
1744static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1745 int how)
1746{
1747 struct rtable *rt = (struct rtable *) dst;
1748 struct in_device *idev = rt->idev;
1749 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1750 struct in_device *loopback_idev =
1751 in_dev_get(dev_net(dev)->loopback_dev);
1752 if (loopback_idev) {
1753 rt->idev = loopback_idev;
1754 in_dev_put(idev);
1755 }
1756 }
1757}
1758 1729
1759static void ipv4_link_failure(struct sk_buff *skb) 1730static void ipv4_link_failure(struct sk_buff *skb)
1760{ 1731{
@@ -1790,7 +1761,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1790 __be32 src; 1761 __be32 src;
1791 struct fib_result res; 1762 struct fib_result res;
1792 1763
1793 if (rt->fl.iif == 0) 1764 if (rt_is_output_route(rt))
1794 src = rt->rt_src; 1765 src = rt->rt_src;
1795 else { 1766 else {
1796 rcu_read_lock(); 1767 rcu_read_lock();
@@ -1814,38 +1785,55 @@ static void set_class_tag(struct rtable *rt, u32 tag)
1814} 1785}
1815#endif 1786#endif
1816 1787
1788static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1789{
1790 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1791
1792 if (advmss == 0) {
1793 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1794 ip_rt_min_advmss);
1795 if (advmss > 65535 - 40)
1796 advmss = 65535 - 40;
1797 }
1798 return advmss;
1799}
1800
1801static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1802{
1803 unsigned int mtu = dst->dev->mtu;
1804
1805 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1806 const struct rtable *rt = (const struct rtable *) dst;
1807
1808 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1809 mtu = 576;
1810 }
1811
1812 if (mtu > IP_MAX_MTU)
1813 mtu = IP_MAX_MTU;
1814
1815 return mtu;
1816}
1817
1817static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1818{ 1819{
1820 struct dst_entry *dst = &rt->dst;
1819 struct fib_info *fi = res->fi; 1821 struct fib_info *fi = res->fi;
1820 1822
1821 if (fi) { 1823 if (fi) {
1822 if (FIB_RES_GW(*res) && 1824 if (FIB_RES_GW(*res) &&
1823 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1824 rt->rt_gateway = FIB_RES_GW(*res); 1826 rt->rt_gateway = FIB_RES_GW(*res);
1825 memcpy(rt->dst.metrics, fi->fib_metrics, 1827 dst_import_metrics(dst, fi->fib_metrics);
1826 sizeof(rt->dst.metrics));
1827 if (fi->fib_mtu == 0) {
1828 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1829 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1830 rt->rt_gateway != rt->rt_dst &&
1831 rt->dst.dev->mtu > 576)
1832 rt->dst.metrics[RTAX_MTU-1] = 576;
1833 }
1834#ifdef CONFIG_NET_CLS_ROUTE 1828#ifdef CONFIG_NET_CLS_ROUTE
1835 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1836#endif 1830#endif
1837 } else 1831 }
1838 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; 1832
1839 1833 if (dst_mtu(dst) > IP_MAX_MTU)
1840 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) 1834 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1841 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1842 if (dst_mtu(&rt->dst) > IP_MAX_MTU) 1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1843 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1844 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1845 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1846 ip_rt_min_advmss);
1847 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1848 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1849 1837
1850#ifdef CONFIG_NET_CLS_ROUTE 1838#ifdef CONFIG_NET_CLS_ROUTE
1851#ifdef CONFIG_IP_MULTIPLE_TABLES 1839#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1910,7 +1898,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1910 rth->fl.iif = dev->ifindex; 1898 rth->fl.iif = dev->ifindex;
1911 rth->dst.dev = init_net.loopback_dev; 1899 rth->dst.dev = init_net.loopback_dev;
1912 dev_hold(rth->dst.dev); 1900 dev_hold(rth->dst.dev);
1913 rth->idev = in_dev_get(rth->dst.dev);
1914 rth->fl.oif = 0; 1901 rth->fl.oif = 0;
1915 rth->rt_gateway = daddr; 1902 rth->rt_gateway = daddr;
1916 rth->rt_spec_dst= spec_dst; 1903 rth->rt_spec_dst= spec_dst;
@@ -2050,7 +2037,6 @@ static int __mkroute_input(struct sk_buff *skb,
2050 rth->fl.iif = in_dev->dev->ifindex; 2037 rth->fl.iif = in_dev->dev->ifindex;
2051 rth->dst.dev = (out_dev)->dev; 2038 rth->dst.dev = (out_dev)->dev;
2052 dev_hold(rth->dst.dev); 2039 dev_hold(rth->dst.dev);
2053 rth->idev = in_dev_get(rth->dst.dev);
2054 rth->fl.oif = 0; 2040 rth->fl.oif = 0;
2055 rth->rt_spec_dst= spec_dst; 2041 rth->rt_spec_dst= spec_dst;
2056 2042
@@ -2111,12 +2097,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2111{ 2097{
2112 struct fib_result res; 2098 struct fib_result res;
2113 struct in_device *in_dev = __in_dev_get_rcu(dev); 2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
2114 struct flowi fl = { .nl_u = { .ip4_u = 2100 struct flowi fl = { .fl4_dst = daddr,
2115 { .daddr = daddr, 2101 .fl4_src = saddr,
2116 .saddr = saddr, 2102 .fl4_tos = tos,
2117 .tos = tos, 2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2118 .scope = RT_SCOPE_UNIVERSE,
2119 } },
2120 .mark = skb->mark, 2104 .mark = skb->mark,
2121 .iif = dev->ifindex }; 2105 .iif = dev->ifindex };
2122 unsigned flags = 0; 2106 unsigned flags = 0;
@@ -2231,7 +2215,6 @@ local_input:
2231 rth->fl.iif = dev->ifindex; 2215 rth->fl.iif = dev->ifindex;
2232 rth->dst.dev = net->loopback_dev; 2216 rth->dst.dev = net->loopback_dev;
2233 dev_hold(rth->dst.dev); 2217 dev_hold(rth->dst.dev);
2234 rth->idev = in_dev_get(rth->dst.dev);
2235 rth->rt_gateway = daddr; 2218 rth->rt_gateway = daddr;
2236 rth->rt_spec_dst= spec_dst; 2219 rth->rt_spec_dst= spec_dst;
2237 rth->dst.input= ip_local_deliver; 2220 rth->dst.input= ip_local_deliver;
@@ -2417,9 +2400,6 @@ static int __mkroute_output(struct rtable **result,
2417 if (!rth) 2400 if (!rth)
2418 return -ENOBUFS; 2401 return -ENOBUFS;
2419 2402
2420 in_dev_hold(in_dev);
2421 rth->idev = in_dev;
2422
2423 atomic_set(&rth->dst.__refcnt, 1); 2403 atomic_set(&rth->dst.__refcnt, 1);
2424 rth->dst.flags= DST_HOST; 2404 rth->dst.flags= DST_HOST;
2425 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
@@ -2506,14 +2486,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2506 const struct flowi *oldflp) 2486 const struct flowi *oldflp)
2507{ 2487{
2508 u32 tos = RT_FL_TOS(oldflp); 2488 u32 tos = RT_FL_TOS(oldflp);
2509 struct flowi fl = { .nl_u = { .ip4_u = 2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2510 { .daddr = oldflp->fl4_dst, 2490 .fl4_src = oldflp->fl4_src,
2511 .saddr = oldflp->fl4_src, 2491 .fl4_tos = tos & IPTOS_RT_MASK,
2512 .tos = tos & IPTOS_RT_MASK, 2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2513 .scope = ((tos & RTO_ONLINK) ? 2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2514 RT_SCOPE_LINK :
2515 RT_SCOPE_UNIVERSE),
2516 } },
2517 .mark = oldflp->mark, 2494 .mark = oldflp->mark,
2518 .iif = net->loopback_dev->ifindex, 2495 .iif = net->loopback_dev->ifindex,
2519 .oif = oldflp->oif }; 2496 .oif = oldflp->oif };
@@ -2700,7 +2677,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2700 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2677 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2701 if (rth->fl.fl4_dst == flp->fl4_dst && 2678 if (rth->fl.fl4_dst == flp->fl4_dst &&
2702 rth->fl.fl4_src == flp->fl4_src && 2679 rth->fl.fl4_src == flp->fl4_src &&
2703 rth->fl.iif == 0 && 2680 rt_is_output_route(rth) &&
2704 rth->fl.oif == flp->oif && 2681 rth->fl.oif == flp->oif &&
2705 rth->fl.mark == flp->mark && 2682 rth->fl.mark == flp->mark &&
2706 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
@@ -2756,7 +2733,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2756 new->__use = 1; 2733 new->__use = 1;
2757 new->input = dst_discard; 2734 new->input = dst_discard;
2758 new->output = dst_discard; 2735 new->output = dst_discard;
2759 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); 2736 dst_copy_metrics(new, &ort->dst);
2760 2737
2761 new->dev = ort->dst.dev; 2738 new->dev = ort->dst.dev;
2762 if (new->dev) 2739 if (new->dev)
@@ -2764,9 +2741,6 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2764 2741
2765 rt->fl = ort->fl; 2742 rt->fl = ort->fl;
2766 2743
2767 rt->idev = ort->idev;
2768 if (rt->idev)
2769 in_dev_hold(rt->idev);
2770 rt->rt_genid = rt_genid(net); 2744 rt->rt_genid = rt_genid(net);
2771 rt->rt_flags = ort->rt_flags; 2745 rt->rt_flags = ort->rt_flags;
2772 rt->rt_type = ort->rt_type; 2746 rt->rt_type = ort->rt_type;
@@ -2858,7 +2832,7 @@ static int rt_fill_info(struct net *net,
2858 if (rt->dst.tclassid) 2832 if (rt->dst.tclassid)
2859 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2860#endif 2834#endif
2861 if (rt->fl.iif) 2835 if (rt_is_input_route(rt))
2862 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2836 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2863 else if (rt->rt_src != rt->fl.fl4_src) 2837 else if (rt->rt_src != rt->fl.fl4_src)
2864 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2838 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
@@ -2866,7 +2840,7 @@ static int rt_fill_info(struct net *net,
2866 if (rt->rt_dst != rt->rt_gateway) 2840 if (rt->rt_dst != rt->rt_gateway)
2867 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2841 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2868 2842
2869 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) 2843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2870 goto nla_put_failure; 2844 goto nla_put_failure;
2871 2845
2872 if (rt->fl.mark) 2846 if (rt->fl.mark)
@@ -2883,7 +2857,7 @@ static int rt_fill_info(struct net *net,
2883 } 2857 }
2884 } 2858 }
2885 2859
2886 if (rt->fl.iif) { 2860 if (rt_is_input_route(rt)) {
2887#ifdef CONFIG_IP_MROUTE 2861#ifdef CONFIG_IP_MROUTE
2888 __be32 dst = rt->rt_dst; 2862 __be32 dst = rt->rt_dst;
2889 2863
@@ -2978,13 +2952,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2978 err = -rt->dst.error; 2952 err = -rt->dst.error;
2979 } else { 2953 } else {
2980 struct flowi fl = { 2954 struct flowi fl = {
2981 .nl_u = { 2955 .fl4_dst = dst,
2982 .ip4_u = { 2956 .fl4_src = src,
2983 .daddr = dst, 2957 .fl4_tos = rtm->rtm_tos,
2984 .saddr = src,
2985 .tos = rtm->rtm_tos,
2986 },
2987 },
2988 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2989 .mark = mark, 2959 .mark = mark,
2990 }; 2960 };
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace2180d..47519205a014 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,17 +346,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi fl = { .mark = sk->sk_mark,
349 .nl_u = { .ip4_u = 349 .fl4_dst = ((opt && opt->srr) ?
350 { .daddr = ((opt && opt->srr) ? 350 opt->faddr : ireq->rmt_addr),
351 opt->faddr : 351 .fl4_src = ireq->loc_addr,
352 ireq->rmt_addr), 352 .fl4_tos = RT_CONN_FLAGS(sk),
353 .saddr = ireq->loc_addr,
354 .tos = RT_CONN_FLAGS(sk) } },
355 .proto = IPPROTO_TCP, 353 .proto = IPPROTO_TCP,
356 .flags = inet_sk_flowi_flags(sk), 354 .flags = inet_sk_flowi_flags(sk),
357 .uli_u = { .ports = 355 .fl_ip_sport = th->dest,
358 { .sport = th->dest, 356 .fl_ip_dport = th->source };
359 .dport = th->source } } };
360 security_req_classify_flow(req, &fl); 357 security_req_classify_flow(req, &fl);
361 if (ip_route_output_key(sock_net(sk), &rt, &fl)) { 358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
362 reqsk_free(req); 359 reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b4ec21497a4..1a456652086b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,6 +28,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
28static int ip_local_port_range_max[] = { 65535, 65535 }; 28static int ip_local_port_range_max[] = { 65535, 65535 };
29static int tcp_adv_win_scale_min = -31; 29static int tcp_adv_win_scale_min = -31;
30static int tcp_adv_win_scale_max = 31; 30static int tcp_adv_win_scale_max = 31;
31static int ip_ttl_min = 1;
32static int ip_ttl_max = 255;
31 33
32/* Update system visible IP port range */ 34/* Update system visible IP port range */
33static void set_local_port_range(int range[2]) 35static void set_local_port_range(int range[2])
@@ -155,8 +157,9 @@ static struct ctl_table ipv4_table[] = {
155 .data = &sysctl_ip_default_ttl, 157 .data = &sysctl_ip_default_ttl,
156 .maxlen = sizeof(int), 158 .maxlen = sizeof(int),
157 .mode = 0644, 159 .mode = 0644,
158 .proc_handler = ipv4_doint_and_flush, 160 .proc_handler = proc_dointvec_minmax,
159 .extra2 = &init_net, 161 .extra1 = &ip_ttl_min,
162 .extra2 = &ip_ttl_max,
160 }, 163 },
161 { 164 {
162 .procname = "ip_no_pmtu_disc", 165 .procname = "ip_no_pmtu_disc",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f15c36a706ec..6c11eece262c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1193,7 +1193,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1193 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1193 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1194 1194
1195 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), 1195 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1196 KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", 1196 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1197 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); 1197 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1198#endif 1198#endif
1199 1199
@@ -1477,10 +1477,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1477 * shouldn't happen. 1477 * shouldn't happen.
1478 */ 1478 */
1479 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), 1479 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1480 KERN_INFO "recvmsg bug: copied %X " 1480 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1481 "seq %X rcvnxt %X fl %X\n", *seq, 1481 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1482 TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 1482 flags))
1483 flags))
1484 break; 1483 break;
1485 1484
1486 offset = *seq - TCP_SKB_CB(skb)->seq; 1485 offset = *seq - TCP_SKB_CB(skb)->seq;
@@ -1490,10 +1489,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1490 goto found_ok_skb; 1489 goto found_ok_skb;
1491 if (tcp_hdr(skb)->fin) 1490 if (tcp_hdr(skb)->fin)
1492 goto found_fin_ok; 1491 goto found_fin_ok;
1493 WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " 1492 WARN(!(flags & MSG_PEEK),
1494 "copied %X seq %X rcvnxt %X fl %X\n", 1493 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1495 *seq, TCP_SKB_CB(skb)->seq, 1494 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1496 tp->rcv_nxt, flags);
1497 } 1495 }
1498 1496
1499 /* Well, if we have backlog, try to process it now yet. */ 1497 /* Well, if we have backlog, try to process it now yet. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6d8ab1c4efc3..2549b29b062d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -734,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
734 * Reset our results. 734 * Reset our results.
735 */ 735 */
736 if (!(dst_metric_locked(dst, RTAX_RTT))) 736 if (!(dst_metric_locked(dst, RTAX_RTT)))
737 dst->metrics[RTAX_RTT - 1] = 0; 737 dst_metric_set(dst, RTAX_RTT, 0);
738 return; 738 return;
739 } 739 }
740 740
@@ -776,34 +776,38 @@ void tcp_update_metrics(struct sock *sk)
776 if (dst_metric(dst, RTAX_SSTHRESH) && 776 if (dst_metric(dst, RTAX_SSTHRESH) &&
777 !dst_metric_locked(dst, RTAX_SSTHRESH) && 777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
779 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; 779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
780 if (!dst_metric_locked(dst, RTAX_CWND) && 780 if (!dst_metric_locked(dst, RTAX_CWND) &&
781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
782 dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; 782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
783 } else if (tp->snd_cwnd > tp->snd_ssthresh && 783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
784 icsk->icsk_ca_state == TCP_CA_Open) { 784 icsk->icsk_ca_state == TCP_CA_Open) {
785 /* Cong. avoidance phase, cwnd is reliable. */ 785 /* Cong. avoidance phase, cwnd is reliable. */
786 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
787 dst->metrics[RTAX_SSTHRESH-1] = 787 dst_metric_set(dst, RTAX_SSTHRESH,
788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh); 788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
789 if (!dst_metric_locked(dst, RTAX_CWND)) 789 if (!dst_metric_locked(dst, RTAX_CWND))
790 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; 790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
791 } else { 793 } else {
792 /* Else slow start did not finish, cwnd is non-sense, 794 /* Else slow start did not finish, cwnd is non-sense,
793 ssthresh may be also invalid. 795 ssthresh may be also invalid.
794 */ 796 */
795 if (!dst_metric_locked(dst, RTAX_CWND)) 797 if (!dst_metric_locked(dst, RTAX_CWND))
796 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; 798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
797 if (dst_metric(dst, RTAX_SSTHRESH) && 801 if (dst_metric(dst, RTAX_SSTHRESH) &&
798 !dst_metric_locked(dst, RTAX_SSTHRESH) && 802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
799 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
800 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; 804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
801 } 805 }
802 806
803 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
804 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
805 tp->reordering != sysctl_tcp_reordering) 809 tp->reordering != sysctl_tcp_reordering)
806 dst->metrics[RTAX_REORDERING-1] = tp->reordering; 810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
807 } 811 }
808 } 812 }
809} 813}
@@ -912,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
912 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
913 } 917 }
914 tcp_set_rto(sk); 918 tcp_set_rto(sk);
915 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
916 goto reset;
917
918cwnd:
919 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
920 tp->snd_cwnd_stamp = tcp_time_stamp;
921 return;
922
923reset: 920reset:
924 /* Play conservative. If timestamps are not 921 /* Play conservative. If timestamps are not
925 * supported, TCP will fail to recalculate correct 922 * supported, TCP will fail to recalculate correct
926 * rtt, if initial rto is too small. FORGET ALL AND RESET! 923 * rtt, if initial rto is too small. FORGET ALL AND RESET!
927 */ 924 */
928 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 925 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
929 tp->srtt = 0; 926 tp->srtt = 0;
930 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
931 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
929 }
932 } 930 }
933 goto cwnd; 931 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
932 tp->snd_cwnd_stamp = tcp_time_stamp;
934} 933}
935 934
936static void tcp_update_reordering(struct sock *sk, const int metric, 935static void tcp_update_reordering(struct sock *sk, const int metric,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d978bb2f748b..856f68466d49 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1210,12 +1210,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1210}; 1210};
1211#endif 1211#endif
1212 1212
1213static struct timewait_sock_ops tcp_timewait_sock_ops = {
1214 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1215 .twsk_unique = tcp_twsk_unique,
1216 .twsk_destructor= tcp_twsk_destructor,
1217};
1218
1219int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1213int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220{ 1214{
1221 struct tcp_extend_values tmp_ext; 1215 struct tcp_extend_values tmp_ext;
@@ -1347,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1347 tcp_death_row.sysctl_tw_recycle && 1341 tcp_death_row.sysctl_tw_recycle &&
1348 (dst = inet_csk_route_req(sk, req)) != NULL && 1342 (dst = inet_csk_route_req(sk, req)) != NULL &&
1349 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350 peer->v4daddr == saddr) { 1344 peer->daddr.a4 == saddr) {
1351 inet_peer_refcheck(peer); 1345 inet_peer_refcheck(peer);
1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353 (s32)(peer->tcp_ts - req->ts_recent) > 1347 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1442,7 +1436,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1442 1436
1443 tcp_mtup_init(newsk); 1437 tcp_mtup_init(newsk);
1444 tcp_sync_mss(newsk, dst_mtu(dst)); 1438 tcp_sync_mss(newsk, dst_mtu(dst));
1445 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1439 newtp->advmss = dst_metric_advmss(dst);
1446 if (tcp_sk(sk)->rx_opt.user_mss && 1440 if (tcp_sk(sk)->rx_opt.user_mss &&
1447 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1441 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1448 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1442 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1763,64 +1757,40 @@ do_time_wait:
1763 goto discard_it; 1757 goto discard_it;
1764} 1758}
1765 1759
1766/* VJ's idea. Save last timestamp seen from this destination 1760struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1767 * and hold it at least for normal timewait interval to use for duplicate
1768 * segment detection in subsequent connections, before they enter synchronized
1769 * state.
1770 */
1771
1772int tcp_v4_remember_stamp(struct sock *sk)
1773{ 1761{
1762 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1774 struct inet_sock *inet = inet_sk(sk); 1763 struct inet_sock *inet = inet_sk(sk);
1775 struct tcp_sock *tp = tcp_sk(sk); 1764 struct inet_peer *peer;
1776 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1777 struct inet_peer *peer = NULL;
1778 int release_it = 0;
1779 1765
1780 if (!rt || rt->rt_dst != inet->inet_daddr) { 1766 if (!rt || rt->rt_dst != inet->inet_daddr) {
1781 peer = inet_getpeer(inet->inet_daddr, 1); 1767 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1782 release_it = 1; 1768 *release_it = true;
1783 } else { 1769 } else {
1784 if (!rt->peer) 1770 if (!rt->peer)
1785 rt_bind_peer(rt, 1); 1771 rt_bind_peer(rt, 1);
1786 peer = rt->peer; 1772 peer = rt->peer;
1773 *release_it = false;
1787 } 1774 }
1788 1775
1789 if (peer) { 1776 return peer;
1790 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1791 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1792 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1793 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1794 peer->tcp_ts = tp->rx_opt.ts_recent;
1795 }
1796 if (release_it)
1797 inet_putpeer(peer);
1798 return 1;
1799 }
1800
1801 return 0;
1802} 1777}
1803EXPORT_SYMBOL(tcp_v4_remember_stamp); 1778EXPORT_SYMBOL(tcp_v4_get_peer);
1804 1779
1805int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1780void *tcp_v4_tw_get_peer(struct sock *sk)
1806{ 1781{
1807 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1782 struct inet_timewait_sock *tw = inet_twsk(sk);
1808
1809 if (peer) {
1810 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1811
1812 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1813 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1814 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1815 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1816 peer->tcp_ts = tcptw->tw_ts_recent;
1817 }
1818 inet_putpeer(peer);
1819 return 1;
1820 }
1821 1783
1822 return 0; 1784 return inet_getpeer_v4(tw->tw_daddr, 1);
1823} 1785}
1786EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1787
1788static struct timewait_sock_ops tcp_timewait_sock_ops = {
1789 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1790 .twsk_unique = tcp_twsk_unique,
1791 .twsk_destructor= tcp_twsk_destructor,
1792 .twsk_getpeer = tcp_v4_tw_get_peer,
1793};
1824 1794
1825const struct inet_connection_sock_af_ops ipv4_specific = { 1795const struct inet_connection_sock_af_ops ipv4_specific = {
1826 .queue_xmit = ip_queue_xmit, 1796 .queue_xmit = ip_queue_xmit,
@@ -1828,7 +1798,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1828 .rebuild_header = inet_sk_rebuild_header, 1798 .rebuild_header = inet_sk_rebuild_header,
1829 .conn_request = tcp_v4_conn_request, 1799 .conn_request = tcp_v4_conn_request,
1830 .syn_recv_sock = tcp_v4_syn_recv_sock, 1800 .syn_recv_sock = tcp_v4_syn_recv_sock,
1831 .remember_stamp = tcp_v4_remember_stamp, 1801 .get_peer = tcp_v4_get_peer,
1832 .net_header_len = sizeof(struct iphdr), 1802 .net_header_len = sizeof(struct iphdr),
1833 .setsockopt = ip_setsockopt, 1803 .setsockopt = ip_setsockopt,
1834 .getsockopt = ip_getsockopt, 1804 .getsockopt = ip_getsockopt,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a66735f75963..80b1f80759ab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,6 +49,56 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static int tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return 1;
76 }
77
78 return 0;
79}
80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return 1;
98 }
99 return 0;
100}
101
52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53{ 103{
54 if (seq == s_win) 104 if (seq == s_win)
@@ -149,14 +199,9 @@ kill_with_rst:
149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
150 } 200 }
151 201
152 /* I am shamed, but failed to make it more elegant. 202 if (tcp_death_row.sysctl_tw_recycle &&
153 * Yes, it is direct reference to IP, which is impossible 203 tcptw->tw_ts_recent_stamp &&
154 * to generalize to IPv6. Taking into account that IPv6 204 tcp_tw_remember_stamp(tw))
155 * do not understand recycling in any case, it not
156 * a big problem in practice. --ANK */
157 if (tw->tw_family == AF_INET &&
158 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
159 tcp_v4_tw_remember_stamp(tw))
160 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 205 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
161 TCP_TIMEWAIT_LEN); 206 TCP_TIMEWAIT_LEN);
162 else 207 else
@@ -274,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
274 int recycle_ok = 0; 319 int recycle_ok = 0;
275 320
276 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
277 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
278 323
279 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 324 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
280 tw = inet_twsk_alloc(sk, state); 325 tw = inet_twsk_alloc(sk, state);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 61c2463e2753..dc7c096ddfef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
55int sysctl_tcp_tso_win_divisor __read_mostly = 3; 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
56 56
57int sysctl_tcp_mtu_probing __read_mostly = 0; 57int sysctl_tcp_mtu_probing __read_mostly = 0;
58int sysctl_tcp_base_mss __read_mostly = 512; 58int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
59 59
60/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
61int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
119 struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
120 int mss = tp->advmss; 120 int mss = tp->advmss;
121 121
122 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { 122 if (dst) {
123 mss = dst_metric(dst, RTAX_ADVMSS); 123 unsigned int metric = dst_metric_advmss(dst);
124 tp->advmss = mss; 124
125 if (metric < mss) {
126 mss = metric;
127 tp->advmss = mss;
128 }
125 } 129 }
126 130
127 return (__u16)mss; 131 return (__u16)mss;
@@ -224,10 +228,15 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 228 }
225 } 229 }
226 230
227 /* Set initial window to value enough for senders, following RFC5681. */ 231 /* Set initial window to a value enough for senders starting with
232 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
233 * a limit on the initial window when mss is larger than 1460.
234 */
228 if (mss > (1 << *rcv_wscale)) { 235 if (mss > (1 << *rcv_wscale)) {
229 int init_cwnd = rfc3390_bytes_to_packets(mss); 236 int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
230 237 if (mss > 1460)
238 init_cwnd =
239 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
231 /* when initializing use the value from init_rcv_wnd 240 /* when initializing use the value from init_rcv_wnd
232 * rather than the default from above 241 * rather than the default from above
233 */ 242 */
@@ -824,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
824 &md5); 833 &md5);
825 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 834 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
826 835
827 if (tcp_packets_in_flight(tp) == 0) 836 if (tcp_packets_in_flight(tp) == 0) {
828 tcp_ca_event(sk, CA_EVENT_TX_START); 837 tcp_ca_event(sk, CA_EVENT_TX_START);
838 skb->ooo_okay = 1;
839 } else
840 skb->ooo_okay = 0;
829 841
830 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
831 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
@@ -2419,7 +2431,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2419 2431
2420 skb_dst_set(skb, dst_clone(dst)); 2432 skb_dst_set(skb, dst_clone(dst));
2421 2433
2422 mss = dst_metric(dst, RTAX_ADVMSS); 2434 mss = dst_metric_advmss(dst);
2423 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2435 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2424 mss = tp->rx_opt.user_mss; 2436 mss = tp->rx_opt.user_mss;
2425 2437
@@ -2553,7 +2565,7 @@ static void tcp_connect_init(struct sock *sk)
2553 2565
2554 if (!tp->window_clamp) 2566 if (!tp->window_clamp)
2555 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2567 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2556 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2568 tp->advmss = dst_metric_advmss(dst);
2557 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2569 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2558 tp->advmss = tp->rx_opt.user_mss; 2570 tp->advmss = tp->rx_opt.user_mss;
2559 2571
@@ -2596,6 +2608,7 @@ int tcp_connect(struct sock *sk)
2596{ 2608{
2597 struct tcp_sock *tp = tcp_sk(sk); 2609 struct tcp_sock *tp = tcp_sk(sk);
2598 struct sk_buff *buff; 2610 struct sk_buff *buff;
2611 int err;
2599 2612
2600 tcp_connect_init(sk); 2613 tcp_connect_init(sk);
2601 2614
@@ -2618,7 +2631,9 @@ int tcp_connect(struct sock *sk)
2618 sk->sk_wmem_queued += buff->truesize; 2631 sk->sk_wmem_queued += buff->truesize;
2619 sk_mem_charge(sk, buff->truesize); 2632 sk_mem_charge(sk, buff->truesize);
2620 tp->packets_out += tcp_skb_pcount(buff); 2633 tp->packets_out += tcp_skb_pcount(buff);
2621 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 2634 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2635 if (err == -ECONNREFUSED)
2636 return err;
2622 2637
2623 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2638 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2624 * in order to make this packet get counted in tcpOutSegs. 2639 * in order to make this packet get counted in tcpOutSegs.
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 6211e2114173..85ee7eb7e38e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n)
154 struct timespec tv 154 struct timespec tv
155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
156 156
157 return snprintf(tbuf, n, 157 return scnprintf(tbuf, n,
158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", 158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
159 (unsigned long) tv.tv_sec, 159 (unsigned long) tv.tv_sec,
160 (unsigned long) tv.tv_nsec, 160 (unsigned long) tv.tv_nsec,
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
174 return -EINVAL; 174 return -EINVAL;
175 175
176 while (cnt < len) { 176 while (cnt < len) {
177 char tbuf[128]; 177 char tbuf[164];
178 int width; 178 int width;
179 179
180 /* Wait for data in buffer */ 180 /* Wait for data in buffer */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2d3ded4d0786..8157b17959ee 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -430,7 +430,7 @@ begin:
430 430
431 if (result) { 431 if (result) {
432exact_match: 432exact_match:
433 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 433 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
434 result = NULL; 434 result = NULL;
435 else if (unlikely(compute_score2(result, net, saddr, sport, 435 else if (unlikely(compute_score2(result, net, saddr, sport,
436 daddr, hnum, dif) < badness)) { 436 daddr, hnum, dif) < badness)) {
@@ -500,7 +500,7 @@ begin:
500 goto begin; 500 goto begin;
501 501
502 if (result) { 502 if (result) {
503 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 503 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
504 result = NULL; 504 result = NULL;
505 else if (unlikely(compute_score(result, net, saddr, hnum, sport, 505 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
506 daddr, dport, dif) < badness)) { 506 daddr, dport, dif) < badness)) {
@@ -890,15 +890,13 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
890 if (rt == NULL) { 890 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif, 891 struct flowi fl = { .oif = ipc.oif,
892 .mark = sk->sk_mark, 892 .mark = sk->sk_mark,
893 .nl_u = { .ip4_u = 893 .fl4_dst = faddr,
894 { .daddr = faddr, 894 .fl4_src = saddr,
895 .saddr = saddr, 895 .fl4_tos = tos,
896 .tos = tos } },
897 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
898 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
899 .uli_u = { .ports = 898 .fl_ip_sport = inet->inet_sport,
900 { .sport = inet->inet_sport, 899 .fl_ip_dport = dport };
901 .dport = dport } } };
902 struct net *net = sock_net(sk); 900 struct net *net = sock_net(sk);
903 901
904 security_sk_classify_flow(sk, &fl); 902 security_sk_classify_flow(sk, &fl);
@@ -2229,7 +2227,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
2229 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot 2227 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
2230 * do checksum of UDP packets sent as multiple IP fragments. 2228 * do checksum of UDP packets sent as multiple IP fragments.
2231 */ 2229 */
2232 offset = skb->csum_start - skb_headroom(skb); 2230 offset = skb_checksum_start_offset(skb);
2233 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2231 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2234 offset += skb->csum_offset; 2232 offset += skb->csum_offset;
2235 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2233 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 6f368413eb0e..534972e114ac 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -56,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
57 ip_select_ident(top_iph, dst->child, NULL); 57 ip_select_ident(top_iph, dst->child, NULL);
58 58
59 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); 59 top_iph->ttl = ip4_dst_hoplimit(dst->child);
60 60
61 top_iph->saddr = x->props.saddr.a4; 61 top_iph->saddr = x->props.saddr.a4;
62 top_iph->daddr = x->id.daddr.a4; 62 top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 4464f3bff6a7..b057d40addec 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -11,6 +11,7 @@
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/if_tunnel.h>
14#include <net/dst.h> 15#include <net/dst.h>
15#include <net/xfrm.h> 16#include <net/xfrm.h>
16#include <net/ip.h> 17#include <net/ip.h>
@@ -22,12 +23,8 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *daddr) 23 xfrm_address_t *daddr)
23{ 24{
24 struct flowi fl = { 25 struct flowi fl = {
25 .nl_u = { 26 .fl4_dst = daddr->a4,
26 .ip4_u = { 27 .fl4_tos = tos,
27 .tos = tos,
28 .daddr = daddr->a4,
29 },
30 },
31 }; 28 };
32 struct dst_entry *dst; 29 struct dst_entry *dst;
33 struct rtable *rt; 30 struct rtable *rt;
@@ -80,10 +77,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
80 xdst->u.dst.dev = dev; 77 xdst->u.dst.dev = dev;
81 dev_hold(dev); 78 dev_hold(dev);
82 79
83 xdst->u.rt.idev = in_dev_get(dev);
84 if (!xdst->u.rt.idev)
85 return -ENODEV;
86
87 xdst->u.rt.peer = rt->peer; 80 xdst->u.rt.peer = rt->peer;
88 if (rt->peer) 81 if (rt->peer)
89 atomic_inc(&rt->peer->refcnt); 82 atomic_inc(&rt->peer->refcnt);
@@ -158,6 +151,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
158 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
159 } 152 }
160 break; 153 break;
154
155 case IPPROTO_GRE:
156 if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
157 __be16 *greflags = (__be16 *)xprth;
158 __be32 *gre_hdr = (__be32 *)xprth;
159
160 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM)
162 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1];
164 }
165 }
166 break;
167
161 default: 168 default:
162 fl->fl_ipsec_spi = 0; 169 fl->fl_ipsec_spi = 0;
163 break; 170 break;
@@ -189,8 +196,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
189{ 196{
190 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
191 198
192 if (likely(xdst->u.rt.idev))
193 in_dev_put(xdst->u.rt.idev);
194 if (likely(xdst->u.rt.peer)) 199 if (likely(xdst->u.rt.peer))
195 inet_putpeer(xdst->u.rt.peer); 200 inet_putpeer(xdst->u.rt.peer);
196 xfrm_dst_destroy(xdst); 201 xfrm_dst_destroy(xdst);
@@ -199,27 +204,9 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
199static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 204static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
200 int unregister) 205 int unregister)
201{ 206{
202 struct xfrm_dst *xdst;
203
204 if (!unregister) 207 if (!unregister)
205 return; 208 return;
206 209
207 xdst = (struct xfrm_dst *)dst;
208 if (xdst->u.rt.idev->dev == dev) {
209 struct in_device *loopback_idev =
210 in_dev_get(dev_net(dev)->loopback_dev);
211 BUG_ON(!loopback_idev);
212
213 do {
214 in_dev_put(xdst->u.rt.idev);
215 xdst->u.rt.idev = loopback_idev;
216 in_dev_hold(loopback_idev);
217 xdst = (struct xfrm_dst *)xdst->u.dst.child;
218 } while (xdst->u.dst.xfrm);
219
220 __in_dev_put(loopback_idev);
221 }
222
223 xfrm_dst_ifdown(dst, dev); 210 xfrm_dst_ifdown(dst, dev);
224} 211}
225 212