aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/netfilter/ebt_log.c6
-rw-r--r--net/core/dev.c5
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/pktgen.c29
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/core/wireless.c1
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/ethernet/eth.c7
-rw-r--r--net/ipv4/Kconfig3
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/fib_trie.c256
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipmr.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c25
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/route.c135
-rw-r--r--net/ipv4/tcp.c44
-rw-r--r--net/ipv4/tcp_input.c76
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c544
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ip6_flowlabel.c1
-rw-r--r--net/ipv6/ip6_output.c1
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/netlink/af_netlink.c11
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/act_api.c10
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_rsvp.h1
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/sch_api.c65
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_generic.c35
-rw-r--r--net/sctp/endpointola.c13
-rw-r--r--net/sctp/protocol.c5
-rw-r--r--net/sctp/sysctl.c13
-rw-r--r--net/sctp/transport.c1
46 files changed, 1060 insertions, 514 deletions
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4edddac3..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
844 * doesn't use the bridge parent of the indev by using 844 * doesn't use the bridge parent of the indev by using
845 * the BRNF_DONT_TAKE_PARENT mask. */ 845 * the BRNF_DONT_TAKE_PARENT mask. */
846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) { 846 if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
847 nf_bridge->mask &= BRNF_DONT_TAKE_PARENT; 847 nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
848 nf_bridge->physindev = (struct net_device *)in; 848 nf_bridge->physindev = (struct net_device *)in;
849 } 849 }
850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) 850#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
61{ 61{
62 struct ebt_log_info *info = (struct ebt_log_info *)data; 62 struct ebt_log_info *info = (struct ebt_log_info *)data;
63 char level_string[4] = "< >"; 63 char level_string[4] = "< >";
64 union {struct iphdr iph; struct tcpudphdr ports;
65 struct arphdr arph; struct arppayload arpp;} u;
66 64
67 level_string[1] = '0' + info->loglevel; 65 level_string[1] = '0' + info->loglevel;
68 spin_lock_bh(&ebt_log_lock); 66 spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
88 } 86 }
89 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", 87 printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
90 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); 88 NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
91 printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, 89 printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
92 ih->protocol); 90 ih->protocol);
93 if (ih->protocol == IPPROTO_TCP || 91 if (ih->protocol == IPPROTO_TCP ||
94 ih->protocol == IPPROTO_UDP) { 92 ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
127 ah->ar_pln == sizeof(uint32_t)) { 125 ah->ar_pln == sizeof(uint32_t)) {
128 struct arppayload _arpp, *ap; 126 struct arppayload _arpp, *ap;
129 127
130 ap = skb_header_pointer(skb, sizeof(u.arph), 128 ap = skb_header_pointer(skb, sizeof(_arph),
131 sizeof(_arpp), &_arpp); 129 sizeof(_arpp), &_arpp);
132 if (ap == NULL) { 130 if (ap == NULL) {
133 printk(" INCOMPLETE ARP payload"); 131 printk(" INCOMPLETE ARP payload");
diff --git a/net/core/dev.c b/net/core/dev.c
index 7016e0c36b3d..7f5f62c65115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
2089{ 2089{
2090 unsigned short old_flags = dev->flags; 2090 unsigned short old_flags = dev->flags;
2091 2091
2092 dev->flags |= IFF_PROMISC;
2093 if ((dev->promiscuity += inc) == 0) 2092 if ((dev->promiscuity += inc) == 0)
2094 dev->flags &= ~IFF_PROMISC; 2093 dev->flags &= ~IFF_PROMISC;
2095 if (dev->flags ^ old_flags) { 2094 else
2095 dev->flags |= IFF_PROMISC;
2096 if (dev->flags != old_flags) {
2096 dev_mc_upload(dev); 2097 dev_mc_upload(dev);
2097 printk(KERN_INFO "device %s %s promiscuous mode\n", 2098 printk(KERN_INFO "device %s %s promiscuous mode\n",
2098 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2099 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
36#include <linux/filter.h> 36#include <linux/filter.h>
37 37
38/* No hurry in this branch */ 38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k) 39static void *__load_pointer(struct sk_buff *skb, int k)
40{ 40{
41 u8 *ptr = NULL; 41 u8 *ptr = NULL;
42 42
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
50 return NULL; 50 return NULL;
51} 51}
52 52
53static inline void *load_pointer(struct sk_buff *skb, int k,
54 unsigned int size, void *buffer)
55{
56 if (k >= 0)
57 return skb_header_pointer(skb, k, size, buffer);
58 else {
59 if (k >= SKF_AD_OFF)
60 return NULL;
61 return __load_pointer(skb, k);
62 }
63}
64
53/** 65/**
54 * sk_run_filter - run a filter on a socket 66 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on 67 * @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
64 76
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 77int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{ 78{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */ 79 struct sock_filter *fentry; /* We walk down these */
80 void *ptr;
73 u32 A = 0; /* Accumulator */ 81 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */ 82 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 83 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
84 u32 tmp;
76 int k; 85 int k;
77 int pc; 86 int pc;
78 87
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
168 case BPF_LD|BPF_W|BPF_ABS: 177 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k; 178 k = fentry->k;
170 load_w: 179 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { 180 ptr = load_pointer(skb, k, 4, &tmp);
172 A = ntohl(*(u32*)&data[k]); 181 if (ptr != NULL) {
182 A = ntohl(*(u32 *)ptr);
173 continue; 183 continue;
174 } 184 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0; 185 return 0;
194 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k; 187 k = fentry->k;
196 load_h: 188 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { 189 ptr = load_pointer(skb, k, 2, &tmp);
198 A = ntohs(*(u16*)&data[k]); 190 if (ptr != NULL) {
191 A = ntohs(*(u16 *)ptr);
199 continue; 192 continue;
200 } 193 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0; 194 return 0;
220 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k; 196 k = fentry->k;
222load_b: 197load_b:
223 if (k >= 0 && (unsigned int)k < len) { 198 ptr = load_pointer(skb, k, 1, &tmp);
224 A = data[k]; 199 if (ptr != NULL) {
200 A = *(u8 *)ptr;
225 continue; 201 continue;
226 } 202 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0; 203 return 0;
246 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
247 A = len; 205 A = skb->len;
248 continue; 206 continue;
249 case BPF_LDX|BPF_W|BPF_LEN: 207 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len; 208 X = skb->len;
251 continue; 209 continue;
252 case BPF_LD|BPF_W|BPF_IND: 210 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k; 211 k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
259 k = X + fentry->k; 217 k = X + fentry->k;
260 goto load_b; 218 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH: 219 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len) 220 ptr = load_pointer(skb, fentry->k, 1, &tmp);
263 return 0; 221 if (ptr != NULL) {
264 X = (data[fentry->k] & 0xf) << 2; 222 X = (*(u8 *)ptr & 0xf) << 2;
265 continue; 223 continue;
224 }
225 return 0;
266 case BPF_LD|BPF_IMM: 226 case BPF_LD|BPF_IMM:
267 A = fentry->k; 227 A = fentry->k;
268 continue; 228 continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 851eb927ed97..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1598,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
1598 1598
1599 read_lock_bh(&tbl->lock); 1599 read_lock_bh(&tbl->lock);
1600 ndtmsg->ndtm_family = tbl->family; 1600 ndtmsg->ndtm_family = tbl->family;
1601 ndtmsg->ndtm_pad1 = 0;
1602 ndtmsg->ndtm_pad2 = 0;
1601 1603
1602 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1604 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1603 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); 1605 RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1683,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
1683 1685
1684 read_lock_bh(&tbl->lock); 1686 read_lock_bh(&tbl->lock);
1685 ndtmsg->ndtm_family = tbl->family; 1687 ndtmsg->ndtm_family = tbl->family;
1688 ndtmsg->ndtm_pad1 = 0;
1689 ndtmsg->ndtm_pad2 = 0;
1686 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id); 1690 RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
1687 1691
1688 if (neightbl_fill_parms(skb, parms) < 0) 1692 if (neightbl_fill_parms(skb, parms) < 0)
@@ -1872,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1872 struct ndmsg *ndm = NLMSG_DATA(nlh); 1876 struct ndmsg *ndm = NLMSG_DATA(nlh);
1873 1877
1874 ndm->ndm_family = n->ops->family; 1878 ndm->ndm_family = n->ops->family;
1879 ndm->ndm_pad1 = 0;
1880 ndm->ndm_pad2 = 0;
1875 ndm->ndm_flags = n->flags; 1881 ndm->ndm_flags = n->flags;
1876 ndm->ndm_type = n->type; 1882 ndm->ndm_type = n->type;
1877 ndm->ndm_ifindex = n->dev->ifindex; 1883 ndm->ndm_ifindex = n->dev->ifindex;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
151#include <asm/timex.h> 151#include <asm/timex.h>
152 152
153 153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" 154#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
155 155
156/* #define PG_DEBUG(a) a */ 156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a) 157#define PG_DEBUG(a)
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1921 struct iphdr *iph; 1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL; 1922 struct pktgen_hdr *pgh = NULL;
1923 1923
1924 /* Update any of the values, used when we're incrementing various
1925 * fields.
1926 */
1927 mod_cur_headers(pkt_dev);
1928
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 1929 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) { 1930 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory"); 1931 sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); 1939 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 1940 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936 1941
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12); 1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP); 1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944 1944
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2192 int datalen; 2192 int datalen;
2193 struct ipv6hdr *iph; 2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL; 2194 struct pktgen_hdr *pgh = NULL;
2195 2195
2196 /* Update any of the values, used when we're incrementing various
2197 * fields.
2198 */
2199 mod_cur_headers(pkt_dev);
2200
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); 2201 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) { 2202 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory"); 2203 sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); 2211 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); 2212 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208 2213
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12); 2214 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6); 2215 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218 2216
2219
2220 datalen = pkt_dev->cur_pkt_size-14- 2217 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ 2218 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222 2219
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
126 rta->rta_type = attrtype; 126 rta->rta_type = attrtype;
127 rta->rta_len = size; 127 rta->rta_len = size;
128 memcpy(RTA_DATA(rta), data, attrlen); 128 memcpy(RTA_DATA(rta), data, attrlen);
129 memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
129} 130}
130 131
131size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) 132size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -188,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
188 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags); 189 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
189 r = NLMSG_DATA(nlh); 190 r = NLMSG_DATA(nlh);
190 r->ifi_family = AF_UNSPEC; 191 r->ifi_family = AF_UNSPEC;
192 r->__ifi_pad = 0;
191 r->ifi_type = dev->type; 193 r->ifi_type = dev->type;
192 r->ifi_index = dev->ifindex; 194 r->ifi_index = dev->ifindex;
193 r->ifi_flags = dev_get_flags(dev); 195 r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..733deee24b9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
357 C(ip_summed); 357 C(ip_summed);
358 C(priority); 358 C(priority);
359 C(protocol); 359 C(protocol);
360 C(security);
361 n->destructor = NULL; 360 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 361#ifdef CONFIG_NETFILTER
363 C(nfmark); 362 C(nfmark);
@@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
422 new->pkt_type = old->pkt_type; 421 new->pkt_type = old->pkt_type;
423 new->stamp = old->stamp; 422 new->stamp = old->stamp;
424 new->destructor = NULL; 423 new->destructor = NULL;
425 new->security = old->security;
426#ifdef CONFIG_NETFILTER 424#ifdef CONFIG_NETFILTER
427 new->nfmark = old->nfmark; 425 new->nfmark = old->nfmark;
428 new->nfcache = old->nfcache; 426 new->nfcache = old->nfcache;
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfbf8..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); 1102 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
1103 r = NLMSG_DATA(nlh); 1103 r = NLMSG_DATA(nlh);
1104 r->ifi_family = AF_UNSPEC; 1104 r->ifi_family = AF_UNSPEC;
1105 r->__ifi_pad = 0;
1105 r->ifi_type = dev->type; 1106 r->ifi_type = dev->type;
1106 r->ifi_index = dev->ifindex; 1107 r->ifi_index = dev->ifindex;
1107 r->ifi_flags = dev->flags; 1108 r->ifi_flags = dev->flags;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
551 if (t < s_t) 551 if (t < s_t)
552 continue; 552 continue;
553 if (t > s_t) 553 if (t > s_t)
554 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); 554 memset(&cb->args[1], 0,
555 sizeof(cb->args) - sizeof(cb->args[0]));
555 tb = dn_fib_get_table(t, 0); 556 tb = dn_fib_get_table(t, 0);
556 if (tb == NULL) 557 if (tb == NULL)
557 continue; 558 continue;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..ab60ea63688e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
92 * Set the source hardware address. 92 * Set the source hardware address.
93 */ 93 */
94 94
95 if(saddr) 95 if(!saddr)
96 memcpy(eth->h_source,saddr,dev->addr_len); 96 saddr = dev->dev_addr;
97 else 97 memcpy(eth->h_source,saddr,dev->addr_len);
98 memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
99 98
100 /* 99 /*
101 * Anyway, the loopback-device should never use this function... 100 * Anyway, the loopback-device should never use this function...
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 347083433120..3e63123f7bbd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -448,7 +448,6 @@ config IP_TCPDIAG_IPV6
448config TCP_CONG_ADVANCED 448config TCP_CONG_ADVANCED
449 bool "TCP: advanced congestion control" 449 bool "TCP: advanced congestion control"
450 depends on INET 450 depends on INET
451 default y
452 ---help--- 451 ---help---
453 Support for selection of various TCP congestion control 452 Support for selection of various TCP congestion control
454 modules. 453 modules.
@@ -549,7 +548,7 @@ config TCP_CONG_SCALABLE
549endmenu 548endmenu
550 549
551config TCP_CONG_BIC 550config TCP_CONG_BIC
552 boolean 551 tristate
553 depends on !TCP_CONG_ADVANCED 552 depends on !TCP_CONG_ADVANCED
554 default y 553 default y
555 554
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6f0..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.323" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -341,8 +342,10 @@ static struct leaf *leaf_new(void)
341static struct leaf_info *leaf_info_new(int plen) 342static struct leaf_info *leaf_info_new(int plen)
342{ 343{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 344 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen; 345 if(li) {
345 INIT_LIST_HEAD(&li->falh); 346 li->plen = plen;
347 INIT_LIST_HEAD(&li->falh);
348 }
346 return li; 349 return li;
347} 350}
348 351
@@ -356,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
356 kfree(li); 359 kfree(li);
357} 360}
358 361
362static struct tnode *tnode_alloc(unsigned int size)
363{
364 if (size <= PAGE_SIZE) {
365 return kmalloc(size, GFP_KERNEL);
366 } else {
367 return (struct tnode *)
368 __get_free_pages(GFP_KERNEL, get_order(size));
369 }
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *);
376
377 if (size <= PAGE_SIZE)
378 kfree(tn);
379 else
380 free_pages((unsigned long)tn, get_order(size));
381}
382
359static struct tnode* tnode_new(t_key key, int pos, int bits) 383static struct tnode* tnode_new(t_key key, int pos, int bits)
360{ 384{
361 int nchildren = 1<<bits; 385 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 387 struct tnode *tn = tnode_alloc(sz);
364 388
365 if(tn) { 389 if(tn) {
366 memset(tn, 0, sz); 390 memset(tn, 0, sz);
@@ -388,7 +412,7 @@ static void tnode_free(struct tnode *tn)
388 printk("FL %p \n", tn); 412 printk("FL %p \n", tn);
389 } 413 }
390 else if(IS_TNODE(tn)) { 414 else if(IS_TNODE(tn)) {
391 kfree(tn); 415 __tnode_free(tn);
392 if(trie_debug > 0 ) 416 if(trie_debug > 0 )
393 printk("FT %p \n", tn); 417 printk("FT %p \n", tn);
394 } 418 }
@@ -458,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
458static struct node *resize(struct trie *t, struct tnode *tn) 482static struct node *resize(struct trie *t, struct tnode *tn)
459{ 483{
460 int i; 484 int i;
485 int err = 0;
461 486
462 if (!tn) 487 if (!tn)
463 return NULL; 488 return NULL;
@@ -554,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
554 */ 579 */
555 580
556 check_tnode(tn); 581 check_tnode(tn);
557 582
583 err = 0;
558 while ((tn->full_children > 0 && 584 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) { 586 inflate_threshold * tnode_child_length(tn))) {
561 587
562 tn = inflate(t, tn); 588 tn = inflate(t, tn, &err);
589
590 if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++;
593#endif
594 break;
595 }
563 } 596 }
564 597
565 check_tnode(tn); 598 check_tnode(tn);
@@ -568,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
568 * Halve as long as the number of empty children in this 601 * Halve as long as the number of empty children in this
569 * node is above threshold. 602 * node is above threshold.
570 */ 603 */
604
605 err = 0;
571 while (tn->bits > 1 && 606 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) < 607 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn)) 608 halve_threshold * tnode_child_length(tn)) {
609
610 tn = halve(t, tn, &err);
611
612 if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++;
615#endif
616 break;
617 }
618 }
574 619
575 tn = halve(t, tn);
576 620
577 /* Only one child remains */ 621 /* Only one child remains */
578 622
@@ -597,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
597 return (struct node *) tn; 641 return (struct node *) tn;
598} 642}
599 643
600static struct tnode *inflate(struct trie *t, struct tnode *tn) 644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
601{ 645{
602 struct tnode *inode; 646 struct tnode *inode;
603 struct tnode *oldtnode = tn; 647 struct tnode *oldtnode = tn;
@@ -609,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
609 653
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611 655
612 if (!tn) 656 if (!tn) {
613 trie_bug("tnode_new failed"); 657 *err = -ENOMEM;
658 return oldtnode;
659 }
660
661 /*
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
666 */
667
668 for(i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671 if (inode &&
672 IS_TNODE(inode) &&
673 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) {
675 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1);
681
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1);
689
690 if(!right) {
691 *err = -ENOMEM;
692 break;
693 }
694
695 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right);
697 }
698 }
699
700 if(*err) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
614 713
615 for(i = 0; i < olen; i++) { 714 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i); 715 struct node *node = tnode_get_child(oldtnode, i);
@@ -623,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
623 722
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 723 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) { 724 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
627 1) == 0) 726 1) == 0)
628 put_child(t, tn, 2*i, node); 727 put_child(t, tn, 2*i, node);
629 else 728 else
@@ -663,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
663 * the position (inode->pos) 762 * the position (inode->pos)
664 */ 763 */
665 764
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant 765 /* Use the old key, but set the new significant
669 * bit to zero. 766 * bit to zero.
670 */ 767 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673 768
674 if(!left) 769 left = (struct tnode *) tnode_get_child(tn, 2*i);
675 trie_bug("tnode_new failed"); 770 put_child(t, tn, 2*i, NULL);
676 771
677 772 if(!left)
678 /* Use the old key, but set the new significant 773 BUG();
679 * bit to one. 774
680 */ 775 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
681 right = tnode_new(inode->key|m, inode->pos + 1, 776 put_child(t, tn, 2*i+1, NULL);
682 inode->bits - 1); 777
778 if(!right)
779 BUG();
683 780
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left); 781 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) { 782 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]); 783 put_child(t, left, j, inode->child[j]);
@@ -699,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
699 return tn; 793 return tn;
700} 794}
701 795
702static struct tnode *halve(struct trie *t, struct tnode *tn) 796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
703{ 797{
704 struct tnode *oldtnode = tn; 798 struct tnode *oldtnode = tn;
705 struct node *left, *right; 799 struct node *left, *right;
@@ -710,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
710 804
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712 806
713 if(!tn) 807 if (!tn) {
714 trie_bug("tnode_new failed"); 808 *err = -ENOMEM;
809 return oldtnode;
810 }
811
812 /*
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
817 */
818
819 for(i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1);
822
823 /* Two nonempty children */
824 if( left && right) {
825 struct tnode *newBinNode =
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827
828 if(!newBinNode) {
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835
836 if(*err) {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if( tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 }
715 849
716 for(i = 0; i < olen; i += 2) { 850 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i); 851 left = tnode_get_child(oldtnode, i);
@@ -728,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
728 /* Two nonempty children */ 862 /* Two nonempty children */
729 else { 863 else {
730 struct tnode *newBinNode = 864 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1); 865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
732 867
733 if(!newBinNode) 868 if(!newBinNode)
734 trie_bug("tnode_new failed"); 869 BUG();
735 870
736 put_child(t, newBinNode, 0, left); 871 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right); 872 put_child(t, newBinNode, 1, right);
@@ -879,8 +1014,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
879 return (struct node*) tn; 1014 return (struct node*) tn;
880} 1015}
881 1016
882static struct list_head * 1017static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen) 1018fib_insert_node(struct trie *t, int *err, u32 key, int plen)
884{ 1019{
885 int pos, newpos; 1020 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL; 1021 struct tnode *tp = NULL, *tn = NULL;
@@ -940,7 +1075,6 @@ fib_insert_node(struct trie *t, u32 key, int plen)
940 if(tp && IS_LEAF(tp)) 1075 if(tp && IS_LEAF(tp))
941 BUG(); 1076 BUG();
942 1077
943 t->revision++;
944 1078
945 /* Case 1: n is a leaf. Compare prefixes */ 1079 /* Case 1: n is a leaf. Compare prefixes */
946 1080
@@ -949,8 +1083,10 @@ fib_insert_node(struct trie *t, u32 key, int plen)
949 1083
950 li = leaf_info_new(plen); 1084 li = leaf_info_new(plen);
951 1085
952 if(! li) 1086 if(! li) {
953 BUG(); 1087 *err = -ENOMEM;
1088 goto err;
1089 }
954 1090
955 fa_head = &li->falh; 1091 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li); 1092 insert_leaf_info(&l->list, li);
@@ -959,14 +1095,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
959 t->size++; 1095 t->size++;
960 l = leaf_new(); 1096 l = leaf_new();
961 1097
962 if(! l) 1098 if(! l) {
963 BUG(); 1099 *err = -ENOMEM;
1100 goto err;
1101 }
964 1102
965 l->key = key; 1103 l->key = key;
966 li = leaf_info_new(plen); 1104 li = leaf_info_new(plen);
967 1105
968 if(! li) 1106 if(! li) {
969 BUG(); 1107 tnode_free((struct tnode *) l);
1108 *err = -ENOMEM;
1109 goto err;
1110 }
970 1111
971 fa_head = &li->falh; 1112 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li); 1113 insert_leaf_info(&l->list, li);
@@ -1003,9 +1144,14 @@ fib_insert_node(struct trie *t, u32 key, int plen)
1003 newpos = 0; 1144 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */ 1145 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 } 1146 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008 1147
1148 if(!tn) {
1149 free_leaf_info(li);
1150 tnode_free((struct tnode *) l);
1151 *err = -ENOMEM;
1152 goto err;
1153 }
1154
1009 NODE_SET_PARENT(tn, tp); 1155 NODE_SET_PARENT(tn, tp);
1010 1156
1011 missbit=tkey_extract_bits(key, newpos, 1); 1157 missbit=tkey_extract_bits(key, newpos, 1);
@@ -1027,7 +1173,9 @@ fib_insert_node(struct trie *t, u32 key, int plen)
1027 } 1173 }
1028 /* Rebalance the trie */ 1174 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp); 1175 t->trie = trie_rebalance(t, tp);
1030done:; 1176done:
1177 t->revision++;
1178err:;
1031 return fa_head; 1179 return fa_head;
1032} 1180}
1033 1181
@@ -1156,8 +1304,12 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1156 * Insert new entry to the list. 1304 * Insert new entry to the list.
1157 */ 1305 */
1158 1306
1159 if(!fa_head) 1307 if(!fa_head) {
1160 fa_head = fib_insert_node(t, key, plen); 1308 fa_head = fib_insert_node(t, &err, key, plen);
1309 err = 0;
1310 if(err)
1311 goto out_free_new_fa;
1312 }
1161 1313
1162 write_lock_bh(&fib_lock); 1314 write_lock_bh(&fib_lock);
1163 1315
@@ -1170,6 +1322,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1322 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded: 1323succeeded:
1172 return 0; 1324 return 0;
1325
1326out_free_new_fa:
1327 kmem_cache_free(fn_alias_kmem, new_fa);
1173out: 1328out:
1174 fib_release_info(fi); 1329 fib_release_info(fi);
1175err:; 1330err:;
@@ -2279,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2282#ifdef CLEAR_STATS 2438#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats)); 2439 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif 2440#endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb2f..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
283{ 283{
284 struct net_device *dev = skb->dev; 284 struct net_device *dev = skb->dev;
285 struct iphdr *iph = skb->nh.iph; 285 struct iphdr *iph = skb->nh.iph;
286 int err;
286 287
287 /* 288 /*
288 * Initialise the virtual path cache for the packet. It describes 289 * Initialise the virtual path cache for the packet. It describes
289 * how the packet travels inside Linux networking. 290 * how the packet travels inside Linux networking.
290 */ 291 */
291 if (skb->dst == NULL) { 292 if (skb->dst == NULL) {
292 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) 293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
294 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
293 goto drop; 296 goto drop;
297 }
294 } 298 }
295 299
296#ifdef CONFIG_NET_CLS_ROUTE 300#ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215a0..9de83e6e0f1d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -188,7 +188,13 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 188 skb = skb2;
189 } 189 }
190 190
191 nf_reset(skb); 191#ifdef CONFIG_BRIDGE_NETFILTER
192 /* bridge-netfilter defers calling some IP hooks to the bridge layer
193 * and still needs the conntrack reference.
194 */
195 if (skb->nf_bridge == NULL)
196#endif
197 nf_reset(skb);
192 198
193 if (hh) { 199 if (hh) {
194 int hh_alen; 200 int hh_alen;
@@ -383,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
383 to->pkt_type = from->pkt_type; 389 to->pkt_type = from->pkt_type;
384 to->priority = from->priority; 390 to->priority = from->priority;
385 to->protocol = from->protocol; 391 to->protocol = from->protocol;
386 to->security = from->security;
387 dst_release(to->dst); 392 dst_release(to->dst);
388 to->dst = dst_clone(from->dst); 393 to->dst = dst_clone(from->dst);
389 to->dev = from->dev; 394 to->dev = from->dev;
@@ -1323,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1323 ip_rt_put(rt); 1328 ip_rt_put(rt);
1324} 1329}
1325 1330
1326/*
1327 * IP protocol layer initialiser
1328 */
1329
1330static struct packet_type ip_packet_type = {
1331 .type = __constant_htons(ETH_P_IP),
1332 .func = ip_rcv,
1333};
1334
1335/*
1336 * IP registers the packet type and then calls the subprotocol initialisers
1337 */
1338
1339void __init ip_init(void) 1331void __init ip_init(void)
1340{ 1332{
1341 dev_add_pack(&ip_packet_type);
1342
1343 ip_rt_init(); 1333 ip_rt_init();
1344 inet_initpeers(); 1334 inet_initpeers();
1345 1335
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
1149 ic_rarp_cleanup(); 1149 ic_rarp_cleanup();
1150#endif 1150#endif
1151 1151
1152 if (!ic_got_reply) 1152 if (!ic_got_reply) {
1153 ic_myaddr = INADDR_NONE;
1153 return -1; 1154 return -1;
1155 }
1154 1156
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", 1157 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP" 1158 ((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f47..7833d920bdba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
297static void ipmr_destroy_unres(struct mfc_cache *c) 297static void ipmr_destroy_unres(struct mfc_cache *c)
298{ 298{
299 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct nlmsgerr *e;
300 301
301 atomic_dec(&cache_resolve_queue_len); 302 atomic_dec(&cache_resolve_queue_len);
302 303
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
306 nlh->nlmsg_type = NLMSG_ERROR; 307 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 308 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len); 309 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; 310 e = NLMSG_DATA(nlh);
311 e->error = -ETIMEDOUT;
312 memset(&e->msg, 0, sizeof(e->msg));
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 313 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else 314 } else
312 kfree_skb(skb); 315 kfree_skb(skb);
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 502static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{ 503{
501 struct sk_buff *skb; 504 struct sk_buff *skb;
505 struct nlmsgerr *e;
502 506
503 /* 507 /*
504 * Play the pending entries through our router 508 * Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
515 nlh->nlmsg_type = NLMSG_ERROR; 519 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len); 521 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; 522 e = NLMSG_DATA(nlh);
523 e->error = -EMSGSIZE;
524 memset(&e->msg, 0, sizeof(e->msg));
519 } 525 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 526 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else 527 } else
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..9f16ab309106 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{ 548{
549 if (del_timer(&cp->timer)) 549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies); 550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552} 551}
553 552
554 553
@@ -764,7 +763,6 @@ void ip_vs_random_dropentry(void)
764{ 763{
765 int idx; 764 int idx;
766 struct ip_vs_conn *cp; 765 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768 766
769 /* 767 /*
770 * Randomly scan 1/32 of the whole table every second 768 * Randomly scan 1/32 of the whole table every second
@@ -801,21 +799,12 @@ void ip_vs_random_dropentry(void)
801 continue; 799 continue;
802 } 800 }
803 801
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n"); 802 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp); 803 ip_vs_conn_expire_now(cp);
814 if (ct) { 804 if (cp->control) {
815 IP_VS_DBG(4, "del conn template\n"); 805 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct); 806 ip_vs_conn_expire_now(cp->control);
817 } 807 }
818 ct_write_lock(hash);
819 } 808 }
820 ct_write_unlock(hash); 809 ct_write_unlock(hash);
821 } 810 }
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
829{ 818{
830 int idx; 819 int idx;
831 struct ip_vs_conn *cp; 820 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833 821
834 flush_again: 822 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
839 ct_write_lock_bh(idx); 827 ct_write_lock_bh(idx);
840 828
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844 830
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n"); 831 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp); 832 ip_vs_conn_expire_now(cp);
849 if (ct) { 833 if (cp->control) {
850 IP_VS_DBG(4, "del conn template\n"); 834 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct); 835 ip_vs_conn_expire_now(cp->control);
852 } 836 }
853 ct_write_lock(idx);
854 } 837 }
855 ct_write_unlock_bh(idx); 838 ct_write_unlock_bh(idx);
856 } 839 }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..12a82e91d22a 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2059,7 +2059,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2059 dst->addr = src->addr;
2060 dst->port = src->port; 2060 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2061 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2062 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2063 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2064 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2065 dst->netmask = src->netmask;
@@ -2080,6 +2080,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2081 if (count >= get->num_services)
2082 goto out; 2082 goto out;
2083 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2084 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2085 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2086 &entry, sizeof(entry))) {
@@ -2094,6 +2095,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2095 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2096 if (count >= get->num_services)
2096 goto out; 2097 goto out;
2098 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2099 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2100 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2101 &entry, sizeof(entry))) {
@@ -2304,12 +2306,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2306 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2307 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2308 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2309 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2310 d[0].syncid = ip_vs_master_syncid;
2309 } 2311 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2312 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2313 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2314 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2315 d[1].syncid = ip_vs_backup_syncid;
2314 } 2316 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2317 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9cde8c61f525..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32 32
33#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.7"
34 34
35#define DEBUG_CLUSTERIP 35#define DEBUG_CLUSTERIP
36 36
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
525 return NF_ACCEPT; 525 return NF_ACCEPT;
526 526
527 /* we only want to mangle arp replies */ 527 /* we only want to mangle arp requests and replies */
528 if (arp->ar_op != htons(ARPOP_REPLY)) 528 if (arp->ar_op != htons(ARPOP_REPLY)
529 && arp->ar_op != htons(ARPOP_REQUEST))
529 return NF_ACCEPT; 530 return NF_ACCEPT;
530 531
531 payload = (void *)(arp+1); 532 payload = (void *)(arp+1);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80cf633d9f4a..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1909 */ 1947 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev)) 1949 if (!IN_DEV_FORWARD(in_dev))
1912 goto e_inval; 1950 goto e_hostunreach;
1913 goto no_route; 1951 goto no_route;
1914 } 1952 }
1915 free_res = 1; 1953 free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1933 } 1971 }
1934 1972
1935 if (!IN_DEV_FORWARD(in_dev)) 1973 if (!IN_DEV_FORWARD(in_dev))
1936 goto e_inval; 1974 goto e_hostunreach;
1937 if (res.type != RTN_UNICAST) 1975 if (res.type != RTN_UNICAST)
1938 goto martian_destination; 1976 goto martian_destination;
1939 1977
@@ -2025,6 +2063,11 @@ martian_destination:
2025 "%u.%u.%u.%u, dev %s\n", 2063 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif 2065#endif
2066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
2028e_inval: 2071e_inval:
2029 err = -EINVAL; 2072 err = -EINVAL;
2030 goto done; 2073 goto done;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3068 3111
3069int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3070{ 3113{
3071 int i, order, goal, rc = 0; 3114 int rc = 0;
3072 3115
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3075 3118
3076#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3077 for (order = 0; 3122 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */; 3124 /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
3081 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3084#endif 3130#endif
3085 3131
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
3091 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3093 3139
3094 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3095 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3097 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3098 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3099 3145 (27 - PAGE_SHIFT) :
3100 do { 3146 (29 - PAGE_SHIFT),
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3102 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3103 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3104 rt_hash_mask--; 3150 0);
3105 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3106 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3107 } while (rt_hash_table == NULL && --order > 0);
3108
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3111
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3118
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3123 }
3124 3153
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2128 2132
2129 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2130 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2131 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2132 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2133 2137
2134 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2178 2182
2179 switch (optname) { 2183 switch (optname) {
2180 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2181 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2182 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2183 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2184 break; 2188 break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
741 741
742 if (!cwnd) { 742 if (!cwnd) {
743 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
744 cwnd = 2; 744 cwnd = 2;
745 else 745 else
746 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
747 } 747 }
748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
749} 749}
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
914 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
915 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
916 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
917 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
918 } 918 }
919 919
920 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1077 (IsFack(tp) || 1077 (IsFack(tp) ||
1078 !before(lost_retrans, 1078 !before(lost_retrans,
1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1080 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083 1083
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1957 } 1957 }
1958} 1958}
1959 1959
1960/* There is one downside to this scheme. Although we keep the
1961 * ACK clock ticking, adjusting packet counters and advancing
1962 * congestion window, we do not liberate socket send buffer
1963 * space.
1964 *
1965 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
1966 * then making a write space wakeup callback is a possible
1967 * future enhancement. WARNING: it is not trivial to make.
1968 */
1969static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
1970 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
1971{ 1962{
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2047 * the other end. 2038 * the other end.
2048 */ 2039 */
2049 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2050 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2051 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2052 now, &seq_rtt); 2044 now, &seq_rtt);
2053 break; 2045 break;
@@ -3308,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3308 tp->snd_cwnd_stamp = tcp_time_stamp; 3300 tp->snd_cwnd_stamp = tcp_time_stamp;
3309} 3301}
3310 3302
3303static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3304{
3305 /* If the user specified a specific send buffer setting, do
3306 * not modify it.
3307 */
3308 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3309 return 0;
3310
3311 /* If we are under global TCP memory pressure, do not expand. */
3312 if (tcp_memory_pressure)
3313 return 0;
3314
3315 /* If we are under soft global TCP memory pressure, do not expand. */
3316 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3317 return 0;
3318
3319 /* If we filled the congestion window, do not expand. */
3320 if (tp->packets_out >= tp->snd_cwnd)
3321 return 0;
3322
3323 return 1;
3324}
3311 3325
3312/* When incoming ACK allowed to free some skb from write_queue, 3326/* When incoming ACK allowed to free some skb from write_queue,
3313 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3327 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3333,8 @@ static void tcp_new_space(struct sock *sk)
3319{ 3333{
3320 struct tcp_sock *tp = tcp_sk(sk); 3334 struct tcp_sock *tp = tcp_sk(sk);
3321 3335
3322 if (tp->packets_out < tp->snd_cwnd && 3336 if (tcp_should_expand_sndbuf(sk, tp)) {
3323 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3337 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3324 !tcp_memory_pressure &&
3325 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3326 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3327 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3338 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3328 demanded = max_t(unsigned int, tp->snd_cwnd, 3339 demanded = max_t(unsigned int, tp->snd_cwnd,
3329 tp->reordering + 1); 3340 tp->reordering + 1);
@@ -3346,22 +3357,9 @@ static inline void tcp_check_space(struct sock *sk)
3346 } 3357 }
3347} 3358}
3348 3359
3349static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3360static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3350{
3351 struct tcp_sock *tp = tcp_sk(sk);
3352
3353 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3354 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3355 tcp_write_xmit(sk, tp->nonagle))
3356 tcp_check_probe_timer(sk, tp);
3357}
3358
3359static __inline__ void tcp_data_snd_check(struct sock *sk)
3360{ 3361{
3361 struct sk_buff *skb = sk->sk_send_head; 3362 tcp_push_pending_frames(sk, tp);
3362
3363 if (skb != NULL)
3364 __tcp_data_snd_check(sk, skb);
3365 tcp_check_space(sk); 3363 tcp_check_space(sk);
3366} 3364}
3367 3365
@@ -3655,7 +3653,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3655 */ 3653 */
3656 tcp_ack(sk, skb, 0); 3654 tcp_ack(sk, skb, 0);
3657 __kfree_skb(skb); 3655 __kfree_skb(skb);
3658 tcp_data_snd_check(sk); 3656 tcp_data_snd_check(sk, tp);
3659 return 0; 3657 return 0;
3660 } else { /* Header too small */ 3658 } else { /* Header too small */
3661 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3659 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3719,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3721 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3719 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
3722 /* Well, only one small jumplet in fast path... */ 3720 /* Well, only one small jumplet in fast path... */
3723 tcp_ack(sk, skb, FLAG_DATA); 3721 tcp_ack(sk, skb, FLAG_DATA);
3724 tcp_data_snd_check(sk); 3722 tcp_data_snd_check(sk, tp);
3725 if (!tcp_ack_scheduled(tp)) 3723 if (!tcp_ack_scheduled(tp))
3726 goto no_ack; 3724 goto no_ack;
3727 } 3725 }
@@ -3799,7 +3797,7 @@ step5:
3799 /* step 7: process the segment text */ 3797 /* step 7: process the segment text */
3800 tcp_data_queue(sk, skb); 3798 tcp_data_queue(sk, skb);
3801 3799
3802 tcp_data_snd_check(sk); 3800 tcp_data_snd_check(sk, tp);
3803 tcp_ack_snd_check(sk); 3801 tcp_ack_snd_check(sk);
3804 return 0; 3802 return 0;
3805 3803
@@ -4109,7 +4107,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4109 /* Do step6 onward by hand. */ 4107 /* Do step6 onward by hand. */
4110 tcp_urg(sk, skb, th); 4108 tcp_urg(sk, skb, th);
4111 __kfree_skb(skb); 4109 __kfree_skb(skb);
4112 tcp_data_snd_check(sk); 4110 tcp_data_snd_check(sk, tp);
4113 return 0; 4111 return 0;
4114 } 4112 }
4115 4113
@@ -4300,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4300 4298
4301 /* tcp_data could move socket to TIME-WAIT */ 4299 /* tcp_data could move socket to TIME-WAIT */
4302 if (sk->sk_state != TCP_CLOSE) { 4300 if (sk->sk_state != TCP_CLOSE) {
4303 tcp_data_snd_check(sk); 4301 tcp_data_snd_check(sk, tp);
4304 tcp_ack_snd_check(sk); 4302 tcp_ack_snd_check(sk);
4305 } 4303 }
4306 4304
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2045 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2047 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops; 2051 tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e041d057ec86 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
140 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
141} 141}
142 142
143static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
144{ 144{
145 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
146 146
147 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
149} 149}
150 150
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
355 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
356 356
357 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
358 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
359 359
360 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
361 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
403 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
404} 404}
405 405
406static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
407{
408 /* Force push to be on for any TSO frames to workaround
409 * problems with busted implementations like Mac OS-X that
410 * hold off socket receive wakeups until push is seen.
411 */
412 if (tcp_skb_pcount(skb) > 1)
413 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
414}
415
416/* Send _single_ skb sitting at the send head. This function requires
417 * true push pending frames to setup probe timer etc.
418 */
419void tcp_push_one(struct sock *sk, unsigned cur_mss)
420{ 407{
421 struct tcp_sock *tp = tcp_sk(sk); 408 struct tcp_sock *tp = tcp_sk(sk);
422 struct sk_buff *skb = sk->sk_send_head;
423 409
424 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { 410 if (skb->len <= tp->mss_cache ||
425 /* Send it out now. */
426 TCP_SKB_CB(skb)->when = tcp_time_stamp;
427 tcp_tso_set_push(skb);
428 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
429 sk->sk_send_head = NULL;
430 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
431 tcp_packets_out_inc(sk, tp, skb);
432 return;
433 }
434 }
435}
436
437void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
438{
439 struct tcp_sock *tp = tcp_sk(sk);
440
441 if (skb->len <= tp->mss_cache_std ||
442 !(sk->sk_route_caps & NETIF_F_TSO)) { 411 !(sk->sk_route_caps & NETIF_F_TSO)) {
443 /* Avoid the costly divide in the normal 412 /* Avoid the costly divide in the normal
444 * non-TSO case. 413 * non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
448 } else { 417 } else {
449 unsigned int factor; 418 unsigned int factor;
450 419
451 factor = skb->len + (tp->mss_cache_std - 1); 420 factor = skb->len + (tp->mss_cache - 1);
452 factor /= tp->mss_cache_std; 421 factor /= tp->mss_cache;
453 skb_shinfo(skb)->tso_segs = factor; 422 skb_shinfo(skb)->tso_segs = factor;
454 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 423 skb_shinfo(skb)->tso_size = tp->mss_cache;
455 } 424 }
456} 425}
457 426
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
537 } 506 }
538 507
539 /* Link BUFF into the send queue. */ 508 /* Link BUFF into the send queue. */
509 skb_header_release(buff);
540 __skb_append(skb, buff); 510 __skb_append(skb, buff);
541 511
542 return 0; 512 return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
657 627
658 /* And store cached results */ 628 /* And store cached results */
659 tp->pmtu_cookie = pmtu; 629 tp->pmtu_cookie = pmtu;
660 tp->mss_cache = tp->mss_cache_std = mss_now; 630 tp->mss_cache = mss_now;
661 631
662 return mss_now; 632 return mss_now;
663} 633}
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
669 * cannot be large. However, taking into account rare use of URG, this 639 * cannot be large. However, taking into account rare use of URG, this
670 * is not a big flaw. 640 * is not a big flaw.
671 */ 641 */
672 642unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
673unsigned int tcp_current_mss(struct sock *sk, int large)
674{ 643{
675 struct tcp_sock *tp = tcp_sk(sk); 644 struct tcp_sock *tp = tcp_sk(sk);
676 struct dst_entry *dst = __sk_dst_get(sk); 645 struct dst_entry *dst = __sk_dst_get(sk);
677 unsigned int do_large, mss_now; 646 u32 mss_now;
647 u16 xmit_size_goal;
648 int doing_tso = 0;
649
650 mss_now = tp->mss_cache;
651
652 if (large_allowed &&
653 (sk->sk_route_caps & NETIF_F_TSO) &&
654 !tp->urg_mode)
655 doing_tso = 1;
678 656
679 mss_now = tp->mss_cache_std;
680 if (dst) { 657 if (dst) {
681 u32 mtu = dst_mtu(dst); 658 u32 mtu = dst_mtu(dst);
682 if (mtu != tp->pmtu_cookie) 659 if (mtu != tp->pmtu_cookie)
683 mss_now = tcp_sync_mss(sk, mtu); 660 mss_now = tcp_sync_mss(sk, mtu);
684 } 661 }
685 662
686 do_large = (large && 663 if (tp->rx_opt.eff_sacks)
687 (sk->sk_route_caps & NETIF_F_TSO) && 664 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
688 !tp->urg_mode); 665 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
689 666
690 if (do_large) { 667 xmit_size_goal = mss_now;
691 unsigned int large_mss, factor, limit;
692 668
693 large_mss = 65535 - tp->af_specific->net_header_len - 669 if (doing_tso) {
670 xmit_size_goal = 65535 -
671 tp->af_specific->net_header_len -
694 tp->ext_header_len - tp->tcp_header_len; 672 tp->ext_header_len - tp->tcp_header_len;
695 673
696 if (tp->max_window && large_mss > (tp->max_window>>1)) 674 if (tp->max_window &&
697 large_mss = max((tp->max_window>>1), 675 (xmit_size_goal > (tp->max_window >> 1)))
698 68U - tp->tcp_header_len); 676 xmit_size_goal = max((tp->max_window >> 1),
677 68U - tp->tcp_header_len);
678
679 xmit_size_goal -= (xmit_size_goal % mss_now);
680 }
681 tp->xmit_size_goal = xmit_size_goal;
699 682
700 factor = large_mss / mss_now; 683 return mss_now;
684}
701 685
702 /* Always keep large mss multiple of real mss, but 686/* Congestion window validation. (RFC2861) */
703 * do not exceed 1/tso_win_divisor of the congestion window
704 * so we can keep the ACK clock ticking and minimize
705 * bursting.
706 */
707 limit = tp->snd_cwnd;
708 if (sysctl_tcp_tso_win_divisor)
709 limit /= sysctl_tcp_tso_win_divisor;
710 limit = max(1U, limit);
711 if (factor > limit)
712 factor = limit;
713 687
714 tp->mss_cache = mss_now * factor; 688static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
689{
690 __u32 packets_out = tp->packets_out;
691
692 if (packets_out >= tp->snd_cwnd) {
693 /* Network is feed fully. */
694 tp->snd_cwnd_used = 0;
695 tp->snd_cwnd_stamp = tcp_time_stamp;
696 } else {
697 /* Network starves. */
698 if (tp->packets_out > tp->snd_cwnd_used)
699 tp->snd_cwnd_used = tp->packets_out;
715 700
716 mss_now = tp->mss_cache; 701 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
702 tcp_cwnd_application_limited(sk);
717 } 703 }
704}
718 705
719 if (tp->rx_opt.eff_sacks) 706static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
720 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 707{
721 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 708 u32 window, cwnd_len;
722 return mss_now; 709
710 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
711 cwnd_len = mss_now * cwnd;
712 return min(window, cwnd_len);
713}
714
715/* Can at least one segment of SKB be sent right now, according to the
716 * congestion window rules? If so, return how many segments are allowed.
717 */
718static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
719{
720 u32 in_flight, cwnd;
721
722 /* Don't be strict about the congestion window for the final FIN. */
723 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
724 return 1;
725
726 in_flight = tcp_packets_in_flight(tp);
727 cwnd = tp->snd_cwnd;
728 if (in_flight < cwnd)
729 return (cwnd - in_flight);
730
731 return 0;
732}
733
734/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire.
736 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
738{
739 int tso_segs = tcp_skb_pcount(skb);
740
741 if (!tso_segs) {
742 tcp_set_skb_tso_segs(sk, skb);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb);
891 tcp_set_skb_tso_segs(sk, buff);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 if (tp->ca_state != TCP_CA_Open)
913 return 0;
914
915 in_flight = tcp_packets_in_flight(tp);
916
917 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
918 (tp->snd_cwnd <= in_flight));
919
920 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
921
922 /* From in_flight test above, we know that cwnd > in_flight. */
923 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
924
925 limit = min(send_win, cong_win);
926
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933
934 /* If at least some fraction of a window is available,
935 * just use it.
936 */
937 chunk /= sysctl_tcp_tso_win_divisor;
938 if (limit >= chunk)
939 return 0;
940 } else {
941 /* Different approach, try not to defer past a single
942 * ACK. Receiver should ACK every other full sized
943 * frame, so if we have space for more than 3 frames
944 * then send now.
945 */
946 if (limit > tcp_max_burst(tp) * tp->mss_cache)
947 return 0;
948 }
949
950 /* Ok, it looks like it is advisable to defer. */
951 return 1;
723} 952}
724 953
725/* This routine writes packets to the network. It advances the 954/* This routine writes packets to the network. It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
729 * Returns 1, if no segments are in flight and we have queued segments, but 958 * Returns 1, if no segments are in flight and we have queued segments, but
730 * cannot send anything now because of SWS or another problem. 959 * cannot send anything now because of SWS or another problem.
731 */ 960 */
732int tcp_write_xmit(struct sock *sk, int nonagle) 961static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
733{ 962{
734 struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_sock *tp = tcp_sk(sk);
735 unsigned int mss_now; 964 struct sk_buff *skb;
965 unsigned int tso_segs, sent_pkts;
966 int cwnd_quota;
736 967
737 /* If we are closed, the bytes will have to remain here. 968 /* If we are closed, the bytes will have to remain here.
738 * In time closedown will finish, we empty the write queue and all 969 * In time closedown will finish, we empty the write queue and all
739 * will be happy. 970 * will be happy.
740 */ 971 */
741 if (sk->sk_state != TCP_CLOSE) { 972 if (unlikely(sk->sk_state == TCP_CLOSE))
742 struct sk_buff *skb; 973 return 0;
743 int sent_pkts = 0; 974
975 skb = sk->sk_send_head;
976 if (unlikely(!skb))
977 return 0;
978
979 tso_segs = tcp_init_tso_segs(sk, skb);
980 cwnd_quota = tcp_cwnd_test(tp, skb);
981 if (unlikely(!cwnd_quota))
982 goto out;
983
984 sent_pkts = 0;
985 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
986 BUG_ON(!tso_segs);
987
988 if (tso_segs == 1) {
989 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
990 (tcp_skb_is_last(sk, skb) ?
991 nonagle : TCP_NAGLE_PUSH))))
992 break;
993 } else {
994 if (tcp_tso_should_defer(sk, tp, skb))
995 break;
996 }
744 997
745 /* Account for SACKS, we may need to fragment due to this. 998 if (tso_segs > 1) {
746 * It is just like the real MSS changing on us midstream. 999 u32 limit = tcp_window_allows(tp, skb,
747 * We also handle things correctly when the user adds some 1000 mss_now, cwnd_quota);
748 * IP options mid-stream. Silly to do, but cover it. 1001
749 */ 1002 if (skb->len < limit) {
750 mss_now = tcp_current_mss(sk, 1); 1003 unsigned int trim = skb->len % mss_now;
751 1004
752 while ((skb = sk->sk_send_head) && 1005 if (trim)
753 tcp_snd_test(sk, skb, mss_now, 1006 limit = skb->len - trim;
754 tcp_skb_is_last(sk, skb) ? nonagle : 1007 }
755 TCP_NAGLE_PUSH)) { 1008 if (skb->len > limit) {
756 if (skb->len > mss_now) { 1009 if (tso_fragment(sk, skb, limit))
757 if (tcp_fragment(sk, skb, mss_now))
758 break; 1010 break;
759 } 1011 }
760 1012 } else if (unlikely(skb->len > mss_now)) {
761 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1013 if (unlikely(tcp_fragment(sk, skb, mss_now)))
762 tcp_tso_set_push(skb);
763 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
764 break; 1014 break;
1015 }
765 1016
766 /* Advance the send_head. This one is sent out. 1017 TCP_SKB_CB(skb)->when = tcp_time_stamp;
767 * This call will increment packets_out. 1018
768 */ 1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
769 update_send_head(sk, tp, skb); 1020 break;
1021
1022 /* Advance the send_head. This one is sent out.
1023 * This call will increment packets_out.
1024 */
1025 update_send_head(sk, tp, skb);
1026
1027 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++;
1029
1030 /* Do not optimize this to use tso_segs. If we chopped up
1031 * the packet above, tso_segs will no longer be valid.
1032 */
1033 cwnd_quota -= tcp_skb_pcount(skb);
1034
1035 BUG_ON(cwnd_quota < 0);
1036 if (!cwnd_quota)
1037 break;
1038
1039 skb = sk->sk_send_head;
1040 if (!skb)
1041 break;
1042 tso_segs = tcp_init_tso_segs(sk, skb);
1043 }
1044
1045 if (likely(sent_pkts)) {
1046 tcp_cwnd_validate(sk, tp);
1047 return 0;
1048 }
1049out:
1050 return !tp->packets_out && sk->sk_send_head;
1051}
1052
1053/* Push out any pending frames which were held back due to
1054 * TCP_CORK or attempt at coalescing tiny packets.
1055 * The socket must be locked by the caller.
1056 */
1057void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1058 unsigned int cur_mss, int nonagle)
1059{
1060 struct sk_buff *skb = sk->sk_send_head;
770 1061
771 tcp_minshall_update(tp, mss_now, skb); 1062 if (skb) {
772 sent_pkts = 1; 1063 if (tcp_write_xmit(sk, cur_mss, nonagle))
1064 tcp_check_probe_timer(sk, tp);
1065 }
1066}
1067
1068/* Send _single_ skb sitting at the send head. This function requires
1069 * true push pending frames to setup probe timer etc.
1070 */
1071void tcp_push_one(struct sock *sk, unsigned int mss_now)
1072{
1073 struct tcp_sock *tp = tcp_sk(sk);
1074 struct sk_buff *skb = sk->sk_send_head;
1075 unsigned int tso_segs, cwnd_quota;
1076
1077 BUG_ON(!skb || skb->len < mss_now);
1078
1079 tso_segs = tcp_init_tso_segs(sk, skb);
1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1081
1082 if (likely(cwnd_quota)) {
1083 BUG_ON(!tso_segs);
1084
1085 if (tso_segs > 1) {
1086 u32 limit = tcp_window_allows(tp, skb,
1087 mss_now, cwnd_quota);
1088
1089 if (skb->len < limit) {
1090 unsigned int trim = skb->len % mss_now;
1091
1092 if (trim)
1093 limit = skb->len - trim;
1094 }
1095 if (skb->len > limit) {
1096 if (unlikely(tso_fragment(sk, skb, limit)))
1097 return;
1098 }
1099 } else if (unlikely(skb->len > mss_now)) {
1100 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1101 return;
773 } 1102 }
774 1103
775 if (sent_pkts) { 1104 /* Send it out now. */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106
1107 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1108 update_send_head(sk, tp, skb);
776 tcp_cwnd_validate(sk, tp); 1109 tcp_cwnd_validate(sk, tp);
777 return 0; 1110 return;
778 } 1111 }
779
780 return !tp->packets_out && sk->sk_send_head;
781 } 1112 }
782 return 0;
783} 1113}
784 1114
785/* This function returns the amount that we can raise the 1115/* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1039 if (sk->sk_route_caps & NETIF_F_TSO) { 1369 if (sk->sk_route_caps & NETIF_F_TSO) {
1040 sk->sk_route_caps &= ~NETIF_F_TSO; 1370 sk->sk_route_caps &= ~NETIF_F_TSO;
1041 sock_set_flag(sk, SOCK_NO_LARGESEND); 1371 sock_set_flag(sk, SOCK_NO_LARGESEND);
1042 tp->mss_cache = tp->mss_cache_std;
1043 } 1372 }
1044 1373
1045 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1374 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1101 * is still in somebody's hands, else make a clone. 1430 * is still in somebody's hands, else make a clone.
1102 */ 1431 */
1103 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1104 tcp_tso_set_push(skb);
1105 1433
1106 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1434 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1107 pskb_copy(skb, GFP_ATOMIC): 1435 pskb_copy(skb, GFP_ATOMIC):
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
1670 if (sk->sk_route_caps & NETIF_F_TSO) { 1998 if (sk->sk_route_caps & NETIF_F_TSO) {
1671 sock_set_flag(sk, SOCK_NO_LARGESEND); 1999 sock_set_flag(sk, SOCK_NO_LARGESEND);
1672 sk->sk_route_caps &= ~NETIF_F_TSO; 2000 sk->sk_route_caps &= ~NETIF_F_TSO;
1673 tp->mss_cache = tp->mss_cache_std;
1674 } 2001 }
1675 } else if (!tcp_skb_pcount(skb)) 2002 } else if (!tcp_skb_pcount(skb))
1676 tcp_set_skb_tso_segs(sk, skb); 2003 tcp_set_skb_tso_segs(sk, skb);
1677 2004
1678 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1679 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1680 tcp_tso_set_push(skb);
1681 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 2007 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1682 if (!err) { 2008 if (!err) {
1683 update_send_head(sk, tp, skb); 2009 update_send_head(sk, tp, skb);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a54d4ef3fd35..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2777,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2777 read_lock_bh(&idev->lock); 2777 read_lock_bh(&idev->lock);
2778 switch (type) { 2778 switch (type) {
2779 case UNICAST_ADDR: 2779 case UNICAST_ADDR:
2780 /* unicast address */ 2780 /* unicast address incl. temp addr */
2781 for (ifa = idev->addr_list; ifa; 2781 for (ifa = idev->addr_list; ifa;
2782 ifa = ifa->if_next, ip_idx++) { 2782 ifa = ifa->if_next, ip_idx++) {
2783 if (ip_idx < s_ip_idx) 2783 if (ip_idx < s_ip_idx)
@@ -2788,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
2788 NLM_F_MULTI)) <= 0) 2788 NLM_F_MULTI)) <= 0)
2789 goto done; 2789 goto done;
2790 } 2790 }
2791 /* temp addr */
2792#ifdef CONFIG_IPV6_PRIVACY
2793 for (ifa = idev->tempaddr_list; ifa;
2794 ifa = ifa->tmp_next, ip_idx++) {
2795 if (ip_idx < s_ip_idx)
2796 continue;
2797 if ((err = inet6_fill_ifaddr(skb, ifa,
2798 NETLINK_CB(cb->skb).pid,
2799 cb->nlh->nlmsg_seq, RTM_NEWADDR,
2800 NLM_F_MULTI)) <= 0)
2801 goto done;
2802 }
2803#endif
2804 break; 2791 break;
2805 case MULTICAST_ADDR: 2792 case MULTICAST_ADDR:
2806 /* multicast address */ 2793 /* multicast address */
@@ -2923,6 +2910,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
2923 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); 2910 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2924 r = NLMSG_DATA(nlh); 2911 r = NLMSG_DATA(nlh);
2925 r->ifi_family = AF_INET6; 2912 r->ifi_family = AF_INET6;
2913 r->__ifi_pad = 0;
2926 r->ifi_type = dev->type; 2914 r->ifi_type = dev->type;
2927 r->ifi_index = dev->ifindex; 2915 r->ifi_index = dev->ifindex;
2928 r->ifi_flags = dev_get_flags(dev); 2916 r->ifi_flags = dev_get_flags(dev);
@@ -3030,9 +3018,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
3030 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags); 3018 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
3031 pmsg = NLMSG_DATA(nlh); 3019 pmsg = NLMSG_DATA(nlh);
3032 pmsg->prefix_family = AF_INET6; 3020 pmsg->prefix_family = AF_INET6;
3021 pmsg->prefix_pad1 = 0;
3022 pmsg->prefix_pad2 = 0;
3033 pmsg->prefix_ifindex = idev->dev->ifindex; 3023 pmsg->prefix_ifindex = idev->dev->ifindex;
3034 pmsg->prefix_len = pinfo->prefix_len; 3024 pmsg->prefix_len = pinfo->prefix_len;
3035 pmsg->prefix_type = pinfo->type; 3025 pmsg->prefix_type = pinfo->type;
3026 pmsg->prefix_pad3 = 0;
3036 3027
3037 pmsg->prefix_flags = 0; 3028 pmsg->prefix_flags = 0;
3038 if (pinfo->onlink) 3029 if (pinfo->onlink)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
774 if (if6_proc_init()) 774 if (if6_proc_init())
775 goto proc_if6_fail; 775 goto proc_if6_fail;
776#endif 776#endif
777 ipv6_packet_init();
778 ip6_route_init(); 777 ip6_route_init();
779 ip6_flowlabel_init(); 778 ip6_flowlabel_init();
780 err = addrconf_init(); 779 err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
791 /* Init v6 transport protocols. */ 790 /* Init v6 transport protocols. */
792 udpv6_init(); 791 udpv6_init();
793 tcpv6_init(); 792 tcpv6_init();
793
794 ipv6_packet_init();
794 err = 0; 795 err = 0;
795out: 796out:
796 return err; 797 return err;
@@ -798,7 +799,6 @@ out:
798addrconf_fail: 799addrconf_fail:
799 ip6_flowlabel_cleanup(); 800 ip6_flowlabel_cleanup();
800 ip6_route_cleanup(); 801 ip6_route_cleanup();
801 ipv6_packet_cleanup();
802#ifdef CONFIG_PROC_FS 802#ifdef CONFIG_PROC_FS
803 if6_proc_exit(); 803 if6_proc_exit();
804proc_if6_fail: 804proc_if6_fail:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
244 opt_space->opt_nflen = 0; 244 opt_space->opt_nflen = 0;
245 } 245 }
246 opt_space->dst1opt = fopt->dst1opt; 246 opt_space->dst1opt = fopt->dst1opt;
247 opt_space->auth = fopt->auth;
248 opt_space->opt_flen = fopt->opt_flen; 247 opt_space->opt_flen = fopt->opt_flen;
249 return opt_space; 248 return opt_space;
250} 249}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
465 to->pkt_type = from->pkt_type; 465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority; 466 to->priority = from->priority;
467 to->protocol = from->protocol; 467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst); 468 dst_release(to->dst);
470 to->dst = dst_clone(from->dst); 469 to->dst = dst_clone(from->dst);
471 to->dev = from->dev; 470 to->dev = from->dev;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
2018 */ 2018 */
2019 tp->snd_ssthresh = 0x7fffffff; 2019 tp->snd_ssthresh = 0x7fffffff;
2020 tp->snd_cwnd_clamp = ~0; 2020 tp->snd_cwnd_clamp = ~0;
2021 tp->mss_cache_std = tp->mss_cache = 536; 2021 tp->mss_cache = 536;
2022 2022
2023 tp->reordering = sysctl_tcp_reordering; 2023 tp->reordering = sysctl_tcp_reordering;
2024 2024
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d93..fc456a7aaec3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
315static void netlink_remove(struct sock *sk) 315static void netlink_remove(struct sock *sk)
316{ 316{
317 netlink_table_grab(); 317 netlink_table_grab();
318 nl_table[sk->sk_protocol].hash.entries--; 318 if (sk_del_node_init(sk))
319 sk_del_node_init(sk); 319 nl_table[sk->sk_protocol].hash.entries--;
320 if (nlk_sk(sk)->groups) 320 if (nlk_sk(sk)->groups)
321 __sk_del_bind_node(sk); 321 __sk_del_bind_node(sk);
322 netlink_table_ungrab(); 322 netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
429 err = netlink_insert(sk, pid); 429 err = netlink_insert(sk, pid);
430 if (err == -EADDRINUSE) 430 if (err == -EADDRINUSE)
431 goto retry; 431 goto retry;
432 return 0; 432
433 /* If 2 threads race to autobind, that is fine. */
434 if (err == -EBUSY)
435 err = 0;
436
437 return err;
433} 438}
434 439
435static inline int netlink_capable(struct socket *sock, unsigned int flag) 440static inline int netlink_capable(struct socket *sock, unsigned int flag)
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8f58cecd6266..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y := sch_generic.o 5obj-y := sch_generic.o
6 6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o 7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
8obj-$(CONFIG_NET_CLS) += cls_api.o 8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o 9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o 10obj-$(CONFIG_NET_ACT_POLICE) += police.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e6035..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
439 439
440 t = NLMSG_DATA(nlh); 440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC; 441 t->tca_family = AF_UNSPEC;
442 t->tca__pad1 = 0;
443 t->tca__pad2 = 0;
442 444
443 x = (struct rtattr*) skb->tail; 445 x = (struct rtattr*) skb->tail;
444 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 446 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
580 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); 582 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
581 t = NLMSG_DATA(nlh); 583 t = NLMSG_DATA(nlh);
582 t->tca_family = AF_UNSPEC; 584 t->tca_family = AF_UNSPEC;
585 t->tca__pad1 = 0;
586 t->tca__pad2 = 0;
583 587
584 x = (struct rtattr *) skb->tail; 588 x = (struct rtattr *) skb->tail;
585 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 589 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
687 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); 691 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
688 t = NLMSG_DATA(nlh); 692 t = NLMSG_DATA(nlh);
689 t->tca_family = AF_UNSPEC; 693 t->tca_family = AF_UNSPEC;
690 694 t->tca__pad1 = 0;
695 t->tca__pad2 = 0;
696
691 x = (struct rtattr*) skb->tail; 697 x = (struct rtattr*) skb->tail;
692 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 698 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
693 699
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
842 cb->nlh->nlmsg_type, sizeof(*t)); 848 cb->nlh->nlmsg_type, sizeof(*t));
843 t = NLMSG_DATA(nlh); 849 t = NLMSG_DATA(nlh);
844 t->tca_family = AF_UNSPEC; 850 t->tca_family = AF_UNSPEC;
851 t->tca__pad1 = 0;
852 t->tca__pad2 = 0;
845 853
846 x = (struct rtattr *) skb->tail; 854 x = (struct rtattr *) skb->tail;
847 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); 855 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c9627..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 331 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
332 tcm = NLMSG_DATA(nlh); 332 tcm = NLMSG_DATA(nlh);
333 tcm->tcm_family = AF_UNSPEC; 333 tcm->tcm_family = AF_UNSPEC;
334 tcm->tcm__pad1 = 0;
335 tcm->tcm__pad1 = 0;
334 tcm->tcm_ifindex = tp->q->dev->ifindex; 336 tcm->tcm_ifindex = tp->q->dev->ifindex;
335 tcm->tcm_parent = tp->classid; 337 tcm->tcm_parent = tp->classid;
336 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); 338 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
618 pinfo.protocol = s->protocol; 618 pinfo.protocol = s->protocol;
619 pinfo.tunnelid = s->tunnelid; 619 pinfo.tunnelid = s->tunnelid;
620 pinfo.tunnelhdr = f->tunnelhdr; 620 pinfo.tunnelhdr = f->tunnelhdr;
621 pinfo.pad = 0;
621 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); 622 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
622 if (f->res.classid) 623 if (f->res.classid)
623 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); 624 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
205 dst->value = skb->protocol; 205 dst->value = skb->protocol;
206} 206}
207 207
208META_COLLECTOR(int_security)
209{
210 dst->value = skb->security;
211}
212
213META_COLLECTOR(int_pkttype) 208META_COLLECTOR(int_pkttype)
214{ 209{
215 dst->value = skb->pkt_type; 210 dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
524 [META_ID(REALDEV)] = META_FUNC(int_realdev), 519 [META_ID(REALDEV)] = META_FUNC(int_realdev),
525 [META_ID(PRIORITY)] = META_FUNC(int_priority), 520 [META_ID(PRIORITY)] = META_FUNC(int_priority),
526 [META_ID(PROTOCOL)] = META_FUNC(int_protocol), 521 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
527 [META_ID(SECURITY)] = META_FUNC(int_security),
528 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), 522 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
529 [META_ID(PKTLEN)] = META_FUNC(int_pktlen), 523 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
530 [META_ID(DATALEN)] = META_FUNC(int_datalen), 524 [META_ID(DATALEN)] = META_FUNC(int_datalen),
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c78..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{ 399{
400 int err; 400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1]; 401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch; 402 struct Qdisc *sch;
404 struct Qdisc_ops *ops; 403 struct Qdisc_ops *ops;
405 int size;
406 404
407 ops = qdisc_lookup_ops(kind); 405 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD 406#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
437 if (ops == NULL) 435 if (ops == NULL)
438 goto err_out; 436 goto err_out;
439 437
440 /* ensure that the Qdisc and the private data are 32-byte aligned */ 438 sch = qdisc_alloc(dev, ops);
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 439 if (IS_ERR(sch)) {
442 size += ops->priv_size + QDISC_ALIGN_CONST; 440 err = PTR_ERR(sch);
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2; 441 goto err_out2;
448 memset(p, 0, size); 442 }
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455 443
456 if (handle == TC_H_INGRESS) 444 if (handle == TC_H_INGRESS) {
457 sch->flags |= TCQ_F_INGRESS; 445 sch->flags |= TCQ_F_INGRESS;
458 446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
459 sch->ops = ops; 447 } else if (handle == 0) {
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev); 448 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM; 449 err = -ENOMEM;
469 if (handle == 0) 450 if (handle == 0)
470 goto err_out3; 451 goto err_out3;
471 } 452 }
472 453
473 if (handle == TC_H_INGRESS) 454 sch->handle = handle;
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477 455
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
479 qdisc_lock_tree(dev); 474 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list); 475 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev); 476 qdisc_unlock_tree(dev);
482 477
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch; 478 return sch;
489 } 479 }
490err_out3: 480err_out3:
491 dev_put(dev); 481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
492err_out2: 483err_out2:
493 module_put(ops->owner); 484 module_put(ops->owner);
494err_out: 485err_out:
495 *errp = err; 486 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL; 487 return NULL;
499} 488}
500 489
@@ -770,6 +759,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
770 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
771 tcm = NLMSG_DATA(nlh); 760 tcm = NLMSG_DATA(nlh);
772 tcm->tcm_family = AF_UNSPEC; 761 tcm->tcm_family = AF_UNSPEC;
762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
773 tcm->tcm_ifindex = q->dev->ifindex; 764 tcm->tcm_ifindex = q->dev->ifindex;
774 tcm->tcm_parent = clid; 765 tcm->tcm_parent = clid;
775 tcm->tcm_handle = q->handle; 766 tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
1/*
2 * net/sched/sch_blackhole.c Black hole queue
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * Note: Quantum tunneling is not supported.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h>
20#include <net/pkt_sched.h>
21
22static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
23{
24 qdisc_drop(skb, sch);
25 return NET_XMIT_SUCCESS;
26}
27
28static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
29{
30 return NULL;
31}
32
33static struct Qdisc_ops blackhole_qdisc_ops = {
34 .id = "blackhole",
35 .priv_size = 0,
36 .enqueue = blackhole_enqueue,
37 .dequeue = blackhole_dequeue,
38 .owner = THIS_MODULE,
39};
40
41static int __init blackhole_module_init(void)
42{
43 return register_qdisc(&blackhole_qdisc_ops);
44}
45
46static void __exit blackhole_module_exit(void)
47{
48 unregister_qdisc(&blackhole_qdisc_ops);
49}
50
51module_init(blackhole_module_init)
52module_exit(blackhole_module_exit)
53
54MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1528 1528
1529 opt.strategy = cl->ovl_strategy; 1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1; 1530 opt.priority2 = cl->priority2+1;
1531 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ; 1532 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1533 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len; 1534 return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1563 1564
1564 if (cl->police) { 1565 if (cl->police) {
1565 opt.police = cl->police; 1566 opt.police = cl->police;
1567 opt.__res1 = 0;
1568 opt.__res2 = 0;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1569 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 } 1570 }
1568 return skb->len; 1571 return skb->len;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
395 .owner = THIS_MODULE, 395 .owner = THIS_MODULE,
396}; 396};
397 397
398struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) 398struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
399{ 399{
400 void *p; 400 void *p;
401 struct Qdisc *sch; 401 struct Qdisc *sch;
402 int size; 402 unsigned int size;
403 int err = -ENOBUFS;
403 404
404 /* ensure that the Qdisc and the private data are 32-byte aligned */ 405 /* ensure that the Qdisc and the private data are 32-byte aligned */
405 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 406 size = QDISC_ALIGN(sizeof(*sch));
406 size += ops->priv_size + QDISC_ALIGN_CONST; 407 size += ops->priv_size + (QDISC_ALIGNTO - 1);
407 408
408 p = kmalloc(size, GFP_KERNEL); 409 p = kmalloc(size, GFP_KERNEL);
409 if (!p) 410 if (!p)
410 return NULL; 411 goto errout;
411 memset(p, 0, size); 412 memset(p, 0, size);
412 413 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
413 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 414 sch->padded = (char *) sch - (char *) p;
414 & ~QDISC_ALIGN_CONST);
415 sch->padded = (char *)sch - (char *)p;
416 415
417 INIT_LIST_HEAD(&sch->list); 416 INIT_LIST_HEAD(&sch->list);
418 skb_queue_head_init(&sch->q); 417 skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
423 dev_hold(dev); 422 dev_hold(dev);
424 sch->stats_lock = &dev->queue_lock; 423 sch->stats_lock = &dev->queue_lock;
425 atomic_set(&sch->refcnt, 1); 424 atomic_set(&sch->refcnt, 1);
425
426 return sch;
427errout:
428 return ERR_PTR(-err);
429}
430
431struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
432{
433 struct Qdisc *sch;
434
435 sch = qdisc_alloc(dev, ops);
436 if (IS_ERR(sch))
437 goto errout;
438
426 if (!ops->init || ops->init(sch, NULL) == 0) 439 if (!ops->init || ops->init(sch, NULL) == 0)
427 return sch; 440 return sch;
428 441
429 dev_put(dev); 442errout:
430 kfree(p);
431 return NULL; 443 return NULL;
432} 444}
433 445
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
591EXPORT_SYMBOL(noop_qdisc); 603EXPORT_SYMBOL(noop_qdisc);
592EXPORT_SYMBOL(noop_qdisc_ops); 604EXPORT_SYMBOL(noop_qdisc_ops);
593EXPORT_SYMBOL(qdisc_create_dflt); 605EXPORT_SYMBOL(qdisc_create_dflt);
606EXPORT_SYMBOL(qdisc_alloc);
594EXPORT_SYMBOL(qdisc_destroy); 607EXPORT_SYMBOL(qdisc_destroy);
595EXPORT_SYMBOL(qdisc_reset); 608EXPORT_SYMBOL(qdisc_reset);
596EXPORT_SYMBOL(qdisc_restart); 609EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..c44bf4165c6e 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -102,9 +102,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
102 /* Set up the base timeout information. */ 102 /* Set up the base timeout information. */
103 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; 103 ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
104 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = 104 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
105 SCTP_DEFAULT_TIMEOUT_T1_COOKIE; 105 msecs_to_jiffies(sp->rtoinfo.srto_initial);
106 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = 106 ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
107 SCTP_DEFAULT_TIMEOUT_T1_INIT; 107 msecs_to_jiffies(sp->rtoinfo.srto_initial);
108 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = 108 ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
109 msecs_to_jiffies(sp->rtoinfo.srto_initial); 109 msecs_to_jiffies(sp->rtoinfo.srto_initial);
110 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; 110 ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +117,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
117 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] 117 ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
118 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); 118 = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
119 119
120 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 120 ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
121 SCTP_DEFAULT_TIMEOUT_HEARTBEAT; 121 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
122 ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 122 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
123 SCTP_DEFAULT_TIMEOUT_SACK;
124 ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
125 sp->autoclose * HZ;
126 123
127 /* Use SCTP specific send buffer space queues. */ 124 /* Use SCTP specific send buffer space queues. */
128 ep->sndbuf_policy = sctp_sndbuf_policy; 125 ep->sndbuf_policy = sctp_sndbuf_policy;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..e7f37faba7c0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
1050 sctp_sndbuf_policy = 0; 1050 sctp_sndbuf_policy = 0;
1051 1051
1052 /* HB.interval - 30 seconds */ 1052 /* HB.interval - 30 seconds */
1053 sctp_hb_interval = 30 * HZ; 1053 sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
1054
1055 /* delayed SACK timeout */
1056 sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
1054 1057
1055 /* Implementation specific variables. */ 1058 /* Implementation specific variables. */
1056 1059
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
47static ctl_handler sctp_sysctl_jiffies_ms; 47static ctl_handler sctp_sysctl_jiffies_ms;
48static long rto_timer_min = 1; 48static long rto_timer_min = 1;
49static long rto_timer_max = 86400000; /* One day */ 49static long rto_timer_max = 86400000; /* One day */
50static long sack_timer_min = 1;
51static long sack_timer_max = 500;
50 52
51static ctl_table sctp_table[] = { 53static ctl_table sctp_table[] = {
52 { 54 {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
187 .mode = 0644, 189 .mode = 0644,
188 .proc_handler = &proc_dointvec 190 .proc_handler = &proc_dointvec
189 }, 191 },
192 {
193 .ctl_name = NET_SCTP_SACK_TIMEOUT,
194 .procname = "sack_timeout",
195 .data = &sctp_sack_timeout,
196 .maxlen = sizeof(long),
197 .mode = 0644,
198 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
199 .strategy = &sctp_sysctl_jiffies_ms,
200 .extra1 = &sack_timer_min,
201 .extra2 = &sack_timer_max,
202 },
190 { .ctl_name = 0 } 203 { .ctl_name = 0 }
191}; 204};
192 205
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..a63b69179607 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
103 103
104 /* Set up the heartbeat timer. */ 104 /* Set up the heartbeat timer. */
105 init_timer(&peer->hb_timer); 105 init_timer(&peer->hb_timer);
106 peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
107 peer->hb_timer.function = sctp_generate_heartbeat_event; 106 peer->hb_timer.function = sctp_generate_heartbeat_event;
108 peer->hb_timer.data = (unsigned long)peer; 107 peer->hb_timer.data = (unsigned long)peer;
109 108