aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/fib_trie.txt145
-rw-r--r--drivers/net/shaper.c42
-rw-r--r--drivers/net/skge.h1
-rw-r--r--drivers/net/tg3.c69
-rw-r--r--drivers/net/tg3.h10
-rw-r--r--include/linux/if_shaper.h2
-rw-r--r--include/linux/skbuff.h19
-rw-r--r--include/linux/tc_ematch/tc_em_meta.h2
-rw-r--r--include/linux/tcp.h2
-rw-r--r--include/net/pkt_sched.h17
-rw-r--r--include/net/sch_generic.h13
-rw-r--r--include/net/slhc_vj.h21
-rw-r--r--include/net/sock.h7
-rw-r--r--include/net/tcp.h156
-rw-r--r--net/core/dev.c5
-rw-r--r--net/core/filter.c104
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/fib_trie.c202
-rw-r--r--net/ipv4/ip_output.c16
-rw-r--r--net/ipv4/route.c126
-rw-r--r--net/ipv4/tcp.c44
-rw-r--r--net/ipv4/tcp_input.c76
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c544
-rw-r--r--net/ipv6/af_inet6.c4
-rw-r--r--net/ipv6/ip6_output.c1
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/sch_api.c63
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_generic.c35
34 files changed, 1191 insertions, 617 deletions
diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt
new file mode 100644
index 000000000000..f50d0c673c57
--- /dev/null
+++ b/Documentation/networking/fib_trie.txt
@@ -0,0 +1,145 @@
1 LC-trie implementation notes.
2
3Node types
4----------
5leaf
6 An end node with data. This has a copy of the relevant key, along
7 with 'hlist' with routing table entries sorted by prefix length.
8 See struct leaf and struct leaf_info.
9
10trie node or tnode
11 An internal node, holding an array of child (leaf or tnode) pointers,
12 indexed through a subset of the key. See Level Compression.
13
14A few concepts explained
15------------------------
16Bits (tnode)
17 The number of bits in the key segment used for indexing into the
18 child array - the "child index". See Level Compression.
19
20Pos (tnode)
21 The position (in the key) of the key segment used for indexing into
22 the child array. See Path Compression.
23
24Path Compression / skipped bits
25 Any given tnode is linked to from the child array of its parent, using
26 a segment of the key specified by the parent's "pos" and "bits"
27 In certain cases, this tnode's own "pos" will not be immediately
28 adjacent to the parent (pos+bits), but there will be some bits
29 in the key skipped over because they represent a single path with no
30 deviations. These "skipped bits" constitute Path Compression.
31 Note that the search algorithm will simply skip over these bits when
32 searching, making it necessary to save the keys in the leaves to
33 verify that they actually do match the key we are searching for.
34
35Level Compression / child arrays
36 the trie is kept level balanced moving, under certain conditions, the
37 children of a full child (see "full_children") up one level, so that
38 instead of a pure binary tree, each internal node ("tnode") may
39 contain an arbitrarily large array of links to several children.
40 Conversely, a tnode with a mostly empty child array (see empty_children)
41 may be "halved", having some of its children moved downwards one level,
42 in order to avoid ever-increasing child arrays.
43
44empty_children
45 the number of positions in the child array of a given tnode that are
46 NULL.
47
48full_children
49 the number of children of a given tnode that aren't path compressed.
50 (in other words, they aren't NULL or leaves and their "pos" is equal
51 to this tnode's "pos"+"bits").
52
53 (The word "full" here is used more in the sense of "complete" than
54 as the opposite of "empty", which might be a tad confusing.)
55
56Comments
57---------
58
59We have tried to keep the structure of the code as close to fib_hash as
60possible to allow verification and help up reviewing.
61
62fib_find_node()
63 A good start for understanding this code. This function implements a
64 straightforward trie lookup.
65
66fib_insert_node()
67 Inserts a new leaf node in the trie. This is bit more complicated than
68 fib_find_node(). Inserting a new node means we might have to run the
69 level compression algorithm on part of the trie.
70
71trie_leaf_remove()
72 Looks up a key, deletes it and runs the level compression algorithm.
73
74trie_rebalance()
75 The key function for the dynamic trie after any change in the trie
76 it is run to optimize and reorganize. Tt will walk the trie upwards
77 towards the root from a given tnode, doing a resize() at each step
78 to implement level compression.
79
80resize()
81 Analyzes a tnode and optimizes the child array size by either inflating
82 or shrinking it repeatedly until it fullfills the criteria for optimal
83 level compression. This part follows the original paper pretty closely
84 and there may be some room for experimentation here.
85
86inflate()
87 Doubles the size of the child array within a tnode. Used by resize().
88
89halve()
90 Halves the size of the child array within a tnode - the inverse of
91 inflate(). Used by resize();
92
93fn_trie_insert(), fn_trie_delete(), fn_trie_select_default()
94 The route manipulation functions. Should conform pretty closely to the
95 corresponding functions in fib_hash.
96
97fn_trie_flush()
98 This walks the full trie (using nextleaf()) and searches for empty
99 leaves which have to be removed.
100
101fn_trie_dump()
102 Dumps the routing table ordered by prefix length. This is somewhat
103 slower than the corresponding fib_hash function, as we have to walk the
104 entire trie for each prefix length. In comparison, fib_hash is organized
105 as one "zone"/hash per prefix length.
106
107Locking
108-------
109
110fib_lock is used for an RW-lock in the same way that this is done in fib_hash.
111However, the functions are somewhat separated for other possible locking
112scenarios. It might conceivably be possible to run trie_rebalance via RCU
113to avoid read_lock in the fn_trie_lookup() function.
114
115Main lookup mechanism
116---------------------
117fn_trie_lookup() is the main lookup function.
118
119The lookup is in its simplest form just like fib_find_node(). We descend the
120trie, key segment by key segment, until we find a leaf. check_leaf() does
121the fib_semantic_match in the leaf's sorted prefix hlist.
122
123If we find a match, we are done.
124
125If we don't find a match, we enter prefix matching mode. The prefix length,
126starting out at the same as the key length, is reduced one step at a time,
127and we backtrack upwards through the trie trying to find a longest matching
128prefix. The goal is always to reach a leaf and get a positive result from the
129fib_semantic_match mechanism.
130
131Inside each tnode, the search for longest matching prefix consists of searching
132through the child array, chopping off (zeroing) the least significant "1" of
133the child index until we find a match or the child index consists of nothing but
134zeros.
135
136At this point we backtrack (t->stats.backtrack++) up the trie, continuing to
137chop off part of the key in order to find the longest matching prefix.
138
139At this point we will repeatedly descend subtries to look for a match, and there
140are some optimizations available that can provide us with "shortcuts" to avoid
141descending into dead ends. Look for "HL_OPTIMIZE" sections in the code.
142
143To alleviate any doubts about the correctness of the route selection process,
144a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which
145gives userland access to fib_lookup().
diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c
index 20edeb345792..3ad0b6751f6f 100644
--- a/drivers/net/shaper.c
+++ b/drivers/net/shaper.c
@@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
135{ 135{
136 struct shaper *shaper = dev->priv; 136 struct shaper *shaper = dev->priv;
137 struct sk_buff *ptr; 137 struct sk_buff *ptr;
138 138
139 if (down_trylock(&shaper->sem)) 139 spin_lock(&shaper->lock);
140 return -1;
141
142 ptr=shaper->sendq.prev; 140 ptr=shaper->sendq.prev;
143 141
144 /* 142 /*
@@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
232 shaper->stats.collisions++; 230 shaper->stats.collisions++;
233 } 231 }
234 shaper_kick(shaper); 232 shaper_kick(shaper);
235 up(&shaper->sem); 233 spin_unlock(&shaper->lock);
236 return 0; 234 return 0;
237} 235}
238 236
@@ -271,11 +269,9 @@ static void shaper_timer(unsigned long data)
271{ 269{
272 struct shaper *shaper = (struct shaper *)data; 270 struct shaper *shaper = (struct shaper *)data;
273 271
274 if (!down_trylock(&shaper->sem)) { 272 spin_lock(&shaper->lock);
275 shaper_kick(shaper); 273 shaper_kick(shaper);
276 up(&shaper->sem); 274 spin_unlock(&shaper->lock);
277 } else
278 mod_timer(&shaper->timer, jiffies);
279} 275}
280 276
281/* 277/*
@@ -332,21 +328,6 @@ static void shaper_kick(struct shaper *shaper)
332 328
333 329
334/* 330/*
335 * Flush the shaper queues on a closedown
336 */
337
338static void shaper_flush(struct shaper *shaper)
339{
340 struct sk_buff *skb;
341
342 down(&shaper->sem);
343 while((skb=skb_dequeue(&shaper->sendq))!=NULL)
344 dev_kfree_skb(skb);
345 shaper_kick(shaper);
346 up(&shaper->sem);
347}
348
349/*
350 * Bring the interface up. We just disallow this until a 331 * Bring the interface up. We just disallow this until a
351 * bind. 332 * bind.
352 */ 333 */
@@ -375,7 +356,15 @@ static int shaper_open(struct net_device *dev)
375static int shaper_close(struct net_device *dev) 356static int shaper_close(struct net_device *dev)
376{ 357{
377 struct shaper *shaper=dev->priv; 358 struct shaper *shaper=dev->priv;
378 shaper_flush(shaper); 359 struct sk_buff *skb;
360
361 while ((skb = skb_dequeue(&shaper->sendq)) != NULL)
362 dev_kfree_skb(skb);
363
364 spin_lock_bh(&shaper->lock);
365 shaper_kick(shaper);
366 spin_unlock_bh(&shaper->lock);
367
379 del_timer_sync(&shaper->timer); 368 del_timer_sync(&shaper->timer);
380 return 0; 369 return 0;
381} 370}
@@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_device *dev)
576 init_timer(&sh->timer); 565 init_timer(&sh->timer);
577 sh->timer.function=shaper_timer; 566 sh->timer.function=shaper_timer;
578 sh->timer.data=(unsigned long)sh; 567 sh->timer.data=(unsigned long)sh;
568 spin_lock_init(&sh->lock);
579} 569}
580 570
581/* 571/*
diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index 14d0cc01fb9a..fced3d2bc072 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -7,6 +7,7 @@
7/* PCI config registers */ 7/* PCI config registers */
8#define PCI_DEV_REG1 0x40 8#define PCI_DEV_REG1 0x40
9#define PCI_DEV_REG2 0x44 9#define PCI_DEV_REG2 0x44
10#define PCI_REV_DESC 0x4
10 11
11#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \ 12#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \
12 PCI_STATUS_SIG_SYSTEM_ERROR | \ 13 PCI_STATUS_SIG_SYSTEM_ERROR | \
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7e371b1209a1..54640686e983 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -66,8 +66,8 @@
66 66
67#define DRV_MODULE_NAME "tg3" 67#define DRV_MODULE_NAME "tg3"
68#define PFX DRV_MODULE_NAME ": " 68#define PFX DRV_MODULE_NAME ": "
69#define DRV_MODULE_VERSION "3.32" 69#define DRV_MODULE_VERSION "3.33"
70#define DRV_MODULE_RELDATE "June 24, 2005" 70#define DRV_MODULE_RELDATE "July 5, 2005"
71 71
72#define TG3_DEF_MAC_MODE 0 72#define TG3_DEF_MAC_MODE 0
73#define TG3_DEF_RX_MODE 0 73#define TG3_DEF_RX_MODE 0
@@ -5117,7 +5117,7 @@ static void tg3_set_bdinfo(struct tg3 *tp, u32 bdinfo_addr,
5117} 5117}
5118 5118
5119static void __tg3_set_rx_mode(struct net_device *); 5119static void __tg3_set_rx_mode(struct net_device *);
5120static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) 5120static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
5121{ 5121{
5122 tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs); 5122 tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
5123 tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs); 5123 tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
@@ -5460,7 +5460,7 @@ static int tg3_reset_hw(struct tg3 *tp)
5460 udelay(10); 5460 udelay(10);
5461 } 5461 }
5462 5462
5463 tg3_set_coalesce(tp, &tp->coal); 5463 __tg3_set_coalesce(tp, &tp->coal);
5464 5464
5465 /* set status block DMA address */ 5465 /* set status block DMA address */
5466 tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH, 5466 tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
@@ -7821,6 +7821,60 @@ static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
7821 return 0; 7821 return 0;
7822} 7822}
7823 7823
7824static int tg3_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
7825{
7826 struct tg3 *tp = netdev_priv(dev);
7827 u32 max_rxcoal_tick_int = 0, max_txcoal_tick_int = 0;
7828 u32 max_stat_coal_ticks = 0, min_stat_coal_ticks = 0;
7829
7830 if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
7831 max_rxcoal_tick_int = MAX_RXCOAL_TICK_INT;
7832 max_txcoal_tick_int = MAX_TXCOAL_TICK_INT;
7833 max_stat_coal_ticks = MAX_STAT_COAL_TICKS;
7834 min_stat_coal_ticks = MIN_STAT_COAL_TICKS;
7835 }
7836
7837 if ((ec->rx_coalesce_usecs > MAX_RXCOL_TICKS) ||
7838 (ec->tx_coalesce_usecs > MAX_TXCOL_TICKS) ||
7839 (ec->rx_max_coalesced_frames > MAX_RXMAX_FRAMES) ||
7840 (ec->tx_max_coalesced_frames > MAX_TXMAX_FRAMES) ||
7841 (ec->rx_coalesce_usecs_irq > max_rxcoal_tick_int) ||
7842 (ec->tx_coalesce_usecs_irq > max_txcoal_tick_int) ||
7843 (ec->rx_max_coalesced_frames_irq > MAX_RXCOAL_MAXF_INT) ||
7844 (ec->tx_max_coalesced_frames_irq > MAX_TXCOAL_MAXF_INT) ||
7845 (ec->stats_block_coalesce_usecs > max_stat_coal_ticks) ||
7846 (ec->stats_block_coalesce_usecs < min_stat_coal_ticks))
7847 return -EINVAL;
7848
7849 /* No rx interrupts will be generated if both are zero */
7850 if ((ec->rx_coalesce_usecs == 0) &&
7851 (ec->rx_max_coalesced_frames == 0))
7852 return -EINVAL;
7853
7854 /* No tx interrupts will be generated if both are zero */
7855 if ((ec->tx_coalesce_usecs == 0) &&
7856 (ec->tx_max_coalesced_frames == 0))
7857 return -EINVAL;
7858
7859 /* Only copy relevant parameters, ignore all others. */
7860 tp->coal.rx_coalesce_usecs = ec->rx_coalesce_usecs;
7861 tp->coal.tx_coalesce_usecs = ec->tx_coalesce_usecs;
7862 tp->coal.rx_max_coalesced_frames = ec->rx_max_coalesced_frames;
7863 tp->coal.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
7864 tp->coal.rx_coalesce_usecs_irq = ec->rx_coalesce_usecs_irq;
7865 tp->coal.tx_coalesce_usecs_irq = ec->tx_coalesce_usecs_irq;
7866 tp->coal.rx_max_coalesced_frames_irq = ec->rx_max_coalesced_frames_irq;
7867 tp->coal.tx_max_coalesced_frames_irq = ec->tx_max_coalesced_frames_irq;
7868 tp->coal.stats_block_coalesce_usecs = ec->stats_block_coalesce_usecs;
7869
7870 if (netif_running(dev)) {
7871 tg3_full_lock(tp, 0);
7872 __tg3_set_coalesce(tp, &tp->coal);
7873 tg3_full_unlock(tp);
7874 }
7875 return 0;
7876}
7877
7824static struct ethtool_ops tg3_ethtool_ops = { 7878static struct ethtool_ops tg3_ethtool_ops = {
7825 .get_settings = tg3_get_settings, 7879 .get_settings = tg3_get_settings,
7826 .set_settings = tg3_set_settings, 7880 .set_settings = tg3_set_settings,
@@ -7856,6 +7910,7 @@ static struct ethtool_ops tg3_ethtool_ops = {
7856 .get_stats_count = tg3_get_stats_count, 7910 .get_stats_count = tg3_get_stats_count,
7857 .get_ethtool_stats = tg3_get_ethtool_stats, 7911 .get_ethtool_stats = tg3_get_ethtool_stats,
7858 .get_coalesce = tg3_get_coalesce, 7912 .get_coalesce = tg3_get_coalesce,
7913 .set_coalesce = tg3_set_coalesce,
7859}; 7914};
7860 7915
7861static void __devinit tg3_get_eeprom_size(struct tg3 *tp) 7916static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
@@ -9800,6 +9855,12 @@ static void __devinit tg3_init_coal(struct tg3 *tp)
9800 ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS; 9855 ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
9801 ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS; 9856 ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
9802 } 9857 }
9858
9859 if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) {
9860 ec->rx_coalesce_usecs_irq = 0;
9861 ec->tx_coalesce_usecs_irq = 0;
9862 ec->stats_block_coalesce_usecs = 0;
9863 }
9803} 9864}
9804 9865
9805static int __devinit tg3_init_one(struct pci_dev *pdev, 9866static int __devinit tg3_init_one(struct pci_dev *pdev,
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 99c5f9675a56..70ad450733e6 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -879,31 +879,41 @@
879#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014 879#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
880#define DEFAULT_RXCOL_TICKS 0x00000048 880#define DEFAULT_RXCOL_TICKS 0x00000048
881#define HIGH_RXCOL_TICKS 0x00000096 881#define HIGH_RXCOL_TICKS 0x00000096
882#define MAX_RXCOL_TICKS 0x000003ff
882#define HOSTCC_TXCOL_TICKS 0x00003c0c 883#define HOSTCC_TXCOL_TICKS 0x00003c0c
883#define LOW_TXCOL_TICKS 0x00000096 884#define LOW_TXCOL_TICKS 0x00000096
884#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048 885#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
885#define DEFAULT_TXCOL_TICKS 0x0000012c 886#define DEFAULT_TXCOL_TICKS 0x0000012c
886#define HIGH_TXCOL_TICKS 0x00000145 887#define HIGH_TXCOL_TICKS 0x00000145
888#define MAX_TXCOL_TICKS 0x000003ff
887#define HOSTCC_RXMAX_FRAMES 0x00003c10 889#define HOSTCC_RXMAX_FRAMES 0x00003c10
888#define LOW_RXMAX_FRAMES 0x00000005 890#define LOW_RXMAX_FRAMES 0x00000005
889#define DEFAULT_RXMAX_FRAMES 0x00000008 891#define DEFAULT_RXMAX_FRAMES 0x00000008
890#define HIGH_RXMAX_FRAMES 0x00000012 892#define HIGH_RXMAX_FRAMES 0x00000012
893#define MAX_RXMAX_FRAMES 0x000000ff
891#define HOSTCC_TXMAX_FRAMES 0x00003c14 894#define HOSTCC_TXMAX_FRAMES 0x00003c14
892#define LOW_TXMAX_FRAMES 0x00000035 895#define LOW_TXMAX_FRAMES 0x00000035
893#define DEFAULT_TXMAX_FRAMES 0x0000004b 896#define DEFAULT_TXMAX_FRAMES 0x0000004b
894#define HIGH_TXMAX_FRAMES 0x00000052 897#define HIGH_TXMAX_FRAMES 0x00000052
898#define MAX_TXMAX_FRAMES 0x000000ff
895#define HOSTCC_RXCOAL_TICK_INT 0x00003c18 899#define HOSTCC_RXCOAL_TICK_INT 0x00003c18
896#define DEFAULT_RXCOAL_TICK_INT 0x00000019 900#define DEFAULT_RXCOAL_TICK_INT 0x00000019
897#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014 901#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
902#define MAX_RXCOAL_TICK_INT 0x000003ff
898#define HOSTCC_TXCOAL_TICK_INT 0x00003c1c 903#define HOSTCC_TXCOAL_TICK_INT 0x00003c1c
899#define DEFAULT_TXCOAL_TICK_INT 0x00000019 904#define DEFAULT_TXCOAL_TICK_INT 0x00000019
900#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014 905#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
906#define MAX_TXCOAL_TICK_INT 0x000003ff
901#define HOSTCC_RXCOAL_MAXF_INT 0x00003c20 907#define HOSTCC_RXCOAL_MAXF_INT 0x00003c20
902#define DEFAULT_RXCOAL_MAXF_INT 0x00000005 908#define DEFAULT_RXCOAL_MAXF_INT 0x00000005
909#define MAX_RXCOAL_MAXF_INT 0x000000ff
903#define HOSTCC_TXCOAL_MAXF_INT 0x00003c24 910#define HOSTCC_TXCOAL_MAXF_INT 0x00003c24
904#define DEFAULT_TXCOAL_MAXF_INT 0x00000005 911#define DEFAULT_TXCOAL_MAXF_INT 0x00000005
912#define MAX_TXCOAL_MAXF_INT 0x000000ff
905#define HOSTCC_STAT_COAL_TICKS 0x00003c28 913#define HOSTCC_STAT_COAL_TICKS 0x00003c28
906#define DEFAULT_STAT_COAL_TICKS 0x000f4240 914#define DEFAULT_STAT_COAL_TICKS 0x000f4240
915#define MAX_STAT_COAL_TICKS 0xd693d400
916#define MIN_STAT_COAL_TICKS 0x00000064
907/* 0x3c2c --> 0x3c30 unused */ 917/* 0x3c2c --> 0x3c30 unused */
908#define HOSTCC_STATS_BLK_HOST_ADDR 0x00003c30 /* 64-bit */ 918#define HOSTCC_STATS_BLK_HOST_ADDR 0x00003c30 /* 64-bit */
909#define HOSTCC_STATUS_BLK_HOST_ADDR 0x00003c38 /* 64-bit */ 919#define HOSTCC_STATUS_BLK_HOST_ADDR 0x00003c38 /* 64-bit */
diff --git a/include/linux/if_shaper.h b/include/linux/if_shaper.h
index 004e6f09a6e2..68c896a36a34 100644
--- a/include/linux/if_shaper.h
+++ b/include/linux/if_shaper.h
@@ -23,7 +23,7 @@ struct shaper
23 __u32 shapeclock; 23 __u32 shapeclock;
24 unsigned long recovery; /* Time we can next clock a packet out on 24 unsigned long recovery; /* Time we can next clock a packet out on
25 an empty queue */ 25 an empty queue */
26 struct semaphore sem; 26 spinlock_t lock;
27 struct net_device_stats stats; 27 struct net_device_stats stats;
28 struct net_device *dev; 28 struct net_device *dev;
29 int (*hard_start_xmit) (struct sk_buff *skb, 29 int (*hard_start_xmit) (struct sk_buff *skb,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 416a2e4024b2..14b950413495 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -183,7 +183,6 @@ struct skb_shared_info {
183 * @priority: Packet queueing priority 183 * @priority: Packet queueing priority
184 * @users: User count - see {datagram,tcp}.c 184 * @users: User count - see {datagram,tcp}.c
185 * @protocol: Packet protocol from driver 185 * @protocol: Packet protocol from driver
186 * @security: Security level of packet
187 * @truesize: Buffer size 186 * @truesize: Buffer size
188 * @head: Head of buffer 187 * @head: Head of buffer
189 * @data: Data head pointer 188 * @data: Data head pointer
@@ -249,18 +248,18 @@ struct sk_buff {
249 data_len, 248 data_len,
250 mac_len, 249 mac_len,
251 csum; 250 csum;
252 unsigned char local_df,
253 cloned:1,
254 nohdr:1,
255 pkt_type,
256 ip_summed;
257 __u32 priority; 251 __u32 priority;
258 unsigned short protocol, 252 __u8 local_df:1,
259 security; 253 cloned:1,
254 ip_summed:2,
255 nohdr:1;
256 /* 3 bits spare */
257 __u8 pkt_type;
258 __u16 protocol;
260 259
261 void (*destructor)(struct sk_buff *skb); 260 void (*destructor)(struct sk_buff *skb);
262#ifdef CONFIG_NETFILTER 261#ifdef CONFIG_NETFILTER
263 unsigned long nfmark; 262 unsigned long nfmark;
264 __u32 nfcache; 263 __u32 nfcache;
265 __u32 nfctinfo; 264 __u32 nfctinfo;
266 struct nf_conntrack *nfct; 265 struct nf_conntrack *nfct;
@@ -1211,7 +1210,7 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
1211{ 1210{
1212 int hlen = skb_headlen(skb); 1211 int hlen = skb_headlen(skb);
1213 1212
1214 if (offset + len <= hlen) 1213 if (hlen - offset >= len)
1215 return skb->data + offset; 1214 return skb->data + offset;
1216 1215
1217 if (skb_copy_bits(skb, offset, buffer, len) < 0) 1216 if (skb_copy_bits(skb, offset, buffer, len) < 0)
diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index a6b2cc530af5..bcb762d93123 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -45,7 +45,7 @@ enum
45 TCF_META_ID_REALDEV, 45 TCF_META_ID_REALDEV,
46 TCF_META_ID_PRIORITY, 46 TCF_META_ID_PRIORITY,
47 TCF_META_ID_PROTOCOL, 47 TCF_META_ID_PROTOCOL,
48 TCF_META_ID_SECURITY, 48 TCF_META_ID_SECURITY, /* obsolete */
49 TCF_META_ID_PKTTYPE, 49 TCF_META_ID_PKTTYPE,
50 TCF_META_ID_PKTLEN, 50 TCF_META_ID_PKTLEN,
51 TCF_META_ID_DATALEN, 51 TCF_META_ID_DATALEN,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index dfd93d03f5d2..e4fd82e42104 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -286,7 +286,7 @@ struct tcp_sock {
286 __u32 max_window; /* Maximal window ever seen from peer */ 286 __u32 max_window; /* Maximal window ever seen from peer */
287 __u32 pmtu_cookie; /* Last pmtu seen by socket */ 287 __u32 pmtu_cookie; /* Last pmtu seen by socket */
288 __u32 mss_cache; /* Cached effective mss, not including SACKS */ 288 __u32 mss_cache; /* Cached effective mss, not including SACKS */
289 __u16 mss_cache_std; /* Like mss_cache, but without TSO */ 289 __u16 xmit_size_goal; /* Goal for segmenting output packets */
290 __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ 290 __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
291 __u8 ca_state; /* State of fast-retransmit machine */ 291 __u8 ca_state; /* State of fast-retransmit machine */
292 __u8 retransmits; /* Number of unrecovered RTO timeouts. */ 292 __u8 retransmits; /* Number of unrecovered RTO timeouts. */
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fcb05a387dbe..6492e7363d84 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -13,13 +13,12 @@ struct qdisc_walker
13 13
14extern rwlock_t qdisc_tree_lock; 14extern rwlock_t qdisc_tree_lock;
15 15
16#define QDISC_ALIGN 32 16#define QDISC_ALIGNTO 32
17#define QDISC_ALIGN_CONST (QDISC_ALIGN - 1) 17#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1))
18 18
19static inline void *qdisc_priv(struct Qdisc *q) 19static inline void *qdisc_priv(struct Qdisc *q)
20{ 20{
21 return (char *)q + ((sizeof(struct Qdisc) + QDISC_ALIGN_CONST) 21 return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc));
22 & ~QDISC_ALIGN_CONST);
23} 22}
24 23
25/* 24/*
@@ -207,8 +206,6 @@ psched_tod_diff(int delta_sec, int bound)
207 206
208#endif /* !CONFIG_NET_SCH_CLK_GETTIMEOFDAY */ 207#endif /* !CONFIG_NET_SCH_CLK_GETTIMEOFDAY */
209 208
210extern struct Qdisc noop_qdisc;
211extern struct Qdisc_ops noop_qdisc_ops;
212extern struct Qdisc_ops pfifo_qdisc_ops; 209extern struct Qdisc_ops pfifo_qdisc_ops;
213extern struct Qdisc_ops bfifo_qdisc_ops; 210extern struct Qdisc_ops bfifo_qdisc_ops;
214 211
@@ -216,14 +213,6 @@ extern int register_qdisc(struct Qdisc_ops *qops);
216extern int unregister_qdisc(struct Qdisc_ops *qops); 213extern int unregister_qdisc(struct Qdisc_ops *qops);
217extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); 214extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
218extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); 215extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
219extern void dev_init_scheduler(struct net_device *dev);
220extern void dev_shutdown(struct net_device *dev);
221extern void dev_activate(struct net_device *dev);
222extern void dev_deactivate(struct net_device *dev);
223extern void qdisc_reset(struct Qdisc *qdisc);
224extern void qdisc_destroy(struct Qdisc *qdisc);
225extern struct Qdisc * qdisc_create_dflt(struct net_device *dev,
226 struct Qdisc_ops *ops);
227extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 216extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
228 struct rtattr *tab); 217 struct rtattr *tab);
229extern void qdisc_put_rtab(struct qdisc_rate_table *tab); 218extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7b97405e2dbf..7b6ec9986715 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -164,6 +164,19 @@ extern void qdisc_unlock_tree(struct net_device *dev);
164#define tcf_tree_lock(tp) qdisc_lock_tree((tp)->q->dev) 164#define tcf_tree_lock(tp) qdisc_lock_tree((tp)->q->dev)
165#define tcf_tree_unlock(tp) qdisc_unlock_tree((tp)->q->dev) 165#define tcf_tree_unlock(tp) qdisc_unlock_tree((tp)->q->dev)
166 166
167extern struct Qdisc noop_qdisc;
168extern struct Qdisc_ops noop_qdisc_ops;
169
170extern void dev_init_scheduler(struct net_device *dev);
171extern void dev_shutdown(struct net_device *dev);
172extern void dev_activate(struct net_device *dev);
173extern void dev_deactivate(struct net_device *dev);
174extern void qdisc_reset(struct Qdisc *qdisc);
175extern void qdisc_destroy(struct Qdisc *qdisc);
176extern struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops);
177extern struct Qdisc *qdisc_create_dflt(struct net_device *dev,
178 struct Qdisc_ops *ops);
179
167static inline void 180static inline void
168tcf_destroy(struct tcf_proto *tp) 181tcf_destroy(struct tcf_proto *tp)
169{ 182{
diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h
index 0b2c2784f333..8716d5942b65 100644
--- a/include/net/slhc_vj.h
+++ b/include/net/slhc_vj.h
@@ -170,19 +170,14 @@ struct slcompress {
170}; 170};
171#define NULLSLCOMPR (struct slcompress *)0 171#define NULLSLCOMPR (struct slcompress *)0
172 172
173#define __ARGS(x) x
174
175/* In slhc.c: */ 173/* In slhc.c: */
176struct slcompress *slhc_init __ARGS((int rslots, int tslots)); 174struct slcompress *slhc_init(int rslots, int tslots);
177void slhc_free __ARGS((struct slcompress *comp)); 175void slhc_free(struct slcompress *comp);
178 176
179int slhc_compress __ARGS((struct slcompress *comp, unsigned char *icp, 177int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize,
180 int isize, unsigned char *ocp, unsigned char **cpp, 178 unsigned char *ocp, unsigned char **cpp, int compress_cid);
181 int compress_cid)); 179int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize);
182int slhc_uncompress __ARGS((struct slcompress *comp, unsigned char *icp, 180int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize);
183 int isize)); 181int slhc_toss(struct slcompress *comp);
184int slhc_remember __ARGS((struct slcompress *comp, unsigned char *icp,
185 int isize));
186int slhc_toss __ARGS((struct slcompress *comp));
187 182
188#endif /* _SLHC_H */ 183#endif /* _SLHC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index e593af5b1ecc..7b76f891ae2d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1134,13 +1134,16 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
1134static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, 1134static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
1135 int size, int mem, int gfp) 1135 int size, int mem, int gfp)
1136{ 1136{
1137 struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp); 1137 struct sk_buff *skb;
1138 int hdr_len;
1138 1139
1140 hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
1141 skb = alloc_skb(size + hdr_len, gfp);
1139 if (skb) { 1142 if (skb) {
1140 skb->truesize += mem; 1143 skb->truesize += mem;
1141 if (sk->sk_forward_alloc >= (int)skb->truesize || 1144 if (sk->sk_forward_alloc >= (int)skb->truesize ||
1142 sk_stream_mem_schedule(sk, skb->truesize, 0)) { 1145 sk_stream_mem_schedule(sk, skb->truesize, 0)) {
1143 skb_reserve(skb, sk->sk_prot->max_header); 1146 skb_reserve(skb, hdr_len);
1144 return skb; 1147 return skb;
1145 } 1148 }
1146 __kfree_skb(skb); 1149 __kfree_skb(skb);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ec9e20c27179..a166918ca56d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -721,11 +721,16 @@ static inline int tcp_ack_scheduled(struct tcp_sock *tp)
721 return tp->ack.pending&TCP_ACK_SCHED; 721 return tp->ack.pending&TCP_ACK_SCHED;
722} 722}
723 723
724static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp) 724static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts)
725{ 725{
726 if (tp->ack.quick && --tp->ack.quick == 0) { 726 if (tp->ack.quick) {
727 /* Leaving quickack mode we deflate ATO. */ 727 if (pkts >= tp->ack.quick) {
728 tp->ack.ato = TCP_ATO_MIN; 728 tp->ack.quick = 0;
729
730 /* Leaving quickack mode we deflate ATO. */
731 tp->ack.ato = TCP_ATO_MIN;
732 } else
733 tp->ack.quick -= pkts;
729 } 734 }
730} 735}
731 736
@@ -843,7 +848,9 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
843 848
844/* tcp_output.c */ 849/* tcp_output.c */
845 850
846extern int tcp_write_xmit(struct sock *, int nonagle); 851extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
852 unsigned int cur_mss, int nonagle);
853extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp);
847extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); 854extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
848extern void tcp_xmit_retransmit_queue(struct sock *); 855extern void tcp_xmit_retransmit_queue(struct sock *);
849extern void tcp_simple_retransmit(struct sock *); 856extern void tcp_simple_retransmit(struct sock *);
@@ -855,10 +862,13 @@ extern int tcp_write_wakeup(struct sock *);
855extern void tcp_send_fin(struct sock *sk); 862extern void tcp_send_fin(struct sock *sk);
856extern void tcp_send_active_reset(struct sock *sk, int priority); 863extern void tcp_send_active_reset(struct sock *sk, int priority);
857extern int tcp_send_synack(struct sock *); 864extern int tcp_send_synack(struct sock *);
858extern void tcp_push_one(struct sock *, unsigned mss_now); 865extern void tcp_push_one(struct sock *, unsigned int mss_now);
859extern void tcp_send_ack(struct sock *sk); 866extern void tcp_send_ack(struct sock *sk);
860extern void tcp_send_delayed_ack(struct sock *sk); 867extern void tcp_send_delayed_ack(struct sock *sk);
861 868
869/* tcp_input.c */
870extern void tcp_cwnd_application_limited(struct sock *sk);
871
862/* tcp_timer.c */ 872/* tcp_timer.c */
863extern void tcp_init_xmit_timers(struct sock *); 873extern void tcp_init_xmit_timers(struct sock *);
864extern void tcp_clear_xmit_timers(struct sock *); 874extern void tcp_clear_xmit_timers(struct sock *);
@@ -958,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
958static inline void tcp_initialize_rcv_mss(struct sock *sk) 968static inline void tcp_initialize_rcv_mss(struct sock *sk)
959{ 969{
960 struct tcp_sock *tp = tcp_sk(sk); 970 struct tcp_sock *tp = tcp_sk(sk);
961 unsigned int hint = min(tp->advmss, tp->mss_cache_std); 971 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
962 972
963 hint = min(hint, tp->rcv_wnd/2); 973 hint = min(hint, tp->rcv_wnd/2);
964 hint = min(hint, TCP_MIN_RCVMSS); 974 hint = min(hint, TCP_MIN_RCVMSS);
@@ -1225,28 +1235,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
1225 tp->left_out = tp->sacked_out + tp->lost_out; 1235 tp->left_out = tp->sacked_out + tp->lost_out;
1226} 1236}
1227 1237
1228extern void tcp_cwnd_application_limited(struct sock *sk);
1229
1230/* Congestion window validation. (RFC2861) */
1231
1232static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
1233{
1234 __u32 packets_out = tp->packets_out;
1235
1236 if (packets_out >= tp->snd_cwnd) {
1237 /* Network is feed fully. */
1238 tp->snd_cwnd_used = 0;
1239 tp->snd_cwnd_stamp = tcp_time_stamp;
1240 } else {
1241 /* Network starves. */
1242 if (tp->packets_out > tp->snd_cwnd_used)
1243 tp->snd_cwnd_used = tp->packets_out;
1244
1245 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
1246 tcp_cwnd_application_limited(sk);
1247 }
1248}
1249
1250/* Set slow start threshould and cwnd not falling to slow start */ 1238/* Set slow start threshould and cwnd not falling to slow start */
1251static inline void __tcp_enter_cwr(struct tcp_sock *tp) 1239static inline void __tcp_enter_cwr(struct tcp_sock *tp)
1252{ 1240{
@@ -1279,12 +1267,6 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
1279 return 3; 1267 return 3;
1280} 1268}
1281 1269
1282static __inline__ int tcp_minshall_check(const struct tcp_sock *tp)
1283{
1284 return after(tp->snd_sml,tp->snd_una) &&
1285 !after(tp->snd_sml, tp->snd_nxt);
1286}
1287
1288static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, 1270static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
1289 const struct sk_buff *skb) 1271 const struct sk_buff *skb)
1290{ 1272{
@@ -1292,122 +1274,18 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
1292 tp->snd_sml = TCP_SKB_CB(skb)->end_seq; 1274 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1293} 1275}
1294 1276
1295/* Return 0, if packet can be sent now without violation Nagle's rules:
1296 1. It is full sized.
1297 2. Or it contains FIN.
1298 3. Or TCP_NODELAY was set.
1299 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1300 With Minshall's modification: all sent small packets are ACKed.
1301 */
1302
1303static __inline__ int
1304tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb,
1305 unsigned mss_now, int nonagle)
1306{
1307 return (skb->len < mss_now &&
1308 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
1309 ((nonagle&TCP_NAGLE_CORK) ||
1310 (!nonagle &&
1311 tp->packets_out &&
1312 tcp_minshall_check(tp))));
1313}
1314
1315extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *);
1316
1317/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
1318 * should be put on the wire right now.
1319 */
1320static __inline__ int tcp_snd_test(struct sock *sk,
1321 struct sk_buff *skb,
1322 unsigned cur_mss, int nonagle)
1323{
1324 struct tcp_sock *tp = tcp_sk(sk);
1325 int pkts = tcp_skb_pcount(skb);
1326
1327 if (!pkts) {
1328 tcp_set_skb_tso_segs(sk, skb);
1329 pkts = tcp_skb_pcount(skb);
1330 }
1331
1332 /* RFC 1122 - section 4.2.3.4
1333 *
1334 * We must queue if
1335 *
1336 * a) The right edge of this frame exceeds the window
1337 * b) There are packets in flight and we have a small segment
1338 * [SWS avoidance and Nagle algorithm]
1339 * (part of SWS is done on packetization)
1340 * Minshall version sounds: there are no _small_
1341 * segments in flight. (tcp_nagle_check)
1342 * c) We have too many packets 'in flight'
1343 *
1344 * Don't use the nagle rule for urgent data (or
1345 * for the final FIN -DaveM).
1346 *
1347 * Also, Nagle rule does not apply to frames, which
1348 * sit in the middle of queue (they have no chances
1349 * to get new data) and if room at tail of skb is
1350 * not enough to save something seriously (<32 for now).
1351 */
1352
1353 /* Don't be strict about the congestion window for the
1354 * final FIN frame. -DaveM
1355 */
1356 return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
1357 || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
1358 (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
1359 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
1360 !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
1361}
1362
1363static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) 1277static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
1364{ 1278{
1365 if (!tp->packets_out && !tp->pending) 1279 if (!tp->packets_out && !tp->pending)
1366 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); 1280 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
1367} 1281}
1368 1282
1369static __inline__ int tcp_skb_is_last(const struct sock *sk,
1370 const struct sk_buff *skb)
1371{
1372 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
1373}
1374
1375/* Push out any pending frames which were held back due to
1376 * TCP_CORK or attempt at coalescing tiny packets.
1377 * The socket must be locked by the caller.
1378 */
1379static __inline__ void __tcp_push_pending_frames(struct sock *sk,
1380 struct tcp_sock *tp,
1381 unsigned cur_mss,
1382 int nonagle)
1383{
1384 struct sk_buff *skb = sk->sk_send_head;
1385
1386 if (skb) {
1387 if (!tcp_skb_is_last(sk, skb))
1388 nonagle = TCP_NAGLE_PUSH;
1389 if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
1390 tcp_write_xmit(sk, nonagle))
1391 tcp_check_probe_timer(sk, tp);
1392 }
1393 tcp_cwnd_validate(sk, tp);
1394}
1395
1396static __inline__ void tcp_push_pending_frames(struct sock *sk, 1283static __inline__ void tcp_push_pending_frames(struct sock *sk,
1397 struct tcp_sock *tp) 1284 struct tcp_sock *tp)
1398{ 1285{
1399 __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); 1286 __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
1400} 1287}
1401 1288
1402static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
1403{
1404 struct sk_buff *skb = sk->sk_send_head;
1405
1406 return (skb &&
1407 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
1408 tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
1409}
1410
1411static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) 1289static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
1412{ 1290{
1413 tp->snd_wl1 = seq; 1291 tp->snd_wl1 = seq;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7016e0c36b3d..7f5f62c65115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
2089{ 2089{
2090 unsigned short old_flags = dev->flags; 2090 unsigned short old_flags = dev->flags;
2091 2091
2092 dev->flags |= IFF_PROMISC;
2093 if ((dev->promiscuity += inc) == 0) 2092 if ((dev->promiscuity += inc) == 0)
2094 dev->flags &= ~IFF_PROMISC; 2093 dev->flags &= ~IFF_PROMISC;
2095 if (dev->flags ^ old_flags) { 2094 else
2095 dev->flags |= IFF_PROMISC;
2096 if (dev->flags != old_flags) {
2096 dev_mc_upload(dev); 2097 dev_mc_upload(dev);
2097 printk(KERN_INFO "device %s %s promiscuous mode\n", 2098 printk(KERN_INFO "device %s %s promiscuous mode\n",
2098 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 2099 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
36#include <linux/filter.h> 36#include <linux/filter.h>
37 37
38/* No hurry in this branch */ 38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k) 39static void *__load_pointer(struct sk_buff *skb, int k)
40{ 40{
41 u8 *ptr = NULL; 41 u8 *ptr = NULL;
42 42
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
50 return NULL; 50 return NULL;
51} 51}
52 52
53static inline void *load_pointer(struct sk_buff *skb, int k,
54 unsigned int size, void *buffer)
55{
56 if (k >= 0)
57 return skb_header_pointer(skb, k, size, buffer);
58 else {
59 if (k >= SKF_AD_OFF)
60 return NULL;
61 return __load_pointer(skb, k);
62 }
63}
64
53/** 65/**
54 * sk_run_filter - run a filter on a socket 66 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on 67 * @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
64 76
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) 77int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{ 78{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */ 79 struct sock_filter *fentry; /* We walk down these */
80 void *ptr;
73 u32 A = 0; /* Accumulator */ 81 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */ 82 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ 83 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
84 u32 tmp;
76 int k; 85 int k;
77 int pc; 86 int pc;
78 87
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
168 case BPF_LD|BPF_W|BPF_ABS: 177 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k; 178 k = fentry->k;
170 load_w: 179 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { 180 ptr = load_pointer(skb, k, 4, &tmp);
172 A = ntohl(*(u32*)&data[k]); 181 if (ptr != NULL) {
182 A = ntohl(*(u32 *)ptr);
173 continue; 183 continue;
174 } 184 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0; 185 return 0;
194 case BPF_LD|BPF_H|BPF_ABS: 186 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k; 187 k = fentry->k;
196 load_h: 188 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { 189 ptr = load_pointer(skb, k, 2, &tmp);
198 A = ntohs(*(u16*)&data[k]); 190 if (ptr != NULL) {
191 A = ntohs(*(u16 *)ptr);
199 continue; 192 continue;
200 } 193 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0; 194 return 0;
220 case BPF_LD|BPF_B|BPF_ABS: 195 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k; 196 k = fentry->k;
222load_b: 197load_b:
223 if (k >= 0 && (unsigned int)k < len) { 198 ptr = load_pointer(skb, k, 1, &tmp);
224 A = data[k]; 199 if (ptr != NULL) {
200 A = *(u8 *)ptr;
225 continue; 201 continue;
226 } 202 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0; 203 return 0;
246 case BPF_LD|BPF_W|BPF_LEN: 204 case BPF_LD|BPF_W|BPF_LEN:
247 A = len; 205 A = skb->len;
248 continue; 206 continue;
249 case BPF_LDX|BPF_W|BPF_LEN: 207 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len; 208 X = skb->len;
251 continue; 209 continue;
252 case BPF_LD|BPF_W|BPF_IND: 210 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k; 211 k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
259 k = X + fentry->k; 217 k = X + fentry->k;
260 goto load_b; 218 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH: 219 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len) 220 ptr = load_pointer(skb, fentry->k, 1, &tmp);
263 return 0; 221 if (ptr != NULL) {
264 X = (data[fentry->k] & 0xf) << 2; 222 X = (*(u8 *)ptr & 0xf) << 2;
265 continue; 223 continue;
224 }
225 return 0;
266 case BPF_LD|BPF_IMM: 226 case BPF_LD|BPF_IMM:
267 A = fentry->k; 227 A = fentry->k;
268 continue; 228 continue;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..733deee24b9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
357 C(ip_summed); 357 C(ip_summed);
358 C(priority); 358 C(priority);
359 C(protocol); 359 C(protocol);
360 C(security);
361 n->destructor = NULL; 360 n->destructor = NULL;
362#ifdef CONFIG_NETFILTER 361#ifdef CONFIG_NETFILTER
363 C(nfmark); 362 C(nfmark);
@@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
422 new->pkt_type = old->pkt_type; 421 new->pkt_type = old->pkt_type;
423 new->stamp = old->stamp; 422 new->stamp = old->stamp;
424 new->destructor = NULL; 423 new->destructor = NULL;
425 new->security = old->security;
426#ifdef CONFIG_NETFILTER 424#ifdef CONFIG_NETFILTER
427 new->nfmark = old->nfmark; 425 new->nfmark = old->nfmark;
428 new->nfcache = old->nfcache; 426 new->nfcache = old->nfcache;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
551 if (t < s_t) 551 if (t < s_t)
552 continue; 552 continue;
553 if (t > s_t) 553 if (t > s_t)
554 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); 554 memset(&cb->args[1], 0,
555 sizeof(cb->args) - sizeof(cb->args[0]));
555 tb = dn_fib_get_table(t, 0); 556 tb = dn_fib_get_table(t, 0);
556 if (tb == NULL) 557 if (tb == NULL)
557 continue; 558 continue;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b56e88edf1b3..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.324" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
358 kfree(li); 359 kfree(li);
359} 360}
360 361
362static struct tnode *tnode_alloc(unsigned int size)
363{
364 if (size <= PAGE_SIZE) {
365 return kmalloc(size, GFP_KERNEL);
366 } else {
367 return (struct tnode *)
368 __get_free_pages(GFP_KERNEL, get_order(size));
369 }
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *);
376
377 if (size <= PAGE_SIZE)
378 kfree(tn);
379 else
380 free_pages((unsigned long)tn, get_order(size));
381}
382
361static struct tnode* tnode_new(t_key key, int pos, int bits) 383static struct tnode* tnode_new(t_key key, int pos, int bits)
362{ 384{
363 int nchildren = 1<<bits; 385 int nchildren = 1<<bits;
364 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
365 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 387 struct tnode *tn = tnode_alloc(sz);
366 388
367 if(tn) { 389 if(tn) {
368 memset(tn, 0, sz); 390 memset(tn, 0, sz);
@@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn)
390 printk("FL %p \n", tn); 412 printk("FL %p \n", tn);
391 } 413 }
392 else if(IS_TNODE(tn)) { 414 else if(IS_TNODE(tn)) {
393 kfree(tn); 415 __tnode_free(tn);
394 if(trie_debug > 0 ) 416 if(trie_debug > 0 )
395 printk("FT %p \n", tn); 417 printk("FT %p \n", tn);
396 } 418 }
@@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
460static struct node *resize(struct trie *t, struct tnode *tn) 482static struct node *resize(struct trie *t, struct tnode *tn)
461{ 483{
462 int i; 484 int i;
485 int err = 0;
463 486
464 if (!tn) 487 if (!tn)
465 return NULL; 488 return NULL;
@@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
556 */ 579 */
557 580
558 check_tnode(tn); 581 check_tnode(tn);
559 582
583 err = 0;
560 while ((tn->full_children > 0 && 584 while ((tn->full_children > 0 &&
561 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
562 inflate_threshold * tnode_child_length(tn))) { 586 inflate_threshold * tnode_child_length(tn))) {
563 587
564 tn = inflate(t, tn); 588 tn = inflate(t, tn, &err);
589
590 if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++;
593#endif
594 break;
595 }
565 } 596 }
566 597
567 check_tnode(tn); 598 check_tnode(tn);
@@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
570 * Halve as long as the number of empty children in this 601 * Halve as long as the number of empty children in this
571 * node is above threshold. 602 * node is above threshold.
572 */ 603 */
604
605 err = 0;
573 while (tn->bits > 1 && 606 while (tn->bits > 1 &&
574 100 * (tnode_child_length(tn) - tn->empty_children) < 607 100 * (tnode_child_length(tn) - tn->empty_children) <
575 halve_threshold * tnode_child_length(tn)) 608 halve_threshold * tnode_child_length(tn)) {
609
610 tn = halve(t, tn, &err);
611
612 if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++;
615#endif
616 break;
617 }
618 }
576 619
577 tn = halve(t, tn);
578 620
579 /* Only one child remains */ 621 /* Only one child remains */
580 622
@@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
599 return (struct node *) tn; 641 return (struct node *) tn;
600} 642}
601 643
602static struct tnode *inflate(struct trie *t, struct tnode *tn) 644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
603{ 645{
604 struct tnode *inode; 646 struct tnode *inode;
605 struct tnode *oldtnode = tn; 647 struct tnode *oldtnode = tn;
@@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
611 653
612 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
613 655
614 if (!tn) 656 if (!tn) {
615 trie_bug("tnode_new failed"); 657 *err = -ENOMEM;
658 return oldtnode;
659 }
660
661 /*
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
666 */
667
668 for(i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671 if (inode &&
672 IS_TNODE(inode) &&
673 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) {
675 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1);
681
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1);
689
690 if(!right) {
691 *err = -ENOMEM;
692 break;
693 }
694
695 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right);
697 }
698 }
699
700 if(*err) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
616 713
617 for(i = 0; i < olen; i++) { 714 for(i = 0; i < olen; i++) {
618 struct node *node = tnode_get_child(oldtnode, i); 715 struct node *node = tnode_get_child(oldtnode, i);
@@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
625 722
626 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 723 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
627 tn->pos + tn->bits - 1) { 724 tn->pos + tn->bits - 1) {
628 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
629 1) == 0) 726 1) == 0)
630 put_child(t, tn, 2*i, node); 727 put_child(t, tn, 2*i, node);
631 else 728 else
@@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
665 * the position (inode->pos) 762 * the position (inode->pos)
666 */ 763 */
667 764
668 t_key m = TKEY_GET_MASK(inode->pos, 1);
669
670 /* Use the old key, but set the new significant 765 /* Use the old key, but set the new significant
671 * bit to zero. 766 * bit to zero.
672 */ 767 */
673 left = tnode_new(inode->key&(~m), inode->pos + 1,
674 inode->bits - 1);
675 768
676 if(!left) 769 left = (struct tnode *) tnode_get_child(tn, 2*i);
677 trie_bug("tnode_new failed"); 770 put_child(t, tn, 2*i, NULL);
678 771
679 772 if(!left)
680 /* Use the old key, but set the new significant 773 BUG();
681 * bit to one. 774
682 */ 775 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
683 right = tnode_new(inode->key|m, inode->pos + 1, 776 put_child(t, tn, 2*i+1, NULL);
684 inode->bits - 1); 777
778 if(!right)
779 BUG();
685 780
686 if(!right)
687 trie_bug("tnode_new failed");
688
689 size = tnode_child_length(left); 781 size = tnode_child_length(left);
690 for(j = 0; j < size; j++) { 782 for(j = 0; j < size; j++) {
691 put_child(t, left, j, inode->child[j]); 783 put_child(t, left, j, inode->child[j]);
@@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
701 return tn; 793 return tn;
702} 794}
703 795
704static struct tnode *halve(struct trie *t, struct tnode *tn) 796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
705{ 797{
706 struct tnode *oldtnode = tn; 798 struct tnode *oldtnode = tn;
707 struct node *left, *right; 799 struct node *left, *right;
@@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
712 804
713 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
714 806
715 if(!tn) 807 if (!tn) {
716 trie_bug("tnode_new failed"); 808 *err = -ENOMEM;
809 return oldtnode;
810 }
811
812 /*
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
817 */
818
819 for(i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1);
822
823 /* Two nonempty children */
824 if( left && right) {
825 struct tnode *newBinNode =
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827
828 if(!newBinNode) {
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835
836 if(*err) {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if( tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 }
717 849
718 for(i = 0; i < olen; i += 2) { 850 for(i = 0; i < olen; i += 2) {
719 left = tnode_get_child(oldtnode, i); 851 left = tnode_get_child(oldtnode, i);
@@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
730 /* Two nonempty children */ 862 /* Two nonempty children */
731 else { 863 else {
732 struct tnode *newBinNode = 864 struct tnode *newBinNode =
733 tnode_new(left->key, tn->pos + tn->bits, 1); 865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
734 867
735 if(!newBinNode) 868 if(!newBinNode)
736 trie_bug("tnode_new failed"); 869 BUG();
737 870
738 put_child(t, newBinNode, 0, left); 871 put_child(t, newBinNode, 0, left);
739 put_child(t, newBinNode, 1, right); 872 put_child(t, newBinNode, 1, right);
@@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2301 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2302 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2303 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2304#ifdef CLEAR_STATS 2438#ifdef CLEAR_STATS
2305 memset(&(t->stats), 0, sizeof(t->stats)); 2439 memset(&(t->stats), 0, sizeof(t->stats));
2306#endif 2440#endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..9de83e6e0f1d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
389 to->pkt_type = from->pkt_type; 389 to->pkt_type = from->pkt_type;
390 to->priority = from->priority; 390 to->priority = from->priority;
391 to->protocol = from->protocol; 391 to->protocol = from->protocol;
392 to->security = from->security;
393 dst_release(to->dst); 392 dst_release(to->dst);
394 to->dst = dst_clone(from->dst); 393 to->dst = dst_clone(from->dst);
395 to->dev = from->dev; 394 to->dev = from->dev;
@@ -1329,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1329 ip_rt_put(rt); 1328 ip_rt_put(rt);
1330} 1329}
1331 1330
1332/*
1333 * IP protocol layer initialiser
1334 */
1335
1336static struct packet_type ip_packet_type = {
1337 .type = __constant_htons(ETH_P_IP),
1338 .func = ip_rcv,
1339};
1340
1341/*
1342 * IP registers the packet type and then calls the subprotocol initialisers
1343 */
1344
1345void __init ip_init(void) 1331void __init ip_init(void)
1346{ 1332{
1347 dev_add_pack(&ip_packet_type);
1348
1349 ip_rt_init(); 1333 ip_rt_init();
1350 inet_initpeers(); 1334 inet_initpeers();
1351 1335
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..726ea5e8180a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3073 3111
3074int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3075{ 3113{
3076 int i, order, goal, rc = 0; 3114 int rc = 0;
3077 3115
3078 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3079 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3080 3118
3081#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3082 for (order = 0; 3122 for (order = 0;
3083 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3084 /* NOTHING */; 3124 /* NOTHING */;
@@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
3086 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3087 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3088 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3089#endif 3130#endif
3090 3131
3091 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
3096 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3097 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3098 3139
3099 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3100 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3101 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3102 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3103 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3104 3145 (27 - PAGE_SHIFT) :
3105 do { 3146 (29 - PAGE_SHIFT),
3106 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3107 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3108 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3109 rt_hash_mask--; 3150 0);
3110 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3111 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3112 } while (rt_hash_table == NULL && --order > 0);
3113
3114 if (!rt_hash_table)
3115 panic("Failed to allocate IP route cache hash table\n");
3116
3117 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3118 rt_hash_mask,
3119 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3120
3121 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3122 /* NOTHING */;
3123
3124 rt_hash_mask--;
3125 for (i = 0; i <= rt_hash_mask; i++) {
3126 spin_lock_init(&rt_hash_table[i].lock);
3127 rt_hash_table[i].chain = NULL;
3128 }
3129 3153
3130 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3131 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2128 2132
2129 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2130 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2131 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2132 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2133 2137
2134 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2178 2182
2179 switch (optname) { 2183 switch (optname) {
2180 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2181 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2182 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2183 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2184 break; 2188 break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
741 741
742 if (!cwnd) { 742 if (!cwnd) {
743 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
744 cwnd = 2; 744 cwnd = 2;
745 else 745 else
746 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
747 } 747 }
748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
749} 749}
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
914 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
915 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
916 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
917 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
918 } 918 }
919 919
920 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1077 (IsFack(tp) || 1077 (IsFack(tp) ||
1078 !before(lost_retrans, 1078 !before(lost_retrans,
1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1080 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083 1083
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1957 } 1957 }
1958} 1958}
1959 1959
1960/* There is one downside to this scheme. Although we keep the
1961 * ACK clock ticking, adjusting packet counters and advancing
1962 * congestion window, we do not liberate socket send buffer
1963 * space.
1964 *
1965 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
1966 * then making a write space wakeup callback is a possible
1967 * future enhancement. WARNING: it is not trivial to make.
1968 */
1969static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
1970 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
1971{ 1962{
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2047 * the other end. 2038 * the other end.
2048 */ 2039 */
2049 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2050 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2051 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2052 now, &seq_rtt); 2044 now, &seq_rtt);
2053 break; 2045 break;
@@ -3308,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3308 tp->snd_cwnd_stamp = tcp_time_stamp; 3300 tp->snd_cwnd_stamp = tcp_time_stamp;
3309} 3301}
3310 3302
3303static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3304{
3305 /* If the user specified a specific send buffer setting, do
3306 * not modify it.
3307 */
3308 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3309 return 0;
3310
3311 /* If we are under global TCP memory pressure, do not expand. */
3312 if (tcp_memory_pressure)
3313 return 0;
3314
3315 /* If we are under soft global TCP memory pressure, do not expand. */
3316 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3317 return 0;
3318
3319 /* If we filled the congestion window, do not expand. */
3320 if (tp->packets_out >= tp->snd_cwnd)
3321 return 0;
3322
3323 return 1;
3324}
3311 3325
3312/* When incoming ACK allowed to free some skb from write_queue, 3326/* When incoming ACK allowed to free some skb from write_queue,
3313 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3327 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3333,8 @@ static void tcp_new_space(struct sock *sk)
3319{ 3333{
3320 struct tcp_sock *tp = tcp_sk(sk); 3334 struct tcp_sock *tp = tcp_sk(sk);
3321 3335
3322 if (tp->packets_out < tp->snd_cwnd && 3336 if (tcp_should_expand_sndbuf(sk, tp)) {
3323 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3337 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3324 !tcp_memory_pressure &&
3325 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3326 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3327 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3338 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3328 demanded = max_t(unsigned int, tp->snd_cwnd, 3339 demanded = max_t(unsigned int, tp->snd_cwnd,
3329 tp->reordering + 1); 3340 tp->reordering + 1);
@@ -3346,22 +3357,9 @@ static inline void tcp_check_space(struct sock *sk)
3346 } 3357 }
3347} 3358}
3348 3359
3349static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3360static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3350{
3351 struct tcp_sock *tp = tcp_sk(sk);
3352
3353 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3354 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3355 tcp_write_xmit(sk, tp->nonagle))
3356 tcp_check_probe_timer(sk, tp);
3357}
3358
3359static __inline__ void tcp_data_snd_check(struct sock *sk)
3360{ 3361{
3361 struct sk_buff *skb = sk->sk_send_head; 3362 tcp_push_pending_frames(sk, tp);
3362
3363 if (skb != NULL)
3364 __tcp_data_snd_check(sk, skb);
3365 tcp_check_space(sk); 3363 tcp_check_space(sk);
3366} 3364}
3367 3365
@@ -3655,7 +3653,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3655 */ 3653 */
3656 tcp_ack(sk, skb, 0); 3654 tcp_ack(sk, skb, 0);
3657 __kfree_skb(skb); 3655 __kfree_skb(skb);
3658 tcp_data_snd_check(sk); 3656 tcp_data_snd_check(sk, tp);
3659 return 0; 3657 return 0;
3660 } else { /* Header too small */ 3658 } else { /* Header too small */
3661 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3659 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3719,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3721 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3719 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
3722 /* Well, only one small jumplet in fast path... */ 3720 /* Well, only one small jumplet in fast path... */
3723 tcp_ack(sk, skb, FLAG_DATA); 3721 tcp_ack(sk, skb, FLAG_DATA);
3724 tcp_data_snd_check(sk); 3722 tcp_data_snd_check(sk, tp);
3725 if (!tcp_ack_scheduled(tp)) 3723 if (!tcp_ack_scheduled(tp))
3726 goto no_ack; 3724 goto no_ack;
3727 } 3725 }
@@ -3799,7 +3797,7 @@ step5:
3799 /* step 7: process the segment text */ 3797 /* step 7: process the segment text */
3800 tcp_data_queue(sk, skb); 3798 tcp_data_queue(sk, skb);
3801 3799
3802 tcp_data_snd_check(sk); 3800 tcp_data_snd_check(sk, tp);
3803 tcp_ack_snd_check(sk); 3801 tcp_ack_snd_check(sk);
3804 return 0; 3802 return 0;
3805 3803
@@ -4109,7 +4107,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4109 /* Do step6 onward by hand. */ 4107 /* Do step6 onward by hand. */
4110 tcp_urg(sk, skb, th); 4108 tcp_urg(sk, skb, th);
4111 __kfree_skb(skb); 4109 __kfree_skb(skb);
4112 tcp_data_snd_check(sk); 4110 tcp_data_snd_check(sk, tp);
4113 return 0; 4111 return 0;
4114 } 4112 }
4115 4113
@@ -4300,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4300 4298
4301 /* tcp_data could move socket to TIME-WAIT */ 4299 /* tcp_data could move socket to TIME-WAIT */
4302 if (sk->sk_state != TCP_CLOSE) { 4300 if (sk->sk_state != TCP_CLOSE) {
4303 tcp_data_snd_check(sk); 4301 tcp_data_snd_check(sk, tp);
4304 tcp_ack_snd_check(sk); 4302 tcp_ack_snd_check(sk);
4305 } 4303 }
4306 4304
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2045 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2047 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops; 2051 tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e041d057ec86 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
140 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
141} 141}
142 142
143static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
144{ 144{
145 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
146 146
147 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
149} 149}
150 150
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
355 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
356 356
357 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
358 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
359 359
360 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
361 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
403 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
404} 404}
405 405
406static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
407{
408 /* Force push to be on for any TSO frames to workaround
409 * problems with busted implementations like Mac OS-X that
410 * hold off socket receive wakeups until push is seen.
411 */
412 if (tcp_skb_pcount(skb) > 1)
413 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
414}
415
416/* Send _single_ skb sitting at the send head. This function requires
417 * true push pending frames to setup probe timer etc.
418 */
419void tcp_push_one(struct sock *sk, unsigned cur_mss)
420{ 407{
421 struct tcp_sock *tp = tcp_sk(sk); 408 struct tcp_sock *tp = tcp_sk(sk);
422 struct sk_buff *skb = sk->sk_send_head;
423 409
424 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { 410 if (skb->len <= tp->mss_cache ||
425 /* Send it out now. */
426 TCP_SKB_CB(skb)->when = tcp_time_stamp;
427 tcp_tso_set_push(skb);
428 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
429 sk->sk_send_head = NULL;
430 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
431 tcp_packets_out_inc(sk, tp, skb);
432 return;
433 }
434 }
435}
436
437void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
438{
439 struct tcp_sock *tp = tcp_sk(sk);
440
441 if (skb->len <= tp->mss_cache_std ||
442 !(sk->sk_route_caps & NETIF_F_TSO)) { 411 !(sk->sk_route_caps & NETIF_F_TSO)) {
443 /* Avoid the costly divide in the normal 412 /* Avoid the costly divide in the normal
444 * non-TSO case. 413 * non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
448 } else { 417 } else {
449 unsigned int factor; 418 unsigned int factor;
450 419
451 factor = skb->len + (tp->mss_cache_std - 1); 420 factor = skb->len + (tp->mss_cache - 1);
452 factor /= tp->mss_cache_std; 421 factor /= tp->mss_cache;
453 skb_shinfo(skb)->tso_segs = factor; 422 skb_shinfo(skb)->tso_segs = factor;
454 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 423 skb_shinfo(skb)->tso_size = tp->mss_cache;
455 } 424 }
456} 425}
457 426
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
537 } 506 }
538 507
539 /* Link BUFF into the send queue. */ 508 /* Link BUFF into the send queue. */
509 skb_header_release(buff);
540 __skb_append(skb, buff); 510 __skb_append(skb, buff);
541 511
542 return 0; 512 return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
657 627
658 /* And store cached results */ 628 /* And store cached results */
659 tp->pmtu_cookie = pmtu; 629 tp->pmtu_cookie = pmtu;
660 tp->mss_cache = tp->mss_cache_std = mss_now; 630 tp->mss_cache = mss_now;
661 631
662 return mss_now; 632 return mss_now;
663} 633}
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
669 * cannot be large. However, taking into account rare use of URG, this 639 * cannot be large. However, taking into account rare use of URG, this
670 * is not a big flaw. 640 * is not a big flaw.
671 */ 641 */
672 642unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
673unsigned int tcp_current_mss(struct sock *sk, int large)
674{ 643{
675 struct tcp_sock *tp = tcp_sk(sk); 644 struct tcp_sock *tp = tcp_sk(sk);
676 struct dst_entry *dst = __sk_dst_get(sk); 645 struct dst_entry *dst = __sk_dst_get(sk);
677 unsigned int do_large, mss_now; 646 u32 mss_now;
647 u16 xmit_size_goal;
648 int doing_tso = 0;
649
650 mss_now = tp->mss_cache;
651
652 if (large_allowed &&
653 (sk->sk_route_caps & NETIF_F_TSO) &&
654 !tp->urg_mode)
655 doing_tso = 1;
678 656
679 mss_now = tp->mss_cache_std;
680 if (dst) { 657 if (dst) {
681 u32 mtu = dst_mtu(dst); 658 u32 mtu = dst_mtu(dst);
682 if (mtu != tp->pmtu_cookie) 659 if (mtu != tp->pmtu_cookie)
683 mss_now = tcp_sync_mss(sk, mtu); 660 mss_now = tcp_sync_mss(sk, mtu);
684 } 661 }
685 662
686 do_large = (large && 663 if (tp->rx_opt.eff_sacks)
687 (sk->sk_route_caps & NETIF_F_TSO) && 664 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
688 !tp->urg_mode); 665 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
689 666
690 if (do_large) { 667 xmit_size_goal = mss_now;
691 unsigned int large_mss, factor, limit;
692 668
693 large_mss = 65535 - tp->af_specific->net_header_len - 669 if (doing_tso) {
670 xmit_size_goal = 65535 -
671 tp->af_specific->net_header_len -
694 tp->ext_header_len - tp->tcp_header_len; 672 tp->ext_header_len - tp->tcp_header_len;
695 673
696 if (tp->max_window && large_mss > (tp->max_window>>1)) 674 if (tp->max_window &&
697 large_mss = max((tp->max_window>>1), 675 (xmit_size_goal > (tp->max_window >> 1)))
698 68U - tp->tcp_header_len); 676 xmit_size_goal = max((tp->max_window >> 1),
677 68U - tp->tcp_header_len);
678
679 xmit_size_goal -= (xmit_size_goal % mss_now);
680 }
681 tp->xmit_size_goal = xmit_size_goal;
699 682
700 factor = large_mss / mss_now; 683 return mss_now;
684}
701 685
702 /* Always keep large mss multiple of real mss, but 686/* Congestion window validation. (RFC2861) */
703 * do not exceed 1/tso_win_divisor of the congestion window
704 * so we can keep the ACK clock ticking and minimize
705 * bursting.
706 */
707 limit = tp->snd_cwnd;
708 if (sysctl_tcp_tso_win_divisor)
709 limit /= sysctl_tcp_tso_win_divisor;
710 limit = max(1U, limit);
711 if (factor > limit)
712 factor = limit;
713 687
714 tp->mss_cache = mss_now * factor; 688static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
689{
690 __u32 packets_out = tp->packets_out;
691
692 if (packets_out >= tp->snd_cwnd) {
693 /* Network is feed fully. */
694 tp->snd_cwnd_used = 0;
695 tp->snd_cwnd_stamp = tcp_time_stamp;
696 } else {
697 /* Network starves. */
698 if (tp->packets_out > tp->snd_cwnd_used)
699 tp->snd_cwnd_used = tp->packets_out;
715 700
716 mss_now = tp->mss_cache; 701 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
702 tcp_cwnd_application_limited(sk);
717 } 703 }
704}
718 705
719 if (tp->rx_opt.eff_sacks) 706static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
720 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 707{
721 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 708 u32 window, cwnd_len;
722 return mss_now; 709
710 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
711 cwnd_len = mss_now * cwnd;
712 return min(window, cwnd_len);
713}
714
715/* Can at least one segment of SKB be sent right now, according to the
716 * congestion window rules? If so, return how many segments are allowed.
717 */
718static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
719{
720 u32 in_flight, cwnd;
721
722 /* Don't be strict about the congestion window for the final FIN. */
723 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
724 return 1;
725
726 in_flight = tcp_packets_in_flight(tp);
727 cwnd = tp->snd_cwnd;
728 if (in_flight < cwnd)
729 return (cwnd - in_flight);
730
731 return 0;
732}
733
734/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire.
736 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
738{
739 int tso_segs = tcp_skb_pcount(skb);
740
741 if (!tso_segs) {
742 tcp_set_skb_tso_segs(sk, skb);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb);
891 tcp_set_skb_tso_segs(sk, buff);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 if (tp->ca_state != TCP_CA_Open)
913 return 0;
914
915 in_flight = tcp_packets_in_flight(tp);
916
917 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
918 (tp->snd_cwnd <= in_flight));
919
920 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
921
922 /* From in_flight test above, we know that cwnd > in_flight. */
923 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
924
925 limit = min(send_win, cong_win);
926
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933
934 /* If at least some fraction of a window is available,
935 * just use it.
936 */
937 chunk /= sysctl_tcp_tso_win_divisor;
938 if (limit >= chunk)
939 return 0;
940 } else {
941 /* Different approach, try not to defer past a single
942 * ACK. Receiver should ACK every other full sized
943 * frame, so if we have space for more than 3 frames
944 * then send now.
945 */
946 if (limit > tcp_max_burst(tp) * tp->mss_cache)
947 return 0;
948 }
949
950 /* Ok, it looks like it is advisable to defer. */
951 return 1;
723} 952}
724 953
725/* This routine writes packets to the network. It advances the 954/* This routine writes packets to the network. It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
729 * Returns 1, if no segments are in flight and we have queued segments, but 958 * Returns 1, if no segments are in flight and we have queued segments, but
730 * cannot send anything now because of SWS or another problem. 959 * cannot send anything now because of SWS or another problem.
731 */ 960 */
732int tcp_write_xmit(struct sock *sk, int nonagle) 961static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
733{ 962{
734 struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_sock *tp = tcp_sk(sk);
735 unsigned int mss_now; 964 struct sk_buff *skb;
965 unsigned int tso_segs, sent_pkts;
966 int cwnd_quota;
736 967
737 /* If we are closed, the bytes will have to remain here. 968 /* If we are closed, the bytes will have to remain here.
738 * In time closedown will finish, we empty the write queue and all 969 * In time closedown will finish, we empty the write queue and all
739 * will be happy. 970 * will be happy.
740 */ 971 */
741 if (sk->sk_state != TCP_CLOSE) { 972 if (unlikely(sk->sk_state == TCP_CLOSE))
742 struct sk_buff *skb; 973 return 0;
743 int sent_pkts = 0; 974
975 skb = sk->sk_send_head;
976 if (unlikely(!skb))
977 return 0;
978
979 tso_segs = tcp_init_tso_segs(sk, skb);
980 cwnd_quota = tcp_cwnd_test(tp, skb);
981 if (unlikely(!cwnd_quota))
982 goto out;
983
984 sent_pkts = 0;
985 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
986 BUG_ON(!tso_segs);
987
988 if (tso_segs == 1) {
989 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
990 (tcp_skb_is_last(sk, skb) ?
991 nonagle : TCP_NAGLE_PUSH))))
992 break;
993 } else {
994 if (tcp_tso_should_defer(sk, tp, skb))
995 break;
996 }
744 997
745 /* Account for SACKS, we may need to fragment due to this. 998 if (tso_segs > 1) {
746 * It is just like the real MSS changing on us midstream. 999 u32 limit = tcp_window_allows(tp, skb,
747 * We also handle things correctly when the user adds some 1000 mss_now, cwnd_quota);
748 * IP options mid-stream. Silly to do, but cover it. 1001
749 */ 1002 if (skb->len < limit) {
750 mss_now = tcp_current_mss(sk, 1); 1003 unsigned int trim = skb->len % mss_now;
751 1004
752 while ((skb = sk->sk_send_head) && 1005 if (trim)
753 tcp_snd_test(sk, skb, mss_now, 1006 limit = skb->len - trim;
754 tcp_skb_is_last(sk, skb) ? nonagle : 1007 }
755 TCP_NAGLE_PUSH)) { 1008 if (skb->len > limit) {
756 if (skb->len > mss_now) { 1009 if (tso_fragment(sk, skb, limit))
757 if (tcp_fragment(sk, skb, mss_now))
758 break; 1010 break;
759 } 1011 }
760 1012 } else if (unlikely(skb->len > mss_now)) {
761 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1013 if (unlikely(tcp_fragment(sk, skb, mss_now)))
762 tcp_tso_set_push(skb);
763 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
764 break; 1014 break;
1015 }
765 1016
766 /* Advance the send_head. This one is sent out. 1017 TCP_SKB_CB(skb)->when = tcp_time_stamp;
767 * This call will increment packets_out. 1018
768 */ 1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
769 update_send_head(sk, tp, skb); 1020 break;
1021
1022 /* Advance the send_head. This one is sent out.
1023 * This call will increment packets_out.
1024 */
1025 update_send_head(sk, tp, skb);
1026
1027 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++;
1029
1030 /* Do not optimize this to use tso_segs. If we chopped up
1031 * the packet above, tso_segs will no longer be valid.
1032 */
1033 cwnd_quota -= tcp_skb_pcount(skb);
1034
1035 BUG_ON(cwnd_quota < 0);
1036 if (!cwnd_quota)
1037 break;
1038
1039 skb = sk->sk_send_head;
1040 if (!skb)
1041 break;
1042 tso_segs = tcp_init_tso_segs(sk, skb);
1043 }
1044
1045 if (likely(sent_pkts)) {
1046 tcp_cwnd_validate(sk, tp);
1047 return 0;
1048 }
1049out:
1050 return !tp->packets_out && sk->sk_send_head;
1051}
1052
1053/* Push out any pending frames which were held back due to
1054 * TCP_CORK or attempt at coalescing tiny packets.
1055 * The socket must be locked by the caller.
1056 */
1057void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1058 unsigned int cur_mss, int nonagle)
1059{
1060 struct sk_buff *skb = sk->sk_send_head;
770 1061
771 tcp_minshall_update(tp, mss_now, skb); 1062 if (skb) {
772 sent_pkts = 1; 1063 if (tcp_write_xmit(sk, cur_mss, nonagle))
1064 tcp_check_probe_timer(sk, tp);
1065 }
1066}
1067
1068/* Send _single_ skb sitting at the send head. This function requires
1069 * true push pending frames to setup probe timer etc.
1070 */
1071void tcp_push_one(struct sock *sk, unsigned int mss_now)
1072{
1073 struct tcp_sock *tp = tcp_sk(sk);
1074 struct sk_buff *skb = sk->sk_send_head;
1075 unsigned int tso_segs, cwnd_quota;
1076
1077 BUG_ON(!skb || skb->len < mss_now);
1078
1079 tso_segs = tcp_init_tso_segs(sk, skb);
1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1081
1082 if (likely(cwnd_quota)) {
1083 BUG_ON(!tso_segs);
1084
1085 if (tso_segs > 1) {
1086 u32 limit = tcp_window_allows(tp, skb,
1087 mss_now, cwnd_quota);
1088
1089 if (skb->len < limit) {
1090 unsigned int trim = skb->len % mss_now;
1091
1092 if (trim)
1093 limit = skb->len - trim;
1094 }
1095 if (skb->len > limit) {
1096 if (unlikely(tso_fragment(sk, skb, limit)))
1097 return;
1098 }
1099 } else if (unlikely(skb->len > mss_now)) {
1100 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1101 return;
773 } 1102 }
774 1103
775 if (sent_pkts) { 1104 /* Send it out now. */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106
1107 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1108 update_send_head(sk, tp, skb);
776 tcp_cwnd_validate(sk, tp); 1109 tcp_cwnd_validate(sk, tp);
777 return 0; 1110 return;
778 } 1111 }
779
780 return !tp->packets_out && sk->sk_send_head;
781 } 1112 }
782 return 0;
783} 1113}
784 1114
785/* This function returns the amount that we can raise the 1115/* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1039 if (sk->sk_route_caps & NETIF_F_TSO) { 1369 if (sk->sk_route_caps & NETIF_F_TSO) {
1040 sk->sk_route_caps &= ~NETIF_F_TSO; 1370 sk->sk_route_caps &= ~NETIF_F_TSO;
1041 sock_set_flag(sk, SOCK_NO_LARGESEND); 1371 sock_set_flag(sk, SOCK_NO_LARGESEND);
1042 tp->mss_cache = tp->mss_cache_std;
1043 } 1372 }
1044 1373
1045 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1374 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1101 * is still in somebody's hands, else make a clone. 1430 * is still in somebody's hands, else make a clone.
1102 */ 1431 */
1103 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1104 tcp_tso_set_push(skb);
1105 1433
1106 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1434 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1107 pskb_copy(skb, GFP_ATOMIC): 1435 pskb_copy(skb, GFP_ATOMIC):
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
1670 if (sk->sk_route_caps & NETIF_F_TSO) { 1998 if (sk->sk_route_caps & NETIF_F_TSO) {
1671 sock_set_flag(sk, SOCK_NO_LARGESEND); 1999 sock_set_flag(sk, SOCK_NO_LARGESEND);
1672 sk->sk_route_caps &= ~NETIF_F_TSO; 2000 sk->sk_route_caps &= ~NETIF_F_TSO;
1673 tp->mss_cache = tp->mss_cache_std;
1674 } 2001 }
1675 } else if (!tcp_skb_pcount(skb)) 2002 } else if (!tcp_skb_pcount(skb))
1676 tcp_set_skb_tso_segs(sk, skb); 2003 tcp_set_skb_tso_segs(sk, skb);
1677 2004
1678 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1679 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1680 tcp_tso_set_push(skb);
1681 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 2007 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1682 if (!err) { 2008 if (!err) {
1683 update_send_head(sk, tp, skb); 2009 update_send_head(sk, tp, skb);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
774 if (if6_proc_init()) 774 if (if6_proc_init())
775 goto proc_if6_fail; 775 goto proc_if6_fail;
776#endif 776#endif
777 ipv6_packet_init();
778 ip6_route_init(); 777 ip6_route_init();
779 ip6_flowlabel_init(); 778 ip6_flowlabel_init();
780 err = addrconf_init(); 779 err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
791 /* Init v6 transport protocols. */ 790 /* Init v6 transport protocols. */
792 udpv6_init(); 791 udpv6_init();
793 tcpv6_init(); 792 tcpv6_init();
793
794 ipv6_packet_init();
794 err = 0; 795 err = 0;
795out: 796out:
796 return err; 797 return err;
@@ -798,7 +799,6 @@ out:
798addrconf_fail: 799addrconf_fail:
799 ip6_flowlabel_cleanup(); 800 ip6_flowlabel_cleanup();
800 ip6_route_cleanup(); 801 ip6_route_cleanup();
801 ipv6_packet_cleanup();
802#ifdef CONFIG_PROC_FS 802#ifdef CONFIG_PROC_FS
803 if6_proc_exit(); 803 if6_proc_exit();
804proc_if6_fail: 804proc_if6_fail:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
465 to->pkt_type = from->pkt_type; 465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority; 466 to->priority = from->priority;
467 to->protocol = from->protocol; 467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst); 468 dst_release(to->dst);
470 to->dst = dst_clone(from->dst); 469 to->dst = dst_clone(from->dst);
471 to->dev = from->dev; 470 to->dev = from->dev;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
2018 */ 2018 */
2019 tp->snd_ssthresh = 0x7fffffff; 2019 tp->snd_ssthresh = 0x7fffffff;
2020 tp->snd_cwnd_clamp = ~0; 2020 tp->snd_cwnd_clamp = ~0;
2021 tp->mss_cache_std = tp->mss_cache = 536; 2021 tp->mss_cache = 536;
2022 2022
2023 tp->reordering = sysctl_tcp_reordering; 2023 tp->reordering = sysctl_tcp_reordering;
2024 2024
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8f58cecd6266..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-y := sch_generic.o 5obj-y := sch_generic.o
6 6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o 7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
8obj-$(CONFIG_NET_CLS) += cls_api.o 8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o 9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o 10obj-$(CONFIG_NET_ACT_POLICE) += police.o
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
205 dst->value = skb->protocol; 205 dst->value = skb->protocol;
206} 206}
207 207
208META_COLLECTOR(int_security)
209{
210 dst->value = skb->security;
211}
212
213META_COLLECTOR(int_pkttype) 208META_COLLECTOR(int_pkttype)
214{ 209{
215 dst->value = skb->pkt_type; 210 dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
524 [META_ID(REALDEV)] = META_FUNC(int_realdev), 519 [META_ID(REALDEV)] = META_FUNC(int_realdev),
525 [META_ID(PRIORITY)] = META_FUNC(int_priority), 520 [META_ID(PRIORITY)] = META_FUNC(int_priority),
526 [META_ID(PROTOCOL)] = META_FUNC(int_protocol), 521 [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
527 [META_ID(SECURITY)] = META_FUNC(int_security),
528 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), 522 [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
529 [META_ID(PKTLEN)] = META_FUNC(int_pktlen), 523 [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
530 [META_ID(DATALEN)] = META_FUNC(int_datalen), 524 [META_ID(DATALEN)] = META_FUNC(int_datalen),
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 05e6e0a799da..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{ 399{
400 int err; 400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1]; 401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch; 402 struct Qdisc *sch;
404 struct Qdisc_ops *ops; 403 struct Qdisc_ops *ops;
405 int size;
406 404
407 ops = qdisc_lookup_ops(kind); 405 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD 406#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
437 if (ops == NULL) 435 if (ops == NULL)
438 goto err_out; 436 goto err_out;
439 437
440 /* ensure that the Qdisc and the private data are 32-byte aligned */ 438 sch = qdisc_alloc(dev, ops);
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 439 if (IS_ERR(sch)) {
442 size += ops->priv_size + QDISC_ALIGN_CONST; 440 err = PTR_ERR(sch);
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2; 441 goto err_out2;
448 memset(p, 0, size); 442 }
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455 443
456 if (handle == TC_H_INGRESS) 444 if (handle == TC_H_INGRESS) {
457 sch->flags |= TCQ_F_INGRESS; 445 sch->flags |= TCQ_F_INGRESS;
458 446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
459 sch->ops = ops; 447 } else if (handle == 0) {
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev); 448 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM; 449 err = -ENOMEM;
469 if (handle == 0) 450 if (handle == 0)
470 goto err_out3; 451 goto err_out3;
471 } 452 }
472 453
473 if (handle == TC_H_INGRESS) 454 sch->handle = handle;
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477 455
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
479 qdisc_lock_tree(dev); 474 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list); 475 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev); 476 qdisc_unlock_tree(dev);
482 477
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch; 478 return sch;
489 } 479 }
490err_out3: 480err_out3:
491 dev_put(dev); 481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
492err_out2: 483err_out2:
493 module_put(ops->owner); 484 module_put(ops->owner);
494err_out: 485err_out:
495 *errp = err; 486 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL; 487 return NULL;
499} 488}
500 489
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
1/*
2 * net/sched/sch_blackhole.c Black hole queue
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * Note: Quantum tunneling is not supported.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/netdevice.h>
19#include <linux/skbuff.h>
20#include <net/pkt_sched.h>
21
22static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
23{
24 qdisc_drop(skb, sch);
25 return NET_XMIT_SUCCESS;
26}
27
28static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
29{
30 return NULL;
31}
32
33static struct Qdisc_ops blackhole_qdisc_ops = {
34 .id = "blackhole",
35 .priv_size = 0,
36 .enqueue = blackhole_enqueue,
37 .dequeue = blackhole_dequeue,
38 .owner = THIS_MODULE,
39};
40
41static int __init blackhole_module_init(void)
42{
43 return register_qdisc(&blackhole_qdisc_ops);
44}
45
46static void __exit blackhole_module_exit(void)
47{
48 unregister_qdisc(&blackhole_qdisc_ops);
49}
50
51module_init(blackhole_module_init)
52module_exit(blackhole_module_exit)
53
54MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
395 .owner = THIS_MODULE, 395 .owner = THIS_MODULE,
396}; 396};
397 397
398struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) 398struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
399{ 399{
400 void *p; 400 void *p;
401 struct Qdisc *sch; 401 struct Qdisc *sch;
402 int size; 402 unsigned int size;
403 int err = -ENOBUFS;
403 404
404 /* ensure that the Qdisc and the private data are 32-byte aligned */ 405 /* ensure that the Qdisc and the private data are 32-byte aligned */
405 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); 406 size = QDISC_ALIGN(sizeof(*sch));
406 size += ops->priv_size + QDISC_ALIGN_CONST; 407 size += ops->priv_size + (QDISC_ALIGNTO - 1);
407 408
408 p = kmalloc(size, GFP_KERNEL); 409 p = kmalloc(size, GFP_KERNEL);
409 if (!p) 410 if (!p)
410 return NULL; 411 goto errout;
411 memset(p, 0, size); 412 memset(p, 0, size);
412 413 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
413 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 414 sch->padded = (char *) sch - (char *) p;
414 & ~QDISC_ALIGN_CONST);
415 sch->padded = (char *)sch - (char *)p;
416 415
417 INIT_LIST_HEAD(&sch->list); 416 INIT_LIST_HEAD(&sch->list);
418 skb_queue_head_init(&sch->q); 417 skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
423 dev_hold(dev); 422 dev_hold(dev);
424 sch->stats_lock = &dev->queue_lock; 423 sch->stats_lock = &dev->queue_lock;
425 atomic_set(&sch->refcnt, 1); 424 atomic_set(&sch->refcnt, 1);
425
426 return sch;
427errout:
428 return ERR_PTR(-err);
429}
430
431struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
432{
433 struct Qdisc *sch;
434
435 sch = qdisc_alloc(dev, ops);
436 if (IS_ERR(sch))
437 goto errout;
438
426 if (!ops->init || ops->init(sch, NULL) == 0) 439 if (!ops->init || ops->init(sch, NULL) == 0)
427 return sch; 440 return sch;
428 441
429 dev_put(dev); 442errout:
430 kfree(p);
431 return NULL; 443 return NULL;
432} 444}
433 445
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
591EXPORT_SYMBOL(noop_qdisc); 603EXPORT_SYMBOL(noop_qdisc);
592EXPORT_SYMBOL(noop_qdisc_ops); 604EXPORT_SYMBOL(noop_qdisc_ops);
593EXPORT_SYMBOL(qdisc_create_dflt); 605EXPORT_SYMBOL(qdisc_create_dflt);
606EXPORT_SYMBOL(qdisc_alloc);
594EXPORT_SYMBOL(qdisc_destroy); 607EXPORT_SYMBOL(qdisc_destroy);
595EXPORT_SYMBOL(qdisc_reset); 608EXPORT_SYMBOL(qdisc_reset);
596EXPORT_SYMBOL(qdisc_restart); 609EXPORT_SYMBOL(qdisc_restart);