aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c125
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/skbuff.c157
-rw-r--r--net/core/sysctl_net_core.c61
-rw-r--r--net/ipv4/Kconfig172
-rw-r--r--net/ipv4/Makefile10
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c114
-rw-r--r--net/ipv4/tcp.c33
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c34
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c737
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c23
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv6/addrconf.c3
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/sched/Kconfig13
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/em_text.c157
-rw-r--r--net/sctp/sm_statefuns.c16
-rw-r--r--net/sunrpc/sunrpc_syms.c1
-rw-r--r--net/sunrpc/svcauth_unix.c11
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xprt.c4
35 files changed, 2646 insertions, 1015 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce81..7016e0c36b3d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
115#endif /* CONFIG_NET_RADIO */ 115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h> 116#include <asm/current.h>
117 117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/* 118/*
131 * The list of packet types we will receive (as opposed to discard) 119 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke. 120 * and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */ 147static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */ 148static struct list_head ptype_all; /* Taps */
161 149
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/* 150/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln 151 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore. 152 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
215 * Device drivers call our routines to queue packets here. We empty the 198 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler. 199 * queue in the local softnet handler.
217 */ 200 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; 201DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
219 202
220#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void); 204extern int netdev_sysfs_init(void);
@@ -1363,71 +1346,13 @@ out:
1363 Receiver routines 1346 Receiver routines
1364 =======================================================================*/ 1347 =======================================================================*/
1365 1348
1366int netdev_max_backlog = 300; 1349int netdev_max_backlog = 1000;
1350int netdev_budget = 300;
1367int weight_p = 64; /* old backlog weight */ 1351int weight_p = 64; /* old backlog weight */
1368/* These numbers are selected based on intuition and some
1369 * experimentatiom, if you have more scientific way of doing this
1370 * please go ahead and fix things.
1371 */
1372int no_cong_thresh = 10;
1373int no_cong = 20;
1374int lo_cong = 100;
1375int mod_cong = 290;
1376 1352
1377DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1353DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1378 1354
1379 1355
1380static void get_sample_stats(int cpu)
1381{
1382#ifdef RAND_LIE
1383 unsigned long rd;
1384 int rq;
1385#endif
1386 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1387 int blog = sd->input_pkt_queue.qlen;
1388 int avg_blog = sd->avg_blog;
1389
1390 avg_blog = (avg_blog >> 1) + (blog >> 1);
1391
1392 if (avg_blog > mod_cong) {
1393 /* Above moderate congestion levels. */
1394 sd->cng_level = NET_RX_CN_HIGH;
1395#ifdef RAND_LIE
1396 rd = net_random();
1397 rq = rd % netdev_max_backlog;
1398 if (rq < avg_blog) /* unlucky bastard */
1399 sd->cng_level = NET_RX_DROP;
1400#endif
1401 } else if (avg_blog > lo_cong) {
1402 sd->cng_level = NET_RX_CN_MOD;
1403#ifdef RAND_LIE
1404 rd = net_random();
1405 rq = rd % netdev_max_backlog;
1406 if (rq < avg_blog) /* unlucky bastard */
1407 sd->cng_level = NET_RX_CN_HIGH;
1408#endif
1409 } else if (avg_blog > no_cong)
1410 sd->cng_level = NET_RX_CN_LOW;
1411 else /* no congestion */
1412 sd->cng_level = NET_RX_SUCCESS;
1413
1414 sd->avg_blog = avg_blog;
1415}
1416
1417#ifdef OFFLINE_SAMPLE
1418static void sample_queue(unsigned long dummy)
1419{
1420/* 10 ms 0r 1ms -- i don't care -- JHS */
1421 int next_tick = 1;
1422 int cpu = smp_processor_id();
1423
1424 get_sample_stats(cpu);
1425 next_tick += jiffies;
1426 mod_timer(&samp_timer, next_tick);
1427}
1428#endif
1429
1430
1431/** 1356/**
1432 * netif_rx - post buffer to the network code 1357 * netif_rx - post buffer to the network code
1433 * @skb: buffer to post 1358 * @skb: buffer to post
@@ -1448,7 +1373,6 @@ static void sample_queue(unsigned long dummy)
1448 1373
1449int netif_rx(struct sk_buff *skb) 1374int netif_rx(struct sk_buff *skb)
1450{ 1375{
1451 int this_cpu;
1452 struct softnet_data *queue; 1376 struct softnet_data *queue;
1453 unsigned long flags; 1377 unsigned long flags;
1454 1378
@@ -1464,38 +1388,22 @@ int netif_rx(struct sk_buff *skb)
1464 * short when CPU is congested, but is still operating. 1388 * short when CPU is congested, but is still operating.
1465 */ 1389 */
1466 local_irq_save(flags); 1390 local_irq_save(flags);
1467 this_cpu = smp_processor_id();
1468 queue = &__get_cpu_var(softnet_data); 1391 queue = &__get_cpu_var(softnet_data);
1469 1392
1470 __get_cpu_var(netdev_rx_stat).total++; 1393 __get_cpu_var(netdev_rx_stat).total++;
1471 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 1394 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1472 if (queue->input_pkt_queue.qlen) { 1395 if (queue->input_pkt_queue.qlen) {
1473 if (queue->throttle)
1474 goto drop;
1475
1476enqueue: 1396enqueue:
1477 dev_hold(skb->dev); 1397 dev_hold(skb->dev);
1478 __skb_queue_tail(&queue->input_pkt_queue, skb); 1398 __skb_queue_tail(&queue->input_pkt_queue, skb);
1479#ifndef OFFLINE_SAMPLE
1480 get_sample_stats(this_cpu);
1481#endif
1482 local_irq_restore(flags); 1399 local_irq_restore(flags);
1483 return queue->cng_level; 1400 return NET_RX_SUCCESS;
1484 } 1401 }
1485 1402
1486 if (queue->throttle)
1487 queue->throttle = 0;
1488
1489 netif_rx_schedule(&queue->backlog_dev); 1403 netif_rx_schedule(&queue->backlog_dev);
1490 goto enqueue; 1404 goto enqueue;
1491 } 1405 }
1492 1406
1493 if (!queue->throttle) {
1494 queue->throttle = 1;
1495 __get_cpu_var(netdev_rx_stat).throttled++;
1496 }
1497
1498drop:
1499 __get_cpu_var(netdev_rx_stat).dropped++; 1407 __get_cpu_var(netdev_rx_stat).dropped++;
1500 local_irq_restore(flags); 1408 local_irq_restore(flags);
1501 1409
@@ -1780,8 +1688,6 @@ job_done:
1780 smp_mb__before_clear_bit(); 1688 smp_mb__before_clear_bit();
1781 netif_poll_enable(backlog_dev); 1689 netif_poll_enable(backlog_dev);
1782 1690
1783 if (queue->throttle)
1784 queue->throttle = 0;
1785 local_irq_enable(); 1691 local_irq_enable();
1786 return 0; 1692 return 0;
1787} 1693}
@@ -1790,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
1790{ 1696{
1791 struct softnet_data *queue = &__get_cpu_var(softnet_data); 1697 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1792 unsigned long start_time = jiffies; 1698 unsigned long start_time = jiffies;
1793 int budget = netdev_max_backlog; 1699 int budget = netdev_budget;
1794
1795 1700
1796 local_irq_disable(); 1701 local_irq_disable();
1797 1702
@@ -2055,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
2055 struct netif_rx_stats *s = v; 1960 struct netif_rx_stats *s = v;
2056 1961
2057 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 1962 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2058 s->total, s->dropped, s->time_squeeze, s->throttled, 1963 s->total, s->dropped, s->time_squeeze, 0,
2059 s->fastroute_hit, s->fastroute_success, s->fastroute_defer, 1964 0, 0, 0, 0, /* was fastroute */
2060 s->fastroute_deferred_out, 1965 s->cpu_collision );
2061#if 0
2062 s->fastroute_latency_reduction
2063#else
2064 s->cpu_collision
2065#endif
2066 );
2067 return 0; 1966 return 0;
2068} 1967}
2069 1968
@@ -3305,9 +3204,6 @@ static int __init net_dev_init(void)
3305 3204
3306 queue = &per_cpu(softnet_data, i); 3205 queue = &per_cpu(softnet_data, i);
3307 skb_queue_head_init(&queue->input_pkt_queue); 3206 skb_queue_head_init(&queue->input_pkt_queue);
3308 queue->throttle = 0;
3309 queue->cng_level = 0;
3310 queue->avg_blog = 10; /* arbitrary non-zero */
3311 queue->completion_queue = NULL; 3207 queue->completion_queue = NULL;
3312 INIT_LIST_HEAD(&queue->poll_list); 3208 INIT_LIST_HEAD(&queue->poll_list);
3313 set_bit(__LINK_STATE_START, &queue->backlog_dev.state); 3209 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3316,11 +3212,6 @@ static int __init net_dev_init(void)
3316 atomic_set(&queue->backlog_dev.refcnt, 1); 3212 atomic_set(&queue->backlog_dev.refcnt, 1);
3317 } 3213 }
3318 3214
3319#ifdef OFFLINE_SAMPLE
3320 samp_timer.expires = jiffies + (10 * HZ);
3321 add_timer(&samp_timer);
3322#endif
3323
3324 dev_boot_phase = 0; 3215 dev_boot_phase = 0;
3325 3216
3326 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); 3217 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f6bdcad47da6..851eb927ed97 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
32#include <net/sock.h> 32#include <net/sock.h>
33#include <linux/rtnetlink.h> 33#include <linux/rtnetlink.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/string.h>
35 36
36#define NEIGH_DEBUG 1 37#define NEIGH_DEBUG 1
37 38
@@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2592 t->neigh_vars[17].extra1 = dev; 2593 t->neigh_vars[17].extra1 = dev;
2593 } 2594 }
2594 2595
2595 dev_name = net_sysctl_strdup(dev_name_source); 2596 dev_name = kstrdup(dev_name_source, GFP_KERNEL);
2596 if (!dev_name) { 2597 if (!dev_name) {
2597 err = -ENOBUFS; 2598 err = -ENOBUFS;
2598 goto free; 2599 goto free;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc051..bb73b2190ec7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1500,6 +1500,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1500 skb_split_no_header(skb, skb1, len, pos); 1500 skb_split_no_header(skb, skb1, len, pos);
1501} 1501}
1502 1502
1503/**
1504 * skb_prepare_seq_read - Prepare a sequential read of skb data
1505 * @skb: the buffer to read
1506 * @from: lower offset of data to be read
1507 * @to: upper offset of data to be read
1508 * @st: state variable
1509 *
1510 * Initializes the specified state variable. Must be called before
1511 * invoking skb_seq_read() for the first time.
1512 */
1513void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
1514 unsigned int to, struct skb_seq_state *st)
1515{
1516 st->lower_offset = from;
1517 st->upper_offset = to;
1518 st->root_skb = st->cur_skb = skb;
1519 st->frag_idx = st->stepped_offset = 0;
1520 st->frag_data = NULL;
1521}
1522
1523/**
1524 * skb_seq_read - Sequentially read skb data
1525 * @consumed: number of bytes consumed by the caller so far
1526 * @data: destination pointer for data to be returned
1527 * @st: state variable
1528 *
1529 * Reads a block of skb data at &consumed relative to the
1530 * lower offset specified to skb_prepare_seq_read(). Assigns
1531 * the head of the data block to &data and returns the length
1532 * of the block or 0 if the end of the skb data or the upper
1533 * offset has been reached.
1534 *
1535 * The caller is not required to consume all of the data
1536 * returned, i.e. &consumed is typically set to the number
1537 * of bytes already consumed and the next call to
1538 * skb_seq_read() will return the remaining part of the block.
1539 *
1540 * Note: The size of each block of data returned can be arbitary,
1541 * this limitation is the cost for zerocopy seqeuental
1542 * reads of potentially non linear data.
1543 *
1544 * Note: Fragment lists within fragments are not implemented
1545 * at the moment, state->root_skb could be replaced with
1546 * a stack for this purpose.
1547 */
1548unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
1549 struct skb_seq_state *st)
1550{
1551 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
1552 skb_frag_t *frag;
1553
1554 if (unlikely(abs_offset >= st->upper_offset))
1555 return 0;
1556
1557next_skb:
1558 block_limit = skb_headlen(st->cur_skb);
1559
1560 if (abs_offset < block_limit) {
1561 *data = st->cur_skb->data + abs_offset;
1562 return block_limit - abs_offset;
1563 }
1564
1565 if (st->frag_idx == 0 && !st->frag_data)
1566 st->stepped_offset += skb_headlen(st->cur_skb);
1567
1568 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
1569 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
1570 block_limit = frag->size + st->stepped_offset;
1571
1572 if (abs_offset < block_limit) {
1573 if (!st->frag_data)
1574 st->frag_data = kmap_skb_frag(frag);
1575
1576 *data = (u8 *) st->frag_data + frag->page_offset +
1577 (abs_offset - st->stepped_offset);
1578
1579 return block_limit - abs_offset;
1580 }
1581
1582 if (st->frag_data) {
1583 kunmap_skb_frag(st->frag_data);
1584 st->frag_data = NULL;
1585 }
1586
1587 st->frag_idx++;
1588 st->stepped_offset += frag->size;
1589 }
1590
1591 if (st->cur_skb->next) {
1592 st->cur_skb = st->cur_skb->next;
1593 st->frag_idx = 0;
1594 goto next_skb;
1595 } else if (st->root_skb == st->cur_skb &&
1596 skb_shinfo(st->root_skb)->frag_list) {
1597 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
1598 goto next_skb;
1599 }
1600
1601 return 0;
1602}
1603
1604/**
1605 * skb_abort_seq_read - Abort a sequential read of skb data
1606 * @st: state variable
1607 *
1608 * Must be called if skb_seq_read() was not called until it
1609 * returned 0.
1610 */
1611void skb_abort_seq_read(struct skb_seq_state *st)
1612{
1613 if (st->frag_data)
1614 kunmap_skb_frag(st->frag_data);
1615}
1616
1617#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
1618
1619static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
1620 struct ts_config *conf,
1621 struct ts_state *state)
1622{
1623 return skb_seq_read(offset, text, TS_SKB_CB(state));
1624}
1625
1626static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
1627{
1628 skb_abort_seq_read(TS_SKB_CB(state));
1629}
1630
1631/**
1632 * skb_find_text - Find a text pattern in skb data
1633 * @skb: the buffer to look in
1634 * @from: search offset
1635 * @to: search limit
1636 * @config: textsearch configuration
1637 * @state: uninitialized textsearch state variable
1638 *
1639 * Finds a pattern in the skb data according to the specified
1640 * textsearch configuration. Use textsearch_next() to retrieve
1641 * subsequent occurrences of the pattern. Returns the offset
1642 * to the first occurrence or UINT_MAX if no match was found.
1643 */
1644unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1645 unsigned int to, struct ts_config *config,
1646 struct ts_state *state)
1647{
1648 config->get_next_block = skb_ts_get_next_block;
1649 config->finish = skb_ts_finish;
1650
1651 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
1652
1653 return textsearch_find(config, state);
1654}
1655
1503void __init skb_init(void) 1656void __init skb_init(void)
1504{ 1657{
1505 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1658 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1691,7 @@ EXPORT_SYMBOL(skb_queue_tail);
1538EXPORT_SYMBOL(skb_unlink); 1691EXPORT_SYMBOL(skb_unlink);
1539EXPORT_SYMBOL(skb_append); 1692EXPORT_SYMBOL(skb_append);
1540EXPORT_SYMBOL(skb_split); 1693EXPORT_SYMBOL(skb_split);
1694EXPORT_SYMBOL(skb_prepare_seq_read);
1695EXPORT_SYMBOL(skb_seq_read);
1696EXPORT_SYMBOL(skb_abort_seq_read);
1697EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb191..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
13#ifdef CONFIG_SYSCTL 13#ifdef CONFIG_SYSCTL
14 14
15extern int netdev_max_backlog; 15extern int netdev_max_backlog;
16extern int netdev_budget;
16extern int weight_p; 17extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost; 18extern int net_msg_cost;
23extern int net_msg_burst; 19extern int net_msg_burst;
24 20
@@ -35,19 +31,6 @@ extern int sysctl_somaxconn;
35extern char sysctl_divert_version[]; 31extern char sysctl_divert_version[];
36#endif /* CONFIG_NET_DIVERT */ 32#endif /* CONFIG_NET_DIVERT */
37 33
38/*
39 * This strdup() is used for creating copies of network
40 * device names to be handed over to sysctl.
41 */
42
43char *net_sysctl_strdup(const char *s)
44{
45 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
46 if (rv)
47 strcpy(rv, s);
48 return rv;
49}
50
51ctl_table core_table[] = { 34ctl_table core_table[] = {
52#ifdef CONFIG_NET 35#ifdef CONFIG_NET
53 { 36 {
@@ -99,38 +82,6 @@ ctl_table core_table[] = {
99 .proc_handler = &proc_dointvec 82 .proc_handler = &proc_dointvec
100 }, 83 },
101 { 84 {
102 .ctl_name = NET_CORE_NO_CONG_THRESH,
103 .procname = "no_cong_thresh",
104 .data = &no_cong_thresh,
105 .maxlen = sizeof(int),
106 .mode = 0644,
107 .proc_handler = &proc_dointvec
108 },
109 {
110 .ctl_name = NET_CORE_NO_CONG,
111 .procname = "no_cong",
112 .data = &no_cong,
113 .maxlen = sizeof(int),
114 .mode = 0644,
115 .proc_handler = &proc_dointvec
116 },
117 {
118 .ctl_name = NET_CORE_LO_CONG,
119 .procname = "lo_cong",
120 .data = &lo_cong,
121 .maxlen = sizeof(int),
122 .mode = 0644,
123 .proc_handler = &proc_dointvec
124 },
125 {
126 .ctl_name = NET_CORE_MOD_CONG,
127 .procname = "mod_cong",
128 .data = &mod_cong,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec
132 },
133 {
134 .ctl_name = NET_CORE_MSG_COST, 85 .ctl_name = NET_CORE_MSG_COST,
135 .procname = "message_cost", 86 .procname = "message_cost",
136 .data = &net_msg_cost, 87 .data = &net_msg_cost,
@@ -174,9 +125,15 @@ ctl_table core_table[] = {
174 .mode = 0644, 125 .mode = 0644,
175 .proc_handler = &proc_dointvec 126 .proc_handler = &proc_dointvec
176 }, 127 },
128 {
129 .ctl_name = NET_CORE_BUDGET,
130 .procname = "netdev_budget",
131 .data = &netdev_budget,
132 .maxlen = sizeof(int),
133 .mode = 0644,
134 .proc_handler = &proc_dointvec
135 },
177 { .ctl_name = 0 } 136 { .ctl_name = 0 }
178}; 137};
179 138
180EXPORT_SYMBOL(net_sysctl_strdup);
181
182#endif 139#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c349..347083433120 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,32 +1,6 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup"
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
30config IP_MULTICAST 4config IP_MULTICAST
31 bool "IP: multicasting" 5 bool "IP: multicasting"
32 depends on INET 6 depends on INET
@@ -79,6 +53,44 @@ config IP_ADVANCED_ROUTER
79 53
80 If unsure, say N here. 54 If unsure, say N here.
81 55
56choice
57 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
58 depends on IP_ADVANCED_ROUTER
59 default IP_FIB_HASH
60
61config IP_FIB_HASH
62 bool "FIB_HASH"
63 ---help---
64 Current FIB is very proven and good enough for most users.
65
66config IP_FIB_TRIE
67 bool "FIB_TRIE"
68 ---help---
69 Use new experimental LC-trie as FIB lookup algoritm.
70 This improves lookup performance if you have a large
71 number of routes.
72
73 LC-trie is a longest matching prefix lookup algorithm which
74 performs better than FIB_HASH for large routing tables.
75 But, it consumes more memory and is more complex.
76
77 LC-trie is described in:
78
79 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
80 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
81 An experimental study of compression methods for dynamic tries
82 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
83 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
84
85endchoice
86
87# If the user does not enable advanced routing, he gets the safe
88# default of the fib-hash algorithm.
89config IP_FIB_HASH
90 bool
91 depends on !IP_ADVANCED_ROUTER
92 default y
93
82config IP_MULTIPLE_TABLES 94config IP_MULTIPLE_TABLES
83 bool "IP: policy routing" 95 bool "IP: policy routing"
84 depends on IP_ADVANCED_ROUTER 96 depends on IP_ADVANCED_ROUTER
@@ -433,5 +445,113 @@ config IP_TCPDIAG
433config IP_TCPDIAG_IPV6 445config IP_TCPDIAG_IPV6
434 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 446 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
435 447
448config TCP_CONG_ADVANCED
449 bool "TCP: advanced congestion control"
450 depends on INET
451 default y
452 ---help---
453 Support for selection of various TCP congestion control
454 modules.
455
456 Nearly all users can safely say no here, and a safe default
457 selection will be made (BIC-TCP with new Reno as a fallback).
458
459 If unsure, say N.
460
461# TCP Reno is builtin (required as fallback)
462menu "TCP congestion control"
463 depends on TCP_CONG_ADVANCED
464
465config TCP_CONG_BIC
466 tristate "Binary Increase Congestion (BIC) control"
467 depends on INET
468 default y
469 ---help---
470 BIC-TCP is a sender-side only change that ensures a linear RTT
471 fairness under large windows while offering both scalability and
472 bounded TCP-friendliness. The protocol combines two schemes
473 called additive increase and binary search increase. When the
474 congestion window is large, additive increase with a large
475 increment ensures linear RTT fairness as well as good
476 scalability. Under small congestion windows, binary search
477 increase provides TCP friendliness.
478 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
479
480config TCP_CONG_WESTWOOD
481 tristate "TCP Westwood+"
482 depends on INET
483 default m
484 ---help---
485 TCP Westwood+ is a sender-side only modification of the TCP Reno
486 protocol stack that optimizes the performance of TCP congestion
487 control. It is based on end-to-end bandwidth estimation to set
488 congestion window and slow start threshold after a congestion
489 episode. Using this estimation, TCP Westwood+ adaptively sets a
490 slow start threshold and a congestion window which takes into
491 account the bandwidth used at the time congestion is experienced.
492 TCP Westwood+ significantly increases fairness wrt TCP Reno in
493 wired networks and throughput over wireless links.
494
495config TCP_CONG_HTCP
496 tristate "H-TCP"
497 depends on INET
498 default m
499 ---help---
500 H-TCP is a send-side only modifications of the TCP Reno
501 protocol stack that optimizes the performance of TCP
502 congestion control for high speed network links. It uses a
503 modeswitch to change the alpha and beta parameters of TCP Reno
504 based on network conditions and in a way so as to be fair with
505 other Reno and H-TCP flows.
506
507config TCP_CONG_HSTCP
508 tristate "High Speed TCP"
509 depends on INET && EXPERIMENTAL
510 default n
511 ---help---
512 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
513 A modification to TCP's congestion control mechanism for use
514 with large congestion windows. A table indicates how much to
515 increase the congestion window by when an ACK is received.
516 For more detail see http://www.icir.org/floyd/hstcp.html
517
518config TCP_CONG_HYBLA
519 tristate "TCP-Hybla congestion control algorithm"
520 depends on INET && EXPERIMENTAL
521 default n
522 ---help---
523 TCP-Hybla is a sender-side only change that eliminates penalization of
524 long-RTT, large-bandwidth connections, like when satellite legs are
525 involved, expecially when sharing a common bottleneck with normal
526 terrestrial connections.
527
528config TCP_CONG_VEGAS
529 tristate "TCP Vegas"
530 depends on INET && EXPERIMENTAL
531 default n
532 ---help---
533 TCP Vegas is a sender-side only change to TCP that anticipates
534 the onset of congestion by estimating the bandwidth. TCP Vegas
535 adjusts the sending rate by modifying the congestion
536 window. TCP Vegas should provide less packet loss, but it is
537 not as aggressive as TCP Reno.
538
539config TCP_CONG_SCALABLE
540 tristate "Scalable TCP"
541 depends on INET && EXPERIMENTAL
542 default n
543 ---help---
544 Scalable TCP is a sender-side only change to TCP which uses a
545 MIMD congestion control algorithm which has some nice scaling
546 properties, though is known to have fairness issues.
547 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
548
549endmenu
550
551config TCP_CONG_BIC
552 boolean
553 depends on !TCP_CONG_ADVANCED
554 default y
555
436source "net/ipv4/ipvs/Kconfig" 556source "net/ipv4/ipvs/Kconfig"
437 557
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1add..5718cdb3a61e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := utils.o route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
30obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
31obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
32obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
33 41
34obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
35 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9a1..d8a10e3dd77d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
1471 * by sysctl and we wouldn't want anyone to change it under our feet 1471 * by sysctl and we wouldn't want anyone to change it under our feet
1472 * (see SIOCSIFNAME). 1472 * (see SIOCSIFNAME).
1473 */ 1473 */
1474 dev_name = net_sysctl_strdup(dev_name); 1474 dev_name = kstrdup(dev_name, GFP_KERNEL);
1475 if (!dev_name) 1475 if (!dev_name)
1476 goto free; 1476 goto free;
1477 1477
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0b..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
118 return 1; 118 return 1;
119} 119}
120 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
121ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
122 { 161 {
123 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
612 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
613 }, 652 },
614 { 653 {
615 .ctl_name = NET_TCP_WESTWOOD,
616 .procname = "tcp_westwood",
617 .data = &sysctl_tcp_westwood,
618 .maxlen = sizeof(int),
619 .mode = 0644,
620 .proc_handler = &proc_dointvec,
621 },
622 {
623 .ctl_name = NET_TCP_VEGAS,
624 .procname = "tcp_vegas_cong_avoid",
625 .data = &sysctl_tcp_vegas_cong_avoid,
626 .maxlen = sizeof(int),
627 .mode = 0644,
628 .proc_handler = &proc_dointvec,
629 },
630 {
631 .ctl_name = NET_TCP_VEGAS_ALPHA,
632 .procname = "tcp_vegas_alpha",
633 .data = &sysctl_tcp_vegas_alpha,
634 .maxlen = sizeof(int),
635 .mode = 0644,
636 .proc_handler = &proc_dointvec,
637 },
638 {
639 .ctl_name = NET_TCP_VEGAS_BETA,
640 .procname = "tcp_vegas_beta",
641 .data = &sysctl_tcp_vegas_beta,
642 .maxlen = sizeof(int),
643 .mode = 0644,
644 .proc_handler = &proc_dointvec,
645 },
646 {
647 .ctl_name = NET_TCP_VEGAS_GAMMA,
648 .procname = "tcp_vegas_gamma",
649 .data = &sysctl_tcp_vegas_gamma,
650 .maxlen = sizeof(int),
651 .mode = 0644,
652 .proc_handler = &proc_dointvec,
653 },
654 {
655 .ctl_name = NET_TCP_BIC,
656 .procname = "tcp_bic",
657 .data = &sysctl_tcp_bic,
658 .maxlen = sizeof(int),
659 .mode = 0644,
660 .proc_handler = &proc_dointvec,
661 },
662 {
663 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
664 .procname = "tcp_bic_fast_convergence",
665 .data = &sysctl_tcp_bic_fast_convergence,
666 .maxlen = sizeof(int),
667 .mode = 0644,
668 .proc_handler = &proc_dointvec,
669 },
670 {
671 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
672 .procname = "tcp_bic_low_window",
673 .data = &sysctl_tcp_bic_low_window,
674 .maxlen = sizeof(int),
675 .mode = 0644,
676 .proc_handler = &proc_dointvec,
677 },
678 {
679 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
680 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
681 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
692 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
693 }, 668 },
694 { 669 {
695 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
696 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
697 .data = &sysctl_tcp_bic_beta,
698 .maxlen = sizeof(int),
699 .mode = 0644, 672 .mode = 0644,
700 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
701 }, 676 },
677
702 { .ctl_name = 0 } 678 { .ctl_name = 0 }
703}; 679};
704 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd36..882436da9a3a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1927 return tp->af_specific->setsockopt(sk, level, optname, 1927 return tp->af_specific->setsockopt(sk, level, optname,
1928 optval, optlen); 1928 optval, optlen);
1929 1929
1930 /* This is a string value all the others are int's */
1931 if (optname == TCP_CONGESTION) {
1932 char name[TCP_CA_NAME_MAX];
1933
1934 if (optlen < 1)
1935 return -EINVAL;
1936
1937 val = strncpy_from_user(name, optval,
1938 min(TCP_CA_NAME_MAX-1, optlen));
1939 if (val < 0)
1940 return -EFAULT;
1941 name[val] = 0;
1942
1943 lock_sock(sk);
1944 err = tcp_set_congestion_control(tp, name);
1945 release_sock(sk);
1946 return err;
1947 }
1948
1930 if (optlen < sizeof(int)) 1949 if (optlen < sizeof(int))
1931 return -EINVAL; 1950 return -EINVAL;
1932 1951
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2211 case TCP_QUICKACK: 2230 case TCP_QUICKACK:
2212 val = !tp->ack.pingpong; 2231 val = !tp->ack.pingpong;
2213 break; 2232 break;
2233
2234 case TCP_CONGESTION:
2235 if (get_user(len, optlen))
2236 return -EFAULT;
2237 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2238 if (put_user(len, optlen))
2239 return -EFAULT;
2240 if (copy_to_user(optval, tp->ca_ops->name, len))
2241 return -EFAULT;
2242 return 0;
2214 default: 2243 default:
2215 return -ENOPROTOOPT; 2244 return -ENOPROTOOPT;
2216 }; 2245 };
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2224 2253
2225 2254
2226extern void __skb_cb_too_small_for_tcp(int, int); 2255extern void __skb_cb_too_small_for_tcp(int, int);
2227extern void tcpdiag_init(void); 2256extern struct tcp_congestion_ops tcp_reno;
2228 2257
2229static __initdata unsigned long thash_entries; 2258static __initdata unsigned long thash_entries;
2230static int __init set_thash_entries(char *str) 2259static int __init set_thash_entries(char *str)
@@ -2333,6 +2362,8 @@ void __init tcp_init(void)
2333 printk(KERN_INFO "TCP: Hash tables configured " 2362 printk(KERN_INFO "TCP: Hash tables configured "
2334 "(established %d bind %d)\n", 2363 "(established %d bind %d)\n",
2335 tcp_ehash_size << 1, tcp_bhash_size); 2364 tcp_ehash_size << 1, tcp_bhash_size);
2365
2366 tcp_register_congestion_control(&tcp_reno);
2336} 2367}
2337 2368
2338EXPORT_SYMBOL(tcp_accept); 2369EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..4970d10a7785
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc07921..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..7bbbbc33eb4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020} 1938}
2021 1939
2022/* 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2023 * Compute congestion window to use. 1941 u32 in_flight, int good)
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083}
2084
2085/* This is Jacobson's slow start and congestion avoidance.
2086 * SIGCOMM '88, p. 328.
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2024
2416 2025
2417/* Remove acknowledged frames from the retransmission queue. */ 2026/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2027static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2028{
2420 struct tcp_sock *tp = tcp_sk(sk); 2029 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2030 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2031 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2032 int acked = 0;
2424 __s32 seq_rtt = -1; 2033 __s32 seq_rtt = -1;
2034 struct timeval usnow;
2035 u32 pkts_acked = 0;
2036
2037 if (seq_usrtt)
2038 do_gettimeofday(&usnow);
2425 2039
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2040 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2041 skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2062 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2063 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2064 acked |= FLAG_DATA_ACKED;
2065 ++pkts_acked;
2451 } else { 2066 } else {
2452 acked |= FLAG_SYN_ACKED; 2067 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2068 tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2076 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2077 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2078 seq_rtt = now - scb->when;
2079 if (seq_usrtt)
2080 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2081 + (usnow.tv_usec - skb->stamp.tv_usec);
2082
2464 if (sacked & TCPCB_SACKED_ACKED) 2083 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2084 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2085 if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2098 }
2480 2099
2481 if (acked&FLAG_ACKED) { 2100 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2101 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2102 tcp_ack_packets_out(sk, tp);
2103
2104 if (tp->ca_ops->pkts_acked)
2105 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2106 }
2485 2107
2486#if FASTRETRANS_DEBUG > 0 2108#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2246 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2247}
2626 2248
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2249/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2250static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2251{
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2255 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2256 u32 prior_in_flight;
2886 s32 seq_rtt; 2257 s32 seq_rtt;
2258 s32 seq_usrtt = 0;
2887 int prior_packets; 2259 int prior_packets;
2888 2260
2889 /* If the ack is newer than sent or older than previous acks 2261 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2274 */
2903 tcp_update_wl(tp, ack, ack_seq); 2275 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2276 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2277 flag |= FLAG_WIN_UPDATE;
2907 2278
2279 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2280
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2281 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2282 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2283 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2293 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2294 flag |= FLAG_ECE;
2922 2295
2923 tcp_westwood_slow_bw(sk,skb); 2296 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2297 }
2925 2298
2926 /* We passed data and got it acked, remove any soft error 2299 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2308 prior_in_flight = tcp_packets_in_flight(tp);
2936 2309
2937 /* See if we can take anything off of the retransmit queue. */ 2310 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2311 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2312 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2313
2940 if (tp->frto_counter) 2314 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2315 tcp_process_frto(sk, prior_snd_una);
2942 2316
2943 if (tcp_ack_is_dubious(tp, flag)) { 2317 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2318 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2319 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2320 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2321 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2322 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2323 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2324 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2325 }
2955 2326
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2327 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3923
4553 tcp_init_metrics(sk); 3924 tcp_init_metrics(sk);
4554 3925
3926 tcp_init_congestion_control(tp);
3927
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3928 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3929 * packet.
4557 */ 3930 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4081 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4082 return 1;
4710 4083
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4084 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4085 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4086 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4102 goto discard;
4733 4103
4734 case TCP_SYN_SENT: 4104 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4105 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4106 if (queued >= 0)
4740 return queued; 4107 return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4183 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4184 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4185 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4186 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4187
4821 if (tp->rx_opt.tstamp_ok) 4188 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4189 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4195
4829 tcp_init_metrics(sk); 4196 tcp_init_metrics(sk);
4830 4197
4198 tcp_init_congestion_control(tp);
4199
4831 /* Prevent spurious tcp_cwnd_restart() on 4200 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4201 * first data packet.
4833 */ 4202 */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad19..ebf112347a97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache_std = tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops;
2051 2052
2052 sk->sk_state = TCP_CLOSE; 2053 sk->sk_state = TCP_CLOSE;
2053 2054
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2070 2071
2071 tcp_clear_xmit_timers(sk); 2072 tcp_clear_xmit_timers(sk);
2072 2073
2074 tcp_cleanup_congestion_control(tp);
2075
2073 /* Cleanup up the write buffer. */ 2076 /* Cleanup up the write buffer. */
2074 sk_stream_writequeue_purge(sk); 2077 sk_stream_writequeue_purge(sk);
2075 2078
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f3..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
776 776
777 newtp->ca_ops = &tcp_reno;
778
777 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
778 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
779 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..0e17c244875c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 515 * skbs, which it never sent before. --ANK
522 */ 516 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 517 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
518 buff->stamp = skb->stamp;
524 519
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 520 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 521 tp->lost_out -= tcp_skb_pcount(skb);
@@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1444 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1445 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1446 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1447
1454 tcp_select_initial_window(tcp_full_space(sk), 1448 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1449 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1497 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1498 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1499 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1500
1508 /* Send it off. */ 1501 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1502 TCP_SKB_CB(buff)->when = tcp_time_stamp;
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14f5c53235fe..a54d4ef3fd35 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
57#endif 57#endif
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/notifier.h> 59#include <linux/notifier.h>
60#include <linux/string.h>
60 61
61#include <net/sock.h> 62#include <net/sock.h>
62#include <net/snmp.h> 63#include <net/snmp.h>
@@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
3437 * by sysctl and we wouldn't want anyone to change it under our feet 3438 * by sysctl and we wouldn't want anyone to change it under our feet
3438 * (see SIOCSIFNAME). 3439 * (see SIOCSIFNAME).
3439 */ 3440 */
3440 dev_name = net_sysctl_strdup(dev_name); 3441 dev_name = kstrdup(dev_name, GFP_KERNEL);
3441 if (!dev_name) 3442 if (!dev_name)
3442 goto free; 3443 goto free;
3443 3444
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a83..9dac7fdf4726 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
2025 sk->sk_state = TCP_CLOSE; 2025 sk->sk_state = TCP_CLOSE;
2026 2026
2027 tp->af_specific = &ipv6_specific; 2027 tp->af_specific = &ipv6_specific;
2028 2028 tp->ca_ops = &tcp_init_congestion_ops;
2029 sk->sk_write_space = sk_stream_write_space; 2029 sk->sk_write_space = sk_stream_write_space;
2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2031 2031
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
138 138
139 _debug("### End Work"); 139 _debug("### End Work");
140 140
141 try_to_freeze(PF_FREEZE); 141 try_to_freeze();
142 142
143 /* discard pending signals */ 143 /* discard pending signals */
144 rxrpc_discard_my_signals(); 144 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
107 107
108 _debug("### End Inbound Calls"); 108 _debug("### End Inbound Calls");
109 109
110 try_to_freeze(PF_FREEZE); 110 try_to_freeze();
111 111
112 /* discard pending signals */ 112 /* discard pending signals */
113 rxrpc_discard_my_signals(); 113 rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
90 complete_and_exit(&krxtimod_dead, 0); 90 complete_and_exit(&krxtimod_dead, 0);
91 } 91 }
92 92
93 try_to_freeze(PF_FREEZE); 93 try_to_freeze();
94 94
95 /* discard pending signals */ 95 /* discard pending signals */
96 rxrpc_discard_my_signals(); 96 rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604d..7bac249258e3 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -449,6 +449,19 @@ config NET_EMATCH_META
449 To compile this code as a module, choose M here: the 449 To compile this code as a module, choose M here: the
450 module will be called em_meta. 450 module will be called em_meta.
451 451
452config NET_EMATCH_TEXT
453 tristate "Textsearch"
454 depends on NET_EMATCH
455 select TEXTSEARCH
456 select TEXTSEARCH_KMP
457 select TEXTSEARCH_FSM
458 ---help---
459 Say Y here if you want to be ablt to classify packets based on
460 textsearch comparisons.
461
462 To compile this code as a module, choose M here: the
463 module will be called em_text.
464
452config NET_CLS_ACT 465config NET_CLS_ACT
453 bool "Packet ACTION" 466 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS 467 depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..8f58cecd6266 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o 40obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o 41obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o 42obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
43obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
1/*
2 * net/sched/em_text.c Textsearch ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/textsearch.h>
20#include <linux/tc_ematch/tc_em_text.h>
21#include <net/pkt_cls.h>
22
23struct text_match
24{
25 u16 from_offset;
26 u16 to_offset;
27 u8 from_layer;
28 u8 to_layer;
29 struct ts_config *config;
30};
31
32#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
33
34static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
35 struct tcf_pkt_info *info)
36{
37 struct text_match *tm = EM_TEXT_PRIV(m);
38 int from, to;
39 struct ts_state state;
40
41 from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
42 from += tm->from_offset;
43
44 to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
45 to += tm->to_offset;
46
47 return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
48}
49
50static int em_text_change(struct tcf_proto *tp, void *data, int len,
51 struct tcf_ematch *m)
52{
53 struct text_match *tm;
54 struct tcf_em_text *conf = data;
55 struct ts_config *ts_conf;
56 int flags = 0;
57
58 printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
59 conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
60
61 if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
62 return -EINVAL;
63
64 if (conf->from_layer > conf->to_layer)
65 return -EINVAL;
66
67 if (conf->from_layer == conf->to_layer &&
68 conf->from_offset > conf->to_offset)
69 return -EINVAL;
70
71retry:
72 ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
73 conf->pattern_len, GFP_KERNEL, flags);
74
75 if (flags & TS_AUTOLOAD)
76 rtnl_lock();
77
78 if (IS_ERR(ts_conf)) {
79 if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
80 rtnl_unlock();
81 flags |= TS_AUTOLOAD;
82 goto retry;
83 } else
84 return PTR_ERR(ts_conf);
85 } else if (flags & TS_AUTOLOAD) {
86 textsearch_destroy(ts_conf);
87 return -EAGAIN;
88 }
89
90 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
91 if (tm == NULL) {
92 textsearch_destroy(ts_conf);
93 return -ENOBUFS;
94 }
95
96 tm->from_offset = conf->from_offset;
97 tm->to_offset = conf->to_offset;
98 tm->from_layer = conf->from_layer;
99 tm->to_layer = conf->to_layer;
100 tm->config = ts_conf;
101
102 m->datalen = sizeof(*tm);
103 m->data = (unsigned long) tm;
104
105 return 0;
106}
107
108static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
109{
110 textsearch_destroy(EM_TEXT_PRIV(m)->config);
111}
112
113static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
114{
115 struct text_match *tm = EM_TEXT_PRIV(m);
116 struct tcf_em_text conf;
117
118 strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
119 conf.from_offset = tm->from_offset;
120 conf.to_offset = tm->to_offset;
121 conf.from_layer = tm->from_layer;
122 conf.to_layer = tm->to_layer;
123 conf.pattern_len = textsearch_get_pattern_len(tm->config);
124 conf.pad = 0;
125
126 RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
127 RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
128 return 0;
129
130rtattr_failure:
131 return -1;
132}
133
134static struct tcf_ematch_ops em_text_ops = {
135 .kind = TCF_EM_TEXT,
136 .change = em_text_change,
137 .match = em_text_match,
138 .destroy = em_text_destroy,
139 .dump = em_text_dump,
140 .owner = THIS_MODULE,
141 .link = LIST_HEAD_INIT(em_text_ops.link)
142};
143
144static int __init init_em_text(void)
145{
146 return tcf_em_register(&em_text_ops);
147}
148
149static void __exit exit_em_text(void)
150{
151 tcf_em_unregister(&em_text_ops);
152}
153
154MODULE_LICENSE("GPL");
155
156module_init(init_em_text);
157module_exit(exit_em_text);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
92 sctp_cmd_seq_t *commands); 92 sctp_cmd_seq_t *commands);
93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); 93static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
94 94
95static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
96 __u16 error,
97 const struct sctp_association *asoc,
98 struct sctp_transport *transport);
99
100static sctp_disposition_t sctp_sf_violation_chunklen(
101 const struct sctp_endpoint *ep,
102 const struct sctp_association *asoc,
103 const sctp_subtype_t type,
104 void *arg,
105 sctp_cmd_seq_t *commands);
95 106
96/* Small helper function that checks if the chunk length 107/* Small helper function that checks if the chunk length
97 * is of the appropriate length. The 'required_length' argument 108 * is of the appropriate length. The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2328 * 2339 *
2329 * This is common code called by several sctp_sf_*_abort() functions above. 2340 * This is common code called by several sctp_sf_*_abort() functions above.
2330 */ 2341 */
2331sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, 2342static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2332 __u16 error, 2343 __u16 error,
2333 const struct sctp_association *asoc, 2344 const struct sctp_association *asoc,
2334 struct sctp_transport *transport) 2345 struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
3687 * 3698 *
3688 * Generate an ABORT chunk and terminate the association. 3699 * Generate an ABORT chunk and terminate the association.
3689 */ 3700 */
3690sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, 3701static sctp_disposition_t sctp_sf_violation_chunklen(
3702 const struct sctp_endpoint *ep,
3691 const struct sctp_association *asoc, 3703 const struct sctp_association *asoc,
3692 const sctp_subtype_t type, 3704 const sctp_subtype_t type,
3693 void *arg, 3705 void *arg,
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60fe..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
41 41
42/* RPC client functions */ 42/* RPC client functions */
43EXPORT_SYMBOL(rpc_create_client); 43EXPORT_SYMBOL(rpc_create_client);
44EXPORT_SYMBOL(rpc_new_client);
44EXPORT_SYMBOL(rpc_clone_client); 45EXPORT_SYMBOL(rpc_clone_client);
45EXPORT_SYMBOL(rpc_bind_new_program); 46EXPORT_SYMBOL(rpc_bind_new_program);
46EXPORT_SYMBOL(rpc_destroy_client); 47EXPORT_SYMBOL(rpc_destroy_client);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d31..d6baf6fdf8a9 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
8#include <linux/err.h> 8#include <linux/err.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/string.h>
11 12
12#define RPCDBG_FACILITY RPCDBG_AUTH 13#define RPCDBG_FACILITY RPCDBG_AUTH
13 14
@@ -20,14 +21,6 @@
20 */ 21 */
21 22
22 23
23static char *strdup(char *s)
24{
25 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
26 if (rv)
27 strcpy(rv, s);
28 return rv;
29}
30
31struct unix_domain { 24struct unix_domain {
32 struct auth_domain h; 25 struct auth_domain h;
33 int addr_changes; 26 int addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
55 if (new == NULL) 48 if (new == NULL)
56 return NULL; 49 return NULL;
57 cache_init(&new->h.h); 50 cache_init(&new->h.h);
58 new->h.name = strdup(name); 51 new->h.name = kstrdup(name, GFP_KERNEL);
59 new->h.flavour = RPC_AUTH_UNIX; 52 new->h.flavour = RPC_AUTH_UNIX;
60 new->addr_changes = 0; 53 new->addr_changes = 0;
61 new->h.h.expiry_time = NEVER; 54 new->h.h.expiry_time = NEVER;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1185 arg->page_len = (pages-2)*PAGE_SIZE; 1185 arg->page_len = (pages-2)*PAGE_SIZE;
1186 arg->len = (pages-1)*PAGE_SIZE; 1186 arg->len = (pages-1)*PAGE_SIZE;
1187 arg->tail[0].iov_len = 0; 1187 arg->tail[0].iov_len = 0;
1188 1188
1189 try_to_freeze(PF_FREEZE); 1189 try_to_freeze();
1190 if (signalled()) 1190 if (signalled())
1191 return -EINTR; 1191 return -EINTR;
1192 1192
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
1227 1227
1228 schedule_timeout(timeout); 1228 schedule_timeout(timeout);
1229 1229
1230 try_to_freeze(PF_FREEZE); 1230 try_to_freeze();
1231 1231
1232 spin_lock_bh(&serv->sv_lock); 1232 spin_lock_bh(&serv->sv_lock);
1233 remove_wait_queue(&rqstp->rq_wait, &wait); 1233 remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca92405948f..269f217918a3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -970,7 +970,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
970 goto out; 970 goto out;
971 } 971 }
972 972
973 dprintk("RPC: XID %08x read %u bytes\n", 973 dprintk("RPC: XID %08x read %Zd bytes\n",
974 ntohl(xprt->tcp_xid), r); 974 ntohl(xprt->tcp_xid), r);
975 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", 975 dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
976 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); 976 xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1006,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
1006 desc->count -= len; 1006 desc->count -= len;
1007 desc->offset += len; 1007 desc->offset += len;
1008 xprt->tcp_offset += len; 1008 xprt->tcp_offset += len;
1009 dprintk("RPC: discarded %u bytes\n", len); 1009 dprintk("RPC: discarded %Zu bytes\n", len);
1010 tcp_check_recm(xprt); 1010 tcp_check_recm(xprt);
1011} 1011}
1012 1012