From a461c0297f2e80c78eaa03fc5141bf57a814ff4f Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 10 Mar 2011 05:57:04 +0000 Subject: bridge: skip forwarding delay if not using STP If Spanning Tree Protocol is not enabled, there is no good reason for the bridge code to wait for the forwarding delay period before enabling the link. The purpose of the forwarding delay is to allow STP to learn about other bridges before nominating itself. The only possible impact is that when starting up a new port the bridge may flood a packet now, where previously it might have seen traffic from the other host and preseeded the forwarding table. Includes change for local variable br already available in that func. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 57186d84d2bd..47582d301fbe 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -375,12 +375,12 @@ static void br_make_forwarding(struct net_bridge_port *p) if (p->state != BR_STATE_BLOCKING) return; - if (br->forward_delay == 0) { + if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { p->state = BR_STATE_FORWARDING; br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } - else if (p->br->stp_enabled == BR_KERNEL_STP) + else if (br->stp_enabled == BR_KERNEL_STP) p->state = BR_STATE_LISTENING; else p->state = BR_STATE_LEARNING; -- cgit v1.2.2 From e5537bfc98f01561fbdfbd8a78f0dc3e2360491d Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Mon, 14 Mar 2011 15:25:33 -0700 Subject: af_unix: update locking comment We latch our state using a spinlock not a r/w kind of lock. Signed-off-by: Daniel Baluta Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 437a99e560e1..b213ce668c98 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1124,7 +1124,7 @@ restart: /* Latch our state. - It is tricky place. We need to grab write lock and cannot + It is tricky place. We need to grab our state lock and cannot drop lock on peer. It is dangerous because deadlock is possible. Connect to self case and simultaneous attempt to connect are eliminated by checking socket -- cgit v1.2.2 From febf081987ec445f071ed10b73e9707a88cc5cc4 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 14 Mar 2011 07:52:12 +0000 Subject: tcp: fix RTT for quick packets in congestion control In the congestion control interface, the callback for each ACK includes an estimated round trip time in microseconds. Some algorithms need high resolution (Vegas style) but most only need jiffie resolution. If RTT is not accurate (like a retransmission) -1 is used as a flag value. When doing coarse resolution if RTT is less than a a jiffie then 0 should be returned rather than no estimate. Otherwise algorithms that expect good ack's to trigger slow start (like CUBIC Hystart) will be confused. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 65f6c0406245..e16b17efcf57 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3350,7 +3350,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); - else if (ca_seq_rtt > 0) + else if (ca_seq_rtt >= 0) rtt_us = jiffies_to_usecs(ca_seq_rtt); } -- cgit v1.2.2 From c54b4b7655447c1f24f6d50779c22eba9ee0fd24 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 14 Mar 2011 07:52:13 +0000 Subject: tcp_cubic: fix comparison of jiffies Jiffies wraps around therefore the correct way to compare is to use cast to signed value. Note: cubic is not using full jiffies value on 64 bit arch because using full unsigned long makes struct bictcp grow too large for the available ca_priv area. Includes correction from Sangtae Ha to improve ack train detection. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f29fa6..43bb34c9b8bf 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -342,9 +342,11 @@ static void hystart_update(struct sock *sk, u32 delay) u32 curr_jiffies = jiffies; /* first detection parameter - ack-train detection */ - if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { + if ((s32)(curr_jiffies - ca->last_jiffies) <= + msecs_to_jiffies(2)) { ca->last_jiffies = curr_jiffies; - if (curr_jiffies - ca->round_start >= ca->delay_min>>4) + if ((s32) (curr_jiffies - ca->round_start) > + ca->delay_min >> 4) ca->found |= HYSTART_ACK_TRAIN; } -- cgit v1.2.2 From aac46324e12a2bf2e9e0855ad6a287945e34ad39 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 14 Mar 2011 07:52:14 +0000 Subject: tcp_cubic: make ack train delta value a parameter Make the spacing between ACK's that indicates a train a tuneable value like other hystart values. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 43bb34c9b8bf..66d3b00233ec 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -343,7 +346,7 @@ static void hystart_update(struct sock *sk, u32 delay) /* first detection parameter - ack-train detection */ if ((s32)(curr_jiffies - ca->last_jiffies) <= - msecs_to_jiffies(2)) { + msecs_to_jiffies(hystart_ack_delta)) { ca->last_jiffies = curr_jiffies; if ((s32) (curr_jiffies - ca->round_start) > ca->delay_min >> 4) -- cgit v1.2.2 From 17a6e9f1aa9ba07ca13a1eaf1e631e743af50cca Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 14 Mar 2011 07:52:15 +0000 Subject: tcp_cubic: fix clock dependency The hystart code was written with assumption that HZ=1000. Replace the use of jiffies with bictcp_clock as a millisecond real time clock. Signed-off-by: Stephen Hemminger Reported-by: Lucas Nussbaum Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 66d3b00233ec..f4fb2d4b90ad 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -88,7 +88,7 @@ struct bictcp { u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay */ + u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -98,7 +98,7 @@ struct bictcp { u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ - u32 last_jiffies; /* last time when the ACK spacing is close */ + u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; @@ -119,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 + return ktime_to_ms(ktime_get_real()); +#else + return jiffies_to_msecs(jiffies); +#endif +} + static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_jiffies = jiffies; + ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; ca->curr_rtt = 0; ca->sample_cnt = 0; @@ -239,8 +248,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) */ /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) - << BICTCP_HZ) / HZ; + t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) + - ca->epoch_start) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -342,14 +351,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (!(ca->found & hystart_detect)) { - u32 curr_jiffies = jiffies; + u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ - if ((s32)(curr_jiffies - ca->last_jiffies) <= - msecs_to_jiffies(hystart_ack_delta)) { - ca->last_jiffies = curr_jiffies; - if ((s32) (curr_jiffies - ca->round_start) > - ca->delay_min >> 4) + if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + ca->last_ack = now; + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) ca->found |= HYSTART_ACK_TRAIN; } @@ -396,7 +403,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = usecs_to_jiffies(rtt_us) << 3; + delay = (rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; -- cgit v1.2.2 From 3b585b34493ec9db382d6c325d4ed77b9eb2d2a5 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 14 Mar 2011 07:52:16 +0000 Subject: tcp_cubic: enable high resolution ack time if needed This is a refined version of an earlier patch by Lucas Nussbaum. Cubic needs RTT values in milliseconds. If HZ < 1000 then the values will be too coarse. Signed-off-by: Stephen Hemminger Reported-by: Lucas Nussbaum Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index f4fb2d4b90ad..5e0491d742ca 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -459,6 +459,10 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); + /* hystart needs ms clock resolution */ + if (hystart && HZ < 1000) + cubictcp.flags |= TCP_CONG_RTT_STAMP; + return tcp_register_congestion_control(&cubictcp); } -- cgit v1.2.2 From 2b4636a5f8ca547000f6aba24ec1c58f31f4a91d Mon Sep 17 00:00:00 2001 From: Sangtae Ha Date: Mon, 14 Mar 2011 07:52:17 +0000 Subject: tcp_cubic: make the delay threshold of HyStart less sensitive Make HyStart less sensitive to abrupt delay variations due to buffer bloat. Signed-off-by: Sangtae Ha Acked-by: Stephen Hemminger Reported-by: Lucas Nussbaum Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5e0491d742ca..7172c129ff19 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -39,7 +39,7 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (2U<<3) +#define HYSTART_DELAY_MIN (4U<<3) #define HYSTART_DELAY_MAX (16U<<3) #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) -- cgit v1.2.2 From b5ccd07337489fa9c9d32e0b628a2168b7953adf Mon Sep 17 00:00:00 2001 From: Sangtae Ha Date: Mon, 14 Mar 2011 07:52:18 +0000 Subject: tcp_cubic: fix low utilization of CUBIC with HyStart HyStart sets the initial exit point of slow start. Suppose that HyStart exits at 0.5BDP in a BDP network and no history exists. If the BDP of a network is large, CUBIC's initial cwnd growth may be too conservative to utilize the link. CUBIC increases the cwnd 20% per RTT in this case. Signed-off-by: Sangtae Ha Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 7172c129ff19..90d92dd4cf13 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -270,6 +270,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 100 * cwnd; /* very small increment*/ } + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->loss_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; -- cgit v1.2.2