9 files changed, 155 insertions, 83 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4286d832166f..fdfb8fe8c38c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
 */
 /**
- *      __skb_queue_head - queue a buffer at the list head
+ *      __skb_queue_after - queue a buffer at the list head
 *      @list: list to use
+ *      @prev: place after this buffer
 *      @newsk: buffer to queue
 *
- *      Queue a buffer at the start of a list. This function takes no locks
+ *      Queue a buffer int the middle of a list. This function takes no locks
 *      and you must therefore hold required locks before calling it.
 *
 *      A buffer cannot be placed on two lists at the same time.
 */
-extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
+static inline void __skb_queue_after(struct sk_buff_head *list,
-static inline void __skb_queue_head(struct sk_buff_head *list,
+                                     struct sk_buff *prev,
-                                    struct sk_buff *newsk)
+                                     struct sk_buff *newsk)
 {
-        struct sk_buff *prev, *next;
+        struct sk_buff *next;
        list->qlen++;
-        prev = (struct sk_buff *)list;
        next = prev->next;
        newsk->next = next;
        newsk->prev = prev;
@@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list,
 }
 /**
+ *      __skb_queue_head - queue a buffer at the list head
+ *      @list: list to use
+ *      @newsk: buffer to queue
+ *
+ *      Queue a buffer at the start of a list. This function takes no locks
+ *      and you must therefore hold required locks before calling it.
+ *
+ *      A buffer cannot be placed on two lists at the same time.
+ */
+extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
+static inline void __skb_queue_head(struct sk_buff_head *list,
+                                    struct sk_buff *newsk)
+{
+        __skb_queue_after(list, (struct sk_buff *)list, newsk);
+}
+/**
 *      __skb_queue_tail - queue a buffer at the list tail
 *      @list: list to use
 *      @newsk: buffer to queue
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
                     prefetch(skb->next), (skb != (struct sk_buff *)(queue));   \
                     skb = skb->next)
+#define skb_queue_reverse_walk(queue, skb) \
+                for (skb = (queue)->prev;                                       \
+                     prefetch(skb->prev), (skb != (struct sk_buff *)(queue));   \
+                     skb = skb->prev)
 extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
                                         int noblock, int *err);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f50f95968340..07840baa9341 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -125,9 +125,7 @@ struct inet_hashinfo {
        rwlock_t                        lhash_lock ____cacheline_aligned;
        atomic_t                        lhash_users;
        wait_queue_head_t               lhash_wait;
-        spinlock_t                      portalloc_lock;
        kmem_cache_t                    *bind_bucket_cachep;
-        int                             port_rover;
 };
 static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf8742..15bfd03e8024 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 {
        struct task_struct *tsk = current;
        DEFINE_WAIT(wait);
+        int done;
-        while (1) {
+        do {
                if (sk->sk_err)
                        return sock_error(sk);
                if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
                sk->sk_write_pending++;
-                if (sk_wait_event(sk, timeo_p,
+                done = sk_wait_event(sk, timeo_p,
-                                  !((1 << sk->sk_state) & 
+                                     !((1 << sk->sk_state) & 
-                                    ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
+                                       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
-                        break;
                finish_wait(sk->sk_sleep, &wait);
                sk->sk_write_pending--;
-        }
+        } while (!done);
        return 0;
 }
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff9e..4b9bc81ae1a3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
        .lhash_lock     = RW_LOCK_UNLOCKED,
        .lhash_users    = ATOMIC_INIT(0),
        .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-        .portalloc_lock = SPIN_LOCK_UNLOCKED,
-        .port_rover     = 1024 - 1,
 };
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
        int ret;
        if (snum == 0) {
-                int rover;
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
+                int rover = net_random() % (high - low) + low;
                struct hlist_node *node;
                struct inet_timewait_sock *tw = NULL;
                local_bh_disable();
-                /* TODO. Actually it is not so bad idea to remove
-                 * dccp_hashinfo.portalloc_lock before next submission to
-                 * Linus.
-                 * As soon as we touch this place at all it is time to think.
-                 *
-                 * Now it protects single _advisory_ variable
-                 * dccp_hashinfo.port_rover, hence it is mostly useless.
-                 * Code will work nicely if we just delete it, but
-                 * I am afraid in contented case it will work not better or
-                 * even worse: another cpu just will hit the same bucket
-                 * and spin there.
-                 * So some cpu salt could remove both contention and
-                 * memory pingpong. Any ideas how to do this in a nice way?
-                 */
-                spin_lock(&dccp_hashinfo.portalloc_lock);
-                rover = dccp_hashinfo.port_rover;
                do {
-                        rover++;
-                        if ((rover < low) || (rover > high))
-                                rover = low;
                        head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
                                                    dccp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
                next_port:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                dccp_hashinfo.port_rover = rover;
-                spin_unlock(&dccp_hashinfo.portalloc_lock);
                local_bh_enable();
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
 ok:
                /* All locks still held and bhs disabled */
-                dccp_hashinfo.port_rover = rover;
-                spin_unlock(&dccp_hashinfo.portalloc_lock);
                inet_bind_hash(sk, tb, rover);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->sport = htons(rover);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5b4..3fe021f1a566 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
-                int rover;
+                int rover = net_random() % (high - low) + low;
-                spin_lock(&hashinfo->portalloc_lock);
-                if (hashinfo->port_rover < low)
-                        rover = low;
-                else
-                        rover = hashinfo->port_rover;
                do {
-                        rover++;
-                        if (rover > high)
-                                rover = low;
                        head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
                        spin_lock(&head->lock);
                        inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
                        break;
                next:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                hashinfo->port_rover = rover;
-                spin_unlock(&hashinfo->portalloc_lock);
                /* Exhausted local port range during search?  It is not
                 * possible for us to be holding one of the bind hash
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a9580..72b7c22e1ea5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
                sysctl_tcp_max_orphans >>= (3 - order);
                sysctl_max_syn_backlog = 128;
        }
-        tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
        sysctl_tcp_mem[0] =  768 << order;
        sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d8474b..49d67cd75edd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
        .lhash_lock     = RW_LOCK_UNLOCKED,
        .lhash_users    = ATOMIC_INIT(0),
        .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-        .portalloc_lock = SPIN_LOCK_UNLOCKED,
-        .port_rover     = 1024 - 1,
 };
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b78..d746d3b27efb 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                int low = sysctl_local_port_range[0];
                int high = sysctl_local_port_range[1];
                int remaining = (high - low) + 1;
-                int rover;
+                int rover = net_random() % (high - low) + low;
-                spin_lock(&tcp_hashinfo.portalloc_lock);
+                do {
-                if (tcp_hashinfo.port_rover < low)
-                        rover = low;
-                else
-                        rover = tcp_hashinfo.port_rover;
-                do {    rover++;
-                        if (rover > high)
-                                rover = low;
                        head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
                        spin_lock(&head->lock);
                        inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
                        break;
                next:
                        spin_unlock(&head->lock);
+                        if (++rover > high)
+                                rover = low;
                } while (--remaining > 0);
-                tcp_hashinfo.port_rover = rover;
-                spin_unlock(&tcp_hashinfo.portalloc_lock);
                /* Exhausted local port range during search?  It is not
                 * possible for us to be holding one of the bind hash
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d5003c..cdc8d283791c 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
 #include <net/pkt_sched.h>
+#define VERSION "1.1"
 /*      Network Emulation Queuing algorithm.
        ====================================
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
            || q->counter < q->gap      /* inside last reordering gap */
            || q->reorder < get_crandom(&q->reorder_cor)) {
                psched_time_t now;
+                psched_tdiff_t delay;
+                delay = tabledist(q->latency, q->jitter,
+                                  &q->delay_cor, q->delay_dist);
                PSCHED_GET_TIME(now);
-                PSCHED_TADD2(now, tabledist(q->latency, q->jitter, 
+                PSCHED_TADD2(now, delay, cb->time_to_send);
-                                            &q->delay_cor, q->delay_dist),
-                             cb->time_to_send);
                ++q->counter;
                ret = q->qdisc->enqueue(skb, q->qdisc);
        } else {
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
                const struct netem_skb_cb *cb
                        = (const struct netem_skb_cb *)skb->cb;
                psched_time_t now;
-                long delay;
                /* if more time remaining? */
                PSCHED_GET_TIME(now);
-                delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
-                pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
+                if (PSCHED_TLESS(cb->time_to_send, now)) {
-                if (delay <= 0) {
                        pr_debug("netem_dequeue: return skb=%p\n", skb);
                        sch->q.qlen--;
                        sch->flags &= ~TCQ_F_THROTTLED;
                        return skb;
-                }
+                } else {
+                        psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
+                        if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+                                sch->qstats.drops++;
-                mod_timer(&q->timer, jiffies + delay);
+                                /* After this qlen is confused */
-                sch->flags |= TCQ_F_THROTTLED;
+                                printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
+                                       q->qdisc->ops->id);
-                if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
+                                sch->q.qlen--;
-                        sch->qstats.drops++;
+                        }
+                        mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
+                        sch->flags |= TCQ_F_THROTTLED;
+                }
        }
        return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
        del_timer_sync(&q->timer);
 }
+/* Pass size change message down to embedded FIFO */
 static int set_fifo_limit(struct Qdisc *q, int limit)
 {
        struct rtattr *rta;
        int ret = -ENOMEM;
+        /* Hack to avoid sending change message to non-FIFO */
+        if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+                return 0;
        rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
        if (rta) {
                rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
        return 0;
 }
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+        u32 limit;
+};
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        struct sk_buff_head *list = &sch->q;
+        const struct netem_skb_cb *ncb
+                = (const struct netem_skb_cb *)nskb->cb;
+        struct sk_buff *skb;
+        if (likely(skb_queue_len(list) < q->limit)) {
+                skb_queue_reverse_walk(list, skb) {
+                        const struct netem_skb_cb *cb
+                                = (const struct netem_skb_cb *)skb->cb;
+                        if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+                                break;
+                }
+                __skb_queue_after(list, skb, nskb);
+                sch->qstats.backlog += nskb->len;
+                sch->bstats.bytes += nskb->len;
+                sch->bstats.packets++;
+                return NET_XMIT_SUCCESS;
+        }
+        return qdisc_drop(nskb, sch);
+}
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        if (opt) {
+                struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+                if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+                        return -EINVAL;
+                q->limit = ctl->limit;
+        } else
+                q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+        return 0;
+}
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+        struct fifo_sched_data *q = qdisc_priv(sch);
+        struct tc_fifo_qopt opt = { .limit = q->limit };
+        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+        return skb->len;
+rtattr_failure:
+        return -1;
+}
+static struct Qdisc_ops tfifo_qdisc_ops = {
+        .id             =       "tfifo",
+        .priv_size      =       sizeof(struct fifo_sched_data),
+        .enqueue        =       tfifo_enqueue,
+        .dequeue        =       qdisc_dequeue_head,
+        .requeue        =       qdisc_requeue,
+        .drop           =       qdisc_queue_drop,
+        .init           =       tfifo_init,
+        .reset          =       qdisc_reset_queue,
+        .change         =       tfifo_init,
+        .dump           =       tfifo_dump,
+};
 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 {
        struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
        q->timer.function = netem_watchdog;
        q->timer.data = (unsigned long) sch;
-        q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+        q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
        if (!q->qdisc) {
                pr_debug("netem: qdisc create failed\n");
                return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
 static int __init netem_module_init(void)
 {
+        pr_info("netem: version " VERSION "\n");
        return register_qdisc(&netem_qdisc_ops);
 }
 static void __exit netem_module_exit(void)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4286d832166f..fdfb8fe8c38c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h
@@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
603	*/	603	*/
604		604
605	/**	605	/**
606	* __skb_queue_head - queue a buffer at the list head	606	* __skb_queue_after - queue a buffer at the list head
607	* @list: list to use	607	* @list: list to use
		608	* @prev: place after this buffer
608	* @newsk: buffer to queue	609	* @newsk: buffer to queue
609	*	610	*
610	* Queue a buffer at the start of a list. This function takes no locks	611	* Queue a buffer int the middle of a list. This function takes no locks
611	* and you must therefore hold required locks before calling it.	612	* and you must therefore hold required locks before calling it.
612	*	613	*
613	* A buffer cannot be placed on two lists at the same time.	614	* A buffer cannot be placed on two lists at the same time.
614	*/	615	*/
615	extern void skb_queue_head(struct sk_buff_head list, struct sk_buff newsk);	616	static inline void __skb_queue_after(struct sk_buff_head *list,
616	static inline void __skb_queue_head(struct sk_buff_head *list,	617	struct sk_buff *prev,
617	struct sk_buff *newsk)	618	struct sk_buff *newsk)
618	{	619	{
619	struct sk_buff prev, next;	620	struct sk_buff *next;
620
621	list->qlen++;	621	list->qlen++;
622	prev = (struct sk_buff *)list;	622
623	next = prev->next;	623	next = prev->next;
624	newsk->next = next;	624	newsk->next = next;
625	newsk->prev = prev;	625	newsk->prev = prev;
@@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list,
627	}	627	}
628		628
629	/**	629	/**
		630	* __skb_queue_head - queue a buffer at the list head
		631	* @list: list to use
		632	* @newsk: buffer to queue
		633	*
		634	* Queue a buffer at the start of a list. This function takes no locks
		635	* and you must therefore hold required locks before calling it.
		636	*
		637	* A buffer cannot be placed on two lists at the same time.
		638	*/
		639	extern void skb_queue_head(struct sk_buff_head list, struct sk_buff newsk);
		640	static inline void __skb_queue_head(struct sk_buff_head *list,
		641	struct sk_buff *newsk)
		642	{
		643	__skb_queue_after(list, (struct sk_buff *)list, newsk);
		644	}
		645
		646	/**
630	* __skb_queue_tail - queue a buffer at the list tail	647	* __skb_queue_tail - queue a buffer at the list tail
631	* @list: list to use	648	* @list: list to use
632	* @newsk: buffer to queue	649	* @newsk: buffer to queue
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
1203	prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \	1220	prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
1204	skb = skb->next)	1221	skb = skb->next)
1205		1222
		1223	#define skb_queue_reverse_walk(queue, skb) \
		1224	for (skb = (queue)->prev; \
		1225	prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \
		1226	skb = skb->prev)
		1227
1206		1228
1207	extern struct sk_buff skb_recv_datagram(struct sock sk, unsigned flags,	1229	extern struct sk_buff skb_recv_datagram(struct sock sk, unsigned flags,
1208	int noblock, int *err);	1230	int noblock, int *err);


diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f50f95968340..07840baa9341 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h
@@ -125,9 +125,7 @@ struct inet_hashinfo {
125	rwlock_t lhash_lock ____cacheline_aligned;	125	rwlock_t lhash_lock ____cacheline_aligned;
126	atomic_t lhash_users;	126	atomic_t lhash_users;
127	wait_queue_head_t lhash_wait;	127	wait_queue_head_t lhash_wait;
128	spinlock_t portalloc_lock;
129	kmem_cache_t *bind_bucket_cachep;	128	kmem_cache_t *bind_bucket_cachep;
130	int port_rover;
131	};	129	};
132		130
133	static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,	131	static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,


diff --git a/net/core/stream.c b/net/core/stream.c index ac9edfdf8742..15bfd03e8024 100644 --- a/net/core/stream.c +++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock sk, long timeo_p)
52	{	52	{
53	struct task_struct *tsk = current;	53	struct task_struct *tsk = current;
54	DEFINE_WAIT(wait);	54	DEFINE_WAIT(wait);
		55	int done;
55		56
56	while (1) {	57	do {
57	if (sk->sk_err)	58	if (sk->sk_err)
58	return sock_error(sk);	59	return sock_error(sk);
59	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV))	60	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock sk, long timeo_p)
65		66
66	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);	67	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
67	sk->sk_write_pending++;	68	sk->sk_write_pending++;
68	if (sk_wait_event(sk, timeo_p,	69	done = sk_wait_event(sk, timeo_p,
69	!((1 << sk->sk_state) &	70	!((1 << sk->sk_state) &
70	~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))))	71	~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)));
71	break;
72	finish_wait(sk->sk_sleep, &wait);	72	finish_wait(sk->sk_sleep, &wait);
73	sk->sk_write_pending--;	73	sk->sk_write_pending--;
74	}	74	} while (!done);
75	return 0;	75	return 0;
76	}	76	}
77		77


diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6298cf58ff9e..4b9bc81ae1a3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
31	.lhash_lock = RW_LOCK_UNLOCKED,	31	.lhash_lock = RW_LOCK_UNLOCKED,
32	.lhash_users = ATOMIC_INIT(0),	32	.lhash_users = ATOMIC_INIT(0),
33	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),	33	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
34	.portalloc_lock = SPIN_LOCK_UNLOCKED,
35	.port_rover = 1024 - 1,
36	};	34	};
37		35
38	EXPORT_SYMBOL_GPL(dccp_hashinfo);	36	EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
125	int ret;	123	int ret;
126		124
127	if (snum == 0) {	125	if (snum == 0) {
128	int rover;
129	int low = sysctl_local_port_range[0];	126	int low = sysctl_local_port_range[0];
130	int high = sysctl_local_port_range[1];	127	int high = sysctl_local_port_range[1];
131	int remaining = (high - low) + 1;	128	int remaining = (high - low) + 1;
		129	int rover = net_random() % (high - low) + low;
132	struct hlist_node *node;	130	struct hlist_node *node;
133	struct inet_timewait_sock *tw = NULL;	131	struct inet_timewait_sock *tw = NULL;
134		132
135	local_bh_disable();	133	local_bh_disable();
136
137	/* TODO. Actually it is not so bad idea to remove
138	* dccp_hashinfo.portalloc_lock before next submission to
139	* Linus.
140	* As soon as we touch this place at all it is time to think.
141	*
142	* Now it protects single _advisory_ variable
143	* dccp_hashinfo.port_rover, hence it is mostly useless.
144	* Code will work nicely if we just delete it, but
145	* I am afraid in contented case it will work not better or
146	* even worse: another cpu just will hit the same bucket
147	* and spin there.
148	* So some cpu salt could remove both contention and
149	* memory pingpong. Any ideas how to do this in a nice way?
150	*/
151	spin_lock(&dccp_hashinfo.portalloc_lock);
152	rover = dccp_hashinfo.port_rover;
153
154	do {	134	do {
155	rover++;
156	if ((rover < low) \|\| (rover > high))
157	rover = low;
158	head = &dccp_hashinfo.bhash[inet_bhashfn(rover,	135	head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
159	dccp_hashinfo.bhash_size)];	136	dccp_hashinfo.bhash_size)];
160	spin_lock(&head->lock);	137	spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
187		164
188	next_port:	165	next_port:
189	spin_unlock(&head->lock);	166	spin_unlock(&head->lock);
		167	if (++rover > high)
		168	rover = low;
190	} while (--remaining > 0);	169	} while (--remaining > 0);
191	dccp_hashinfo.port_rover = rover;
192	spin_unlock(&dccp_hashinfo.portalloc_lock);
193		170
194	local_bh_enable();	171	local_bh_enable();
195		172
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
197		174
198	ok:	175	ok:
199	/* All locks still held and bhs disabled */	176	/* All locks still held and bhs disabled */
200	dccp_hashinfo.port_rover = rover;
201	spin_unlock(&dccp_hashinfo.portalloc_lock);
202
203	inet_bind_hash(sk, tb, rover);	177	inet_bind_hash(sk, tb, rover);
204	if (sk_unhashed(sk)) {	178	if (sk_unhashed(sk)) {
205	inet_sk(sk)->sport = htons(rover);	179	inet_sk(sk)->sport = htons(rover);


diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b4..3fe021f1a566 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
78	int low = sysctl_local_port_range[0];	78	int low = sysctl_local_port_range[0];
79	int high = sysctl_local_port_range[1];	79	int high = sysctl_local_port_range[1];
80	int remaining = (high - low) + 1;	80	int remaining = (high - low) + 1;
81	int rover;	81	int rover = net_random() % (high - low) + low;
82		82
83	spin_lock(&hashinfo->portalloc_lock);
84	if (hashinfo->port_rover < low)
85	rover = low;
86	else
87	rover = hashinfo->port_rover;
88	do {	83	do {
89	rover++;
90	if (rover > high)
91	rover = low;
92	head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];	84	head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
93	spin_lock(&head->lock);	85	spin_lock(&head->lock);
94	inet_bind_bucket_for_each(tb, node, &head->chain)	86	inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
97	break;	89	break;
98	next:	90	next:
99	spin_unlock(&head->lock);	91	spin_unlock(&head->lock);
		92	if (++rover > high)
		93	rover = low;
100	} while (--remaining > 0);	94	} while (--remaining > 0);
101	hashinfo->port_rover = rover;
102	spin_unlock(&hashinfo->portalloc_lock);
103		95
104	/* Exhausted local port range during search? It is not	96	/* Exhausted local port range during search? It is not
105	* possible for us to be holding one of the bind hash	97	* possible for us to be holding one of the bind hash


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a9580..72b7c22e1ea5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
2112	sysctl_tcp_max_orphans >>= (3 - order);	2112	sysctl_tcp_max_orphans >>= (3 - order);
2113	sysctl_max_syn_backlog = 128;	2113	sysctl_max_syn_backlog = 128;
2114	}	2114	}
2115	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2116		2115
2117	sysctl_tcp_mem[0] = 768 << order;	2116	sysctl_tcp_mem[0] = 768 << order;
2118	sysctl_tcp_mem[1] = 1024 << order;	2117	sysctl_tcp_mem[1] = 1024 << order;


diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474b..49d67cd75edd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93	.lhash_lock = RW_LOCK_UNLOCKED,	93	.lhash_lock = RW_LOCK_UNLOCKED,
94	.lhash_users = ATOMIC_INIT(0),	94	.lhash_users = ATOMIC_INIT(0),
95	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),	95	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96	.portalloc_lock = SPIN_LOCK_UNLOCKED,
97	.port_rover = 1024 - 1,
98	};	96	};
99		97
100	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)	98	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)


diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d693cb988b78..d746d3b27efb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
114	int low = sysctl_local_port_range[0];	114	int low = sysctl_local_port_range[0];
115	int high = sysctl_local_port_range[1];	115	int high = sysctl_local_port_range[1];
116	int remaining = (high - low) + 1;	116	int remaining = (high - low) + 1;
117	int rover;	117	int rover = net_random() % (high - low) + low;
118		118
119	spin_lock(&tcp_hashinfo.portalloc_lock);	119	do {
120	if (tcp_hashinfo.port_rover < low)
121	rover = low;
122	else
123	rover = tcp_hashinfo.port_rover;
124	do { rover++;
125	if (rover > high)
126	rover = low;
127	head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];	120	head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
128	spin_lock(&head->lock);	121	spin_lock(&head->lock);
129	inet_bind_bucket_for_each(tb, node, &head->chain)	122	inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
132	break;	125	break;
133	next:	126	next:
134	spin_unlock(&head->lock);	127	spin_unlock(&head->lock);
		128	if (++rover > high)
		129	rover = low;
135	} while (--remaining > 0);	130	} while (--remaining > 0);
136	tcp_hashinfo.port_rover = rover;
137	spin_unlock(&tcp_hashinfo.portalloc_lock);
138		131
139	/* Exhausted local port range during search? It is not	132	/* Exhausted local port range during search? It is not
140	* possible for us to be holding one of the bind hash	133	* possible for us to be holding one of the bind hash


diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bb9bf8d5003c..cdc8d283791c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
25		25
26	#include <net/pkt_sched.h>	26	#include <net/pkt_sched.h>
27		27
		28	#define VERSION "1.1"
		29
28	/* Network Emulation Queuing algorithm.	30	/* Network Emulation Queuing algorithm.
29	====================================	31	====================================
30		32
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff skb, struct Qdisc sch)
185	\|\| q->counter < q->gap /* inside last reordering gap */	187	\|\| q->counter < q->gap /* inside last reordering gap */
186	\|\| q->reorder < get_crandom(&q->reorder_cor)) {	188	\|\| q->reorder < get_crandom(&q->reorder_cor)) {
187	psched_time_t now;	189	psched_time_t now;
		190	psched_tdiff_t delay;
		191
		192	delay = tabledist(q->latency, q->jitter,
		193	&q->delay_cor, q->delay_dist);
		194
188	PSCHED_GET_TIME(now);	195	PSCHED_GET_TIME(now);
189	PSCHED_TADD2(now, tabledist(q->latency, q->jitter,	196	PSCHED_TADD2(now, delay, cb->time_to_send);
190	&q->delay_cor, q->delay_dist),
191	cb->time_to_send);
192	++q->counter;	197	++q->counter;
193	ret = q->qdisc->enqueue(skb, q->qdisc);	198	ret = q->qdisc->enqueue(skb, q->qdisc);
194	} else {	199	} else {
@@ -248,24 +253,31 @@ static struct sk_buff netem_dequeue(struct Qdisc sch)
248	const struct netem_skb_cb *cb	253	const struct netem_skb_cb *cb
249	= (const struct netem_skb_cb *)skb->cb;	254	= (const struct netem_skb_cb *)skb->cb;
250	psched_time_t now;	255	psched_time_t now;
251	long delay;
252		256
253	/* if more time remaining? */	257	/* if more time remaining? */
254	PSCHED_GET_TIME(now);	258	PSCHED_GET_TIME(now);
255	delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));	259
256	pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);	260	if (PSCHED_TLESS(cb->time_to_send, now)) {
257	if (delay <= 0) {
258	pr_debug("netem_dequeue: return skb=%p\n", skb);	261	pr_debug("netem_dequeue: return skb=%p\n", skb);
259	sch->q.qlen--;	262	sch->q.qlen--;
260	sch->flags &= ~TCQ_F_THROTTLED;	263	sch->flags &= ~TCQ_F_THROTTLED;
261	return skb;	264	return skb;
262	}	265	} else {
		266	psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
		267
		268	if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
		269	sch->qstats.drops++;
263		270
264	mod_timer(&q->timer, jiffies + delay);	271	/* After this qlen is confused */
265	sch->flags \|= TCQ_F_THROTTLED;	272	printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
		273	q->qdisc->ops->id);
266		274
267	if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)	275	sch->q.qlen--;
268	sch->qstats.drops++;	276	}
		277
		278	mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
		279	sch->flags \|= TCQ_F_THROTTLED;
		280	}
269	}	281	}
270		282
271	return NULL;	283	return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
290	del_timer_sync(&q->timer);	302	del_timer_sync(&q->timer);
291	}	303	}
292		304
		305	/* Pass size change message down to embedded FIFO */
293	static int set_fifo_limit(struct Qdisc *q, int limit)	306	static int set_fifo_limit(struct Qdisc *q, int limit)
294	{	307	{
295	struct rtattr *rta;	308	struct rtattr *rta;
296	int ret = -ENOMEM;	309	int ret = -ENOMEM;
297		310
		311	/* Hack to avoid sending change message to non-FIFO */
		312	if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
		313	return 0;
		314
298	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);	315	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
299	if (rta) {	316	if (rta) {
300	rta->rta_type = RTM_NEWQDISC;	317	rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc sch, struct rtattr opt)
426	return 0;	443	return 0;
427	}	444	}
428		445
		446	/*
		447	* Special case version of FIFO queue for use by netem.
		448	* It queues in order based on timestamps in skb's
		449	*/
		450	struct fifo_sched_data {
		451	u32 limit;
		452	};
		453
		454	static int tfifo_enqueue(struct sk_buff nskb, struct Qdisc sch)
		455	{
		456	struct fifo_sched_data *q = qdisc_priv(sch);
		457	struct sk_buff_head *list = &sch->q;
		458	const struct netem_skb_cb *ncb
		459	= (const struct netem_skb_cb *)nskb->cb;
		460	struct sk_buff *skb;
		461
		462	if (likely(skb_queue_len(list) < q->limit)) {
		463	skb_queue_reverse_walk(list, skb) {
		464	const struct netem_skb_cb *cb
		465	= (const struct netem_skb_cb *)skb->cb;
		466
		467	if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
		468	break;
		469	}
		470
		471	__skb_queue_after(list, skb, nskb);
		472
		473	sch->qstats.backlog += nskb->len;
		474	sch->bstats.bytes += nskb->len;
		475	sch->bstats.packets++;
		476
		477	return NET_XMIT_SUCCESS;
		478	}
		479
		480	return qdisc_drop(nskb, sch);
		481	}
		482
		483	static int tfifo_init(struct Qdisc sch, struct rtattr opt)
		484	{
		485	struct fifo_sched_data *q = qdisc_priv(sch);
		486
		487	if (opt) {
		488	struct tc_fifo_qopt *ctl = RTA_DATA(opt);
		489	if (RTA_PAYLOAD(opt) < sizeof(*ctl))
		490	return -EINVAL;
		491
		492	q->limit = ctl->limit;
		493	} else
		494	q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
		495
		496	return 0;
		497	}
		498
		499	static int tfifo_dump(struct Qdisc sch, struct sk_buff skb)
		500	{
		501	struct fifo_sched_data *q = qdisc_priv(sch);
		502	struct tc_fifo_qopt opt = { .limit = q->limit };
		503
		504	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
		505	return skb->len;
		506
		507	rtattr_failure:
		508	return -1;
		509	}
		510
		511	static struct Qdisc_ops tfifo_qdisc_ops = {
		512	.id = "tfifo",
		513	.priv_size = sizeof(struct fifo_sched_data),
		514	.enqueue = tfifo_enqueue,
		515	.dequeue = qdisc_dequeue_head,
		516	.requeue = qdisc_requeue,
		517	.drop = qdisc_queue_drop,
		518	.init = tfifo_init,
		519	.reset = qdisc_reset_queue,
		520	.change = tfifo_init,
		521	.dump = tfifo_dump,
		522	};
		523
429	static int netem_init(struct Qdisc sch, struct rtattr opt)	524	static int netem_init(struct Qdisc sch, struct rtattr opt)
430	{	525	{
431	struct netem_sched_data *q = qdisc_priv(sch);	526	struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc sch, struct rtattr opt)
438	q->timer.function = netem_watchdog;	533	q->timer.function = netem_watchdog;
439	q->timer.data = (unsigned long) sch;	534	q->timer.data = (unsigned long) sch;
440		535
441	q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);	536	q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
442	if (!q->qdisc) {	537	if (!q->qdisc) {
443	pr_debug("netem: qdisc create failed\n");	538	pr_debug("netem: qdisc create failed\n");
444	return -ENOMEM;	539	return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
601		696
602	static int __init netem_module_init(void)	697	static int __init netem_module_init(void)
603	{	698	{
		699	pr_info("netem: version " VERSION "\n");
604	return register_qdisc(&netem_qdisc_ops);	700	return register_qdisc(&netem_qdisc_ops);
605	}	701	}
606	static void __exit netem_module_exit(void)	702	static void __exit netem_module_exit(void)