From 4498121ca3acbf928681b71261227d28dc29b6f6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Feb 2007 09:56:42 -0800
Subject: [NET]: Handle disabled preemption in gfp_any()

ctnetlink uses netlink_unicast from an atomic_notifier_chain
(which is called within a RCU read side critical section)
without holding further locks. netlink_unicast calls netlink_trim
with the result of gfp_any() for the gfp flags, which are passed
down to pskb_expand_header. gfp_any() only checks for softirq
context and returns GFP_KERNEL, resulting in this warning:

BUG: sleeping function called from invalid context at mm/slab.c:3032
in_atomic():1, irqs_disabled():0
no locks held by rmmod/7010.

Call Trace:
 [<ffffffff8109467f>] debug_show_held_locks+0x9/0xb
 [<ffffffff8100b0b4>] __might_sleep+0xd9/0xdb
 [<ffffffff810b5082>] __kmalloc+0x68/0x110
 [<ffffffff811ba8f2>] pskb_expand_head+0x4d/0x13b
 [<ffffffff81053147>] netlink_broadcast+0xa5/0x2e0
 [<ffffffff881cd1d7>] :nfnetlink:nfnetlink_send+0x83/0x8a
 [<ffffffff8834f6a6>] :nf_conntrack_netlink:ctnetlink_conntrack_event+0x94c/0x96a
 [<ffffffff810624d6>] notifier_call_chain+0x29/0x3e
 [<ffffffff8106251d>] atomic_notifier_call_chain+0x32/0x60
 [<ffffffff881d266d>] :nf_conntrack:destroy_conntrack+0xa5/0x1d3
 [<ffffffff881d194e>] :nf_conntrack:nf_ct_cleanup+0x8c/0x12c
 [<ffffffff881d4614>] :nf_conntrack:kill_l3proto+0x0/0x13
 [<ffffffff881d482a>] :nf_conntrack:nf_conntrack_l3proto_unregister+0x90/0x94
 [<ffffffff883551b3>] :nf_conntrack_ipv4:nf_conntrack_l3proto_ipv4_fini+0x2b/0x5d
 [<ffffffff8109d44f>] sys_delete_module+0x1b5/0x1e6
 [<ffffffff8105f245>] trace_hardirqs_on_thunk+0x35/0x37
 [<ffffffff8105911e>] system_call+0x7e/0x83

Since netlink_unicast is supposed to be callable from within RCU
read side critical sections, make gfp_any() check for in_atomic()
instead of in_softirq().

Additionally nfnetlink_send needs to use gfp_any() as well for the
call to netlink_broadcast).

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/sock.h')
diff --git a/include/net/sock.h b/include/net/sock.h
index 03684e702d13..2c7d60ca3548 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1278,7 +1278,7 @@ static inline int sock_writeable(const struct sock *sk)
 
 static inline gfp_t gfp_any(void)
 {
-	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
+	return in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
 }
 
 static inline long sock_rcvtimeo(const struct sock *sk, int noblock)
-- 
cgit v1.2.2


From 8488df894d05d6fa41c2bd298c335f944bb0e401 Mon Sep 17 00:00:00 2001
From: Wei Dong <weid@np.css.fujitsu.com>
Date: Fri, 2 Mar 2007 12:37:26 -0800
Subject: [NET]: Fix bugs in "Whether sock accept queue is full" checking

	when I use linux TCP socket, and find there is a bug in function
sk_acceptq_is_full().

	When a new SYN comes, TCP module first checks its validation. If valid,
send SYN,ACK to the client and add the sock to the syn hash table. Next
time if received the valid ACK for SYN,ACK from the client. server will
accept this connection and increase the sk->sk_ack_backlog -- which is
done in function tcp_check_req().We check wether acceptq is full in
function tcp_v4_syn_recv_sock().

Consider an example:

 After listen(sockfd, 1) system call, sk->sk_max_ack_backlog is set to
1. As we know, sk->sk_ack_backlog is initialized to 0. Assuming accept()
system call is not invoked now.

1. 1st connection comes. invoke sk_acceptq_is_full(). sk-
>sk_ack_backlog=0 sk->sk_max_ack_backlog=1, function return 0 accept
this connection. Increase the sk->sk_ack_backlog
2. 2nd connection comes. invoke sk_acceptq_is_full(). sk-
>sk_ack_backlog=1 sk->sk_max_ack_backlog=1, function return 0 accept
this connection. Increase the sk->sk_ack_backlog
3. 3rd connection comes. invoke sk_acceptq_is_full(). sk-
>sk_ack_backlog=2 sk->sk_max_ack_backlog=1, function return 1. Refuse
this connection.

I think it has bugs. after listen system call. sk->sk_max_ack_backlog=1
but now it can accept 2 connections.

Signed-off-by: Wei Dong <weid@np.css.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2c7d60ca3548..849c7df23181 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -426,7 +426,7 @@ static inline void sk_acceptq_added(struct sock *sk)
 
 static inline int sk_acceptq_is_full(struct sock *sk)
 {
-	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
+	return sk->sk_ack_backlog >= sk->sk_max_ack_backlog;
 }
 
 /*
-- 
cgit v1.2.2


From 64a146513f8f12ba204b7bf5cb7e9505594ead42 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 6 Mar 2007 11:21:05 -0800
Subject: [NET]: Revert incorrect accept queue backlog changes.

This reverts two changes:

8488df894d05d6fa41c2bd298c335f944bb0e401
248f06726e866942b3d8ca8f411f9067713b7ff8

A backlog value of N really does mean allow "N + 1" connections
to queue to a listening socket.  This allows one to specify
"0" as the backlog and still get 1 connection.

Noticed by Gerrit Renker and Rick Jones.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 849c7df23181..2c7d60ca3548 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -426,7 +426,7 @@ static inline void sk_acceptq_added(struct sock *sk)
 
 static inline int sk_acceptq_is_full(struct sock *sk)
 {
-	return sk->sk_ack_backlog >= sk->sk_max_ack_backlog;
+	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
 }
 
 /*
-- 
cgit v1.2.2


From fa438ccfdfd3f6db02c13b61b21454eb81cd6a13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 4 Mar 2007 16:05:44 -0800
Subject: [NET]: Keep sk_backlog near sk_lock

sk_backlog is a critical field of struct sock. (known famous words)

It is (ab)used in hot paths, in particular in release_sock(), tcp_recvmsg(),
tcp_v4_rcv(), sk_receive_skb().

It really makes sense to place it next to sk_lock, because sk_backlog is only
used after sk_lock locked (and thus memory cache line in L1 cache). This
should reduce cache misses and sk_lock acquisition time.

(In theory, we could only move the head pointer near sk_lock, and leaving tail
far away, because 'tail' is normally not so hot, but keep it simple :) )

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2c7d60ca3548..a3366c3c837a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -202,6 +202,15 @@ struct sock {
 	unsigned short		sk_type;
 	int			sk_rcvbuf;
 	socket_lock_t		sk_lock;
+	/*
+	 * The backlog queue is special, it is always used with
+	 * the per-socket spinlock held and requires low latency
+	 * access. Therefore we special case it's implementation.
+	 */
+	struct {
+		struct sk_buff *head;
+		struct sk_buff *tail;
+	} sk_backlog;
 	wait_queue_head_t	*sk_sleep;
 	struct dst_entry	*sk_dst_cache;
 	struct xfrm_policy	*sk_policy[2];
@@ -221,15 +230,6 @@ struct sock {
 	int			sk_rcvlowat;
 	unsigned long 		sk_flags;
 	unsigned long	        sk_lingertime;
-	/*
-	 * The backlog queue is special, it is always used with
-	 * the per-socket spinlock held and requires low latency
-	 * access. Therefore we special case it's implementation.
-	 */
-	struct {
-		struct sk_buff *head;
-		struct sk_buff *tail;
-	} sk_backlog;
 	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
-- 
cgit v1.2.2


From b7aa0bf70c4afb9e38be25f5c0922498d0f8684c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 19 Apr 2007 16:16:32 -0700
Subject: [NET]: convert network timestamps to ktime_t

We currently use a special structure (struct skb_timeval) and plain
'struct timeval' to store packet timestamps in sk_buffs and struct
sock.

This has some drawbacks :
- Fixed resolution of micro second.
- Waste of space on 64bit platforms where sizeof(struct timeval)=16

I suggest using ktime_t that is a nice abstraction of high resolution
time services, currently capable of nanosecond resolution.

As sizeof(ktime_t) is 8 bytes, using ktime_t in 'struct sock' permits
a 8 byte shrink of this structure on 64bit architectures. Some other
structures also benefit from this size reduction (struct ipq in
ipv4/ip_fragment.c, struct frag_queue in ipv6/reassembly.c, ...)

Once this ktime infrastructure adopted, we can more easily provide
nanosecond resolution on top of it. (ioctl SIOCGSTAMPNS and/or
SO_TIMESTAMPNS/SCM_TIMESTAMPNS)

Note : this patch includes a bug correction in
compat_sock_get_timestamp() where a "err = 0;" was missing (so this
syscall returned -ENOENT instead of 0)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
CC: Stephen Hemminger <shemminger@linux-foundation.org>
CC: John find <linux.kernel@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index a3366c3c837a..9583639090d2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -244,7 +244,7 @@ struct sock {
 	struct sk_filter      	*sk_filter;
 	void			*sk_protinfo;
 	struct timer_list	sk_timer;
-	struct timeval		sk_stamp;
+	ktime_t			sk_stamp;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 	struct page		*sk_sndmsg_page;
@@ -1307,19 +1307,19 @@ static inline int sock_intr_errno(long timeo)
 static __inline__ void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 {
-	struct timeval stamp;
+	ktime_t kt = skb->tstamp;
 
-	skb_get_timestamp(skb, &stamp);
 	if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+		struct timeval tv;
 		/* Race occurred between timestamp enabling and packet
 		   receiving.  Fill in the current time for now. */
-		if (stamp.tv_sec == 0)
-			do_gettimeofday(&stamp);
-		skb_set_timestamp(skb, &stamp);
-		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval),
-			 &stamp);
+		if (kt.tv64 == 0)
+			kt = ktime_get_real();
+		skb->tstamp = kt;
+		tv = ktime_to_timeval(kt);
+		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(tv), &tv);
 	} else
-		sk->sk_stamp = stamp;
+		sk->sk_stamp = kt;
 }
 
 /**
-- 
cgit v1.2.2


From fe067e8ab5e0dc5ca3c54634924c628da92090b4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 7 Mar 2007 12:12:44 -0800
Subject: [TCP]: Abstract out all write queue operations.

This allows the write queue implementation to be changed,
for example, to one which allows fast interval searching.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9583639090d2..2974bacc8850 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -710,15 +710,6 @@ static inline void sk_stream_mem_reclaim(struct sock *sk)
 		__sk_stream_mem_reclaim(sk);
 }
 
-static inline void sk_stream_writequeue_purge(struct sock *sk)
-{
-	struct sk_buff *skb;
-
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
-		sk_stream_free_skb(sk, skb);
-	sk_stream_mem_reclaim(sk);
-}
-
 static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
 	return (int)skb->truesize <= sk->sk_forward_alloc ||
@@ -1256,18 +1247,6 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
 	return page;
 }
 
-#define sk_stream_for_retrans_queue(skb, sk)				\
-		for (skb = (sk)->sk_write_queue.next;			\
-		     (skb != (sk)->sk_send_head) &&			\
-		     (skb != (struct sk_buff *)&(sk)->sk_write_queue);	\
-		     skb = skb->next)
-
-/*from STCP for fast SACK Process*/
-#define sk_stream_for_retrans_queue_from(skb, sk)			\
-		for (; (skb != (sk)->sk_send_head) &&                   \
-		     (skb != (struct sk_buff *)&(sk)->sk_write_queue);	\
-		     skb = skb->next)
-
 /*
  *	Default write policy as shown to user space via poll/select/SIGIO
  */
-- 
cgit v1.2.2


From ae40eb1ef30ab4120bd3c8b7e3da99ee53d27a23 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 18 Mar 2007 17:33:16 -0700
Subject: [NET]: Introduce SIOCGSTAMPNS ioctl to get timestamps with nanosec
 resolution

Now network timestamps use ktime_t infrastructure, we can add a new
ioctl() SIOCGSTAMPNS command to get timestamps in 'struct timespec'.
User programs can thus access to nanosecond resolution.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
CC: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2974bacc8850..d093e49fdc85 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1329,6 +1329,7 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_e
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
+extern int sock_get_timestampns(struct sock *, struct timespec __user *);
 
 /* 
  *	Enable debug/info messages 
-- 
cgit v1.2.2


From a2a316fd068c455c609ecc155dcfaa7e208d29fe Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Thu, 8 Mar 2007 20:41:08 -0800
Subject: [NET]: Replace CONFIG_NET_DEBUG with sysctl.

Covert network warning messages from a compile time to runtime choice.
Removes kernel config option and replaces it with new /proc/sys/net/core/warnings.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index d093e49fdc85..51246579592e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1334,14 +1334,12 @@ extern int sock_get_timestampns(struct sock *, struct timespec __user *);
 /* 
  *	Enable debug/info messages 
  */
+extern int net_msg_warn;
+#define NETDEBUG(fmt, args...) \
+	do { if (net_msg_warn) printk(fmt,##args); } while (0)
 
-#ifdef CONFIG_NETDEBUG
-#define NETDEBUG(fmt, args...)	printk(fmt,##args)
-#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0)
-#else
-#define NETDEBUG(fmt, args...)	do { } while (0)
-#define LIMIT_NETDEBUG(fmt, args...) do { } while(0)
-#endif
+#define LIMIT_NETDEBUG(fmt, args...) \
+	do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0)
 
 /*
  * Macros for sleeping on a socket. Use them like this:
-- 
cgit v1.2.2


From 92f37fd2ee805aa77925c1e64fd56088b46094fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 25 Mar 2007 22:14:49 -0700
Subject: [NET]: Adding SO_TIMESTAMPNS / SCM_TIMESTAMPNS support

Now that network timestamps use ktime_t infrastructure, we can add a new
SOL_SOCKET sockopt  SO_TIMESTAMPNS.

This command is similar to SO_TIMESTAMP, but permits transmission of
a 'timespec struct' instead of a 'timeval struct' control message.
(nanosecond resolution instead of microsecond)

Control message is labelled SCM_TIMESTAMPNS instead of SCM_TIMESTAMP

A socket cannot mix SO_TIMESTAMP and SO_TIMESTAMPNS : the two modes are
mutually exclusive.

sock_recv_timestamp() became too big to be fully inlined so I added a
__sock_recv_timestamp() helper function.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
CC: linux-arch@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 51246579592e..390c04700590 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -390,6 +390,7 @@ enum sock_flags {
 	SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
 	SOCK_DBG, /* %SO_DEBUG setting */
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
+	SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
 };
@@ -1283,21 +1284,17 @@ static inline int sock_intr_errno(long timeo)
 	return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
 }
 
+extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb);
+
 static __inline__ void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 {
 	ktime_t kt = skb->tstamp;
 
-	if (sock_flag(sk, SOCK_RCVTSTAMP)) {
-		struct timeval tv;
-		/* Race occurred between timestamp enabling and packet
-		   receiving.  Fill in the current time for now. */
-		if (kt.tv64 == 0)
-			kt = ktime_get_real();
-		skb->tstamp = kt;
-		tv = ktime_to_timeval(kt);
-		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(tv), &tv);
-	} else
+	if (sock_flag(sk, SOCK_RCVTSTAMP))
+		__sock_recv_timestamp(msg, sk, skb);
+	else
 		sk->sk_stamp = kt;
 }
 
-- 
cgit v1.2.2


From 9958089a43ae8a9af07402461c0b2b7548c7341e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 Apr 2007 17:12:43 -0700
Subject: [NET]: Move sk_setup_caps() out of line.

It is far too large to be an inline and not in any hot paths.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 390c04700590..25c37e34bfdc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1075,19 +1075,7 @@ static inline int sk_can_gso(const struct sock *sk)
 	return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
 }
 
-static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
-{
-	__sk_dst_set(sk, dst);
-	sk->sk_route_caps = dst->dev->features;
-	if (sk->sk_route_caps & NETIF_F_GSO)
-		sk->sk_route_caps |= NETIF_F_GSO_MASK;
-	if (sk_can_gso(sk)) {
-		if (dst->header_len)
-			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
-		else 
-			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
-	}
-}
+extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
 
 static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb)
 {
-- 
cgit v1.2.2