From 4498121ca3acbf928681b71261227d28dc29b6f6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Feb 2007 09:56:42 -0800 Subject: [NET]: Handle disabled preemption in gfp_any() ctnetlink uses netlink_unicast from an atomic_notifier_chain (which is called within a RCU read side critical section) without holding further locks. netlink_unicast calls netlink_trim with the result of gfp_any() for the gfp flags, which are passed down to pskb_expand_header. gfp_any() only checks for softirq context and returns GFP_KERNEL, resulting in this warning: BUG: sleeping function called from invalid context at mm/slab.c:3032 in_atomic():1, irqs_disabled():0 no locks held by rmmod/7010. Call Trace: [] debug_show_held_locks+0x9/0xb [] __might_sleep+0xd9/0xdb [] __kmalloc+0x68/0x110 [] pskb_expand_head+0x4d/0x13b [] netlink_broadcast+0xa5/0x2e0 [] :nfnetlink:nfnetlink_send+0x83/0x8a [] :nf_conntrack_netlink:ctnetlink_conntrack_event+0x94c/0x96a [] notifier_call_chain+0x29/0x3e [] atomic_notifier_call_chain+0x32/0x60 [] :nf_conntrack:destroy_conntrack+0xa5/0x1d3 [] :nf_conntrack:nf_ct_cleanup+0x8c/0x12c [] :nf_conntrack:kill_l3proto+0x0/0x13 [] :nf_conntrack:nf_conntrack_l3proto_unregister+0x90/0x94 [] :nf_conntrack_ipv4:nf_conntrack_l3proto_ipv4_fini+0x2b/0x5d [] sys_delete_module+0x1b5/0x1e6 [] trace_hardirqs_on_thunk+0x35/0x37 [] system_call+0x7e/0x83 Since netlink_unicast is supposed to be callable from within RCU read side critical sections, make gfp_any() check for in_atomic() instead of in_softirq(). Additionally nfnetlink_send needs to use gfp_any() as well for the call to netlink_broadcast). Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 03684e702d13..2c7d60ca3548 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1278,7 +1278,7 @@ static inline int sock_writeable(const struct sock *sk) static inline gfp_t gfp_any(void) { - return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; + return in_atomic() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, int noblock) -- cgit v1.2.2 From 8488df894d05d6fa41c2bd298c335f944bb0e401 Mon Sep 17 00:00:00 2001 From: Wei Dong Date: Fri, 2 Mar 2007 12:37:26 -0800 Subject: [NET]: Fix bugs in "Whether sock accept queue is full" checking when I use linux TCP socket, and find there is a bug in function sk_acceptq_is_full(). When a new SYN comes, TCP module first checks its validation. If valid, send SYN,ACK to the client and add the sock to the syn hash table. Next time if received the valid ACK for SYN,ACK from the client. server will accept this connection and increase the sk->sk_ack_backlog -- which is done in function tcp_check_req().We check wether acceptq is full in function tcp_v4_syn_recv_sock(). Consider an example: After listen(sockfd, 1) system call, sk->sk_max_ack_backlog is set to 1. As we know, sk->sk_ack_backlog is initialized to 0. Assuming accept() system call is not invoked now. 1. 1st connection comes. invoke sk_acceptq_is_full(). sk- >sk_ack_backlog=0 sk->sk_max_ack_backlog=1, function return 0 accept this connection. Increase the sk->sk_ack_backlog 2. 2nd connection comes. invoke sk_acceptq_is_full(). sk- >sk_ack_backlog=1 sk->sk_max_ack_backlog=1, function return 0 accept this connection. Increase the sk->sk_ack_backlog 3. 3rd connection comes. invoke sk_acceptq_is_full(). sk- >sk_ack_backlog=2 sk->sk_max_ack_backlog=1, function return 1. Refuse this connection. I think it has bugs. after listen system call. sk->sk_max_ack_backlog=1 but now it can accept 2 connections. Signed-off-by: Wei Dong Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 2c7d60ca3548..849c7df23181 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -426,7 +426,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline int sk_acceptq_is_full(struct sock *sk) { - return sk->sk_ack_backlog > sk->sk_max_ack_backlog; + return sk->sk_ack_backlog >= sk->sk_max_ack_backlog; } /* -- cgit v1.2.2 From 64a146513f8f12ba204b7bf5cb7e9505594ead42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Mar 2007 11:21:05 -0800 Subject: [NET]: Revert incorrect accept queue backlog changes. This reverts two changes: 8488df894d05d6fa41c2bd298c335f944bb0e401 248f06726e866942b3d8ca8f411f9067713b7ff8 A backlog value of N really does mean allow "N + 1" connections to queue to a listening socket. This allows one to specify "0" as the backlog and still get 1 connection. Noticed by Gerrit Renker and Rick Jones. Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 849c7df23181..2c7d60ca3548 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -426,7 +426,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline int sk_acceptq_is_full(struct sock *sk) { - return sk->sk_ack_backlog >= sk->sk_max_ack_backlog; + return sk->sk_ack_backlog > sk->sk_max_ack_backlog; } /* -- cgit v1.2.2 From fa438ccfdfd3f6db02c13b61b21454eb81cd6a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Mar 2007 16:05:44 -0800 Subject: [NET]: Keep sk_backlog near sk_lock sk_backlog is a critical field of struct sock. (known famous words) It is (ab)used in hot paths, in particular in release_sock(), tcp_recvmsg(), tcp_v4_rcv(), sk_receive_skb(). It really makes sense to place it next to sk_lock, because sk_backlog is only used after sk_lock locked (and thus memory cache line in L1 cache). This should reduce cache misses and sk_lock acquisition time. (In theory, we could only move the head pointer near sk_lock, and leaving tail far away, because 'tail' is normally not so hot, but keep it simple :) ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 2c7d60ca3548..a3366c3c837a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -202,6 +202,15 @@ struct sock { unsigned short sk_type; int sk_rcvbuf; socket_lock_t sk_lock; + /* + * The backlog queue is special, it is always used with + * the per-socket spinlock held and requires low latency + * access. Therefore we special case it's implementation. + */ + struct { + struct sk_buff *head; + struct sk_buff *tail; + } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; struct xfrm_policy *sk_policy[2]; @@ -221,15 +230,6 @@ struct sock { int sk_rcvlowat; unsigned long sk_flags; unsigned long sk_lingertime; - /* - * The backlog queue is special, it is always used with - * the per-socket spinlock held and requires low latency - * access. Therefore we special case it's implementation. - */ - struct { - struct sk_buff *head; - struct sk_buff *tail; - } sk_backlog; struct sk_buff_head sk_error_queue; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; -- cgit v1.2.2 From b7aa0bf70c4afb9e38be25f5c0922498d0f8684c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Apr 2007 16:16:32 -0700 Subject: [NET]: convert network timestamps to ktime_t We currently use a special structure (struct skb_timeval) and plain 'struct timeval' to store packet timestamps in sk_buffs and struct sock. This has some drawbacks : - Fixed resolution of micro second. - Waste of space on 64bit platforms where sizeof(struct timeval)=16 I suggest using ktime_t that is a nice abstraction of high resolution time services, currently capable of nanosecond resolution. As sizeof(ktime_t) is 8 bytes, using ktime_t in 'struct sock' permits a 8 byte shrink of this structure on 64bit architectures. Some other structures also benefit from this size reduction (struct ipq in ipv4/ip_fragment.c, struct frag_queue in ipv6/reassembly.c, ...) Once this ktime infrastructure adopted, we can more easily provide nanosecond resolution on top of it. (ioctl SIOCGSTAMPNS and/or SO_TIMESTAMPNS/SCM_TIMESTAMPNS) Note : this patch includes a bug correction in compat_sock_get_timestamp() where a "err = 0;" was missing (so this syscall returned -ENOENT instead of 0) Signed-off-by: Eric Dumazet CC: Stephen Hemminger CC: John find Signed-off-by: David S. Miller --- include/net/sock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index a3366c3c837a..9583639090d2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -244,7 +244,7 @@ struct sock { struct sk_filter *sk_filter; void *sk_protinfo; struct timer_list sk_timer; - struct timeval sk_stamp; + ktime_t sk_stamp; struct socket *sk_socket; void *sk_user_data; struct page *sk_sndmsg_page; @@ -1307,19 +1307,19 @@ static inline int sock_intr_errno(long timeo) static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval stamp; + ktime_t kt = skb->tstamp; - skb_get_timestamp(skb, &stamp); if (sock_flag(sk, SOCK_RCVTSTAMP)) { + struct timeval tv; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (stamp.tv_sec == 0) - do_gettimeofday(&stamp); - skb_set_timestamp(skb, &stamp); - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - &stamp); + if (kt.tv64 == 0) + kt = ktime_get_real(); + skb->tstamp = kt; + tv = ktime_to_timeval(kt); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(tv), &tv); } else - sk->sk_stamp = stamp; + sk->sk_stamp = kt; } /** -- cgit v1.2.2 From fe067e8ab5e0dc5ca3c54634924c628da92090b4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 7 Mar 2007 12:12:44 -0800 Subject: [TCP]: Abstract out all write queue operations. This allows the write queue implementation to be changed, for example, to one which allows fast interval searching. Signed-off-by: David S. Miller --- include/net/sock.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 9583639090d2..2974bacc8850 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -710,15 +710,6 @@ static inline void sk_stream_mem_reclaim(struct sock *sk) __sk_stream_mem_reclaim(sk); } -static inline void sk_stream_writequeue_purge(struct sock *sk) -{ - struct sk_buff *skb; - - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) - sk_stream_free_skb(sk, skb); - sk_stream_mem_reclaim(sk); -} - static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) { return (int)skb->truesize <= sk->sk_forward_alloc || @@ -1256,18 +1247,6 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) return page; } -#define sk_stream_for_retrans_queue(skb, sk) \ - for (skb = (sk)->sk_write_queue.next; \ - (skb != (sk)->sk_send_head) && \ - (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ - skb = skb->next) - -/*from STCP for fast SACK Process*/ -#define sk_stream_for_retrans_queue_from(skb, sk) \ - for (; (skb != (sk)->sk_send_head) && \ - (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ - skb = skb->next) - /* * Default write policy as shown to user space via poll/select/SIGIO */ -- cgit v1.2.2 From ae40eb1ef30ab4120bd3c8b7e3da99ee53d27a23 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 18 Mar 2007 17:33:16 -0700 Subject: [NET]: Introduce SIOCGSTAMPNS ioctl to get timestamps with nanosec resolution Now network timestamps use ktime_t infrastructure, we can add a new ioctl() SIOCGSTAMPNS command to get timestamps in 'struct timespec'. User programs can thus access to nanosecond resolution. Signed-off-by: Eric Dumazet CC: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 2974bacc8850..d093e49fdc85 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1329,6 +1329,7 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_e extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); +extern int sock_get_timestampns(struct sock *, struct timespec __user *); /* * Enable debug/info messages -- cgit v1.2.2 From a2a316fd068c455c609ecc155dcfaa7e208d29fe Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 8 Mar 2007 20:41:08 -0800 Subject: [NET]: Replace CONFIG_NET_DEBUG with sysctl. Covert network warning messages from a compile time to runtime choice. Removes kernel config option and replaces it with new /proc/sys/net/core/warnings. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index d093e49fdc85..51246579592e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1334,14 +1334,12 @@ extern int sock_get_timestampns(struct sock *, struct timespec __user *); /* * Enable debug/info messages */ +extern int net_msg_warn; +#define NETDEBUG(fmt, args...) \ + do { if (net_msg_warn) printk(fmt,##args); } while (0) -#ifdef CONFIG_NETDEBUG -#define NETDEBUG(fmt, args...) printk(fmt,##args) -#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) -#else -#define NETDEBUG(fmt, args...) do { } while (0) -#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) -#endif +#define LIMIT_NETDEBUG(fmt, args...) \ + do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) /* * Macros for sleeping on a socket. Use them like this: -- cgit v1.2.2 From 92f37fd2ee805aa77925c1e64fd56088b46094fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 25 Mar 2007 22:14:49 -0700 Subject: [NET]: Adding SO_TIMESTAMPNS / SCM_TIMESTAMPNS support Now that network timestamps use ktime_t infrastructure, we can add a new SOL_SOCKET sockopt SO_TIMESTAMPNS. This command is similar to SO_TIMESTAMP, but permits transmission of a 'timespec struct' instead of a 'timeval struct' control message. (nanosecond resolution instead of microsecond) Control message is labelled SCM_TIMESTAMPNS instead of SCM_TIMESTAMP A socket cannot mix SO_TIMESTAMP and SO_TIMESTAMPNS : the two modes are mutually exclusive. sock_recv_timestamp() became too big to be fully inlined so I added a __sock_recv_timestamp() helper function. Signed-off-by: Eric Dumazet CC: linux-arch@vger.kernel.org Signed-off-by: David S. Miller --- include/net/sock.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 51246579592e..390c04700590 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -390,6 +390,7 @@ enum sock_flags { SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ + SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ }; @@ -1283,21 +1284,17 @@ static inline int sock_intr_errno(long timeo) return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } +extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); + static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { ktime_t kt = skb->tstamp; - if (sock_flag(sk, SOCK_RCVTSTAMP)) { - struct timeval tv; - /* Race occurred between timestamp enabling and packet - receiving. Fill in the current time for now. */ - if (kt.tv64 == 0) - kt = ktime_get_real(); - skb->tstamp = kt; - tv = ktime_to_timeval(kt); - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(tv), &tv); - } else + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + else sk->sk_stamp = kt; } -- cgit v1.2.2 From 9958089a43ae8a9af07402461c0b2b7548c7341e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 20 Apr 2007 17:12:43 -0700 Subject: [NET]: Move sk_setup_caps() out of line. It is far too large to be an inline and not in any hot paths. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- include/net/sock.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 390c04700590..25c37e34bfdc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1075,19 +1075,7 @@ static inline int sk_can_gso(const struct sock *sk) return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } -static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) -{ - __sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; - if (sk->sk_route_caps & NETIF_F_GSO) - sk->sk_route_caps |= NETIF_F_GSO_MASK; - if (sk_can_gso(sk)) { - if (dst->header_len) - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - else - sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - } -} +extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) { -- cgit v1.2.2