From b6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Apr 2010 23:03:29 +0000 Subject: net: sk_dst_cache RCUification With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this work. sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock) This rwlock is readlocked for a very small amount of time, and dst entries are already freed after RCU grace period. This calls for RCU again :) This patch converts sk_dst_lock to a spinlock, and use RCU for readers. __sk_dst_get() is supposed to be called with rcu_read_lock() or if socket locked by user, so use appropriate rcu_dereference_check() condition (rcu_read_lock_held() || sock_owned_by_user(sk)) This patch avoids two atomic ops per tx packet on UDP connected sockets, for example, and permits sk_dst_lock to be much less dirtied. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index b4603cd54fcd..56df440a950b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -262,7 +262,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif - rwlock_t sk_dst_lock; + spinlock_t sk_dst_lock; atomic_t sk_rmem_alloc; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; @@ -1192,7 +1192,8 @@ extern unsigned long sock_i_ino(struct sock *sk); static inline struct dst_entry * __sk_dst_get(struct sock *sk) { - return sk->sk_dst_cache; + return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || + sock_owned_by_user(sk)); } static inline struct dst_entry * @@ -1200,50 +1201,62 @@ sk_dst_get(struct sock *sk) { struct dst_entry *dst; - read_lock(&sk->sk_dst_lock); - dst = sk->sk_dst_cache; + rcu_read_lock(); + dst = rcu_dereference(sk->sk_dst_cache); if (dst) dst_hold(dst); - read_unlock(&sk->sk_dst_lock); + rcu_read_unlock(); return dst; } +extern void sk_reset_txq(struct sock *sk); + +static inline void dst_negative_advice(struct sock *sk) +{ + struct dst_entry *ndst, *dst = __sk_dst_get(sk); + + if (dst && dst->ops->negative_advice) { + ndst = dst->ops->negative_advice(dst); + + if (ndst != dst) { + rcu_assign_pointer(sk->sk_dst_cache, ndst); + sk_reset_txq(sk); + } + } +} + static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); - old_dst = sk->sk_dst_cache; - sk->sk_dst_cache = dst; + old_dst = rcu_dereference_check(sk->sk_dst_cache, + lockdep_is_held(&sk->sk_dst_lock)); + rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { - write_lock(&sk->sk_dst_lock); + spin_lock(&sk->sk_dst_lock); __sk_dst_set(sk, dst); - write_unlock(&sk->sk_dst_lock); + spin_unlock(&sk->sk_dst_lock); } static inline void __sk_dst_reset(struct sock *sk) { - struct dst_entry *old_dst; - - sk_tx_queue_clear(sk); - old_dst = sk->sk_dst_cache; - sk->sk_dst_cache = NULL; - dst_release(old_dst); + __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { - write_lock(&sk->sk_dst_lock); + spin_lock(&sk->sk_dst_lock); __sk_dst_reset(sk); - write_unlock(&sk->sk_dst_lock); + spin_unlock(&sk->sk_dst_lock); } extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); -- cgit v1.2.2 From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 56df440a950b..8ab05146a447 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1160,6 +1160,10 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) sk->sk_socket = sock; } +static inline wait_queue_head_t *sk_sleep(struct sock *sk) +{ + return sk->sk_sleep; +} /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -1346,8 +1350,8 @@ static inline int sk_has_allocations(const struct sock *sk) * tp->rcv_nxt check sock_def_readable * ... { * schedule ... - * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - * wake_up_interruptible(sk->sk_sleep) + * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) + * wake_up_interruptible(sk_sleep(sk)) * ... * } * @@ -1368,7 +1372,7 @@ static inline int sk_has_sleeper(struct sock *sk) * This memory barrier is paired in the sock_poll_wait. */ smp_mb__after_lock(); - return sk->sk_sleep && waitqueue_active(sk->sk_sleep); + return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); } /** -- cgit v1.2.2 From f68c224fedff2157f3fad7f7da674cbc96567c84 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Apr 2010 16:06:59 -0700 Subject: dst: rcu check refinement __sk_dst_get() might be called from softirq, with socket lock held. [ 159.026180] include/net/sock.h:1200 invoked rcu_dereference_check() without protection! [ 159.026261] [ 159.026261] other info that might help us debug this: [ 159.026263] [ 159.026425] [ 159.026426] rcu_scheduler_active = 1, debug_locks = 0 [ 159.026552] 2 locks held by swapper/0: [ 159.026609] #0: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x105/0x350 [ 159.026839] #1: (slock-AF_INET){+.-...}, at: [] tcp_write_timer+0x2f/0x1e0 [ 159.027063] [ 159.027064] stack backtrace: [ 159.027172] Pid: 0, comm: swapper Not tainted 2.6.34-rc5-03707-gde498c8-dirty #36 [ 159.027252] Call Trace: [ 159.027306] [] lockdep_rcu_dereference +0xaf/0xc0 [ 159.027411] [] tcp_current_mss+0xa7/0xb0 [ 159.027537] [] tcp_write_wakeup+0x89/0x190 [ 159.027600] [] tcp_send_probe0+0x16/0x100 [ 159.027726] [] tcp_write_timer+0x179/0x1e0 [ 159.027790] [] run_timer_softirq+0x191/0x350 [ 159.027980] [] __do_softirq+0xcd/0x200 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 8ab05146a447..86a8ca177a29 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1197,7 +1197,8 @@ static inline struct dst_entry * __sk_dst_get(struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() || - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); } static inline struct dst_entry * -- cgit v1.2.2 From 0b53ff2eadb1db6818894435f85989fb05d7e718 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Apr 2010 20:40:43 +0000 Subject: net: fix a lockdep rcu warning in __sk_dst_set() __sk_dst_set() might be called while no state can be integrated in a rcu_dereference_check() condition. So use rcu_dereference_raw() to shutup lockdep warnings (if CONFIG_PROVE_RCU is set) Signed-off-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 86a8ca177a29..4081db86a352 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1236,8 +1236,11 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - old_dst = rcu_dereference_check(sk->sk_dst_cache, - lockdep_is_held(&sk->sk_dst_lock)); + /* + * This can be called while sk is owned by the caller only, + * with no state that can be checked in a rcu_dereference_check() cond + */ + old_dst = rcu_dereference_raw(sk->sk_dst_cache); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } -- cgit v1.2.2 From c58dc01babfd58ec9e71a6ce080150dc27755d88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Apr 2010 15:05:31 -0700 Subject: net: Make RFS socket operations not be inet specific. Idea from Eric Dumazet. As for placement inside of struct sock, I tried to choose a place that otherwise has a 32-bit hole on 64-bit systems. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- include/net/sock.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 4081db86a352..07822280d953 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -198,6 +198,7 @@ struct sock_common { * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting + * @sk_rxhash: flow hash received from netif layer * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer @@ -279,6 +280,9 @@ struct sock { int sk_gso_type; unsigned int sk_gso_max_size; int sk_rcvlowat; +#ifdef CONFIG_RPS + __u32 sk_rxhash; +#endif unsigned long sk_flags; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; @@ -620,6 +624,40 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_record_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_reset_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + rcu_read_lock(); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + rps_reset_sock_flow(sock_flow_table, sk->sk_rxhash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_save_rxhash(struct sock *sk, u32 rxhash) +{ +#ifdef CONFIG_RPS + if (unlikely(sk->sk_rxhash != rxhash)) { + sock_rps_reset_flow(sk); + sk->sk_rxhash = rxhash; + } +#endif +} + #define sk_wait_event(__sk, __timeo, __condition) \ ({ int __rc; \ release_sock(__sk); \ -- cgit v1.2.2 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 07822280d953..cf12b1e61fa6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -256,7 +256,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; int len; - int limit; } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; @@ -608,10 +607,20 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } +/* + * Take into account size of receive queue and backlog queue + */ +static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); + + return qsize + skb->truesize > sk->sk_rcvbuf; +} + /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (sk->sk_backlog.len >= max(sk->sk_backlog.limit, sk->sk_rcvbuf << 1)) + if (sk_rcvqueues_full(sk, skb)) return -ENOBUFS; __sk_add_backlog(sk, skb); -- cgit v1.2.2 From 4b0b72f7dd617b13abd1b04c947e15873e011a24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 14:35:48 -0700 Subject: net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index cf12b1e61fa6..d361c7769fe0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1021,6 +1021,16 @@ extern void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) +static inline void lock_sock_bh(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); +} + +static inline void unlock_sock_bh(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot); -- cgit v1.2.2 From 767dd03369ac18af58efdef0383d6eb986eab426 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 19:14:43 +0000 Subject: net: speedup sock_recv_ts_and_drops() sock_recv_ts_and_drops() is fat and slow (~ 4% of cpu time on some profiles) We can test all socket flags at once to make fast path fast again. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index d361c7769fe0..e1777db5b9ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1635,7 +1635,24 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) sk->sk_stamp = kt; } -extern void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); + +static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ +#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ + (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE) | \ + (1UL << SOCK_TIMESTAMPING_SYS_HARDWARE)) + + if (sk->sk_flags & FLAGS_TS_OR_DROPS) + __sock_recv_ts_and_drops(msg, sk, skb); + else + sk->sk_stamp = skb->tstamp; +} /** * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped -- cgit v1.2.2 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index e1777db5b9ab..cc7f91ec972c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -159,7 +159,7 @@ struct sock_common { * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_rcvbuf: size of receive buffer in bytes - * @sk_sleep: sock wait queue + * @sk_wq: sock wait queue and async head * @sk_dst_cache: destination cache * @sk_dst_lock: destination cache lock * @sk_policy: flow policy @@ -257,7 +257,7 @@ struct sock { struct sk_buff *tail; int len; } sk_backlog; - wait_queue_head_t *sk_sleep; + struct socket_wq *sk_wq; struct dst_entry *sk_dst_cache; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; @@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) static inline wait_queue_head_t *sk_sleep(struct sock *sk) { - return sk->sk_sleep; + return &sk->sk_wq->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. @@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); - sk->sk_sleep = NULL; + sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); - sk->sk_sleep = &parent->wait; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); security_sock_graft(sk, parent); @@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk) } /** - * sk_has_sleeper - check if there are any waiting processes - * @sk: socket + * wq_has_sleeper - check if there are any waiting processes + * @sk: struct socket_wq * - * Returns true if socket has waiting processes + * Returns true if socket_wq has waiting processes * - * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk) * ... ... * tp->rcv_nxt check sock_def_readable * ... { - * schedule ... - * if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) - * wake_up_interruptible(sk_sleep(sk)) + * schedule rcu_read_lock(); + * wq = rcu_dereference(sk->sk_wq); + * if (wq && waitqueue_active(&wq->wait)) + * wake_up_interruptible(&wq->wait) * ... * } * @@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk) * could then endup calling schedule and sleep forever if there are no more * data on the socket. * - * The sk_has_sleeper is always called right after a call to read_lock, so we - * can use smp_mb__after_lock barrier. */ -static inline int sk_has_sleeper(struct sock *sk) +static inline bool wq_has_sleeper(struct socket_wq *wq) { + /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb__after_lock(); - return sk_sleep(sk) && waitqueue_active(sk_sleep(sk)); + smp_mb(); + return wq && waitqueue_active(&wq->wait); } /** @@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk) * @wait_address: socket wait queue * @p: poll_table * - * See the comments in the sk_has_sleeper function. + * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp, * We need to be sure we are in sync with the * socket flags modification. * - * This memory barrier is paired in the sk_has_sleeper. + * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } -- cgit v1.2.2 From a465419b1febb603821f924805529cff89cafeed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 May 2010 00:36:33 -0700 Subject: net: Introduce sk_route_nocaps TCP-MD5 sessions have intermittent failures, when route cache is invalidated. ip_queue_xmit() has to find a new route, calls sk_setup_caps(sk, &rt->u.dst), destroying the sk->sk_route_caps &= ~NETIF_F_GSO_MASK that MD5 desperately try to make all over its way (from tcp_transmit_skb() for example) So we send few bad packets, and everything is fine when tcp_transmit_skb() is called again for this socket. Since ip_queue_xmit() is at a lower level than TCP-MD5, I chose to use a socket field, sk_route_nocaps, containing bits to mask on sk_route_caps. Reported-by: Bhaskar Dutta Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 328e03f47dd1..aed16eb9db4b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -177,6 +177,7 @@ struct sock_common { * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) + * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_lingertime: %SO_LINGER l_linger setting @@ -276,6 +277,7 @@ struct sock { int sk_forward_alloc; gfp_t sk_allocation; int sk_route_caps; + int sk_route_nocaps; int sk_gso_type; unsigned int sk_gso_max_size; int sk_rcvlowat; @@ -1335,6 +1337,12 @@ static inline int sk_can_gso(const struct sock *sk) extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); +static inline void sk_nocaps_add(struct sock *sk, int flags) +{ + sk->sk_route_nocaps |= flags; + sk->sk_route_caps &= ~flags; +} + static inline int skb_copy_to_page(struct sock *sk, char __user *from, struct sk_buff *skb, struct page *page, int off, int copy) -- cgit v1.2.2 From 7fee226ad2397b635e2fd565a59ca3ae08a164cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 May 2010 23:19:48 +0000 Subject: net: add a noref bit on skb dst Use low order bit of skb->_skb_dst to tell dst is not refcounted. Change _skb_dst to _skb_refdst to make sure all uses are catched. skb_dst() returns the dst, regardless of noref bit set or not, but with a lockdep check to make sure a noref dst is not given if current user is not rcu protected. New skb_dst_set_noref() helper to set an notrefcounted dst on a skb. (with lockdep check) skb_dst_drop() drops a reference only if skb dst was refcounted. skb_dst_force() helper is used to force a refcount on dst, when skb is queued and not anymore RCU protected. Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if !IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in sock_queue_rcv_skb(), in __nf_queue(). Use skb_dst_force() in dev_requeue_skb(). Note: dst_use_noref() still dirties dst, we might transform it later to do one dirtying per jiffies. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index aed16eb9db4b..5697caf8cc76 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -600,12 +600,15 @@ static inline int sk_stream_memory_free(struct sock *sk) /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_backlog.tail) { - sk->sk_backlog.head = sk->sk_backlog.tail = skb; - } else { + /* dont let skb dst not refcounted, we are going to leave rcu lock */ + skb_dst_force(skb); + + if (!sk->sk_backlog.tail) + sk->sk_backlog.head = skb; + else sk->sk_backlog.tail->next = skb; - sk->sk_backlog.tail = skb; - } + + sk->sk_backlog.tail = skb; skb->next = NULL; } -- cgit v1.2.2