From 8d987e5c75107ca7515fa19e857cfa24aab6ec8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Nov 2010 23:24:26 +0000 Subject: net: avoid limits overflow Robin Holt tried to boot a 16TB machine and found some limits were reached : sysctl_tcp_mem[2], sysctl_udp_mem[2] We can switch infrastructure to use long "instead" of "int", now atomic_long_t primitives are available for free. Signed-off-by: Eric Dumazet Reported-by: Robin Holt Reviewed-by: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index c7a736228ca2..a6338d039857 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -762,7 +762,7 @@ struct proto { /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); - atomic_t *memory_allocated; /* Current allocated memory. */ + atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. @@ -771,7 +771,7 @@ struct proto { * is strict, actions are advisory and have some latency. */ int *memory_pressure; - int *sysctl_mem; + long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; int max_header; -- cgit v1.2.2 From c31504dc0d1dc853dcee509d9999169a9097a717 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2010 19:58:26 +0000 Subject: udp: use atomic_inc_not_zero_hint UDP sockets refcount is usually 2, unless an incoming frame is going to be queued in receive or backlog queue. Using atomic_inc_not_zero_hint() permits to reduce latency, because processor issues less memory transactions. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index a6338d039857..eb0c1f504678 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -57,7 +57,7 @@ #include #include -#include +#include #include #include -- cgit v1.2.2 From b178bb3dfc30d9555bdd2401e95af98e23e83e10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2010 05:56:04 +0000 Subject: net: reorder struct sock fields Right now, fields in struct sock are not optimally ordered, because each path (RX softirq, TX completion, RX user, TX user) has to touch fields that are contained in many different cache lines. The really critical thing is to shrink number of cache lines that are used at RX softirq time : CPU handling softirqs for a device can receive many frames per second for many sockets. If load is too big, we can drop frames at NIC level. RPS or multiqueue cards can help, but better reduce latency if possible. This patch starts with UDP protocol, then additional patches will try to reduce latencies of other ones as well. At RX softirq time, fields of interest for UDP protocol are : (not counting ones in inet struct for the lookup) Read/Written: sk_refcnt (atomic increment/decrement) sk_rmem_alloc & sk_backlog.len (to check if there is room in queues) sk_receive_queue sk_backlog (if socket locked by user program) sk_rxhash sk_forward_alloc sk_drops Read only: sk_rcvbuf (sk_rcvqueues_full()) sk_filter sk_wq sk_policy[0] sk_flags Additional notes : - sk_backlog has one hole on 64bit arches. We can fill it to save 8 bytes. - sk_backlog is used only if RX sofirq handler finds the socket while locked by user. - sk_rxhash is written only once per flow. - sk_drops is written only if queues are full Final layout : [1] One section grouping all read/write fields, but placing rxhash and sk_backlog at the end of this section. [2] One section grouping all read fields in RX handler (sk_filter, sk_rcv_buf, sk_wq) [3] Section used by other paths I'll post a patch on its own to put sk_refcnt at the end of struct sock_common so that it shares same cache line than section [1] New offsets on 64bit arch : sizeof(struct sock)=0x268 offsetof(struct sock, sk_refcnt) =0x10 offsetof(struct sock, sk_lock) =0x48 offsetof(struct sock, sk_receive_queue)=0x68 offsetof(struct sock, sk_backlog)=0x80 offsetof(struct sock, sk_rmem_alloc)=0x80 offsetof(struct sock, sk_forward_alloc)=0x98 offsetof(struct sock, sk_rxhash)=0x9c offsetof(struct sock, sk_rcvbuf)=0xa4 offsetof(struct sock, sk_drops) =0xa0 offsetof(struct sock, sk_filter)=0xa8 offsetof(struct sock, sk_wq)=0xb0 offsetof(struct sock, sk_policy)=0xd0 offsetof(struct sock, sk_flags) =0xe0 Instead of : sizeof(struct sock)=0x270 offsetof(struct sock, sk_refcnt) =0x10 offsetof(struct sock, sk_lock) =0x50 offsetof(struct sock, sk_receive_queue)=0xc0 offsetof(struct sock, sk_backlog)=0x70 offsetof(struct sock, sk_rmem_alloc)=0xac offsetof(struct sock, sk_forward_alloc)=0x10c offsetof(struct sock, sk_rxhash)=0x128 offsetof(struct sock, sk_rcvbuf)=0x4c offsetof(struct sock, sk_drops) =0x16c offsetof(struct sock, sk_filter)=0x198 offsetof(struct sock, sk_wq)=0x88 offsetof(struct sock, sk_policy)=0x98 offsetof(struct sock, sk_flags) =0x130 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 55 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index eb0c1f504678..5557dfb3dd68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -241,59 +241,67 @@ struct sock { #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net - kmemcheck_bitfield_begin(flags); - unsigned int sk_shutdown : 2, - sk_no_check : 2, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; - kmemcheck_bitfield_end(flags); - int sk_rcvbuf; socket_lock_t sk_lock; + struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. + * Note : rmem_alloc is in this structure to fill a hole + * on 64bit arches, not because its logically part of + * backlog. */ struct { - struct sk_buff *head; - struct sk_buff *tail; - int len; + atomic_t rmem_alloc; + int len; + struct sk_buff *head; + struct sk_buff *tail; } sk_backlog; +#define sk_rmem_alloc sk_backlog.rmem_alloc + int sk_forward_alloc; +#ifdef CONFIG_RPS + __u32 sk_rxhash; +#endif + atomic_t sk_drops; + int sk_rcvbuf; + + struct sk_filter __rcu *sk_filter; struct socket_wq *sk_wq; - struct dst_entry *sk_dst_cache; + +#ifdef CONFIG_NET_DMA + struct sk_buff_head sk_async_wait_queue; +#endif + #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif + unsigned long sk_flags; + struct dst_entry *sk_dst_cache; spinlock_t sk_dst_lock; - atomic_t sk_rmem_alloc; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; - struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; -#ifdef CONFIG_NET_DMA - struct sk_buff_head sk_async_wait_queue; -#endif + kmemcheck_bitfield_begin(flags); + unsigned int sk_shutdown : 2, + sk_no_check : 2, + sk_userlocks : 4, + sk_protocol : 8, + sk_type : 16; + kmemcheck_bitfield_end(flags); int sk_wmem_queued; - int sk_forward_alloc; gfp_t sk_allocation; int sk_route_caps; int sk_route_nocaps; int sk_gso_type; unsigned int sk_gso_max_size; int sk_rcvlowat; -#ifdef CONFIG_RPS - __u32 sk_rxhash; -#endif - unsigned long sk_flags; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; - atomic_t sk_drops; unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; @@ -301,7 +309,6 @@ struct sock { const struct cred *sk_peer_cred; long sk_rcvtimeo; long sk_sndtimeo; - struct sk_filter __rcu *sk_filter; void *sk_protinfo; struct timer_list sk_timer; ktime_t sk_stamp; -- cgit v1.2.2 From dca9b2404a6d6579828da2425c051462701efd3f Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Wed, 1 Dec 2010 18:05:17 +0000 Subject: net: kill unused macros from head file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These macros have been defined for several years since v2.6.12-rc2(tracing by git), but never be used. So remove them. Signed-off-by: Shan Wei Signed-off-by: David S. Miller --- include/net/sock.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 5557dfb3dd68..717cfbf649df 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -516,9 +516,6 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) -#define sk_for_each_continue(__sk, node) \ - if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ - hlist_for_each_entry_continue(__sk, node, sk_node) #define sk_for_each_safe(__sk, node, tmp, list) \ hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) #define sk_for_each_bound(__sk, node, list) \ -- cgit v1.2.2 From 46bcf14f44d8f31ecfdc8b6708ec15a3b33316d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2010 09:29:43 -0800 Subject: filter: fix sk_filter rcu handling Pavel Emelyanov tried to fix a race between sk_filter_(de|at)tach and sk_clone() in commit 47e958eac280c263397 Problem is we can have several clones sharing a common sk_filter, and these clones might want to sk_filter_attach() their own filters at the same time, and can overwrite old_filter->rcu, corrupting RCU queues. We can not use filter->rcu without being sure no other thread could do the same thing. Switch code to a more conventional ref-counting technique : Do the atomic decrement immediately and queue one rcu call back when last reference is released. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index a6338d039857..659d968d95c5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1155,6 +1155,8 @@ extern void sk_common_release(struct sock *sk); /* Initialise core socket variables */ extern void sock_init_data(struct socket *sock, struct sock *sk); +extern void sk_filter_release_rcu(struct rcu_head *rcu); + /** * sk_filter_release - release a socket filter * @fp: filter to remove @@ -1165,7 +1167,7 @@ extern void sock_init_data(struct socket *sock, struct sock *sk); static inline void sk_filter_release(struct sk_filter *fp) { if (atomic_dec_and_test(&fp->refcnt)) - kfree(fp); + call_rcu_bh(&fp->rcu, sk_filter_release_rcu); } static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) -- cgit v1.2.2 From 68835aba4d9b74e2f94106d13b6a4bddc447c4c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2010 19:04:07 +0000 Subject: net: optimize INET input path further Followup of commit b178bb3dfc30 (net: reorder struct sock fields) Optimize INET input path a bit further, by : 1) moving sk_refcnt close to sk_lock. This reduces number of dirtied cache lines by one on 64bit arches (and 64 bytes cache line size). 2) moving inet_daddr & inet_rcv_saddr at the beginning of sk (same cache line than hash / family / bound_dev_if / nulls_node) This reduces number of accessed cache lines in lookups by one, and dont increase size of inet and timewait socks. inet and tw sockets now share same place-holder for these fields. Before patch : offsetof(struct sock, sk_refcnt) = 0x10 offsetof(struct sock, sk_lock) = 0x40 offsetof(struct sock, sk_receive_queue) = 0x60 offsetof(struct inet_sock, inet_daddr) = 0x270 offsetof(struct inet_sock, inet_rcv_saddr) = 0x274 After patch : offsetof(struct sock, sk_refcnt) = 0x44 offsetof(struct sock, sk_lock) = 0x48 offsetof(struct sock, sk_receive_queue) = 0x68 offsetof(struct inet_sock, inet_daddr) = 0x0 offsetof(struct inet_sock, inet_rcv_saddr) = 0x4 compute_score() (udp or tcp) now use a single cache line per ignored item, instead of two. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 3482004e5c29..82e86034702f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -105,10 +105,8 @@ struct net; /** * struct sock_common - minimal network layer representation of sockets - * @skc_node: main hash linkage for various protocol lookup tables - * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol - * @skc_refcnt: reference count - * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_daddr: Foreign IPv4 addr + * @skc_rcv_saddr: Bound local IPv4 addr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_family: network address family @@ -119,20 +117,20 @@ struct net; * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket + * @skc_node: main hash linkage for various protocol lookup tables + * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol + * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { - /* - * first fields are not copied in sock_copy() + /* skc_daddr and skc_rcv_saddr must be grouped : + * cf INET_MATCH() and INET_TW_MATCH() */ - union { - struct hlist_node skc_node; - struct hlist_nulls_node skc_nulls_node; - }; - atomic_t skc_refcnt; - int skc_tx_queue_mapping; + __be32 skc_daddr; + __be32 skc_rcv_saddr; union { unsigned int skc_hash; @@ -150,6 +148,18 @@ struct sock_common { #ifdef CONFIG_NET_NS struct net *skc_net; #endif + /* + * fields between dontcopy_begin/dontcopy_end + * are not copied in sock_copy() + */ + int skc_dontcopy_begin[0]; + union { + struct hlist_node skc_node; + struct hlist_nulls_node skc_nulls_node; + }; + int skc_tx_queue_mapping; + atomic_t skc_refcnt; + int skc_dontcopy_end[0]; }; /** @@ -232,7 +242,8 @@ struct sock { #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping -#define sk_copy_start __sk_common.skc_hash +#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin +#define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state -- cgit v1.2.2 From fcbdf09d9652c8919dcf47072e3ae7dcb4eb98ac Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Thu, 16 Dec 2010 14:26:56 -0800 Subject: net: fix nulls list corruptions in sk_prot_alloc Special care is taken inside sk_port_alloc to avoid overwriting skc_node/skc_nulls_node. We should also avoid overwriting skc_bind_node/skc_portaddr_node. The patch fixes the following crash: BUG: unable to handle kernel paging request at fffffffffffffff0 IP: [] udp4_lib_lookup2+0xad/0x370 [] __udp4_lib_lookup+0x282/0x360 [] __udp4_lib_rcv+0x31e/0x700 [] ? ip_local_deliver_finish+0x65/0x190 [] ? ip_local_deliver+0x88/0xa0 [] udp_rcv+0x15/0x20 [] ip_local_deliver_finish+0x65/0x190 [] ip_local_deliver+0x88/0xa0 [] ip_rcv_finish+0x32d/0x6f0 [] ? netif_receive_skb+0x99c/0x11c0 [] ip_rcv+0x2bb/0x350 [] netif_receive_skb+0x99c/0x11c0 Signed-off-by: Leonard Crestez Signed-off-by: Octavian Purdila Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 659d968d95c5..7d3f7ce239b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -754,6 +754,7 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + void (*clear_sk)(struct sock *sk, int size); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS @@ -852,6 +853,8 @@ static inline void __sk_prot_rehash(struct sock *sk) sk->sk_prot->hash(sk); } +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) -- cgit v1.2.2