diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-04-08 19:03:29 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-04-13 04:41:33 -0400 |
commit | b6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 (patch) | |
tree | 42032b4978874e8ffcf6c851d13324b8c8c7c113 /net | |
parent | 7a161ea92471087a1579239d7a58dd06eaa5601c (diff) |
net: sk_dst_cache RCUification
With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this
work.
sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock)
This rwlock is readlocked for a very small amount of time, and dst
entries are already freed after RCU grace period. This calls for RCU
again :)
This patch converts sk_dst_lock to a spinlock, and use RCU for readers.
__sk_dst_get() is supposed to be called with rcu_read_lock() or if
socket locked by user, so use appropriate rcu_dereference_check()
condition (rcu_read_lock_held() || sock_owned_by_user(sk))
This patch avoids two atomic ops per tx packet on UDP connected sockets,
for example, and permits sk_dst_lock to be much less dirtied.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/dev.c | 2 | ||||
-rw-r--r-- | net/core/sock.c | 8 | ||||
-rw-r--r-- | net/dccp/timer.c | 4 | ||||
-rw-r--r-- | net/decnet/af_decnet.c | 6 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
-rw-r--r-- | net/ipv6/ipv6_sockglue.c | 25 |
8 files changed, 28 insertions, 27 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 0eb79e35671f..ca4cdef74a1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, | |||
2015 | if (dev->real_num_tx_queues > 1) | 2015 | if (dev->real_num_tx_queues > 1) |
2016 | queue_index = skb_tx_hash(dev, skb); | 2016 | queue_index = skb_tx_hash(dev, skb); |
2017 | 2017 | ||
2018 | if (sk && sk->sk_dst_cache) | 2018 | if (sk && rcu_dereference_check(sk->sk_dst_cache, 1)) |
2019 | sk_tx_queue_set(sk, queue_index); | 2019 | sk_tx_queue_set(sk, queue_index); |
2020 | } | 2020 | } |
2021 | } | 2021 | } |
diff --git a/net/core/sock.c b/net/core/sock.c index c5812bbc2cc9..7effa1e689df 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq); | |||
364 | 364 | ||
365 | struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) | 365 | struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) |
366 | { | 366 | { |
367 | struct dst_entry *dst = sk->sk_dst_cache; | 367 | struct dst_entry *dst = __sk_dst_get(sk); |
368 | 368 | ||
369 | if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { | 369 | if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { |
370 | sk_tx_queue_clear(sk); | 370 | sk_tx_queue_clear(sk); |
371 | sk->sk_dst_cache = NULL; | 371 | rcu_assign_pointer(sk->sk_dst_cache, NULL); |
372 | dst_release(dst); | 372 | dst_release(dst); |
373 | return NULL; | 373 | return NULL; |
374 | } | 374 | } |
@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) | |||
1157 | skb_queue_head_init(&newsk->sk_async_wait_queue); | 1157 | skb_queue_head_init(&newsk->sk_async_wait_queue); |
1158 | #endif | 1158 | #endif |
1159 | 1159 | ||
1160 | rwlock_init(&newsk->sk_dst_lock); | 1160 | spin_lock_init(&newsk->sk_dst_lock); |
1161 | rwlock_init(&newsk->sk_callback_lock); | 1161 | rwlock_init(&newsk->sk_callback_lock); |
1162 | lockdep_set_class_and_name(&newsk->sk_callback_lock, | 1162 | lockdep_set_class_and_name(&newsk->sk_callback_lock, |
1163 | af_callback_keys + newsk->sk_family, | 1163 | af_callback_keys + newsk->sk_family, |
@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) | |||
1898 | } else | 1898 | } else |
1899 | sk->sk_sleep = NULL; | 1899 | sk->sk_sleep = NULL; |
1900 | 1900 | ||
1901 | rwlock_init(&sk->sk_dst_lock); | 1901 | spin_lock_init(&sk->sk_dst_lock); |
1902 | rwlock_init(&sk->sk_callback_lock); | 1902 | rwlock_init(&sk->sk_callback_lock); |
1903 | lockdep_set_class_and_name(&sk->sk_callback_lock, | 1903 | lockdep_set_class_and_name(&sk->sk_callback_lock, |
1904 | af_callback_keys + sk->sk_family, | 1904 | af_callback_keys + sk->sk_family, |
diff --git a/net/dccp/timer.c b/net/dccp/timer.c index bbfeb5eae46a..1a9aa05d4dc4 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c | |||
@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk) | |||
38 | 38 | ||
39 | if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { | 39 | if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { |
40 | if (icsk->icsk_retransmits != 0) | 40 | if (icsk->icsk_retransmits != 0) |
41 | dst_negative_advice(&sk->sk_dst_cache, sk); | 41 | dst_negative_advice(sk); |
42 | retry_until = icsk->icsk_syn_retries ? | 42 | retry_until = icsk->icsk_syn_retries ? |
43 | : sysctl_dccp_request_retries; | 43 | : sysctl_dccp_request_retries; |
44 | } else { | 44 | } else { |
@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk) | |||
63 | Golden words :-). | 63 | Golden words :-). |
64 | */ | 64 | */ |
65 | 65 | ||
66 | dst_negative_advice(&sk->sk_dst_cache, sk); | 66 | dst_negative_advice(sk); |
67 | } | 67 | } |
68 | 68 | ||
69 | retry_until = sysctl_dccp_retries2; | 69 | retry_until = sysctl_dccp_retries2; |
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 2b494fac9468..55e3b6b0061a 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c | |||
@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk) | |||
446 | skb_queue_purge(&scp->other_xmit_queue); | 446 | skb_queue_purge(&scp->other_xmit_queue); |
447 | skb_queue_purge(&scp->other_receive_queue); | 447 | skb_queue_purge(&scp->other_receive_queue); |
448 | 448 | ||
449 | dst_release(xchg(&sk->sk_dst_cache, NULL)); | 449 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
450 | } | 450 | } |
451 | 451 | ||
452 | static int dn_memory_pressure; | 452 | static int dn_memory_pressure; |
@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) | |||
1105 | release_sock(sk); | 1105 | release_sock(sk); |
1106 | 1106 | ||
1107 | dst = skb_dst(skb); | 1107 | dst = skb_dst(skb); |
1108 | dst_release(xchg(&newsk->sk_dst_cache, dst)); | 1108 | sk_dst_set(newsk, dst); |
1109 | skb_dst_set(skb, NULL); | 1109 | skb_dst_set(skb, NULL); |
1110 | 1110 | ||
1111 | DN_SK(newsk)->state = DN_CR; | 1111 | DN_SK(newsk)->state = DN_CR; |
@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
1956 | } | 1956 | } |
1957 | 1957 | ||
1958 | if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) | 1958 | if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) |
1959 | dst_negative_advice(&sk->sk_dst_cache, sk); | 1959 | dst_negative_advice(sk); |
1960 | 1960 | ||
1961 | mss = scp->segsize_rem; | 1961 | mss = scp->segsize_rem; |
1962 | fctype = scp->services_rem & NSP_FC_MASK; | 1962 | fctype = scp->services_rem & NSP_FC_MASK; |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a0beb32beaa3..193dcd6ed64f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) | |||
154 | WARN_ON(sk->sk_forward_alloc); | 154 | WARN_ON(sk->sk_forward_alloc); |
155 | 155 | ||
156 | kfree(inet->opt); | 156 | kfree(inet->opt); |
157 | dst_release(sk->sk_dst_cache); | 157 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
158 | sk_refcnt_debug_dec(sk); | 158 | sk_refcnt_debug_dec(sk); |
159 | } | 159 | } |
160 | EXPORT_SYMBOL(inet_sock_destruct); | 160 | EXPORT_SYMBOL(inet_sock_destruct); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4000b10610b7..ae3ec15fb630 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -3710,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
3710 | } | 3710 | } |
3711 | 3711 | ||
3712 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) | 3712 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) |
3713 | dst_confirm(sk->sk_dst_cache); | 3713 | dst_confirm(__sk_dst_get(sk)); |
3714 | 3714 | ||
3715 | return 1; | 3715 | return 1; |
3716 | 3716 | ||
@@ -5833,7 +5833,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5833 | if (tp->snd_una == tp->write_seq) { | 5833 | if (tp->snd_una == tp->write_seq) { |
5834 | tcp_set_state(sk, TCP_FIN_WAIT2); | 5834 | tcp_set_state(sk, TCP_FIN_WAIT2); |
5835 | sk->sk_shutdown |= SEND_SHUTDOWN; | 5835 | sk->sk_shutdown |= SEND_SHUTDOWN; |
5836 | dst_confirm(sk->sk_dst_cache); | 5836 | dst_confirm(__sk_dst_get(sk)); |
5837 | 5837 | ||
5838 | if (!sock_flag(sk, SOCK_DEAD)) | 5838 | if (!sock_flag(sk, SOCK_DEAD)) |
5839 | /* Wake up lingering close() */ | 5839 | /* Wake up lingering close() */ |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8a0ab2977f1f..c732be00606b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -172,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk) | |||
172 | 172 | ||
173 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 173 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
174 | if (icsk->icsk_retransmits) | 174 | if (icsk->icsk_retransmits) |
175 | dst_negative_advice(&sk->sk_dst_cache, sk); | 175 | dst_negative_advice(sk); |
176 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 176 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
177 | } else { | 177 | } else { |
178 | if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { | 178 | if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { |
179 | /* Black hole detection */ | 179 | /* Black hole detection */ |
180 | tcp_mtu_probing(icsk, sk); | 180 | tcp_mtu_probing(icsk, sk); |
181 | 181 | ||
182 | dst_negative_advice(&sk->sk_dst_cache, sk); | 182 | dst_negative_advice(sk); |
183 | } | 183 | } |
184 | 184 | ||
185 | retry_until = sysctl_tcp_retries2; | 185 | retry_until = sysctl_tcp_retries2; |
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 33f60fca7aa7..1160400e9dbd 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c | |||
@@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, | |||
114 | } | 114 | } |
115 | opt = xchg(&inet6_sk(sk)->opt, opt); | 115 | opt = xchg(&inet6_sk(sk)->opt, opt); |
116 | } else { | 116 | } else { |
117 | write_lock(&sk->sk_dst_lock); | 117 | spin_lock(&sk->sk_dst_lock); |
118 | opt = xchg(&inet6_sk(sk)->opt, opt); | 118 | opt = xchg(&inet6_sk(sk)->opt, opt); |
119 | write_unlock(&sk->sk_dst_lock); | 119 | spin_unlock(&sk->sk_dst_lock); |
120 | } | 120 | } |
121 | sk_dst_reset(sk); | 121 | sk_dst_reset(sk); |
122 | 122 | ||
@@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, | |||
971 | case IPV6_MTU: | 971 | case IPV6_MTU: |
972 | { | 972 | { |
973 | struct dst_entry *dst; | 973 | struct dst_entry *dst; |
974 | |||
974 | val = 0; | 975 | val = 0; |
975 | lock_sock(sk); | 976 | rcu_read_lock(); |
976 | dst = sk_dst_get(sk); | 977 | dst = __sk_dst_get(sk); |
977 | if (dst) { | 978 | if (dst) |
978 | val = dst_mtu(dst); | 979 | val = dst_mtu(dst); |
979 | dst_release(dst); | 980 | rcu_read_unlock(); |
980 | } | ||
981 | release_sock(sk); | ||
982 | if (!val) | 981 | if (!val) |
983 | return -ENOTCONN; | 982 | return -ENOTCONN; |
984 | break; | 983 | break; |
@@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, | |||
1066 | else | 1065 | else |
1067 | val = np->mcast_hops; | 1066 | val = np->mcast_hops; |
1068 | 1067 | ||
1069 | dst = sk_dst_get(sk); | 1068 | if (val < 0) { |
1070 | if (dst) { | 1069 | rcu_read_lock(); |
1071 | if (val < 0) | 1070 | dst = __sk_dst_get(sk); |
1071 | if (dst) | ||
1072 | val = ip6_dst_hoplimit(dst); | 1072 | val = ip6_dst_hoplimit(dst); |
1073 | dst_release(dst); | 1073 | rcu_read_unlock(); |
1074 | } | 1074 | } |
1075 | |||
1075 | if (val < 0) | 1076 | if (val < 0) |
1076 | val = sock_net(sk)->ipv6.devconf_all->hop_limit; | 1077 | val = sock_net(sk)->ipv6.devconf_all->hop_limit; |
1077 | break; | 1078 | break; |