diff options
author | Julian Anastasov <ja@ssi.bg> | 2013-03-21 05:58:10 -0400 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2013-04-01 18:23:45 -0400 |
commit | 088339a57d6042a8a19a3d5794594b558cd7b624 (patch) | |
tree | 835a3b82d7504f7f5670a7b130eedabf14d0ccc2 | |
parent | 60b6aa3b319d902db49dbaee7433fe2ac7d0cdb5 (diff) |
ipvs: convert connection locking
Convert __ip_vs_conntbl_lock_array as follows:
- readers that do not modify conn lists will use RCU lock
- updaters that modify lists will use spinlock_t
Now for conn lookups we will use RCU read-side
critical section. Without using __ip_vs_conn_get such
places have access to connection fields and can
dereference some pointers like pe and pe_data plus
the ability to update timer expiration. If full access
is required we contend for reference.
We add barrier in __ip_vs_conn_put, so that
other CPUs see the refcnt operation after other writes.
With the introduction of ip_vs_conn_unlink()
we try to reorganize ip_vs_conn_expire(), so that
unhashing of connections that should stay more time is
avoided, even if it is for very short time.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
-rw-r--r-- | include/net/ip_vs.h | 12 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 230 |
2 files changed, 134 insertions, 108 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b06aa6c939fa..5700b07b5186 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h | |||
@@ -620,6 +620,8 @@ struct ip_vs_conn { | |||
620 | const struct ip_vs_pe *pe; | 620 | const struct ip_vs_pe *pe; |
621 | char *pe_data; | 621 | char *pe_data; |
622 | __u8 pe_data_len; | 622 | __u8 pe_data_len; |
623 | |||
624 | struct rcu_head rcu_head; | ||
623 | }; | 625 | }; |
624 | 626 | ||
625 | /* | 627 | /* |
@@ -1185,9 +1187,19 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, | |||
1185 | const struct ip_vs_iphdr *iph, | 1187 | const struct ip_vs_iphdr *iph, |
1186 | int inverse); | 1188 | int inverse); |
1187 | 1189 | ||
1190 | /* Get reference to gain full access to conn. | ||
1191 | * By default, RCU read-side critical sections have access only to | ||
1192 | * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference. | ||
1193 | */ | ||
1194 | static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) | ||
1195 | { | ||
1196 | return atomic_inc_not_zero(&cp->refcnt); | ||
1197 | } | ||
1198 | |||
1188 | /* put back the conn without restarting its timer */ | 1199 | /* put back the conn without restarting its timer */ |
1189 | static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) | 1200 | static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) |
1190 | { | 1201 | { |
1202 | smp_mb__before_atomic_dec(); | ||
1191 | atomic_dec(&cp->refcnt); | 1203 | atomic_dec(&cp->refcnt); |
1192 | } | 1204 | } |
1193 | extern void ip_vs_conn_put(struct ip_vs_conn *cp); | 1205 | extern void ip_vs_conn_put(struct ip_vs_conn *cp); |
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 704e514e02ab..b0cd2be01d75 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c | |||
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly; | |||
79 | 79 | ||
80 | struct ip_vs_aligned_lock | 80 | struct ip_vs_aligned_lock |
81 | { | 81 | { |
82 | rwlock_t l; | 82 | spinlock_t l; |
83 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | 83 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); |
84 | 84 | ||
85 | /* lock array for conn table */ | 85 | /* lock array for conn table */ |
86 | static struct ip_vs_aligned_lock | 86 | static struct ip_vs_aligned_lock |
87 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; | 87 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; |
88 | 88 | ||
89 | static inline void ct_read_lock(unsigned int key) | ||
90 | { | ||
91 | read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
92 | } | ||
93 | |||
94 | static inline void ct_read_unlock(unsigned int key) | ||
95 | { | ||
96 | read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
97 | } | ||
98 | |||
99 | static inline void ct_write_lock(unsigned int key) | 89 | static inline void ct_write_lock(unsigned int key) |
100 | { | 90 | { |
101 | write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | 91 | spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); |
102 | } | 92 | } |
103 | 93 | ||
104 | static inline void ct_write_unlock(unsigned int key) | 94 | static inline void ct_write_unlock(unsigned int key) |
105 | { | 95 | { |
106 | write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | 96 | spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); |
107 | } | ||
108 | |||
109 | static inline void ct_read_lock_bh(unsigned int key) | ||
110 | { | ||
111 | read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
112 | } | ||
113 | |||
114 | static inline void ct_read_unlock_bh(unsigned int key) | ||
115 | { | ||
116 | read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
117 | } | ||
118 | |||
119 | static inline void ct_write_lock_bh(unsigned int key) | ||
120 | { | ||
121 | write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
122 | } | ||
123 | |||
124 | static inline void ct_write_unlock_bh(unsigned int key) | ||
125 | { | ||
126 | write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
127 | } | 97 | } |
128 | 98 | ||
129 | 99 | ||
@@ -201,9 +171,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) | |||
201 | spin_lock(&cp->lock); | 171 | spin_lock(&cp->lock); |
202 | 172 | ||
203 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { | 173 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { |
204 | hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); | ||
205 | cp->flags |= IP_VS_CONN_F_HASHED; | 174 | cp->flags |= IP_VS_CONN_F_HASHED; |
206 | atomic_inc(&cp->refcnt); | 175 | atomic_inc(&cp->refcnt); |
176 | hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); | ||
207 | ret = 1; | 177 | ret = 1; |
208 | } else { | 178 | } else { |
209 | pr_err("%s(): request for already hashed, called from %pF\n", | 179 | pr_err("%s(): request for already hashed, called from %pF\n", |
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) | |||
220 | 190 | ||
221 | /* | 191 | /* |
222 | * UNhashes ip_vs_conn from ip_vs_conn_tab. | 192 | * UNhashes ip_vs_conn from ip_vs_conn_tab. |
223 | * returns bool success. | 193 | * returns bool success. Caller should hold conn reference. |
224 | */ | 194 | */ |
225 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | 195 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) |
226 | { | 196 | { |
@@ -234,7 +204,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | |||
234 | spin_lock(&cp->lock); | 204 | spin_lock(&cp->lock); |
235 | 205 | ||
236 | if (cp->flags & IP_VS_CONN_F_HASHED) { | 206 | if (cp->flags & IP_VS_CONN_F_HASHED) { |
237 | hlist_del(&cp->c_list); | 207 | hlist_del_rcu(&cp->c_list); |
238 | cp->flags &= ~IP_VS_CONN_F_HASHED; | 208 | cp->flags &= ~IP_VS_CONN_F_HASHED; |
239 | atomic_dec(&cp->refcnt); | 209 | atomic_dec(&cp->refcnt); |
240 | ret = 1; | 210 | ret = 1; |
@@ -247,6 +217,36 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | |||
247 | return ret; | 217 | return ret; |
248 | } | 218 | } |
249 | 219 | ||
220 | /* Try to unlink ip_vs_conn from ip_vs_conn_tab. | ||
221 | * returns bool success. | ||
222 | */ | ||
223 | static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) | ||
224 | { | ||
225 | unsigned int hash; | ||
226 | bool ret; | ||
227 | |||
228 | hash = ip_vs_conn_hashkey_conn(cp); | ||
229 | |||
230 | ct_write_lock(hash); | ||
231 | spin_lock(&cp->lock); | ||
232 | |||
233 | if (cp->flags & IP_VS_CONN_F_HASHED) { | ||
234 | ret = false; | ||
235 | /* Decrease refcnt and unlink conn only if we are last user */ | ||
236 | if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { | ||
237 | hlist_del_rcu(&cp->c_list); | ||
238 | cp->flags &= ~IP_VS_CONN_F_HASHED; | ||
239 | ret = true; | ||
240 | } | ||
241 | } else | ||
242 | ret = atomic_read(&cp->refcnt) ? false : true; | ||
243 | |||
244 | spin_unlock(&cp->lock); | ||
245 | ct_write_unlock(hash); | ||
246 | |||
247 | return ret; | ||
248 | } | ||
249 | |||
250 | 250 | ||
251 | /* | 251 | /* |
252 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | 252 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. |
@@ -262,9 +262,9 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) | |||
262 | 262 | ||
263 | hash = ip_vs_conn_hashkey_param(p, false); | 263 | hash = ip_vs_conn_hashkey_param(p, false); |
264 | 264 | ||
265 | ct_read_lock(hash); | 265 | rcu_read_lock(); |
266 | 266 | ||
267 | hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | 267 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
268 | if (cp->af == p->af && | 268 | if (cp->af == p->af && |
269 | p->cport == cp->cport && p->vport == cp->vport && | 269 | p->cport == cp->cport && p->vport == cp->vport && |
270 | ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && | 270 | ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && |
@@ -272,14 +272,15 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) | |||
272 | ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && | 272 | ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && |
273 | p->protocol == cp->protocol && | 273 | p->protocol == cp->protocol && |
274 | ip_vs_conn_net_eq(cp, p->net)) { | 274 | ip_vs_conn_net_eq(cp, p->net)) { |
275 | if (!__ip_vs_conn_get(cp)) | ||
276 | continue; | ||
275 | /* HIT */ | 277 | /* HIT */ |
276 | atomic_inc(&cp->refcnt); | 278 | rcu_read_unlock(); |
277 | ct_read_unlock(hash); | ||
278 | return cp; | 279 | return cp; |
279 | } | 280 | } |
280 | } | 281 | } |
281 | 282 | ||
282 | ct_read_unlock(hash); | 283 | rcu_read_unlock(); |
283 | 284 | ||
284 | return NULL; | 285 | return NULL; |
285 | } | 286 | } |
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) | |||
346 | 347 | ||
347 | hash = ip_vs_conn_hashkey_param(p, false); | 348 | hash = ip_vs_conn_hashkey_param(p, false); |
348 | 349 | ||
349 | ct_read_lock(hash); | 350 | rcu_read_lock(); |
350 | 351 | ||
351 | hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | 352 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
352 | if (!ip_vs_conn_net_eq(cp, p->net)) | 353 | if (!ip_vs_conn_net_eq(cp, p->net)) |
353 | continue; | 354 | continue; |
354 | if (p->pe_data && p->pe->ct_match) { | 355 | if (p->pe_data && p->pe->ct_match) { |
355 | if (p->pe == cp->pe && p->pe->ct_match(p, cp)) | 356 | if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { |
356 | goto out; | 357 | if (__ip_vs_conn_get(cp)) |
358 | goto out; | ||
359 | } | ||
357 | continue; | 360 | continue; |
358 | } | 361 | } |
359 | 362 | ||
@@ -365,15 +368,15 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) | |||
365 | p->af, p->vaddr, &cp->vaddr) && | 368 | p->af, p->vaddr, &cp->vaddr) && |
366 | p->cport == cp->cport && p->vport == cp->vport && | 369 | p->cport == cp->cport && p->vport == cp->vport && |
367 | cp->flags & IP_VS_CONN_F_TEMPLATE && | 370 | cp->flags & IP_VS_CONN_F_TEMPLATE && |
368 | p->protocol == cp->protocol) | 371 | p->protocol == cp->protocol) { |
369 | goto out; | 372 | if (__ip_vs_conn_get(cp)) |
373 | goto out; | ||
374 | } | ||
370 | } | 375 | } |
371 | cp = NULL; | 376 | cp = NULL; |
372 | 377 | ||
373 | out: | 378 | out: |
374 | if (cp) | 379 | rcu_read_unlock(); |
375 | atomic_inc(&cp->refcnt); | ||
376 | ct_read_unlock(hash); | ||
377 | 380 | ||
378 | IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", | 381 | IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", |
379 | ip_vs_proto_name(p->protocol), | 382 | ip_vs_proto_name(p->protocol), |
@@ -398,23 +401,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) | |||
398 | */ | 401 | */ |
399 | hash = ip_vs_conn_hashkey_param(p, true); | 402 | hash = ip_vs_conn_hashkey_param(p, true); |
400 | 403 | ||
401 | ct_read_lock(hash); | 404 | rcu_read_lock(); |
402 | 405 | ||
403 | hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | 406 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
404 | if (cp->af == p->af && | 407 | if (cp->af == p->af && |
405 | p->vport == cp->cport && p->cport == cp->dport && | 408 | p->vport == cp->cport && p->cport == cp->dport && |
406 | ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && | 409 | ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && |
407 | ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && | 410 | ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && |
408 | p->protocol == cp->protocol && | 411 | p->protocol == cp->protocol && |
409 | ip_vs_conn_net_eq(cp, p->net)) { | 412 | ip_vs_conn_net_eq(cp, p->net)) { |
413 | if (!__ip_vs_conn_get(cp)) | ||
414 | continue; | ||
410 | /* HIT */ | 415 | /* HIT */ |
411 | atomic_inc(&cp->refcnt); | ||
412 | ret = cp; | 416 | ret = cp; |
413 | break; | 417 | break; |
414 | } | 418 | } |
415 | } | 419 | } |
416 | 420 | ||
417 | ct_read_unlock(hash); | 421 | rcu_read_unlock(); |
418 | 422 | ||
419 | IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", | 423 | IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", |
420 | ip_vs_proto_name(p->protocol), | 424 | ip_vs_proto_name(p->protocol), |
@@ -757,41 +761,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct) | |||
757 | * Simply decrease the refcnt of the template, | 761 | * Simply decrease the refcnt of the template, |
758 | * don't restart its timer. | 762 | * don't restart its timer. |
759 | */ | 763 | */ |
760 | atomic_dec(&ct->refcnt); | 764 | __ip_vs_conn_put(ct); |
761 | return 0; | 765 | return 0; |
762 | } | 766 | } |
763 | return 1; | 767 | return 1; |
764 | } | 768 | } |
765 | 769 | ||
770 | static void ip_vs_conn_rcu_free(struct rcu_head *head) | ||
771 | { | ||
772 | struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, | ||
773 | rcu_head); | ||
774 | |||
775 | ip_vs_pe_put(cp->pe); | ||
776 | kfree(cp->pe_data); | ||
777 | kmem_cache_free(ip_vs_conn_cachep, cp); | ||
778 | } | ||
779 | |||
766 | static void ip_vs_conn_expire(unsigned long data) | 780 | static void ip_vs_conn_expire(unsigned long data) |
767 | { | 781 | { |
768 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; | 782 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; |
769 | struct net *net = ip_vs_conn_net(cp); | 783 | struct net *net = ip_vs_conn_net(cp); |
770 | struct netns_ipvs *ipvs = net_ipvs(net); | 784 | struct netns_ipvs *ipvs = net_ipvs(net); |
771 | 785 | ||
772 | cp->timeout = 60*HZ; | ||
773 | |||
774 | /* | ||
775 | * hey, I'm using it | ||
776 | */ | ||
777 | atomic_inc(&cp->refcnt); | ||
778 | |||
779 | /* | 786 | /* |
780 | * do I control anybody? | 787 | * do I control anybody? |
781 | */ | 788 | */ |
782 | if (atomic_read(&cp->n_control)) | 789 | if (atomic_read(&cp->n_control)) |
783 | goto expire_later; | 790 | goto expire_later; |
784 | 791 | ||
785 | /* | 792 | /* Unlink conn if not referenced anymore */ |
786 | * unhash it if it is hashed in the conn table | 793 | if (likely(ip_vs_conn_unlink(cp))) { |
787 | */ | ||
788 | if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) | ||
789 | goto expire_later; | ||
790 | |||
791 | /* | ||
792 | * refcnt==1 implies I'm the only one referrer | ||
793 | */ | ||
794 | if (likely(atomic_read(&cp->refcnt) == 1)) { | ||
795 | /* delete the timer if it is activated by other users */ | 794 | /* delete the timer if it is activated by other users */ |
796 | del_timer(&cp->timer); | 795 | del_timer(&cp->timer); |
797 | 796 | ||
@@ -810,38 +809,41 @@ static void ip_vs_conn_expire(unsigned long data) | |||
810 | ip_vs_conn_drop_conntrack(cp); | 809 | ip_vs_conn_drop_conntrack(cp); |
811 | } | 810 | } |
812 | 811 | ||
813 | ip_vs_pe_put(cp->pe); | ||
814 | kfree(cp->pe_data); | ||
815 | if (unlikely(cp->app != NULL)) | 812 | if (unlikely(cp->app != NULL)) |
816 | ip_vs_unbind_app(cp); | 813 | ip_vs_unbind_app(cp); |
817 | ip_vs_unbind_dest(cp); | 814 | ip_vs_unbind_dest(cp); |
818 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) | 815 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) |
819 | atomic_dec(&ip_vs_conn_no_cport_cnt); | 816 | atomic_dec(&ip_vs_conn_no_cport_cnt); |
817 | call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); | ||
820 | atomic_dec(&ipvs->conn_count); | 818 | atomic_dec(&ipvs->conn_count); |
821 | |||
822 | kmem_cache_free(ip_vs_conn_cachep, cp); | ||
823 | return; | 819 | return; |
824 | } | 820 | } |
825 | 821 | ||
826 | /* hash it back to the table */ | ||
827 | ip_vs_conn_hash(cp); | ||
828 | |||
829 | expire_later: | 822 | expire_later: |
830 | IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", | 823 | IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", |
831 | atomic_read(&cp->refcnt)-1, | 824 | atomic_read(&cp->refcnt), |
832 | atomic_read(&cp->n_control)); | 825 | atomic_read(&cp->n_control)); |
833 | 826 | ||
827 | atomic_inc(&cp->refcnt); | ||
828 | cp->timeout = 60*HZ; | ||
829 | |||
834 | if (ipvs->sync_state & IP_VS_STATE_MASTER) | 830 | if (ipvs->sync_state & IP_VS_STATE_MASTER) |
835 | ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); | 831 | ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); |
836 | 832 | ||
837 | ip_vs_conn_put(cp); | 833 | ip_vs_conn_put(cp); |
838 | } | 834 | } |
839 | 835 | ||
840 | 836 | /* Modify timer, so that it expires as soon as possible. | |
837 | * Can be called without reference only if under RCU lock. | ||
838 | */ | ||
841 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) | 839 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) |
842 | { | 840 | { |
843 | if (del_timer(&cp->timer)) | 841 | /* Using mod_timer_pending will ensure the timer is not |
844 | mod_timer(&cp->timer, jiffies); | 842 | * modified after the final del_timer in ip_vs_conn_expire. |
843 | */ | ||
844 | if (timer_pending(&cp->timer) && | ||
845 | time_after(cp->timer.expires, jiffies)) | ||
846 | mod_timer_pending(&cp->timer, jiffies); | ||
845 | } | 847 | } |
846 | 848 | ||
847 | 849 | ||
@@ -952,14 +954,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) | |||
952 | struct ip_vs_iter_state *iter = seq->private; | 954 | struct ip_vs_iter_state *iter = seq->private; |
953 | 955 | ||
954 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { | 956 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { |
955 | ct_read_lock_bh(idx); | 957 | rcu_read_lock(); |
956 | hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | 958 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
959 | /* __ip_vs_conn_get() is not needed by | ||
960 | * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show | ||
961 | */ | ||
957 | if (pos-- == 0) { | 962 | if (pos-- == 0) { |
958 | iter->l = &ip_vs_conn_tab[idx]; | 963 | iter->l = &ip_vs_conn_tab[idx]; |
959 | return cp; | 964 | return cp; |
960 | } | 965 | } |
961 | } | 966 | } |
962 | ct_read_unlock_bh(idx); | 967 | rcu_read_unlock(); |
963 | } | 968 | } |
964 | 969 | ||
965 | return NULL; | 970 | return NULL; |
@@ -977,6 +982,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
977 | { | 982 | { |
978 | struct ip_vs_conn *cp = v; | 983 | struct ip_vs_conn *cp = v; |
979 | struct ip_vs_iter_state *iter = seq->private; | 984 | struct ip_vs_iter_state *iter = seq->private; |
985 | struct hlist_node *e; | ||
980 | struct hlist_head *l = iter->l; | 986 | struct hlist_head *l = iter->l; |
981 | int idx; | 987 | int idx; |
982 | 988 | ||
@@ -985,19 +991,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
985 | return ip_vs_conn_array(seq, 0); | 991 | return ip_vs_conn_array(seq, 0); |
986 | 992 | ||
987 | /* more on same hash chain? */ | 993 | /* more on same hash chain? */ |
988 | if (cp->c_list.next) | 994 | e = rcu_dereference(hlist_next_rcu(&cp->c_list)); |
989 | return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list); | 995 | if (e) |
996 | return hlist_entry(e, struct ip_vs_conn, c_list); | ||
997 | rcu_read_unlock(); | ||
990 | 998 | ||
991 | idx = l - ip_vs_conn_tab; | 999 | idx = l - ip_vs_conn_tab; |
992 | ct_read_unlock_bh(idx); | ||
993 | |||
994 | while (++idx < ip_vs_conn_tab_size) { | 1000 | while (++idx < ip_vs_conn_tab_size) { |
995 | ct_read_lock_bh(idx); | 1001 | rcu_read_lock(); |
996 | hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | 1002 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
997 | iter->l = &ip_vs_conn_tab[idx]; | 1003 | iter->l = &ip_vs_conn_tab[idx]; |
998 | return cp; | 1004 | return cp; |
999 | } | 1005 | } |
1000 | ct_read_unlock_bh(idx); | 1006 | rcu_read_unlock(); |
1001 | } | 1007 | } |
1002 | iter->l = NULL; | 1008 | iter->l = NULL; |
1003 | return NULL; | 1009 | return NULL; |
@@ -1009,7 +1015,7 @@ static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) | |||
1009 | struct hlist_head *l = iter->l; | 1015 | struct hlist_head *l = iter->l; |
1010 | 1016 | ||
1011 | if (l) | 1017 | if (l) |
1012 | ct_read_unlock_bh(l - ip_vs_conn_tab); | 1018 | rcu_read_unlock(); |
1013 | } | 1019 | } |
1014 | 1020 | ||
1015 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) | 1021 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) |
@@ -1188,7 +1194,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) | |||
1188 | void ip_vs_random_dropentry(struct net *net) | 1194 | void ip_vs_random_dropentry(struct net *net) |
1189 | { | 1195 | { |
1190 | int idx; | 1196 | int idx; |
1191 | struct ip_vs_conn *cp; | 1197 | struct ip_vs_conn *cp, *cp_c; |
1192 | 1198 | ||
1193 | /* | 1199 | /* |
1194 | * Randomly scan 1/32 of the whole table every second | 1200 | * Randomly scan 1/32 of the whole table every second |
@@ -1199,9 +1205,9 @@ void ip_vs_random_dropentry(struct net *net) | |||
1199 | /* | 1205 | /* |
1200 | * Lock is actually needed in this loop. | 1206 | * Lock is actually needed in this loop. |
1201 | */ | 1207 | */ |
1202 | ct_write_lock_bh(hash); | 1208 | rcu_read_lock(); |
1203 | 1209 | ||
1204 | hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | 1210 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
1205 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) | 1211 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) |
1206 | /* connection template */ | 1212 | /* connection template */ |
1207 | continue; | 1213 | continue; |
@@ -1228,12 +1234,15 @@ void ip_vs_random_dropentry(struct net *net) | |||
1228 | 1234 | ||
1229 | IP_VS_DBG(4, "del connection\n"); | 1235 | IP_VS_DBG(4, "del connection\n"); |
1230 | ip_vs_conn_expire_now(cp); | 1236 | ip_vs_conn_expire_now(cp); |
1231 | if (cp->control) { | 1237 | cp_c = cp->control; |
1238 | /* cp->control is valid only with reference to cp */ | ||
1239 | if (cp_c && __ip_vs_conn_get(cp)) { | ||
1232 | IP_VS_DBG(4, "del conn template\n"); | 1240 | IP_VS_DBG(4, "del conn template\n"); |
1233 | ip_vs_conn_expire_now(cp->control); | 1241 | ip_vs_conn_expire_now(cp_c); |
1242 | __ip_vs_conn_put(cp); | ||
1234 | } | 1243 | } |
1235 | } | 1244 | } |
1236 | ct_write_unlock_bh(hash); | 1245 | rcu_read_unlock(); |
1237 | } | 1246 | } |
1238 | } | 1247 | } |
1239 | 1248 | ||
@@ -1244,7 +1253,7 @@ void ip_vs_random_dropentry(struct net *net) | |||
1244 | static void ip_vs_conn_flush(struct net *net) | 1253 | static void ip_vs_conn_flush(struct net *net) |
1245 | { | 1254 | { |
1246 | int idx; | 1255 | int idx; |
1247 | struct ip_vs_conn *cp; | 1256 | struct ip_vs_conn *cp, *cp_c; |
1248 | struct netns_ipvs *ipvs = net_ipvs(net); | 1257 | struct netns_ipvs *ipvs = net_ipvs(net); |
1249 | 1258 | ||
1250 | flush_again: | 1259 | flush_again: |
@@ -1252,19 +1261,22 @@ flush_again: | |||
1252 | /* | 1261 | /* |
1253 | * Lock is actually needed in this loop. | 1262 | * Lock is actually needed in this loop. |
1254 | */ | 1263 | */ |
1255 | ct_write_lock_bh(idx); | 1264 | rcu_read_lock(); |
1256 | 1265 | ||
1257 | hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | 1266 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
1258 | if (!ip_vs_conn_net_eq(cp, net)) | 1267 | if (!ip_vs_conn_net_eq(cp, net)) |
1259 | continue; | 1268 | continue; |
1260 | IP_VS_DBG(4, "del connection\n"); | 1269 | IP_VS_DBG(4, "del connection\n"); |
1261 | ip_vs_conn_expire_now(cp); | 1270 | ip_vs_conn_expire_now(cp); |
1262 | if (cp->control) { | 1271 | cp_c = cp->control; |
1272 | /* cp->control is valid only with reference to cp */ | ||
1273 | if (cp_c && __ip_vs_conn_get(cp)) { | ||
1263 | IP_VS_DBG(4, "del conn template\n"); | 1274 | IP_VS_DBG(4, "del conn template\n"); |
1264 | ip_vs_conn_expire_now(cp->control); | 1275 | ip_vs_conn_expire_now(cp_c); |
1276 | __ip_vs_conn_put(cp); | ||
1265 | } | 1277 | } |
1266 | } | 1278 | } |
1267 | ct_write_unlock_bh(idx); | 1279 | rcu_read_unlock(); |
1268 | } | 1280 | } |
1269 | 1281 | ||
1270 | /* the counter may be not NULL, because maybe some conn entries | 1282 | /* the counter may be not NULL, because maybe some conn entries |
@@ -1331,7 +1343,7 @@ int __init ip_vs_conn_init(void) | |||
1331 | INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); | 1343 | INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); |
1332 | 1344 | ||
1333 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { | 1345 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { |
1334 | rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); | 1346 | spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); |
1335 | } | 1347 | } |
1336 | 1348 | ||
1337 | /* calculate the random value for connection hash */ | 1349 | /* calculate the random value for connection hash */ |
@@ -1342,6 +1354,8 @@ int __init ip_vs_conn_init(void) | |||
1342 | 1354 | ||
1343 | void ip_vs_conn_cleanup(void) | 1355 | void ip_vs_conn_cleanup(void) |
1344 | { | 1356 | { |
1357 | /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ | ||
1358 | rcu_barrier(); | ||
1345 | /* Release the empty cache */ | 1359 | /* Release the empty cache */ |
1346 | kmem_cache_destroy(ip_vs_conn_cachep); | 1360 | kmem_cache_destroy(ip_vs_conn_cachep); |
1347 | vfree(ip_vs_conn_tab); | 1361 | vfree(ip_vs_conn_tab); |