aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2013-03-21 05:58:10 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2013-04-01 18:23:45 -0400
commit088339a57d6042a8a19a3d5794594b558cd7b624 (patch)
tree835a3b82d7504f7f5670a7b130eedabf14d0ccc2
parent60b6aa3b319d902db49dbaee7433fe2ac7d0cdb5 (diff)
ipvs: convert connection locking
Convert __ip_vs_conntbl_lock_array as follows: - readers that do not modify conn lists will use RCU lock - updaters that modify lists will use spinlock_t Now for conn lookups we will use RCU read-side critical section. Without using __ip_vs_conn_get such places have access to connection fields and can dereference some pointers like pe and pe_data plus the ability to update timer expiration. If full access is required we contend for reference. We add barrier in __ip_vs_conn_put, so that other CPUs see the refcnt operation after other writes. With the introduction of ip_vs_conn_unlink() we try to reorganize ip_vs_conn_expire(), so that unhashing of connections that should stay more time is avoided, even if it is for very short time. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off by: Hans Schillstrom <hans@schillstrom.com> Signed-off-by: Simon Horman <horms@verge.net.au>
-rw-r--r--include/net/ip_vs.h12
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c230
2 files changed, 134 insertions, 108 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b06aa6c939fa..5700b07b5186 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -620,6 +620,8 @@ struct ip_vs_conn {
620 const struct ip_vs_pe *pe; 620 const struct ip_vs_pe *pe;
621 char *pe_data; 621 char *pe_data;
622 __u8 pe_data_len; 622 __u8 pe_data_len;
623
624 struct rcu_head rcu_head;
623}; 625};
624 626
625/* 627/*
@@ -1185,9 +1187,19 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
1185 const struct ip_vs_iphdr *iph, 1187 const struct ip_vs_iphdr *iph,
1186 int inverse); 1188 int inverse);
1187 1189
1190/* Get reference to gain full access to conn.
1191 * By default, RCU read-side critical sections have access only to
1192 * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference.
1193 */
1194static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp)
1195{
1196 return atomic_inc_not_zero(&cp->refcnt);
1197}
1198
1188/* put back the conn without restarting its timer */ 1199/* put back the conn without restarting its timer */
1189static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) 1200static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
1190{ 1201{
1202 smp_mb__before_atomic_dec();
1191 atomic_dec(&cp->refcnt); 1203 atomic_dec(&cp->refcnt);
1192} 1204}
1193extern void ip_vs_conn_put(struct ip_vs_conn *cp); 1205extern void ip_vs_conn_put(struct ip_vs_conn *cp);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 704e514e02ab..b0cd2be01d75 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
79 79
80struct ip_vs_aligned_lock 80struct ip_vs_aligned_lock
81{ 81{
82 rwlock_t l; 82 spinlock_t l;
83} __attribute__((__aligned__(SMP_CACHE_BYTES))); 83} __attribute__((__aligned__(SMP_CACHE_BYTES)));
84 84
85/* lock array for conn table */ 85/* lock array for conn table */
86static struct ip_vs_aligned_lock 86static struct ip_vs_aligned_lock
87__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 87__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
88 88
89static inline void ct_read_lock(unsigned int key)
90{
91 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
92}
93
94static inline void ct_read_unlock(unsigned int key)
95{
96 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
97}
98
99static inline void ct_write_lock(unsigned int key) 89static inline void ct_write_lock(unsigned int key)
100{ 90{
101 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 91 spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
102} 92}
103 93
104static inline void ct_write_unlock(unsigned int key) 94static inline void ct_write_unlock(unsigned int key)
105{ 95{
106 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 96 spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
107}
108
109static inline void ct_read_lock_bh(unsigned int key)
110{
111 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
112}
113
114static inline void ct_read_unlock_bh(unsigned int key)
115{
116 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
117}
118
119static inline void ct_write_lock_bh(unsigned int key)
120{
121 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
122}
123
124static inline void ct_write_unlock_bh(unsigned int key)
125{
126 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
127} 97}
128 98
129 99
@@ -201,9 +171,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
201 spin_lock(&cp->lock); 171 spin_lock(&cp->lock);
202 172
203 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 173 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
204 hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
205 cp->flags |= IP_VS_CONN_F_HASHED; 174 cp->flags |= IP_VS_CONN_F_HASHED;
206 atomic_inc(&cp->refcnt); 175 atomic_inc(&cp->refcnt);
176 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
207 ret = 1; 177 ret = 1;
208 } else { 178 } else {
209 pr_err("%s(): request for already hashed, called from %pF\n", 179 pr_err("%s(): request for already hashed, called from %pF\n",
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
220 190
221/* 191/*
222 * UNhashes ip_vs_conn from ip_vs_conn_tab. 192 * UNhashes ip_vs_conn from ip_vs_conn_tab.
223 * returns bool success. 193 * returns bool success. Caller should hold conn reference.
224 */ 194 */
225static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 195static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
226{ 196{
@@ -234,7 +204,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
234 spin_lock(&cp->lock); 204 spin_lock(&cp->lock);
235 205
236 if (cp->flags & IP_VS_CONN_F_HASHED) { 206 if (cp->flags & IP_VS_CONN_F_HASHED) {
237 hlist_del(&cp->c_list); 207 hlist_del_rcu(&cp->c_list);
238 cp->flags &= ~IP_VS_CONN_F_HASHED; 208 cp->flags &= ~IP_VS_CONN_F_HASHED;
239 atomic_dec(&cp->refcnt); 209 atomic_dec(&cp->refcnt);
240 ret = 1; 210 ret = 1;
@@ -247,6 +217,36 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
247 return ret; 217 return ret;
248} 218}
249 219
220/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
221 * returns bool success.
222 */
223static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
224{
225 unsigned int hash;
226 bool ret;
227
228 hash = ip_vs_conn_hashkey_conn(cp);
229
230 ct_write_lock(hash);
231 spin_lock(&cp->lock);
232
233 if (cp->flags & IP_VS_CONN_F_HASHED) {
234 ret = false;
235 /* Decrease refcnt and unlink conn only if we are last user */
236 if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
237 hlist_del_rcu(&cp->c_list);
238 cp->flags &= ~IP_VS_CONN_F_HASHED;
239 ret = true;
240 }
241 } else
242 ret = atomic_read(&cp->refcnt) ? false : true;
243
244 spin_unlock(&cp->lock);
245 ct_write_unlock(hash);
246
247 return ret;
248}
249
250 250
251/* 251/*
252 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 252 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
@@ -262,9 +262,9 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
262 262
263 hash = ip_vs_conn_hashkey_param(p, false); 263 hash = ip_vs_conn_hashkey_param(p, false);
264 264
265 ct_read_lock(hash); 265 rcu_read_lock();
266 266
267 hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 267 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
268 if (cp->af == p->af && 268 if (cp->af == p->af &&
269 p->cport == cp->cport && p->vport == cp->vport && 269 p->cport == cp->cport && p->vport == cp->vport &&
270 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 270 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
@@ -272,14 +272,15 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
272 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 272 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
273 p->protocol == cp->protocol && 273 p->protocol == cp->protocol &&
274 ip_vs_conn_net_eq(cp, p->net)) { 274 ip_vs_conn_net_eq(cp, p->net)) {
275 if (!__ip_vs_conn_get(cp))
276 continue;
275 /* HIT */ 277 /* HIT */
276 atomic_inc(&cp->refcnt); 278 rcu_read_unlock();
277 ct_read_unlock(hash);
278 return cp; 279 return cp;
279 } 280 }
280 } 281 }
281 282
282 ct_read_unlock(hash); 283 rcu_read_unlock();
283 284
284 return NULL; 285 return NULL;
285} 286}
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
346 347
347 hash = ip_vs_conn_hashkey_param(p, false); 348 hash = ip_vs_conn_hashkey_param(p, false);
348 349
349 ct_read_lock(hash); 350 rcu_read_lock();
350 351
351 hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 352 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
352 if (!ip_vs_conn_net_eq(cp, p->net)) 353 if (!ip_vs_conn_net_eq(cp, p->net))
353 continue; 354 continue;
354 if (p->pe_data && p->pe->ct_match) { 355 if (p->pe_data && p->pe->ct_match) {
355 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) 356 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
356 goto out; 357 if (__ip_vs_conn_get(cp))
358 goto out;
359 }
357 continue; 360 continue;
358 } 361 }
359 362
@@ -365,15 +368,15 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
365 p->af, p->vaddr, &cp->vaddr) && 368 p->af, p->vaddr, &cp->vaddr) &&
366 p->cport == cp->cport && p->vport == cp->vport && 369 p->cport == cp->cport && p->vport == cp->vport &&
367 cp->flags & IP_VS_CONN_F_TEMPLATE && 370 cp->flags & IP_VS_CONN_F_TEMPLATE &&
368 p->protocol == cp->protocol) 371 p->protocol == cp->protocol) {
369 goto out; 372 if (__ip_vs_conn_get(cp))
373 goto out;
374 }
370 } 375 }
371 cp = NULL; 376 cp = NULL;
372 377
373 out: 378 out:
374 if (cp) 379 rcu_read_unlock();
375 atomic_inc(&cp->refcnt);
376 ct_read_unlock(hash);
377 380
378 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 381 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
379 ip_vs_proto_name(p->protocol), 382 ip_vs_proto_name(p->protocol),
@@ -398,23 +401,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
398 */ 401 */
399 hash = ip_vs_conn_hashkey_param(p, true); 402 hash = ip_vs_conn_hashkey_param(p, true);
400 403
401 ct_read_lock(hash); 404 rcu_read_lock();
402 405
403 hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 406 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
404 if (cp->af == p->af && 407 if (cp->af == p->af &&
405 p->vport == cp->cport && p->cport == cp->dport && 408 p->vport == cp->cport && p->cport == cp->dport &&
406 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 409 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
407 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 410 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
408 p->protocol == cp->protocol && 411 p->protocol == cp->protocol &&
409 ip_vs_conn_net_eq(cp, p->net)) { 412 ip_vs_conn_net_eq(cp, p->net)) {
413 if (!__ip_vs_conn_get(cp))
414 continue;
410 /* HIT */ 415 /* HIT */
411 atomic_inc(&cp->refcnt);
412 ret = cp; 416 ret = cp;
413 break; 417 break;
414 } 418 }
415 } 419 }
416 420
417 ct_read_unlock(hash); 421 rcu_read_unlock();
418 422
419 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 423 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
420 ip_vs_proto_name(p->protocol), 424 ip_vs_proto_name(p->protocol),
@@ -757,41 +761,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
757 * Simply decrease the refcnt of the template, 761 * Simply decrease the refcnt of the template,
758 * don't restart its timer. 762 * don't restart its timer.
759 */ 763 */
760 atomic_dec(&ct->refcnt); 764 __ip_vs_conn_put(ct);
761 return 0; 765 return 0;
762 } 766 }
763 return 1; 767 return 1;
764} 768}
765 769
770static void ip_vs_conn_rcu_free(struct rcu_head *head)
771{
772 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
773 rcu_head);
774
775 ip_vs_pe_put(cp->pe);
776 kfree(cp->pe_data);
777 kmem_cache_free(ip_vs_conn_cachep, cp);
778}
779
766static void ip_vs_conn_expire(unsigned long data) 780static void ip_vs_conn_expire(unsigned long data)
767{ 781{
768 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 782 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
769 struct net *net = ip_vs_conn_net(cp); 783 struct net *net = ip_vs_conn_net(cp);
770 struct netns_ipvs *ipvs = net_ipvs(net); 784 struct netns_ipvs *ipvs = net_ipvs(net);
771 785
772 cp->timeout = 60*HZ;
773
774 /*
775 * hey, I'm using it
776 */
777 atomic_inc(&cp->refcnt);
778
779 /* 786 /*
780 * do I control anybody? 787 * do I control anybody?
781 */ 788 */
782 if (atomic_read(&cp->n_control)) 789 if (atomic_read(&cp->n_control))
783 goto expire_later; 790 goto expire_later;
784 791
785 /* 792 /* Unlink conn if not referenced anymore */
786 * unhash it if it is hashed in the conn table 793 if (likely(ip_vs_conn_unlink(cp))) {
787 */
788 if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
789 goto expire_later;
790
791 /*
792 * refcnt==1 implies I'm the only one referrer
793 */
794 if (likely(atomic_read(&cp->refcnt) == 1)) {
795 /* delete the timer if it is activated by other users */ 794 /* delete the timer if it is activated by other users */
796 del_timer(&cp->timer); 795 del_timer(&cp->timer);
797 796
@@ -810,38 +809,41 @@ static void ip_vs_conn_expire(unsigned long data)
810 ip_vs_conn_drop_conntrack(cp); 809 ip_vs_conn_drop_conntrack(cp);
811 } 810 }
812 811
813 ip_vs_pe_put(cp->pe);
814 kfree(cp->pe_data);
815 if (unlikely(cp->app != NULL)) 812 if (unlikely(cp->app != NULL))
816 ip_vs_unbind_app(cp); 813 ip_vs_unbind_app(cp);
817 ip_vs_unbind_dest(cp); 814 ip_vs_unbind_dest(cp);
818 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 815 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
819 atomic_dec(&ip_vs_conn_no_cport_cnt); 816 atomic_dec(&ip_vs_conn_no_cport_cnt);
817 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
820 atomic_dec(&ipvs->conn_count); 818 atomic_dec(&ipvs->conn_count);
821
822 kmem_cache_free(ip_vs_conn_cachep, cp);
823 return; 819 return;
824 } 820 }
825 821
826 /* hash it back to the table */
827 ip_vs_conn_hash(cp);
828
829 expire_later: 822 expire_later:
830 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 823 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
831 atomic_read(&cp->refcnt)-1, 824 atomic_read(&cp->refcnt),
832 atomic_read(&cp->n_control)); 825 atomic_read(&cp->n_control));
833 826
827 atomic_inc(&cp->refcnt);
828 cp->timeout = 60*HZ;
829
834 if (ipvs->sync_state & IP_VS_STATE_MASTER) 830 if (ipvs->sync_state & IP_VS_STATE_MASTER)
835 ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); 831 ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
836 832
837 ip_vs_conn_put(cp); 833 ip_vs_conn_put(cp);
838} 834}
839 835
840 836/* Modify timer, so that it expires as soon as possible.
837 * Can be called without reference only if under RCU lock.
838 */
841void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 839void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
842{ 840{
843 if (del_timer(&cp->timer)) 841 /* Using mod_timer_pending will ensure the timer is not
844 mod_timer(&cp->timer, jiffies); 842 * modified after the final del_timer in ip_vs_conn_expire.
843 */
844 if (timer_pending(&cp->timer) &&
845 time_after(cp->timer.expires, jiffies))
846 mod_timer_pending(&cp->timer, jiffies);
845} 847}
846 848
847 849
@@ -952,14 +954,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
952 struct ip_vs_iter_state *iter = seq->private; 954 struct ip_vs_iter_state *iter = seq->private;
953 955
954 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 956 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
955 ct_read_lock_bh(idx); 957 rcu_read_lock();
956 hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 958 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
959 /* __ip_vs_conn_get() is not needed by
960 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
961 */
957 if (pos-- == 0) { 962 if (pos-- == 0) {
958 iter->l = &ip_vs_conn_tab[idx]; 963 iter->l = &ip_vs_conn_tab[idx];
959 return cp; 964 return cp;
960 } 965 }
961 } 966 }
962 ct_read_unlock_bh(idx); 967 rcu_read_unlock();
963 } 968 }
964 969
965 return NULL; 970 return NULL;
@@ -977,6 +982,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
977{ 982{
978 struct ip_vs_conn *cp = v; 983 struct ip_vs_conn *cp = v;
979 struct ip_vs_iter_state *iter = seq->private; 984 struct ip_vs_iter_state *iter = seq->private;
985 struct hlist_node *e;
980 struct hlist_head *l = iter->l; 986 struct hlist_head *l = iter->l;
981 int idx; 987 int idx;
982 988
@@ -985,19 +991,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
985 return ip_vs_conn_array(seq, 0); 991 return ip_vs_conn_array(seq, 0);
986 992
987 /* more on same hash chain? */ 993 /* more on same hash chain? */
988 if (cp->c_list.next) 994 e = rcu_dereference(hlist_next_rcu(&cp->c_list));
989 return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list); 995 if (e)
996 return hlist_entry(e, struct ip_vs_conn, c_list);
997 rcu_read_unlock();
990 998
991 idx = l - ip_vs_conn_tab; 999 idx = l - ip_vs_conn_tab;
992 ct_read_unlock_bh(idx);
993
994 while (++idx < ip_vs_conn_tab_size) { 1000 while (++idx < ip_vs_conn_tab_size) {
995 ct_read_lock_bh(idx); 1001 rcu_read_lock();
996 hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 1002 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
997 iter->l = &ip_vs_conn_tab[idx]; 1003 iter->l = &ip_vs_conn_tab[idx];
998 return cp; 1004 return cp;
999 } 1005 }
1000 ct_read_unlock_bh(idx); 1006 rcu_read_unlock();
1001 } 1007 }
1002 iter->l = NULL; 1008 iter->l = NULL;
1003 return NULL; 1009 return NULL;
@@ -1009,7 +1015,7 @@ static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
1009 struct hlist_head *l = iter->l; 1015 struct hlist_head *l = iter->l;
1010 1016
1011 if (l) 1017 if (l)
1012 ct_read_unlock_bh(l - ip_vs_conn_tab); 1018 rcu_read_unlock();
1013} 1019}
1014 1020
1015static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 1021static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1188,7 +1194,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
1188void ip_vs_random_dropentry(struct net *net) 1194void ip_vs_random_dropentry(struct net *net)
1189{ 1195{
1190 int idx; 1196 int idx;
1191 struct ip_vs_conn *cp; 1197 struct ip_vs_conn *cp, *cp_c;
1192 1198
1193 /* 1199 /*
1194 * Randomly scan 1/32 of the whole table every second 1200 * Randomly scan 1/32 of the whole table every second
@@ -1199,9 +1205,9 @@ void ip_vs_random_dropentry(struct net *net)
1199 /* 1205 /*
1200 * Lock is actually needed in this loop. 1206 * Lock is actually needed in this loop.
1201 */ 1207 */
1202 ct_write_lock_bh(hash); 1208 rcu_read_lock();
1203 1209
1204 hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 1210 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
1205 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 1211 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1206 /* connection template */ 1212 /* connection template */
1207 continue; 1213 continue;
@@ -1228,12 +1234,15 @@ void ip_vs_random_dropentry(struct net *net)
1228 1234
1229 IP_VS_DBG(4, "del connection\n"); 1235 IP_VS_DBG(4, "del connection\n");
1230 ip_vs_conn_expire_now(cp); 1236 ip_vs_conn_expire_now(cp);
1231 if (cp->control) { 1237 cp_c = cp->control;
1238 /* cp->control is valid only with reference to cp */
1239 if (cp_c && __ip_vs_conn_get(cp)) {
1232 IP_VS_DBG(4, "del conn template\n"); 1240 IP_VS_DBG(4, "del conn template\n");
1233 ip_vs_conn_expire_now(cp->control); 1241 ip_vs_conn_expire_now(cp_c);
1242 __ip_vs_conn_put(cp);
1234 } 1243 }
1235 } 1244 }
1236 ct_write_unlock_bh(hash); 1245 rcu_read_unlock();
1237 } 1246 }
1238} 1247}
1239 1248
@@ -1244,7 +1253,7 @@ void ip_vs_random_dropentry(struct net *net)
1244static void ip_vs_conn_flush(struct net *net) 1253static void ip_vs_conn_flush(struct net *net)
1245{ 1254{
1246 int idx; 1255 int idx;
1247 struct ip_vs_conn *cp; 1256 struct ip_vs_conn *cp, *cp_c;
1248 struct netns_ipvs *ipvs = net_ipvs(net); 1257 struct netns_ipvs *ipvs = net_ipvs(net);
1249 1258
1250flush_again: 1259flush_again:
@@ -1252,19 +1261,22 @@ flush_again:
1252 /* 1261 /*
1253 * Lock is actually needed in this loop. 1262 * Lock is actually needed in this loop.
1254 */ 1263 */
1255 ct_write_lock_bh(idx); 1264 rcu_read_lock();
1256 1265
1257 hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 1266 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1258 if (!ip_vs_conn_net_eq(cp, net)) 1267 if (!ip_vs_conn_net_eq(cp, net))
1259 continue; 1268 continue;
1260 IP_VS_DBG(4, "del connection\n"); 1269 IP_VS_DBG(4, "del connection\n");
1261 ip_vs_conn_expire_now(cp); 1270 ip_vs_conn_expire_now(cp);
1262 if (cp->control) { 1271 cp_c = cp->control;
1272 /* cp->control is valid only with reference to cp */
1273 if (cp_c && __ip_vs_conn_get(cp)) {
1263 IP_VS_DBG(4, "del conn template\n"); 1274 IP_VS_DBG(4, "del conn template\n");
1264 ip_vs_conn_expire_now(cp->control); 1275 ip_vs_conn_expire_now(cp_c);
1276 __ip_vs_conn_put(cp);
1265 } 1277 }
1266 } 1278 }
1267 ct_write_unlock_bh(idx); 1279 rcu_read_unlock();
1268 } 1280 }
1269 1281
1270 /* the counter may be not NULL, because maybe some conn entries 1282 /* the counter may be not NULL, because maybe some conn entries
@@ -1331,7 +1343,7 @@ int __init ip_vs_conn_init(void)
1331 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 1343 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
1332 1344
1333 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1345 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
1334 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 1346 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
1335 } 1347 }
1336 1348
1337 /* calculate the random value for connection hash */ 1349 /* calculate the random value for connection hash */
@@ -1342,6 +1354,8 @@ int __init ip_vs_conn_init(void)
1342 1354
1343void ip_vs_conn_cleanup(void) 1355void ip_vs_conn_cleanup(void)
1344{ 1356{
1357 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
1358 rcu_barrier();
1345 /* Release the empty cache */ 1359 /* Release the empty cache */
1346 kmem_cache_destroy(ip_vs_conn_cachep); 1360 kmem_cache_destroy(ip_vs_conn_cachep);
1347 vfree(ip_vs_conn_tab); 1361 vfree(ip_vs_conn_tab);