diff options
author | Julian Anastasov <ja@ssi.bg> | 2013-09-12 04:21:07 -0400 |
---|---|---|
committer | Simon Horman <horms@verge.net.au> | 2013-09-18 15:39:03 -0400 |
commit | bcbde4c0a7556cca72874c5e1efa4dccb5198a2b (patch) | |
tree | 73e5f985bbcd5a976c26dd75e4672ac2b70b9270 /net | |
parent | c16526a7b99c1c28e9670a8c8e3dbcf741bb32be (diff) |
ipvs: make the service replacement more robust
commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added
IP_VS_DEST_STATE_REMOVING flag and RCU callback named
ip_vs_dest_wait_readers() to keep dests and services after
removal for at least a RCU grace period. But we have the
following corner cases:
- we can not reuse the same dest if its service is removed
while IP_VS_DEST_STATE_REMOVING is still set because another dest
removal in the first grace period can not extend this period.
It can happen when ipvsadm -C && ipvsadm -R is used.
- dest->svc can be replaced but ip_vs_in_stats() and
ip_vs_out_stats() have no explicit read memory barriers
when accessing dest->svc. It can happen that dest->svc
was just freed (replaced) while we use it to update
the stats.
We solve the problems as follows:
- IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed
idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start
will remember when for first time after deletion we noticed
dest->refcnt=0. Later, the connections can grab a reference
while in RCU grace period but if refcnt becomes 0 we can
safely free the dest and its svc.
- dest->svc becomes RCU pointer. As result, we add explicit
RCU locking in ip_vs_in_stats() and ip_vs_out_stats().
- __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it
now can free the service immediately or after a RCU grace
period. dest->svc is not set to NULL anymore.
As result, unlinked dests and their services are
freed always after IP_VS_DEST_TRASH_PERIOD period, unused
services are freed after a RCU grace period.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 12 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 86 |
2 files changed, 45 insertions, 53 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f69e83ff836..74fd00c27210 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c | |||
@@ -116,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | |||
116 | 116 | ||
117 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | 117 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
118 | struct ip_vs_cpu_stats *s; | 118 | struct ip_vs_cpu_stats *s; |
119 | struct ip_vs_service *svc; | ||
119 | 120 | ||
120 | s = this_cpu_ptr(dest->stats.cpustats); | 121 | s = this_cpu_ptr(dest->stats.cpustats); |
121 | s->ustats.inpkts++; | 122 | s->ustats.inpkts++; |
@@ -123,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | |||
123 | s->ustats.inbytes += skb->len; | 124 | s->ustats.inbytes += skb->len; |
124 | u64_stats_update_end(&s->syncp); | 125 | u64_stats_update_end(&s->syncp); |
125 | 126 | ||
126 | s = this_cpu_ptr(dest->svc->stats.cpustats); | 127 | rcu_read_lock(); |
128 | svc = rcu_dereference(dest->svc); | ||
129 | s = this_cpu_ptr(svc->stats.cpustats); | ||
127 | s->ustats.inpkts++; | 130 | s->ustats.inpkts++; |
128 | u64_stats_update_begin(&s->syncp); | 131 | u64_stats_update_begin(&s->syncp); |
129 | s->ustats.inbytes += skb->len; | 132 | s->ustats.inbytes += skb->len; |
130 | u64_stats_update_end(&s->syncp); | 133 | u64_stats_update_end(&s->syncp); |
134 | rcu_read_unlock(); | ||
131 | 135 | ||
132 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); | 136 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); |
133 | s->ustats.inpkts++; | 137 | s->ustats.inpkts++; |
@@ -146,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | |||
146 | 150 | ||
147 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | 151 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
148 | struct ip_vs_cpu_stats *s; | 152 | struct ip_vs_cpu_stats *s; |
153 | struct ip_vs_service *svc; | ||
149 | 154 | ||
150 | s = this_cpu_ptr(dest->stats.cpustats); | 155 | s = this_cpu_ptr(dest->stats.cpustats); |
151 | s->ustats.outpkts++; | 156 | s->ustats.outpkts++; |
@@ -153,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | |||
153 | s->ustats.outbytes += skb->len; | 158 | s->ustats.outbytes += skb->len; |
154 | u64_stats_update_end(&s->syncp); | 159 | u64_stats_update_end(&s->syncp); |
155 | 160 | ||
156 | s = this_cpu_ptr(dest->svc->stats.cpustats); | 161 | rcu_read_lock(); |
162 | svc = rcu_dereference(dest->svc); | ||
163 | s = this_cpu_ptr(svc->stats.cpustats); | ||
157 | s->ustats.outpkts++; | 164 | s->ustats.outpkts++; |
158 | u64_stats_update_begin(&s->syncp); | 165 | u64_stats_update_begin(&s->syncp); |
159 | s->ustats.outbytes += skb->len; | 166 | s->ustats.outbytes += skb->len; |
160 | u64_stats_update_end(&s->syncp); | 167 | u64_stats_update_end(&s->syncp); |
168 | rcu_read_unlock(); | ||
161 | 169 | ||
162 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); | 170 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); |
163 | s->ustats.outpkts++; | 171 | s->ustats.outpkts++; |
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c8148e487386..a3df9bddc4f7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c | |||
@@ -460,7 +460,7 @@ static inline void | |||
460 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) | 460 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) |
461 | { | 461 | { |
462 | atomic_inc(&svc->refcnt); | 462 | atomic_inc(&svc->refcnt); |
463 | dest->svc = svc; | 463 | rcu_assign_pointer(dest->svc, svc); |
464 | } | 464 | } |
465 | 465 | ||
466 | static void ip_vs_service_free(struct ip_vs_service *svc) | 466 | static void ip_vs_service_free(struct ip_vs_service *svc) |
@@ -470,18 +470,25 @@ static void ip_vs_service_free(struct ip_vs_service *svc) | |||
470 | kfree(svc); | 470 | kfree(svc); |
471 | } | 471 | } |
472 | 472 | ||
473 | static void | 473 | static void ip_vs_service_rcu_free(struct rcu_head *head) |
474 | __ip_vs_unbind_svc(struct ip_vs_dest *dest) | ||
475 | { | 474 | { |
476 | struct ip_vs_service *svc = dest->svc; | 475 | struct ip_vs_service *svc; |
476 | |||
477 | svc = container_of(head, struct ip_vs_service, rcu_head); | ||
478 | ip_vs_service_free(svc); | ||
479 | } | ||
477 | 480 | ||
478 | dest->svc = NULL; | 481 | static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) |
482 | { | ||
479 | if (atomic_dec_and_test(&svc->refcnt)) { | 483 | if (atomic_dec_and_test(&svc->refcnt)) { |
480 | IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", | 484 | IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", |
481 | svc->fwmark, | 485 | svc->fwmark, |
482 | IP_VS_DBG_ADDR(svc->af, &svc->addr), | 486 | IP_VS_DBG_ADDR(svc->af, &svc->addr), |
483 | ntohs(svc->port)); | 487 | ntohs(svc->port)); |
484 | ip_vs_service_free(svc); | 488 | if (do_delay) |
489 | call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); | ||
490 | else | ||
491 | ip_vs_service_free(svc); | ||
485 | } | 492 | } |
486 | } | 493 | } |
487 | 494 | ||
@@ -667,11 +674,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, | |||
667 | IP_VS_DBG_ADDR(svc->af, &dest->addr), | 674 | IP_VS_DBG_ADDR(svc->af, &dest->addr), |
668 | ntohs(dest->port), | 675 | ntohs(dest->port), |
669 | atomic_read(&dest->refcnt)); | 676 | atomic_read(&dest->refcnt)); |
670 | /* We can not reuse dest while in grace period | ||
671 | * because conns still can use dest->svc | ||
672 | */ | ||
673 | if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) | ||
674 | continue; | ||
675 | if (dest->af == svc->af && | 677 | if (dest->af == svc->af && |
676 | ip_vs_addr_equal(svc->af, &dest->addr, daddr) && | 678 | ip_vs_addr_equal(svc->af, &dest->addr, daddr) && |
677 | dest->port == dport && | 679 | dest->port == dport && |
@@ -697,8 +699,10 @@ out: | |||
697 | 699 | ||
698 | static void ip_vs_dest_free(struct ip_vs_dest *dest) | 700 | static void ip_vs_dest_free(struct ip_vs_dest *dest) |
699 | { | 701 | { |
702 | struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); | ||
703 | |||
700 | __ip_vs_dst_cache_reset(dest); | 704 | __ip_vs_dst_cache_reset(dest); |
701 | __ip_vs_unbind_svc(dest); | 705 | __ip_vs_svc_put(svc, false); |
702 | free_percpu(dest->stats.cpustats); | 706 | free_percpu(dest->stats.cpustats); |
703 | kfree(dest); | 707 | kfree(dest); |
704 | } | 708 | } |
@@ -771,6 +775,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, | |||
771 | struct ip_vs_dest_user_kern *udest, int add) | 775 | struct ip_vs_dest_user_kern *udest, int add) |
772 | { | 776 | { |
773 | struct netns_ipvs *ipvs = net_ipvs(svc->net); | 777 | struct netns_ipvs *ipvs = net_ipvs(svc->net); |
778 | struct ip_vs_service *old_svc; | ||
774 | struct ip_vs_scheduler *sched; | 779 | struct ip_vs_scheduler *sched; |
775 | int conn_flags; | 780 | int conn_flags; |
776 | 781 | ||
@@ -792,13 +797,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, | |||
792 | atomic_set(&dest->conn_flags, conn_flags); | 797 | atomic_set(&dest->conn_flags, conn_flags); |
793 | 798 | ||
794 | /* bind the service */ | 799 | /* bind the service */ |
795 | if (!dest->svc) { | 800 | old_svc = rcu_dereference_protected(dest->svc, 1); |
801 | if (!old_svc) { | ||
796 | __ip_vs_bind_svc(dest, svc); | 802 | __ip_vs_bind_svc(dest, svc); |
797 | } else { | 803 | } else { |
798 | if (dest->svc != svc) { | 804 | if (old_svc != svc) { |
799 | __ip_vs_unbind_svc(dest); | ||
800 | ip_vs_zero_stats(&dest->stats); | 805 | ip_vs_zero_stats(&dest->stats); |
801 | __ip_vs_bind_svc(dest, svc); | 806 | __ip_vs_bind_svc(dest, svc); |
807 | __ip_vs_svc_put(old_svc, true); | ||
802 | } | 808 | } |
803 | } | 809 | } |
804 | 810 | ||
@@ -998,16 +1004,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) | |||
998 | return 0; | 1004 | return 0; |
999 | } | 1005 | } |
1000 | 1006 | ||
1001 | static void ip_vs_dest_wait_readers(struct rcu_head *head) | ||
1002 | { | ||
1003 | struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest, | ||
1004 | rcu_head); | ||
1005 | |||
1006 | /* End of grace period after unlinking */ | ||
1007 | clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); | ||
1008 | } | ||
1009 | |||
1010 | |||
1011 | /* | 1007 | /* |
1012 | * Delete a destination (must be already unlinked from the service) | 1008 | * Delete a destination (must be already unlinked from the service) |
1013 | */ | 1009 | */ |
@@ -1023,20 +1019,16 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, | |||
1023 | */ | 1019 | */ |
1024 | ip_vs_rs_unhash(dest); | 1020 | ip_vs_rs_unhash(dest); |
1025 | 1021 | ||
1026 | if (!cleanup) { | ||
1027 | set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); | ||
1028 | call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers); | ||
1029 | } | ||
1030 | |||
1031 | spin_lock_bh(&ipvs->dest_trash_lock); | 1022 | spin_lock_bh(&ipvs->dest_trash_lock); |
1032 | IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", | 1023 | IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", |
1033 | IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), | 1024 | IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), |
1034 | atomic_read(&dest->refcnt)); | 1025 | atomic_read(&dest->refcnt)); |
1035 | if (list_empty(&ipvs->dest_trash) && !cleanup) | 1026 | if (list_empty(&ipvs->dest_trash) && !cleanup) |
1036 | mod_timer(&ipvs->dest_trash_timer, | 1027 | mod_timer(&ipvs->dest_trash_timer, |
1037 | jiffies + IP_VS_DEST_TRASH_PERIOD); | 1028 | jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); |
1038 | /* dest lives in trash without reference */ | 1029 | /* dest lives in trash without reference */ |
1039 | list_add(&dest->t_list, &ipvs->dest_trash); | 1030 | list_add(&dest->t_list, &ipvs->dest_trash); |
1031 | dest->idle_start = 0; | ||
1040 | spin_unlock_bh(&ipvs->dest_trash_lock); | 1032 | spin_unlock_bh(&ipvs->dest_trash_lock); |
1041 | ip_vs_dest_put(dest); | 1033 | ip_vs_dest_put(dest); |
1042 | } | 1034 | } |
@@ -1108,24 +1100,30 @@ static void ip_vs_dest_trash_expire(unsigned long data) | |||
1108 | struct net *net = (struct net *) data; | 1100 | struct net *net = (struct net *) data; |
1109 | struct netns_ipvs *ipvs = net_ipvs(net); | 1101 | struct netns_ipvs *ipvs = net_ipvs(net); |
1110 | struct ip_vs_dest *dest, *next; | 1102 | struct ip_vs_dest *dest, *next; |
1103 | unsigned long now = jiffies; | ||
1111 | 1104 | ||
1112 | spin_lock(&ipvs->dest_trash_lock); | 1105 | spin_lock(&ipvs->dest_trash_lock); |
1113 | list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { | 1106 | list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { |
1114 | /* Skip if dest is in grace period */ | ||
1115 | if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) | ||
1116 | continue; | ||
1117 | if (atomic_read(&dest->refcnt) > 0) | 1107 | if (atomic_read(&dest->refcnt) > 0) |
1118 | continue; | 1108 | continue; |
1109 | if (dest->idle_start) { | ||
1110 | if (time_before(now, dest->idle_start + | ||
1111 | IP_VS_DEST_TRASH_PERIOD)) | ||
1112 | continue; | ||
1113 | } else { | ||
1114 | dest->idle_start = max(1UL, now); | ||
1115 | continue; | ||
1116 | } | ||
1119 | IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", | 1117 | IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", |
1120 | dest->vfwmark, | 1118 | dest->vfwmark, |
1121 | IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), | 1119 | IP_VS_DBG_ADDR(dest->af, &dest->addr), |
1122 | ntohs(dest->port)); | 1120 | ntohs(dest->port)); |
1123 | list_del(&dest->t_list); | 1121 | list_del(&dest->t_list); |
1124 | ip_vs_dest_free(dest); | 1122 | ip_vs_dest_free(dest); |
1125 | } | 1123 | } |
1126 | if (!list_empty(&ipvs->dest_trash)) | 1124 | if (!list_empty(&ipvs->dest_trash)) |
1127 | mod_timer(&ipvs->dest_trash_timer, | 1125 | mod_timer(&ipvs->dest_trash_timer, |
1128 | jiffies + IP_VS_DEST_TRASH_PERIOD); | 1126 | jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); |
1129 | spin_unlock(&ipvs->dest_trash_lock); | 1127 | spin_unlock(&ipvs->dest_trash_lock); |
1130 | } | 1128 | } |
1131 | 1129 | ||
@@ -1320,14 +1318,6 @@ out: | |||
1320 | return ret; | 1318 | return ret; |
1321 | } | 1319 | } |
1322 | 1320 | ||
1323 | static void ip_vs_service_rcu_free(struct rcu_head *head) | ||
1324 | { | ||
1325 | struct ip_vs_service *svc; | ||
1326 | |||
1327 | svc = container_of(head, struct ip_vs_service, rcu_head); | ||
1328 | ip_vs_service_free(svc); | ||
1329 | } | ||
1330 | |||
1331 | /* | 1321 | /* |
1332 | * Delete a service from the service list | 1322 | * Delete a service from the service list |
1333 | * - The service must be unlinked, unlocked and not referenced! | 1323 | * - The service must be unlinked, unlocked and not referenced! |
@@ -1376,13 +1366,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) | |||
1376 | /* | 1366 | /* |
1377 | * Free the service if nobody refers to it | 1367 | * Free the service if nobody refers to it |
1378 | */ | 1368 | */ |
1379 | if (atomic_dec_and_test(&svc->refcnt)) { | 1369 | __ip_vs_svc_put(svc, true); |
1380 | IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", | ||
1381 | svc->fwmark, | ||
1382 | IP_VS_DBG_ADDR(svc->af, &svc->addr), | ||
1383 | ntohs(svc->port)); | ||
1384 | call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); | ||
1385 | } | ||
1386 | 1370 | ||
1387 | /* decrease the module use count */ | 1371 | /* decrease the module use count */ |
1388 | ip_vs_use_count_dec(); | 1372 | ip_vs_use_count_dec(); |