aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2013-09-12 04:21:07 -0400
committerSimon Horman <horms@verge.net.au>2013-09-18 15:39:03 -0400
commitbcbde4c0a7556cca72874c5e1efa4dccb5198a2b (patch)
tree73e5f985bbcd5a976c26dd75e4672ac2b70b9270 /net
parentc16526a7b99c1c28e9670a8c8e3dbcf741bb32be (diff)
ipvs: make the service replacement more robust
commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c86
2 files changed, 45 insertions, 53 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f69e83ff836..74fd00c27210 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -116,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116 116
117 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 117 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
118 struct ip_vs_cpu_stats *s; 118 struct ip_vs_cpu_stats *s;
119 struct ip_vs_service *svc;
119 120
120 s = this_cpu_ptr(dest->stats.cpustats); 121 s = this_cpu_ptr(dest->stats.cpustats);
121 s->ustats.inpkts++; 122 s->ustats.inpkts++;
@@ -123,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
123 s->ustats.inbytes += skb->len; 124 s->ustats.inbytes += skb->len;
124 u64_stats_update_end(&s->syncp); 125 u64_stats_update_end(&s->syncp);
125 126
126 s = this_cpu_ptr(dest->svc->stats.cpustats); 127 rcu_read_lock();
128 svc = rcu_dereference(dest->svc);
129 s = this_cpu_ptr(svc->stats.cpustats);
127 s->ustats.inpkts++; 130 s->ustats.inpkts++;
128 u64_stats_update_begin(&s->syncp); 131 u64_stats_update_begin(&s->syncp);
129 s->ustats.inbytes += skb->len; 132 s->ustats.inbytes += skb->len;
130 u64_stats_update_end(&s->syncp); 133 u64_stats_update_end(&s->syncp);
134 rcu_read_unlock();
131 135
132 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 136 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
133 s->ustats.inpkts++; 137 s->ustats.inpkts++;
@@ -146,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
146 150
147 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 151 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
148 struct ip_vs_cpu_stats *s; 152 struct ip_vs_cpu_stats *s;
153 struct ip_vs_service *svc;
149 154
150 s = this_cpu_ptr(dest->stats.cpustats); 155 s = this_cpu_ptr(dest->stats.cpustats);
151 s->ustats.outpkts++; 156 s->ustats.outpkts++;
@@ -153,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
153 s->ustats.outbytes += skb->len; 158 s->ustats.outbytes += skb->len;
154 u64_stats_update_end(&s->syncp); 159 u64_stats_update_end(&s->syncp);
155 160
156 s = this_cpu_ptr(dest->svc->stats.cpustats); 161 rcu_read_lock();
162 svc = rcu_dereference(dest->svc);
163 s = this_cpu_ptr(svc->stats.cpustats);
157 s->ustats.outpkts++; 164 s->ustats.outpkts++;
158 u64_stats_update_begin(&s->syncp); 165 u64_stats_update_begin(&s->syncp);
159 s->ustats.outbytes += skb->len; 166 s->ustats.outbytes += skb->len;
160 u64_stats_update_end(&s->syncp); 167 u64_stats_update_end(&s->syncp);
168 rcu_read_unlock();
161 169
162 s = this_cpu_ptr(ipvs->tot_stats.cpustats); 170 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
163 s->ustats.outpkts++; 171 s->ustats.outpkts++;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c8148e487386..a3df9bddc4f7 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -460,7 +460,7 @@ static inline void
460__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 460__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461{ 461{
462 atomic_inc(&svc->refcnt); 462 atomic_inc(&svc->refcnt);
463 dest->svc = svc; 463 rcu_assign_pointer(dest->svc, svc);
464} 464}
465 465
466static void ip_vs_service_free(struct ip_vs_service *svc) 466static void ip_vs_service_free(struct ip_vs_service *svc)
@@ -470,18 +470,25 @@ static void ip_vs_service_free(struct ip_vs_service *svc)
470 kfree(svc); 470 kfree(svc);
471} 471}
472 472
473static void 473static void ip_vs_service_rcu_free(struct rcu_head *head)
474__ip_vs_unbind_svc(struct ip_vs_dest *dest)
475{ 474{
476 struct ip_vs_service *svc = dest->svc; 475 struct ip_vs_service *svc;
476
477 svc = container_of(head, struct ip_vs_service, rcu_head);
478 ip_vs_service_free(svc);
479}
477 480
478 dest->svc = NULL; 481static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
482{
479 if (atomic_dec_and_test(&svc->refcnt)) { 483 if (atomic_dec_and_test(&svc->refcnt)) {
480 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 484 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
481 svc->fwmark, 485 svc->fwmark,
482 IP_VS_DBG_ADDR(svc->af, &svc->addr), 486 IP_VS_DBG_ADDR(svc->af, &svc->addr),
483 ntohs(svc->port)); 487 ntohs(svc->port));
484 ip_vs_service_free(svc); 488 if (do_delay)
489 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
490 else
491 ip_vs_service_free(svc);
485 } 492 }
486} 493}
487 494
@@ -667,11 +674,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
667 IP_VS_DBG_ADDR(svc->af, &dest->addr), 674 IP_VS_DBG_ADDR(svc->af, &dest->addr),
668 ntohs(dest->port), 675 ntohs(dest->port),
669 atomic_read(&dest->refcnt)); 676 atomic_read(&dest->refcnt));
670 /* We can not reuse dest while in grace period
671 * because conns still can use dest->svc
672 */
673 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
674 continue;
675 if (dest->af == svc->af && 677 if (dest->af == svc->af &&
676 ip_vs_addr_equal(svc->af, &dest->addr, daddr) && 678 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
677 dest->port == dport && 679 dest->port == dport &&
@@ -697,8 +699,10 @@ out:
697 699
698static void ip_vs_dest_free(struct ip_vs_dest *dest) 700static void ip_vs_dest_free(struct ip_vs_dest *dest)
699{ 701{
702 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
703
700 __ip_vs_dst_cache_reset(dest); 704 __ip_vs_dst_cache_reset(dest);
701 __ip_vs_unbind_svc(dest); 705 __ip_vs_svc_put(svc, false);
702 free_percpu(dest->stats.cpustats); 706 free_percpu(dest->stats.cpustats);
703 kfree(dest); 707 kfree(dest);
704} 708}
@@ -771,6 +775,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
771 struct ip_vs_dest_user_kern *udest, int add) 775 struct ip_vs_dest_user_kern *udest, int add)
772{ 776{
773 struct netns_ipvs *ipvs = net_ipvs(svc->net); 777 struct netns_ipvs *ipvs = net_ipvs(svc->net);
778 struct ip_vs_service *old_svc;
774 struct ip_vs_scheduler *sched; 779 struct ip_vs_scheduler *sched;
775 int conn_flags; 780 int conn_flags;
776 781
@@ -792,13 +797,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
792 atomic_set(&dest->conn_flags, conn_flags); 797 atomic_set(&dest->conn_flags, conn_flags);
793 798
794 /* bind the service */ 799 /* bind the service */
795 if (!dest->svc) { 800 old_svc = rcu_dereference_protected(dest->svc, 1);
801 if (!old_svc) {
796 __ip_vs_bind_svc(dest, svc); 802 __ip_vs_bind_svc(dest, svc);
797 } else { 803 } else {
798 if (dest->svc != svc) { 804 if (old_svc != svc) {
799 __ip_vs_unbind_svc(dest);
800 ip_vs_zero_stats(&dest->stats); 805 ip_vs_zero_stats(&dest->stats);
801 __ip_vs_bind_svc(dest, svc); 806 __ip_vs_bind_svc(dest, svc);
807 __ip_vs_svc_put(old_svc, true);
802 } 808 }
803 } 809 }
804 810
@@ -998,16 +1004,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
998 return 0; 1004 return 0;
999} 1005}
1000 1006
1001static void ip_vs_dest_wait_readers(struct rcu_head *head)
1002{
1003 struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
1004 rcu_head);
1005
1006 /* End of grace period after unlinking */
1007 clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1008}
1009
1010
1011/* 1007/*
1012 * Delete a destination (must be already unlinked from the service) 1008 * Delete a destination (must be already unlinked from the service)
1013 */ 1009 */
@@ -1023,20 +1019,16 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
1023 */ 1019 */
1024 ip_vs_rs_unhash(dest); 1020 ip_vs_rs_unhash(dest);
1025 1021
1026 if (!cleanup) {
1027 set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1028 call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
1029 }
1030
1031 spin_lock_bh(&ipvs->dest_trash_lock); 1022 spin_lock_bh(&ipvs->dest_trash_lock);
1032 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1023 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1033 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1024 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1034 atomic_read(&dest->refcnt)); 1025 atomic_read(&dest->refcnt));
1035 if (list_empty(&ipvs->dest_trash) && !cleanup) 1026 if (list_empty(&ipvs->dest_trash) && !cleanup)
1036 mod_timer(&ipvs->dest_trash_timer, 1027 mod_timer(&ipvs->dest_trash_timer,
1037 jiffies + IP_VS_DEST_TRASH_PERIOD); 1028 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1038 /* dest lives in trash without reference */ 1029 /* dest lives in trash without reference */
1039 list_add(&dest->t_list, &ipvs->dest_trash); 1030 list_add(&dest->t_list, &ipvs->dest_trash);
1031 dest->idle_start = 0;
1040 spin_unlock_bh(&ipvs->dest_trash_lock); 1032 spin_unlock_bh(&ipvs->dest_trash_lock);
1041 ip_vs_dest_put(dest); 1033 ip_vs_dest_put(dest);
1042} 1034}
@@ -1108,24 +1100,30 @@ static void ip_vs_dest_trash_expire(unsigned long data)
1108 struct net *net = (struct net *) data; 1100 struct net *net = (struct net *) data;
1109 struct netns_ipvs *ipvs = net_ipvs(net); 1101 struct netns_ipvs *ipvs = net_ipvs(net);
1110 struct ip_vs_dest *dest, *next; 1102 struct ip_vs_dest *dest, *next;
1103 unsigned long now = jiffies;
1111 1104
1112 spin_lock(&ipvs->dest_trash_lock); 1105 spin_lock(&ipvs->dest_trash_lock);
1113 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1106 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1114 /* Skip if dest is in grace period */
1115 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
1116 continue;
1117 if (atomic_read(&dest->refcnt) > 0) 1107 if (atomic_read(&dest->refcnt) > 0)
1118 continue; 1108 continue;
1109 if (dest->idle_start) {
1110 if (time_before(now, dest->idle_start +
1111 IP_VS_DEST_TRASH_PERIOD))
1112 continue;
1113 } else {
1114 dest->idle_start = max(1UL, now);
1115 continue;
1116 }
1119 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1117 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1120 dest->vfwmark, 1118 dest->vfwmark,
1121 IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), 1119 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1122 ntohs(dest->port)); 1120 ntohs(dest->port));
1123 list_del(&dest->t_list); 1121 list_del(&dest->t_list);
1124 ip_vs_dest_free(dest); 1122 ip_vs_dest_free(dest);
1125 } 1123 }
1126 if (!list_empty(&ipvs->dest_trash)) 1124 if (!list_empty(&ipvs->dest_trash))
1127 mod_timer(&ipvs->dest_trash_timer, 1125 mod_timer(&ipvs->dest_trash_timer,
1128 jiffies + IP_VS_DEST_TRASH_PERIOD); 1126 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1129 spin_unlock(&ipvs->dest_trash_lock); 1127 spin_unlock(&ipvs->dest_trash_lock);
1130} 1128}
1131 1129
@@ -1320,14 +1318,6 @@ out:
1320 return ret; 1318 return ret;
1321} 1319}
1322 1320
1323static void ip_vs_service_rcu_free(struct rcu_head *head)
1324{
1325 struct ip_vs_service *svc;
1326
1327 svc = container_of(head, struct ip_vs_service, rcu_head);
1328 ip_vs_service_free(svc);
1329}
1330
1331/* 1321/*
1332 * Delete a service from the service list 1322 * Delete a service from the service list
1333 * - The service must be unlinked, unlocked and not referenced! 1323 * - The service must be unlinked, unlocked and not referenced!
@@ -1376,13 +1366,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1376 /* 1366 /*
1377 * Free the service if nobody refers to it 1367 * Free the service if nobody refers to it
1378 */ 1368 */
1379 if (atomic_dec_and_test(&svc->refcnt)) { 1369 __ip_vs_svc_put(svc, true);
1380 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
1381 svc->fwmark,
1382 IP_VS_DBG_ADDR(svc->af, &svc->addr),
1383 ntohs(svc->port));
1384 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
1385 }
1386 1370
1387 /* decrease the module use count */ 1371 /* decrease the module use count */
1388 ip_vs_use_count_dec(); 1372 ip_vs_use_count_dec();