aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2013-03-22 05:46:49 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2013-04-01 18:23:55 -0400
commit578bc3ef1e473abb9ea99046a307fef0094b22af (patch)
tree78c93c431c65e573846e573cdff8afb4924c501c /net/netfilter
parent08cb2d032f13da4a076b51639b104a830b6bd18c (diff)
ipvs: reorganize dest trash
All dests will go to trash, no exceptions. But we have to use new list node t_list for this, due to RCU changes in following patches. Dests will wait there initial grace period and later all conns and schedulers to put their reference. The dests don't get reference for staying in dest trash as before. As result, we do not load ip_vs_dest_put with extra checks for last refcnt and the schedulers do not need to play games with atomic_inc_not_zero while selecting best destination. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c178
1 files changed, 104 insertions, 74 deletions
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a4f638880470..80d366b7daab 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -71,7 +71,7 @@ int ip_vs_get_debug_level(void)
71 71
72 72
73/* Protos */ 73/* Protos */
74static void __ip_vs_del_service(struct ip_vs_service *svc); 74static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
75 75
76 76
77#ifdef CONFIG_IP_VS_IPV6 77#ifdef CONFIG_IP_VS_IPV6
@@ -657,19 +657,25 @@ static struct ip_vs_dest *
657ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, 657ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
658 __be16 dport) 658 __be16 dport)
659{ 659{
660 struct ip_vs_dest *dest, *nxt; 660 struct ip_vs_dest *dest;
661 struct netns_ipvs *ipvs = net_ipvs(svc->net); 661 struct netns_ipvs *ipvs = net_ipvs(svc->net);
662 662
663 /* 663 /*
664 * Find the destination in trash 664 * Find the destination in trash
665 */ 665 */
666 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { 666 spin_lock_bh(&ipvs->dest_trash_lock);
667 list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
667 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 668 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
668 "dest->refcnt=%d\n", 669 "dest->refcnt=%d\n",
669 dest->vfwmark, 670 dest->vfwmark,
670 IP_VS_DBG_ADDR(svc->af, &dest->addr), 671 IP_VS_DBG_ADDR(svc->af, &dest->addr),
671 ntohs(dest->port), 672 ntohs(dest->port),
672 atomic_read(&dest->refcnt)); 673 atomic_read(&dest->refcnt));
674 /* We can not reuse dest while in grace period
675 * because conns still can use dest->svc
676 */
677 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
678 continue;
673 if (dest->af == svc->af && 679 if (dest->af == svc->af &&
674 ip_vs_addr_equal(svc->af, &dest->addr, daddr) && 680 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
675 dest->port == dport && 681 dest->port == dport &&
@@ -679,29 +685,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
679 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 685 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
680 dest->vport == svc->port))) { 686 dest->vport == svc->port))) {
681 /* HIT */ 687 /* HIT */
682 return dest; 688 list_del(&dest->t_list);
683 } 689 ip_vs_dest_hold(dest);
684 690 goto out;
685 /*
686 * Try to purge the destination from trash if not referenced
687 */
688 if (atomic_read(&dest->refcnt) == 1) {
689 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
690 "from trash\n",
691 dest->vfwmark,
692 IP_VS_DBG_ADDR(svc->af, &dest->addr),
693 ntohs(dest->port));
694 list_del(&dest->n_list);
695 __ip_vs_dst_cache_reset(dest);
696 __ip_vs_unbind_svc(dest);
697 free_percpu(dest->stats.cpustats);
698 kfree_rcu(dest, rcu_head);
699 } 691 }
700 } 692 }
701 693
702 return NULL; 694 dest = NULL;
695
696out:
697 spin_unlock_bh(&ipvs->dest_trash_lock);
698
699 return dest;
703} 700}
704 701
702static void ip_vs_dest_free(struct ip_vs_dest *dest)
703{
704 __ip_vs_dst_cache_reset(dest);
705 __ip_vs_unbind_svc(dest);
706 free_percpu(dest->stats.cpustats);
707 kfree(dest);
708}
705 709
706/* 710/*
707 * Clean up all the destinations in the trash 711 * Clean up all the destinations in the trash
@@ -710,19 +714,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
710 * When the ip_vs_control_clearup is activated by ipvs module exit, 714 * When the ip_vs_control_clearup is activated by ipvs module exit,
711 * the service tables must have been flushed and all the connections 715 * the service tables must have been flushed and all the connections
712 * are expired, and the refcnt of each destination in the trash must 716 * are expired, and the refcnt of each destination in the trash must
713 * be 1, so we simply release them here. 717 * be 0, so we simply release them here.
714 */ 718 */
715static void ip_vs_trash_cleanup(struct net *net) 719static void ip_vs_trash_cleanup(struct net *net)
716{ 720{
717 struct ip_vs_dest *dest, *nxt; 721 struct ip_vs_dest *dest, *nxt;
718 struct netns_ipvs *ipvs = net_ipvs(net); 722 struct netns_ipvs *ipvs = net_ipvs(net);
719 723
720 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { 724 del_timer_sync(&ipvs->dest_trash_timer);
721 list_del(&dest->n_list); 725 /* No need to use dest_trash_lock */
722 __ip_vs_dst_cache_reset(dest); 726 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
723 __ip_vs_unbind_svc(dest); 727 list_del(&dest->t_list);
724 free_percpu(dest->stats.cpustats); 728 ip_vs_dest_free(dest);
725 kfree_rcu(dest, rcu_head);
726 } 729 }
727} 730}
728 731
@@ -955,11 +958,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
955 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 958 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
956 ntohs(dest->vport)); 959 ntohs(dest->vport));
957 960
958 /*
959 * Get the destination from the trash
960 */
961 list_del(&dest->n_list);
962
963 __ip_vs_update_dest(svc, dest, udest, 1); 961 __ip_vs_update_dest(svc, dest, udest, 1);
964 ret = 0; 962 ret = 0;
965 } else { 963 } else {
@@ -1015,11 +1013,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1015 return 0; 1013 return 0;
1016} 1014}
1017 1015
1016static void ip_vs_dest_wait_readers(struct rcu_head *head)
1017{
1018 struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
1019 rcu_head);
1020
1021 /* End of grace period after unlinking */
1022 clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1023}
1024
1018 1025
1019/* 1026/*
1020 * Delete a destination (must be already unlinked from the service) 1027 * Delete a destination (must be already unlinked from the service)
1021 */ 1028 */
1022static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) 1029static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
1030 bool cleanup)
1023{ 1031{
1024 struct netns_ipvs *ipvs = net_ipvs(net); 1032 struct netns_ipvs *ipvs = net_ipvs(net);
1025 1033
@@ -1030,34 +1038,22 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1030 */ 1038 */
1031 ip_vs_rs_unhash(dest); 1039 ip_vs_rs_unhash(dest);
1032 1040
1033 /* 1041 if (!cleanup) {
1034 * Decrease the refcnt of the dest, and free the dest 1042 set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1035 * if nobody refers to it (refcnt=0). Otherwise, throw 1043 call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
1036 * the destination into the trash.
1037 */
1038 if (atomic_dec_and_test(&dest->refcnt)) {
1039 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1040 dest->vfwmark,
1041 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1042 ntohs(dest->port));
1043 __ip_vs_dst_cache_reset(dest);
1044 /* simply decrease svc->refcnt here, let the caller check
1045 and release the service if nobody refers to it.
1046 Only user context can release destination and service,
1047 and only one user context can update virtual service at a
1048 time, so the operation here is OK */
1049 atomic_dec(&dest->svc->refcnt);
1050 free_percpu(dest->stats.cpustats);
1051 kfree_rcu(dest, rcu_head);
1052 } else {
1053 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1054 "dest->refcnt=%d\n",
1055 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1056 ntohs(dest->port),
1057 atomic_read(&dest->refcnt));
1058 list_add(&dest->n_list, &ipvs->dest_trash);
1059 ip_vs_dest_hold(dest);
1060 } 1044 }
1045
1046 spin_lock_bh(&ipvs->dest_trash_lock);
1047 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1048 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1049 atomic_read(&dest->refcnt));
1050 if (list_empty(&ipvs->dest_trash) && !cleanup)
1051 mod_timer(&ipvs->dest_trash_timer,
1052 jiffies + IP_VS_DEST_TRASH_PERIOD);
1053 /* dest lives in trash without reference */
1054 list_add(&dest->t_list, &ipvs->dest_trash);
1055 spin_unlock_bh(&ipvs->dest_trash_lock);
1056 ip_vs_dest_put(dest);
1061} 1057}
1062 1058
1063 1059
@@ -1122,13 +1118,38 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1122 /* 1118 /*
1123 * Delete the destination 1119 * Delete the destination
1124 */ 1120 */
1125 __ip_vs_del_dest(svc->net, dest); 1121 __ip_vs_del_dest(svc->net, dest, false);
1126 1122
1127 LeaveFunction(2); 1123 LeaveFunction(2);
1128 1124
1129 return 0; 1125 return 0;
1130} 1126}
1131 1127
1128static void ip_vs_dest_trash_expire(unsigned long data)
1129{
1130 struct net *net = (struct net *) data;
1131 struct netns_ipvs *ipvs = net_ipvs(net);
1132 struct ip_vs_dest *dest, *next;
1133
1134 spin_lock(&ipvs->dest_trash_lock);
1135 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1136 /* Skip if dest is in grace period */
1137 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
1138 continue;
1139 if (atomic_read(&dest->refcnt) > 0)
1140 continue;
1141 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1142 dest->vfwmark,
1143 IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
1144 ntohs(dest->port));
1145 list_del(&dest->t_list);
1146 ip_vs_dest_free(dest);
1147 }
1148 if (!list_empty(&ipvs->dest_trash))
1149 mod_timer(&ipvs->dest_trash_timer,
1150 jiffies + IP_VS_DEST_TRASH_PERIOD);
1151 spin_unlock(&ipvs->dest_trash_lock);
1152}
1132 1153
1133/* 1154/*
1134 * Add a service into the service hash table 1155 * Add a service into the service hash table
@@ -1358,7 +1379,7 @@ out:
1358 * - The service must be unlinked, unlocked and not referenced! 1379 * - The service must be unlinked, unlocked and not referenced!
1359 * - We are called under _bh lock 1380 * - We are called under _bh lock
1360 */ 1381 */
1361static void __ip_vs_del_service(struct ip_vs_service *svc) 1382static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1362{ 1383{
1363 struct ip_vs_dest *dest, *nxt; 1384 struct ip_vs_dest *dest, *nxt;
1364 struct ip_vs_scheduler *old_sched; 1385 struct ip_vs_scheduler *old_sched;
@@ -1394,7 +1415,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1394 */ 1415 */
1395 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 1416 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1396 __ip_vs_unlink_dest(svc, dest, 0); 1417 __ip_vs_unlink_dest(svc, dest, 0);
1397 __ip_vs_del_dest(svc->net, dest); 1418 __ip_vs_del_dest(svc->net, dest, cleanup);
1398 } 1419 }
1399 1420
1400 /* 1421 /*
@@ -1424,7 +1445,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1424/* 1445/*
1425 * Unlink a service from list and try to delete it if its refcnt reached 0 1446 * Unlink a service from list and try to delete it if its refcnt reached 0
1426 */ 1447 */
1427static void ip_vs_unlink_service(struct ip_vs_service *svc) 1448static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
1428{ 1449{
1429 /* 1450 /*
1430 * Unhash it from the service table 1451 * Unhash it from the service table
@@ -1438,7 +1459,7 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc)
1438 */ 1459 */
1439 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); 1460 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1440 1461
1441 __ip_vs_del_service(svc); 1462 __ip_vs_del_service(svc, cleanup);
1442 1463
1443 write_unlock_bh(&__ip_vs_svc_lock); 1464 write_unlock_bh(&__ip_vs_svc_lock);
1444} 1465}
@@ -1450,7 +1471,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
1450{ 1471{
1451 if (svc == NULL) 1472 if (svc == NULL)
1452 return -EEXIST; 1473 return -EEXIST;
1453 ip_vs_unlink_service(svc); 1474 ip_vs_unlink_service(svc, false);
1454 1475
1455 return 0; 1476 return 0;
1456} 1477}
@@ -1459,7 +1480,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
1459/* 1480/*
1460 * Flush all the virtual services 1481 * Flush all the virtual services
1461 */ 1482 */
1462static int ip_vs_flush(struct net *net) 1483static int ip_vs_flush(struct net *net, bool cleanup)
1463{ 1484{
1464 int idx; 1485 int idx;
1465 struct ip_vs_service *svc, *nxt; 1486 struct ip_vs_service *svc, *nxt;
@@ -1471,7 +1492,7 @@ static int ip_vs_flush(struct net *net)
1471 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], 1492 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1472 s_list) { 1493 s_list) {
1473 if (net_eq(svc->net, net)) 1494 if (net_eq(svc->net, net))
1474 ip_vs_unlink_service(svc); 1495 ip_vs_unlink_service(svc, cleanup);
1475 } 1496 }
1476 } 1497 }
1477 1498
@@ -1482,7 +1503,7 @@ static int ip_vs_flush(struct net *net)
1482 list_for_each_entry_safe(svc, nxt, 1503 list_for_each_entry_safe(svc, nxt,
1483 &ip_vs_svc_fwm_table[idx], f_list) { 1504 &ip_vs_svc_fwm_table[idx], f_list) {
1484 if (net_eq(svc->net, net)) 1505 if (net_eq(svc->net, net))
1485 ip_vs_unlink_service(svc); 1506 ip_vs_unlink_service(svc, cleanup);
1486 } 1507 }
1487 } 1508 }
1488 1509
@@ -1498,7 +1519,7 @@ void ip_vs_service_net_cleanup(struct net *net)
1498 EnterFunction(2); 1519 EnterFunction(2);
1499 /* Check for "full" addressed entries */ 1520 /* Check for "full" addressed entries */
1500 mutex_lock(&__ip_vs_mutex); 1521 mutex_lock(&__ip_vs_mutex);
1501 ip_vs_flush(net); 1522 ip_vs_flush(net, true);
1502 mutex_unlock(&__ip_vs_mutex); 1523 mutex_unlock(&__ip_vs_mutex);
1503 LeaveFunction(2); 1524 LeaveFunction(2);
1504} 1525}
@@ -1558,9 +1579,11 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1558 } 1579 }
1559 } 1580 }
1560 1581
1561 list_for_each_entry(dest, &ipvs->dest_trash, n_list) { 1582 spin_lock_bh(&ipvs->dest_trash_lock);
1583 list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1562 ip_vs_forget_dev(dest, dev); 1584 ip_vs_forget_dev(dest, dev);
1563 } 1585 }
1586 spin_unlock_bh(&ipvs->dest_trash_lock);
1564 mutex_unlock(&__ip_vs_mutex); 1587 mutex_unlock(&__ip_vs_mutex);
1565 LeaveFunction(2); 1588 LeaveFunction(2);
1566 return NOTIFY_DONE; 1589 return NOTIFY_DONE;
@@ -2394,7 +2417,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2394 2417
2395 if (cmd == IP_VS_SO_SET_FLUSH) { 2418 if (cmd == IP_VS_SO_SET_FLUSH) {
2396 /* Flush the virtual service */ 2419 /* Flush the virtual service */
2397 ret = ip_vs_flush(net); 2420 ret = ip_vs_flush(net, false);
2398 goto out_unlock; 2421 goto out_unlock;
2399 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 2422 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2400 /* Set timeout values for (tcp tcpfin udp) */ 2423 /* Set timeout values for (tcp tcpfin udp) */
@@ -3403,7 +3426,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3403 mutex_lock(&__ip_vs_mutex); 3426 mutex_lock(&__ip_vs_mutex);
3404 3427
3405 if (cmd == IPVS_CMD_FLUSH) { 3428 if (cmd == IPVS_CMD_FLUSH) {
3406 ret = ip_vs_flush(net); 3429 ret = ip_vs_flush(net, false);
3407 goto out; 3430 goto out;
3408 } else if (cmd == IPVS_CMD_SET_CONFIG) { 3431 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3409 ret = ip_vs_genl_set_config(net, info->attrs); 3432 ret = ip_vs_genl_set_config(net, info->attrs);
@@ -3800,6 +3823,9 @@ int __net_init ip_vs_control_net_init(struct net *net)
3800 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 3823 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
3801 3824
3802 INIT_LIST_HEAD(&ipvs->dest_trash); 3825 INIT_LIST_HEAD(&ipvs->dest_trash);
3826 spin_lock_init(&ipvs->dest_trash_lock);
3827 setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
3828 (unsigned long) net);
3803 atomic_set(&ipvs->ftpsvc_counter, 0); 3829 atomic_set(&ipvs->ftpsvc_counter, 0);
3804 atomic_set(&ipvs->nullsvc_counter, 0); 3830 atomic_set(&ipvs->nullsvc_counter, 0);
3805 3831
@@ -3829,6 +3855,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
3829{ 3855{
3830 struct netns_ipvs *ipvs = net_ipvs(net); 3856 struct netns_ipvs *ipvs = net_ipvs(net);
3831 3857
3858 /* Some dest can be in grace period even before cleanup, we have to
3859 * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
3860 */
3861 rcu_barrier();
3832 ip_vs_trash_cleanup(net); 3862 ip_vs_trash_cleanup(net);
3833 ip_vs_stop_estimator(net, &ipvs->tot_stats); 3863 ip_vs_stop_estimator(net, &ipvs->tot_stats);
3834 ip_vs_control_net_cleanup_sysctl(net); 3864 ip_vs_control_net_cleanup_sysctl(net);