diff options
author | Julian Anastasov <ja@ssi.bg> | 2012-04-24 16:46:40 -0400 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2012-05-08 13:40:10 -0400 |
commit | 749c42b620a9511782bc38d0a88702a42434529e (patch) | |
tree | 057b15f2bbd7fad96becdada1dc2ee775482e0b6 /net | |
parent | 1c003b1580e20ff9f500846677303a695b1837cc (diff) |
ipvs: reduce sync rate with time thresholds
Add two new sysctl vars to control the sync rate with the
main idea to reduce the rate for connection templates because
currently it depends on the packet rate for controlled connections.
This mechanism should be useful also for normal connections
with high traffic.
sync_refresh_period: in seconds, difference in reported connection
timer that triggers new sync message. It can be used to
avoid sync messages for the specified period (or half of
the connection timeout if it is lower) if connection state
is not changed from last sync.
sync_retries: integer, 0..3, defines sync retries with period of
sync_refresh_period/8. Useful to protect against loss of
sync messages.
Allow sysctl_sync_threshold to be used with
sysctl_sync_period=0, so that only single sync message is sent
if sync_refresh_period is also 0.
Add new field "sync_endtime" in connection structure to
hold the reported time when connection expires. The 2 lowest
bits will represent the retry count.
As the sysctl_sync_period now can be 0 use ACCESS_ONCE to
avoid division by zero.
Special thanks to Aleksey Chudov for being patient with me,
for his extensive reports and helping in all tests.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Tested-by: Aleksey Chudov <aleksey.chudov@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 7 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 30 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 25 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sync.c | 121 |
4 files changed, 137 insertions, 46 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index fd74f881d04a..4f3205def28f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c | |||
@@ -762,7 +762,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct) | |||
762 | static void ip_vs_conn_expire(unsigned long data) | 762 | static void ip_vs_conn_expire(unsigned long data) |
763 | { | 763 | { |
764 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; | 764 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; |
765 | struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); | 765 | struct net *net = ip_vs_conn_net(cp); |
766 | struct netns_ipvs *ipvs = net_ipvs(net); | ||
766 | 767 | ||
767 | cp->timeout = 60*HZ; | 768 | cp->timeout = 60*HZ; |
768 | 769 | ||
@@ -827,6 +828,9 @@ static void ip_vs_conn_expire(unsigned long data) | |||
827 | atomic_read(&cp->refcnt)-1, | 828 | atomic_read(&cp->refcnt)-1, |
828 | atomic_read(&cp->n_control)); | 829 | atomic_read(&cp->n_control)); |
829 | 830 | ||
831 | if (ipvs->sync_state & IP_VS_STATE_MASTER) | ||
832 | ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); | ||
833 | |||
830 | ip_vs_conn_put(cp); | 834 | ip_vs_conn_put(cp); |
831 | } | 835 | } |
832 | 836 | ||
@@ -900,6 +904,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, | |||
900 | /* Set its state and timeout */ | 904 | /* Set its state and timeout */ |
901 | cp->state = 0; | 905 | cp->state = 0; |
902 | cp->timeout = 3*HZ; | 906 | cp->timeout = 3*HZ; |
907 | cp->sync_endtime = jiffies & ~3UL; | ||
903 | 908 | ||
904 | /* Bind its packet transmitter */ | 909 | /* Bind its packet transmitter */ |
905 | #ifdef CONFIG_IP_VS_IPV6 | 910 | #ifdef CONFIG_IP_VS_IPV6 |
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c8f36b96f44f..a54b018c6eea 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c | |||
@@ -1613,34 +1613,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1613 | else | 1613 | else |
1614 | pkts = atomic_add_return(1, &cp->in_pkts); | 1614 | pkts = atomic_add_return(1, &cp->in_pkts); |
1615 | 1615 | ||
1616 | if ((ipvs->sync_state & IP_VS_STATE_MASTER) && | 1616 | if (ipvs->sync_state & IP_VS_STATE_MASTER) |
1617 | cp->protocol == IPPROTO_SCTP) { | 1617 | ip_vs_sync_conn(net, cp, pkts); |
1618 | if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && | ||
1619 | (pkts % sysctl_sync_period(ipvs) | ||
1620 | == sysctl_sync_threshold(ipvs))) || | ||
1621 | (cp->old_state != cp->state && | ||
1622 | ((cp->state == IP_VS_SCTP_S_CLOSED) || | ||
1623 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || | ||
1624 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { | ||
1625 | ip_vs_sync_conn(net, cp); | ||
1626 | goto out; | ||
1627 | } | ||
1628 | } | ||
1629 | |||
1630 | /* Keep this block last: TCP and others with pp->num_states <= 1 */ | ||
1631 | else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && | ||
1632 | (((cp->protocol != IPPROTO_TCP || | ||
1633 | cp->state == IP_VS_TCP_S_ESTABLISHED) && | ||
1634 | (pkts % sysctl_sync_period(ipvs) | ||
1635 | == sysctl_sync_threshold(ipvs))) || | ||
1636 | ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && | ||
1637 | ((cp->state == IP_VS_TCP_S_FIN_WAIT) || | ||
1638 | (cp->state == IP_VS_TCP_S_CLOSE) || | ||
1639 | (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || | ||
1640 | (cp->state == IP_VS_TCP_S_TIME_WAIT))))) | ||
1641 | ip_vs_sync_conn(net, cp); | ||
1642 | out: | ||
1643 | cp->old_state = cp->state; | ||
1644 | 1618 | ||
1645 | ip_vs_conn_put(cp); | 1619 | ip_vs_conn_put(cp); |
1646 | return ret; | 1620 | return ret; |
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index bd3827ec25c9..a77b9bd433aa 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c | |||
@@ -1599,6 +1599,10 @@ static int ip_vs_zero_all(struct net *net) | |||
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | #ifdef CONFIG_SYSCTL | 1601 | #ifdef CONFIG_SYSCTL |
1602 | |||
1603 | static int zero; | ||
1604 | static int three = 3; | ||
1605 | |||
1602 | static int | 1606 | static int |
1603 | proc_do_defense_mode(ctl_table *table, int write, | 1607 | proc_do_defense_mode(ctl_table *table, int write, |
1604 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1608 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -1632,7 +1636,8 @@ proc_do_sync_threshold(ctl_table *table, int write, | |||
1632 | memcpy(val, valp, sizeof(val)); | 1636 | memcpy(val, valp, sizeof(val)); |
1633 | 1637 | ||
1634 | rc = proc_dointvec(table, write, buffer, lenp, ppos); | 1638 | rc = proc_dointvec(table, write, buffer, lenp, ppos); |
1635 | if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { | 1639 | if (write && (valp[0] < 0 || valp[1] < 0 || |
1640 | (valp[0] >= valp[1] && valp[1]))) { | ||
1636 | /* Restore the correct value */ | 1641 | /* Restore the correct value */ |
1637 | memcpy(valp, val, sizeof(val)); | 1642 | memcpy(valp, val, sizeof(val)); |
1638 | } | 1643 | } |
@@ -1755,6 +1760,20 @@ static struct ctl_table vs_vars[] = { | |||
1755 | .proc_handler = proc_do_sync_threshold, | 1760 | .proc_handler = proc_do_sync_threshold, |
1756 | }, | 1761 | }, |
1757 | { | 1762 | { |
1763 | .procname = "sync_refresh_period", | ||
1764 | .maxlen = sizeof(int), | ||
1765 | .mode = 0644, | ||
1766 | .proc_handler = proc_dointvec_jiffies, | ||
1767 | }, | ||
1768 | { | ||
1769 | .procname = "sync_retries", | ||
1770 | .maxlen = sizeof(int), | ||
1771 | .mode = 0644, | ||
1772 | .proc_handler = proc_dointvec_minmax, | ||
1773 | .extra1 = &zero, | ||
1774 | .extra2 = &three, | ||
1775 | }, | ||
1776 | { | ||
1758 | .procname = "nat_icmp_send", | 1777 | .procname = "nat_icmp_send", |
1759 | .maxlen = sizeof(int), | 1778 | .maxlen = sizeof(int), |
1760 | .mode = 0644, | 1779 | .mode = 0644, |
@@ -3678,6 +3697,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) | |||
3678 | ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; | 3697 | ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; |
3679 | tbl[idx].data = &ipvs->sysctl_sync_threshold; | 3698 | tbl[idx].data = &ipvs->sysctl_sync_threshold; |
3680 | tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); | 3699 | tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); |
3700 | ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; | ||
3701 | tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; | ||
3702 | ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); | ||
3703 | tbl[idx++].data = &ipvs->sysctl_sync_retries; | ||
3681 | tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; | 3704 | tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; |
3682 | 3705 | ||
3683 | 3706 | ||
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b3235b230139..8d6a4219e904 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c | |||
@@ -451,11 +451,94 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) | |||
451 | return sb; | 451 | return sb; |
452 | } | 452 | } |
453 | 453 | ||
454 | /* Check if conn should be synced. | ||
455 | * pkts: conn packets, use sysctl_sync_threshold to avoid packet check | ||
456 | * - (1) sync_refresh_period: reduce sync rate. Additionally, retry | ||
457 | * sync_retries times with period of sync_refresh_period/8 | ||
458 | * - (2) if both sync_refresh_period and sync_period are 0 send sync only | ||
459 | * for state changes or only once when pkts matches sync_threshold | ||
460 | * - (3) templates: rate can be reduced only with sync_refresh_period or | ||
461 | * with (2) | ||
462 | */ | ||
463 | static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, | ||
464 | struct ip_vs_conn *cp, int pkts) | ||
465 | { | ||
466 | unsigned long orig = ACCESS_ONCE(cp->sync_endtime); | ||
467 | unsigned long now = jiffies; | ||
468 | unsigned long n = (now + cp->timeout) & ~3UL; | ||
469 | unsigned int sync_refresh_period; | ||
470 | int sync_period; | ||
471 | int force; | ||
472 | |||
473 | /* Check if we sync in current state */ | ||
474 | if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) | ||
475 | force = 0; | ||
476 | else if (likely(cp->protocol == IPPROTO_TCP)) { | ||
477 | if (!((1 << cp->state) & | ||
478 | ((1 << IP_VS_TCP_S_ESTABLISHED) | | ||
479 | (1 << IP_VS_TCP_S_FIN_WAIT) | | ||
480 | (1 << IP_VS_TCP_S_CLOSE) | | ||
481 | (1 << IP_VS_TCP_S_CLOSE_WAIT) | | ||
482 | (1 << IP_VS_TCP_S_TIME_WAIT)))) | ||
483 | return 0; | ||
484 | force = cp->state != cp->old_state; | ||
485 | if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
486 | goto set; | ||
487 | } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { | ||
488 | if (!((1 << cp->state) & | ||
489 | ((1 << IP_VS_SCTP_S_ESTABLISHED) | | ||
490 | (1 << IP_VS_SCTP_S_CLOSED) | | ||
491 | (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) | | ||
492 | (1 << IP_VS_SCTP_S_SHUT_ACK_SER)))) | ||
493 | return 0; | ||
494 | force = cp->state != cp->old_state; | ||
495 | if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) | ||
496 | goto set; | ||
497 | } else { | ||
498 | /* UDP or another protocol with single state */ | ||
499 | force = 0; | ||
500 | } | ||
501 | |||
502 | sync_refresh_period = sysctl_sync_refresh_period(ipvs); | ||
503 | if (sync_refresh_period > 0) { | ||
504 | long diff = n - orig; | ||
505 | long min_diff = max(cp->timeout >> 1, 10UL * HZ); | ||
506 | |||
507 | /* Avoid sync if difference is below sync_refresh_period | ||
508 | * and below the half timeout. | ||
509 | */ | ||
510 | if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { | ||
511 | int retries = orig & 3; | ||
512 | |||
513 | if (retries >= sysctl_sync_retries(ipvs)) | ||
514 | return 0; | ||
515 | if (time_before(now, orig - cp->timeout + | ||
516 | (sync_refresh_period >> 3))) | ||
517 | return 0; | ||
518 | n |= retries + 1; | ||
519 | } | ||
520 | } | ||
521 | sync_period = sysctl_sync_period(ipvs); | ||
522 | if (sync_period > 0) { | ||
523 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && | ||
524 | pkts % sync_period != sysctl_sync_threshold(ipvs)) | ||
525 | return 0; | ||
526 | } else if (sync_refresh_period <= 0 && | ||
527 | pkts != sysctl_sync_threshold(ipvs)) | ||
528 | return 0; | ||
529 | |||
530 | set: | ||
531 | cp->old_state = cp->state; | ||
532 | n = cmpxchg(&cp->sync_endtime, orig, n); | ||
533 | return n == orig || force; | ||
534 | } | ||
535 | |||
454 | /* | 536 | /* |
455 | * Version 0 , could be switched in by sys_ctl. | 537 | * Version 0 , could be switched in by sys_ctl. |
456 | * Add an ip_vs_conn information into the current sync_buff. | 538 | * Add an ip_vs_conn information into the current sync_buff. |
457 | */ | 539 | */ |
458 | void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) | 540 | static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, |
541 | int pkts) | ||
459 | { | 542 | { |
460 | struct netns_ipvs *ipvs = net_ipvs(net); | 543 | struct netns_ipvs *ipvs = net_ipvs(net); |
461 | struct ip_vs_sync_mesg_v0 *m; | 544 | struct ip_vs_sync_mesg_v0 *m; |
@@ -468,6 +551,9 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) | |||
468 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) | 551 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
469 | return; | 552 | return; |
470 | 553 | ||
554 | if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) | ||
555 | return; | ||
556 | |||
471 | spin_lock(&ipvs->sync_buff_lock); | 557 | spin_lock(&ipvs->sync_buff_lock); |
472 | if (!ipvs->sync_buff) { | 558 | if (!ipvs->sync_buff) { |
473 | ipvs->sync_buff = | 559 | ipvs->sync_buff = |
@@ -513,8 +599,14 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) | |||
513 | spin_unlock(&ipvs->sync_buff_lock); | 599 | spin_unlock(&ipvs->sync_buff_lock); |
514 | 600 | ||
515 | /* synchronize its controller if it has */ | 601 | /* synchronize its controller if it has */ |
516 | if (cp->control) | 602 | cp = cp->control; |
517 | ip_vs_sync_conn(net, cp->control); | 603 | if (cp) { |
604 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) | ||
605 | pkts = atomic_add_return(1, &cp->in_pkts); | ||
606 | else | ||
607 | pkts = sysctl_sync_threshold(ipvs); | ||
608 | ip_vs_sync_conn(net, cp->control, pkts); | ||
609 | } | ||
518 | } | 610 | } |
519 | 611 | ||
520 | /* | 612 | /* |
@@ -522,7 +614,7 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) | |||
522 | * Called by ip_vs_in. | 614 | * Called by ip_vs_in. |
523 | * Sending Version 1 messages | 615 | * Sending Version 1 messages |
524 | */ | 616 | */ |
525 | void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) | 617 | void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) |
526 | { | 618 | { |
527 | struct netns_ipvs *ipvs = net_ipvs(net); | 619 | struct netns_ipvs *ipvs = net_ipvs(net); |
528 | struct ip_vs_sync_mesg *m; | 620 | struct ip_vs_sync_mesg *m; |
@@ -532,13 +624,16 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) | |||
532 | 624 | ||
533 | /* Handle old version of the protocol */ | 625 | /* Handle old version of the protocol */ |
534 | if (sysctl_sync_ver(ipvs) == 0) { | 626 | if (sysctl_sync_ver(ipvs) == 0) { |
535 | ip_vs_sync_conn_v0(net, cp); | 627 | ip_vs_sync_conn_v0(net, cp, pkts); |
536 | return; | 628 | return; |
537 | } | 629 | } |
538 | /* Do not sync ONE PACKET */ | 630 | /* Do not sync ONE PACKET */ |
539 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) | 631 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
540 | goto control; | 632 | goto control; |
541 | sloop: | 633 | sloop: |
634 | if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) | ||
635 | goto control; | ||
636 | |||
542 | /* Sanity checks */ | 637 | /* Sanity checks */ |
543 | pe_name_len = 0; | 638 | pe_name_len = 0; |
544 | if (cp->pe_data_len) { | 639 | if (cp->pe_data_len) { |
@@ -653,16 +748,10 @@ control: | |||
653 | cp = cp->control; | 748 | cp = cp->control; |
654 | if (!cp) | 749 | if (!cp) |
655 | return; | 750 | return; |
656 | /* | 751 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) |
657 | * Reduce sync rate for templates | 752 | pkts = atomic_add_return(1, &cp->in_pkts); |
658 | * i.e only increment in_pkts for Templates. | 753 | else |
659 | */ | 754 | pkts = sysctl_sync_threshold(ipvs); |
660 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) { | ||
661 | int pkts = atomic_add_return(1, &cp->in_pkts); | ||
662 | |||
663 | if (pkts % sysctl_sync_period(ipvs) != 1) | ||
664 | return; | ||
665 | } | ||
666 | goto sloop; | 755 | goto sloop; |
667 | } | 756 | } |
668 | 757 | ||
@@ -1494,7 +1583,7 @@ next_sync_buff(struct netns_ipvs *ipvs) | |||
1494 | if (sb) | 1583 | if (sb) |
1495 | return sb; | 1584 | return sb; |
1496 | /* Do not delay entries in buffer for more than 2 seconds */ | 1585 | /* Do not delay entries in buffer for more than 2 seconds */ |
1497 | return get_curr_sync_buff(ipvs, 2 * HZ); | 1586 | return get_curr_sync_buff(ipvs, IPVS_SYNC_FLUSH_TIME); |
1498 | } | 1587 | } |
1499 | 1588 | ||
1500 | static int sync_thread_master(void *data) | 1589 | static int sync_thread_master(void *data) |