diff options
author | Marcelo Ricardo Leitner <mleitner@redhat.com> | 2015-02-23 13:02:34 -0500 |
---|---|---|
committer | Simon Horman <horms@verge.net.au> | 2015-02-24 23:46:35 -0500 |
commit | d752c364571743d696c2a54a449ce77550c35ac5 (patch) | |
tree | 160b607482c080c720f27626fa7a7130307cbbf3 | |
parent | 7f73b9f1ca7334eec0ff9a40e37ece92dd3e420f (diff) |
ipvs: allow rescheduling of new connections when port reuse is detected
Currently, when TCP/SCTP port reusing happens, IPVS will find the old
entry and use it for the new one, behaving like a forced persistence.
But if you consider a cluster with a heavy load of small connections,
such reuse will happen often and may lead to a not optimal load
balancing and might prevent a new node from getting a fair load.
This patch introduces a new sysctl, conn_reuse_mode, that allows
controlling how to proceed when port reuse is detected. The default
value will allow rescheduling of new connections only if the old entry
was in TIME_WAIT state for TCP or CLOSED for SCTP.
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
-rw-r--r-- | Documentation/networking/ipvs-sysctl.txt | 21 | ||||
-rw-r--r-- | include/net/ip_vs.h | 11 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 33 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 8 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sync.c | 21 |
5 files changed, 88 insertions, 6 deletions
diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index 7a3c04729591..3ba709531adb 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt | |||
@@ -22,6 +22,27 @@ backup_only - BOOLEAN | |||
22 | If set, disable the director function while the server is | 22 | If set, disable the director function while the server is |
23 | in backup mode to avoid packet loops for DR/TUN methods. | 23 | in backup mode to avoid packet loops for DR/TUN methods. |
24 | 24 | ||
25 | conn_reuse_mode - INTEGER | ||
26 | 1 - default | ||
27 | |||
28 | Controls how ipvs will deal with connections that are detected | ||
29 | port reuse. It is a bitmap, with the values being: | ||
30 | |||
31 | 0: disable any special handling on port reuse. The new | ||
32 | connection will be delivered to the same real server that was | ||
33 | servicing the previous connection. This will effectively | ||
34 | disable expire_nodest_conn. | ||
35 | |||
36 | bit 1: enable rescheduling of new connections when it is safe. | ||
37 | That is, whenever expire_nodest_conn and for TCP sockets, when | ||
38 | the connection is in TIME_WAIT state (which is only possible if | ||
39 | you use NAT mode). | ||
40 | |||
41 | bit 2: it is bit 1 plus, for TCP connections, when connections | ||
42 | are in FIN_WAIT state, as this is the last state seen by load | ||
43 | balancer in Direct Routing mode. This bit helps on adding new | ||
44 | real servers to a very busy cluster. | ||
45 | |||
25 | conntrack - BOOLEAN | 46 | conntrack - BOOLEAN |
26 | 0 - disabled (default) | 47 | 0 - disabled (default) |
27 | not 0 - enabled | 48 | not 0 - enabled |
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a627fe690c19..20fd23398537 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h | |||
@@ -941,6 +941,7 @@ struct netns_ipvs { | |||
941 | int sysctl_nat_icmp_send; | 941 | int sysctl_nat_icmp_send; |
942 | int sysctl_pmtu_disc; | 942 | int sysctl_pmtu_disc; |
943 | int sysctl_backup_only; | 943 | int sysctl_backup_only; |
944 | int sysctl_conn_reuse_mode; | ||
944 | 945 | ||
945 | /* ip_vs_lblc */ | 946 | /* ip_vs_lblc */ |
946 | int sysctl_lblc_expiration; | 947 | int sysctl_lblc_expiration; |
@@ -1059,6 +1060,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) | |||
1059 | ipvs->sysctl_backup_only; | 1060 | ipvs->sysctl_backup_only; |
1060 | } | 1061 | } |
1061 | 1062 | ||
1063 | static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) | ||
1064 | { | ||
1065 | return ipvs->sysctl_conn_reuse_mode; | ||
1066 | } | ||
1067 | |||
1062 | #else | 1068 | #else |
1063 | 1069 | ||
1064 | static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) | 1070 | static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) |
@@ -1126,6 +1132,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) | |||
1126 | return 0; | 1132 | return 0; |
1127 | } | 1133 | } |
1128 | 1134 | ||
1135 | static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) | ||
1136 | { | ||
1137 | return 1; | ||
1138 | } | ||
1139 | |||
1129 | #endif | 1140 | #endif |
1130 | 1141 | ||
1131 | /* IPVS core functions | 1142 | /* IPVS core functions |
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c9470c86308f..6103ab933c5b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c | |||
@@ -1042,6 +1042,26 @@ static inline bool is_new_conn(const struct sk_buff *skb, | |||
1042 | } | 1042 | } |
1043 | } | 1043 | } |
1044 | 1044 | ||
1045 | static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, | ||
1046 | int conn_reuse_mode) | ||
1047 | { | ||
1048 | /* Controlled (FTP DATA or persistence)? */ | ||
1049 | if (cp->control) | ||
1050 | return false; | ||
1051 | |||
1052 | switch (cp->protocol) { | ||
1053 | case IPPROTO_TCP: | ||
1054 | return (cp->state == IP_VS_TCP_S_TIME_WAIT) || | ||
1055 | ((conn_reuse_mode & 2) && | ||
1056 | (cp->state == IP_VS_TCP_S_FIN_WAIT) && | ||
1057 | (cp->flags & IP_VS_CONN_F_NOOUTPUT)); | ||
1058 | case IPPROTO_SCTP: | ||
1059 | return cp->state == IP_VS_SCTP_S_CLOSED; | ||
1060 | default: | ||
1061 | return false; | ||
1062 | } | ||
1063 | } | ||
1064 | |||
1045 | /* Handle response packets: rewrite addresses and send away... | 1065 | /* Handle response packets: rewrite addresses and send away... |
1046 | */ | 1066 | */ |
1047 | static unsigned int | 1067 | static unsigned int |
@@ -1580,6 +1600,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1580 | struct ip_vs_conn *cp; | 1600 | struct ip_vs_conn *cp; |
1581 | int ret, pkts; | 1601 | int ret, pkts; |
1582 | struct netns_ipvs *ipvs; | 1602 | struct netns_ipvs *ipvs; |
1603 | int conn_reuse_mode; | ||
1583 | 1604 | ||
1584 | /* Already marked as IPVS request or reply? */ | 1605 | /* Already marked as IPVS request or reply? */ |
1585 | if (skb->ipvs_property) | 1606 | if (skb->ipvs_property) |
@@ -1648,10 +1669,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1648 | */ | 1669 | */ |
1649 | cp = pp->conn_in_get(af, skb, &iph, 0); | 1670 | cp = pp->conn_in_get(af, skb, &iph, 0); |
1650 | 1671 | ||
1651 | if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && | 1672 | conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); |
1652 | unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && | 1673 | if (conn_reuse_mode && !iph.fragoffs && |
1653 | is_new_conn(skb, &iph)) { | 1674 | is_new_conn(skb, &iph) && cp && |
1654 | ip_vs_conn_expire_now(cp); | 1675 | ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && |
1676 | unlikely(!atomic_read(&cp->dest->weight))) || | ||
1677 | unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) { | ||
1678 | if (!atomic_read(&cp->n_control)) | ||
1679 | ip_vs_conn_expire_now(cp); | ||
1655 | __ip_vs_conn_put(cp); | 1680 | __ip_vs_conn_put(cp); |
1656 | cp = NULL; | 1681 | cp = NULL; |
1657 | } | 1682 | } |
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6fd60059faf0..76cc9ffd87fa 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c | |||
@@ -1823,6 +1823,12 @@ static struct ctl_table vs_vars[] = { | |||
1823 | .mode = 0644, | 1823 | .mode = 0644, |
1824 | .proc_handler = proc_dointvec, | 1824 | .proc_handler = proc_dointvec, |
1825 | }, | 1825 | }, |
1826 | { | ||
1827 | .procname = "conn_reuse_mode", | ||
1828 | .maxlen = sizeof(int), | ||
1829 | .mode = 0644, | ||
1830 | .proc_handler = proc_dointvec, | ||
1831 | }, | ||
1826 | #ifdef CONFIG_IP_VS_DEBUG | 1832 | #ifdef CONFIG_IP_VS_DEBUG |
1827 | { | 1833 | { |
1828 | .procname = "debug_level", | 1834 | .procname = "debug_level", |
@@ -3790,6 +3796,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) | |||
3790 | ipvs->sysctl_pmtu_disc = 1; | 3796 | ipvs->sysctl_pmtu_disc = 1; |
3791 | tbl[idx++].data = &ipvs->sysctl_pmtu_disc; | 3797 | tbl[idx++].data = &ipvs->sysctl_pmtu_disc; |
3792 | tbl[idx++].data = &ipvs->sysctl_backup_only; | 3798 | tbl[idx++].data = &ipvs->sysctl_backup_only; |
3799 | ipvs->sysctl_conn_reuse_mode = 1; | ||
3800 | tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; | ||
3793 | 3801 | ||
3794 | 3802 | ||
3795 | ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); | 3803 | ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); |
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c47ffd7a0a70..f96229cdb6e1 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c | |||
@@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, | |||
845 | struct ip_vs_conn *cp; | 845 | struct ip_vs_conn *cp; |
846 | struct netns_ipvs *ipvs = net_ipvs(net); | 846 | struct netns_ipvs *ipvs = net_ipvs(net); |
847 | 847 | ||
848 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) | 848 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) { |
849 | cp = ip_vs_conn_in_get(param); | 849 | cp = ip_vs_conn_in_get(param); |
850 | else | 850 | if (cp && ((cp->dport != dport) || |
851 | !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { | ||
852 | if (!(flags & IP_VS_CONN_F_INACTIVE)) { | ||
853 | ip_vs_conn_expire_now(cp); | ||
854 | __ip_vs_conn_put(cp); | ||
855 | cp = NULL; | ||
856 | } else { | ||
857 | /* This is the expiration message for the | ||
858 | * connection that was already replaced, so we | ||
859 | * just ignore it. | ||
860 | */ | ||
861 | __ip_vs_conn_put(cp); | ||
862 | kfree(param->pe_data); | ||
863 | return; | ||
864 | } | ||
865 | } | ||
866 | } else { | ||
851 | cp = ip_vs_ct_in_get(param); | 867 | cp = ip_vs_ct_in_get(param); |
868 | } | ||
852 | 869 | ||
853 | if (cp) { | 870 | if (cp) { |
854 | /* Free pe_data */ | 871 | /* Free pe_data */ |