aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarcelo Ricardo Leitner <mleitner@redhat.com>2015-02-23 13:02:34 -0500
committerSimon Horman <horms@verge.net.au>2015-02-24 23:46:35 -0500
commitd752c364571743d696c2a54a449ce77550c35ac5 (patch)
tree160b607482c080c720f27626fa7a7130307cbbf3
parent7f73b9f1ca7334eec0ff9a40e37ece92dd3e420f (diff)
ipvs: allow rescheduling of new connections when port reuse is detected
Currently, when TCP/SCTP port reusing happens, IPVS will find the old entry and use it for the new one, behaving like a forced persistence. But if you consider a cluster with a heavy load of small connections, such reuse will happen often and may lead to a not optimal load balancing and might prevent a new node from getting a fair load. This patch introduces a new sysctl, conn_reuse_mode, that allows controlling how to proceed when port reuse is detected. The default value will allow rescheduling of new connections only if the old entry was in TIME_WAIT state for TCP or CLOSED for SCTP. Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com> Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
-rw-r--r--Documentation/networking/ipvs-sysctl.txt21
-rw-r--r--include/net/ip_vs.h11
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c33
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c21
5 files changed, 88 insertions, 6 deletions
diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
index 7a3c04729591..3ba709531adb 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -22,6 +22,27 @@ backup_only - BOOLEAN
22 If set, disable the director function while the server is 22 If set, disable the director function while the server is
23 in backup mode to avoid packet loops for DR/TUN methods. 23 in backup mode to avoid packet loops for DR/TUN methods.
24 24
25conn_reuse_mode - INTEGER
26 1 - default
27
28 Controls how ipvs will deal with connections that are detected
29 port reuse. It is a bitmap, with the values being:
30
31 0: disable any special handling on port reuse. The new
32 connection will be delivered to the same real server that was
33 servicing the previous connection. This will effectively
34 disable expire_nodest_conn.
35
36 bit 1: enable rescheduling of new connections when it is safe.
37 That is, whenever expire_nodest_conn and for TCP sockets, when
38 the connection is in TIME_WAIT state (which is only possible if
39 you use NAT mode).
40
41 bit 2: it is bit 1 plus, for TCP connections, when connections
42 are in FIN_WAIT state, as this is the last state seen by load
43 balancer in Direct Routing mode. This bit helps on adding new
44 real servers to a very busy cluster.
45
25conntrack - BOOLEAN 46conntrack - BOOLEAN
26 0 - disabled (default) 47 0 - disabled (default)
27 not 0 - enabled 48 not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a627fe690c19..20fd23398537 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -941,6 +941,7 @@ struct netns_ipvs {
941 int sysctl_nat_icmp_send; 941 int sysctl_nat_icmp_send;
942 int sysctl_pmtu_disc; 942 int sysctl_pmtu_disc;
943 int sysctl_backup_only; 943 int sysctl_backup_only;
944 int sysctl_conn_reuse_mode;
944 945
945 /* ip_vs_lblc */ 946 /* ip_vs_lblc */
946 int sysctl_lblc_expiration; 947 int sysctl_lblc_expiration;
@@ -1059,6 +1060,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
1059 ipvs->sysctl_backup_only; 1060 ipvs->sysctl_backup_only;
1060} 1061}
1061 1062
1063static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
1064{
1065 return ipvs->sysctl_conn_reuse_mode;
1066}
1067
1062#else 1068#else
1063 1069
1064static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) 1070static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1126,6 +1132,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
1126 return 0; 1132 return 0;
1127} 1133}
1128 1134
1135static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
1136{
1137 return 1;
1138}
1139
1129#endif 1140#endif
1130 1141
1131/* IPVS core functions 1142/* IPVS core functions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c9470c86308f..6103ab933c5b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1042,6 +1042,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
1042 } 1042 }
1043} 1043}
1044 1044
1045static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
1046 int conn_reuse_mode)
1047{
1048 /* Controlled (FTP DATA or persistence)? */
1049 if (cp->control)
1050 return false;
1051
1052 switch (cp->protocol) {
1053 case IPPROTO_TCP:
1054 return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
1055 ((conn_reuse_mode & 2) &&
1056 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
1057 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
1058 case IPPROTO_SCTP:
1059 return cp->state == IP_VS_SCTP_S_CLOSED;
1060 default:
1061 return false;
1062 }
1063}
1064
1045/* Handle response packets: rewrite addresses and send away... 1065/* Handle response packets: rewrite addresses and send away...
1046 */ 1066 */
1047static unsigned int 1067static unsigned int
@@ -1580,6 +1600,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1580 struct ip_vs_conn *cp; 1600 struct ip_vs_conn *cp;
1581 int ret, pkts; 1601 int ret, pkts;
1582 struct netns_ipvs *ipvs; 1602 struct netns_ipvs *ipvs;
1603 int conn_reuse_mode;
1583 1604
1584 /* Already marked as IPVS request or reply? */ 1605 /* Already marked as IPVS request or reply? */
1585 if (skb->ipvs_property) 1606 if (skb->ipvs_property)
@@ -1648,10 +1669,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1648 */ 1669 */
1649 cp = pp->conn_in_get(af, skb, &iph, 0); 1670 cp = pp->conn_in_get(af, skb, &iph, 0);
1650 1671
1651 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && 1672 conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
1652 unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && 1673 if (conn_reuse_mode && !iph.fragoffs &&
1653 is_new_conn(skb, &iph)) { 1674 is_new_conn(skb, &iph) && cp &&
1654 ip_vs_conn_expire_now(cp); 1675 ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
1676 unlikely(!atomic_read(&cp->dest->weight))) ||
1677 unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
1678 if (!atomic_read(&cp->n_control))
1679 ip_vs_conn_expire_now(cp);
1655 __ip_vs_conn_put(cp); 1680 __ip_vs_conn_put(cp);
1656 cp = NULL; 1681 cp = NULL;
1657 } 1682 }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6fd60059faf0..76cc9ffd87fa 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1823,6 +1823,12 @@ static struct ctl_table vs_vars[] = {
1823 .mode = 0644, 1823 .mode = 0644,
1824 .proc_handler = proc_dointvec, 1824 .proc_handler = proc_dointvec,
1825 }, 1825 },
1826 {
1827 .procname = "conn_reuse_mode",
1828 .maxlen = sizeof(int),
1829 .mode = 0644,
1830 .proc_handler = proc_dointvec,
1831 },
1826#ifdef CONFIG_IP_VS_DEBUG 1832#ifdef CONFIG_IP_VS_DEBUG
1827 { 1833 {
1828 .procname = "debug_level", 1834 .procname = "debug_level",
@@ -3790,6 +3796,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3790 ipvs->sysctl_pmtu_disc = 1; 3796 ipvs->sysctl_pmtu_disc = 1;
3791 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 3797 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3792 tbl[idx++].data = &ipvs->sysctl_backup_only; 3798 tbl[idx++].data = &ipvs->sysctl_backup_only;
3799 ipvs->sysctl_conn_reuse_mode = 1;
3800 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
3793 3801
3794 3802
3795 ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); 3803 ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index c47ffd7a0a70..f96229cdb6e1 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
845 struct ip_vs_conn *cp; 845 struct ip_vs_conn *cp;
846 struct netns_ipvs *ipvs = net_ipvs(net); 846 struct netns_ipvs *ipvs = net_ipvs(net);
847 847
848 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 848 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
849 cp = ip_vs_conn_in_get(param); 849 cp = ip_vs_conn_in_get(param);
850 else 850 if (cp && ((cp->dport != dport) ||
851 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
852 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
853 ip_vs_conn_expire_now(cp);
854 __ip_vs_conn_put(cp);
855 cp = NULL;
856 } else {
857 /* This is the expiration message for the
858 * connection that was already replaced, so we
859 * just ignore it.
860 */
861 __ip_vs_conn_put(cp);
862 kfree(param->pe_data);
863 return;
864 }
865 }
866 } else {
851 cp = ip_vs_ct_in_get(param); 867 cp = ip_vs_ct_in_get(param);
868 }
852 869
853 if (cp) { 870 if (cp) {
854 /* Free pe_data */ 871 /* Free pe_data */