aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2013-05-20 00:02:32 -0400
committerDavid S. Miller <davem@davemloft.net>2013-05-20 16:48:04 -0400
commit99bbc70741903c063b3ccad90a3e06fc55df9245 (patch)
treea3377d2461242bf1134464ce3fe6d69f82c907c2 /net
parent4a5bddf7ea6b6c5916eccbc2fa1950555073ff48 (diff)
rps: selective flow shedding during softnet overflow
A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig12
-rw-r--r--net/core/dev.c48
-rw-r--r--net/core/net-procfs.c16
-rw-r--r--net/core/sysctl_net_core.c104
4 files changed, 177 insertions, 3 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 2ddc9046868e..08de901415ee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,6 +259,18 @@ config BPF_JIT
259 packet sniffing (libpcap/tcpdump). Note : Admin should enable 259 packet sniffing (libpcap/tcpdump). Note : Admin should enable
260 this feature changing /proc/sys/net/core/bpf_jit_enable 260 this feature changing /proc/sys/net/core/bpf_jit_enable
261 261
262config NET_FLOW_LIMIT
263 boolean
264 depends on RPS
265 default y
266 ---help---
267 The network stack has to drop packets when a receive processing CPU's
268 backlog reaches netdev_max_backlog. If a few out of many active flows
269 generate the vast majority of load, drop their traffic earlier to
270 maintain capacity for the other flows. This feature provides servers
271 with many clients some protection against DoS by a single (spoofed)
272 flow that greatly exceeds average workload.
273
262menu "Network testing" 274menu "Network testing"
263 275
264config NET_PKTGEN 276config NET_PKTGEN
diff --git a/net/core/dev.c b/net/core/dev.c
index 18e9730cc4be..7229bc30e509 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd)
3064 return 0; 3064 return 0;
3065} 3065}
3066 3066
3067#ifdef CONFIG_NET_FLOW_LIMIT
3068int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3069#endif
3070
3071static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3072{
3073#ifdef CONFIG_NET_FLOW_LIMIT
3074 struct sd_flow_limit *fl;
3075 struct softnet_data *sd;
3076 unsigned int old_flow, new_flow;
3077
3078 if (qlen < (netdev_max_backlog >> 1))
3079 return false;
3080
3081 sd = &__get_cpu_var(softnet_data);
3082
3083 rcu_read_lock();
3084 fl = rcu_dereference(sd->flow_limit);
3085 if (fl) {
3086 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3087 old_flow = fl->history[fl->history_head];
3088 fl->history[fl->history_head] = new_flow;
3089
3090 fl->history_head++;
3091 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3092
3093 if (likely(fl->buckets[old_flow]))
3094 fl->buckets[old_flow]--;
3095
3096 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3097 fl->count++;
3098 rcu_read_unlock();
3099 return true;
3100 }
3101 }
3102 rcu_read_unlock();
3103#endif
3104 return false;
3105}
3106
3067/* 3107/*
3068 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3108 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3069 * queue (may be a remote CPU queue). 3109 * queue (may be a remote CPU queue).
@@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3073{ 3113{
3074 struct softnet_data *sd; 3114 struct softnet_data *sd;
3075 unsigned long flags; 3115 unsigned long flags;
3116 unsigned int qlen;
3076 3117
3077 sd = &per_cpu(softnet_data, cpu); 3118 sd = &per_cpu(softnet_data, cpu);
3078 3119
3079 local_irq_save(flags); 3120 local_irq_save(flags);
3080 3121
3081 rps_lock(sd); 3122 rps_lock(sd);
3082 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { 3123 qlen = skb_queue_len(&sd->input_pkt_queue);
3124 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3083 if (skb_queue_len(&sd->input_pkt_queue)) { 3125 if (skb_queue_len(&sd->input_pkt_queue)) {
3084enqueue: 3126enqueue:
3085 __skb_queue_tail(&sd->input_pkt_queue, skb); 3127 __skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@ static int __init net_dev_init(void)
6269 sd->backlog.weight = weight_p; 6311 sd->backlog.weight = weight_p;
6270 sd->backlog.gro_list = NULL; 6312 sd->backlog.gro_list = NULL;
6271 sd->backlog.gro_count = 0; 6313 sd->backlog.gro_count = 0;
6314
6315#ifdef CONFIG_NET_FLOW_LIMIT
6316 sd->flow_limit = NULL;
6317#endif
6272 } 6318 }
6273 6319
6274 dev_boot_phase = 0; 6320 dev_boot_phase = 0;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 569d355fec3e..2bf83299600a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
146static int softnet_seq_show(struct seq_file *seq, void *v) 146static int softnet_seq_show(struct seq_file *seq, void *v)
147{ 147{
148 struct softnet_data *sd = v; 148 struct softnet_data *sd = v;
149 unsigned int flow_limit_count = 0;
149 150
150 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 151#ifdef CONFIG_NET_FLOW_LIMIT
152 struct sd_flow_limit *fl;
153
154 rcu_read_lock();
155 fl = rcu_dereference(sd->flow_limit);
156 if (fl)
157 flow_limit_count = fl->count;
158 rcu_read_unlock();
159#endif
160
161 seq_printf(seq,
162 "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151 sd->processed, sd->dropped, sd->time_squeeze, 0, 163 sd->processed, sd->dropped, sd->time_squeeze, 0,
152 0, 0, 0, 0, /* was fastroute */ 164 0, 0, 0, 0, /* was fastroute */
153 sd->cpu_collision, sd->received_rps); 165 sd->cpu_collision, sd->received_rps, flow_limit_count);
154 return 0; 166 return 0;
155} 167}
156 168
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfdb46ab3a7f..741db5fc7806 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
87} 87}
88#endif /* CONFIG_RPS */ 88#endif /* CONFIG_RPS */
89 89
90#ifdef CONFIG_NET_FLOW_LIMIT
91static DEFINE_MUTEX(flow_limit_update_mutex);
92
93static int flow_limit_cpu_sysctl(ctl_table *table, int write,
94 void __user *buffer, size_t *lenp,
95 loff_t *ppos)
96{
97 struct sd_flow_limit *cur;
98 struct softnet_data *sd;
99 cpumask_var_t mask;
100 int i, len, ret = 0;
101
102 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
103 return -ENOMEM;
104
105 if (write) {
106 ret = cpumask_parse_user(buffer, *lenp, mask);
107 if (ret)
108 goto done;
109
110 mutex_lock(&flow_limit_update_mutex);
111 len = sizeof(*cur) + netdev_flow_limit_table_len;
112 for_each_possible_cpu(i) {
113 sd = &per_cpu(softnet_data, i);
114 cur = rcu_dereference_protected(sd->flow_limit,
115 lockdep_is_held(&flow_limit_update_mutex));
116 if (cur && !cpumask_test_cpu(i, mask)) {
117 RCU_INIT_POINTER(sd->flow_limit, NULL);
118 synchronize_rcu();
119 kfree(cur);
120 } else if (!cur && cpumask_test_cpu(i, mask)) {
121 cur = kzalloc(len, GFP_KERNEL);
122 if (!cur) {
123 /* not unwinding previous changes */
124 ret = -ENOMEM;
125 goto write_unlock;
126 }
127 cur->num_buckets = netdev_flow_limit_table_len;
128 rcu_assign_pointer(sd->flow_limit, cur);
129 }
130 }
131write_unlock:
132 mutex_unlock(&flow_limit_update_mutex);
133 } else {
134 if (*ppos || !*lenp) {
135 *lenp = 0;
136 goto done;
137 }
138
139 cpumask_clear(mask);
140 rcu_read_lock();
141 for_each_possible_cpu(i) {
142 sd = &per_cpu(softnet_data, i);
143 if (rcu_dereference(sd->flow_limit))
144 cpumask_set_cpu(i, mask);
145 }
146 rcu_read_unlock();
147
148 len = cpumask_scnprintf(buffer, *lenp, mask);
149 *lenp = len + 1;
150 *ppos += len + 1;
151 }
152
153done:
154 free_cpumask_var(mask);
155 return ret;
156}
157
158static int flow_limit_table_len_sysctl(ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 unsigned int old, *ptr;
163 int ret;
164
165 mutex_lock(&flow_limit_update_mutex);
166
167 ptr = table->data;
168 old = *ptr;
169 ret = proc_dointvec(table, write, buffer, lenp, ppos);
170 if (!ret && write && !is_power_of_2(*ptr)) {
171 *ptr = old;
172 ret = -EINVAL;
173 }
174
175 mutex_unlock(&flow_limit_update_mutex);
176 return ret;
177}
178#endif /* CONFIG_NET_FLOW_LIMIT */
179
90static struct ctl_table net_core_table[] = { 180static struct ctl_table net_core_table[] = {
91#ifdef CONFIG_NET 181#ifdef CONFIG_NET
92 { 182 {
@@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = {
180 .proc_handler = rps_sock_flow_sysctl 270 .proc_handler = rps_sock_flow_sysctl
181 }, 271 },
182#endif 272#endif
273#ifdef CONFIG_NET_FLOW_LIMIT
274 {
275 .procname = "flow_limit_cpu_bitmap",
276 .mode = 0644,
277 .proc_handler = flow_limit_cpu_sysctl
278 },
279 {
280 .procname = "flow_limit_table_len",
281 .data = &netdev_flow_limit_table_len,
282 .maxlen = sizeof(int),
283 .mode = 0644,
284 .proc_handler = flow_limit_table_len_sysctl
285 },
286#endif /* CONFIG_NET_FLOW_LIMIT */
183#endif /* CONFIG_NET */ 287#endif /* CONFIG_NET */
184 { 288 {
185 .procname = "netdev_budget", 289 .procname = "netdev_budget",