aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2014-07-24 10:50:32 -0400
committerDavid S. Miller <davem@davemloft.net>2014-07-28 01:34:35 -0400
commitb13d3cbfb8e8a8f53930af67d1ebf05149f32c24 (patch)
treee612cafb034853b303a0622c8acc05c53775b255 /net/ipv4
parent86e93e470cadedda9181a2bd9aee1d9d2e5e9c0f (diff)
inet: frag: move eviction of queues to work queue
When the high_thresh limit is reached we try to toss the 'oldest' incomplete fragment queues until memory limits are below the low_thresh value. This happens in softirq/packet processing context. This has two drawbacks: 1) processors might evict a queue that was about to be completed by another cpu, because they will compete wrt. resource usage and resource reclaim. 2) LRU list maintenance is expensive. But when constantly overloaded, even the 'least recently used' element is recent, so removing 'lru' queue first is not 'fairer' than removing any other fragment queue. This moves eviction out of the fast path: When the low threshold is reached, a work queue is scheduled which then iterates over the table and removes the queues that exceed the memory limits of the namespace. It sets a new flag called INET_FRAG_EVICTED on the evicted queues so the proper counters will get incremented when the queue is forcefully expired. When the high threshold is reached, no more fragment queues are created until we're below the limit again. The LRU list is now unused and will be removed in a followup patch. Joint work with Nikolay Aleksandrov. Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_fragment.c142
-rw-r--r--net/ipv4/ip_fragment.c3
2 files changed, 101 insertions, 44 deletions
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 535636017534..43315ecb9400 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,9 @@
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/inet_ecn.h> 26#include <net/inet_ecn.h>
27 27
28#define INETFRAGS_EVICT_BUCKETS 128
29#define INETFRAGS_EVICT_MAX 512
30
28/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 31/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
29 * Value : 0xff if frame should be dropped. 32 * Value : 0xff if frame should be dropped.
30 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 33 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,8 +49,6 @@ const u8 ip_frag_ecn_table[16] = {
46}; 49};
47EXPORT_SYMBOL(ip_frag_ecn_table); 50EXPORT_SYMBOL(ip_frag_ecn_table);
48 51
49static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
50
51static unsigned int 52static unsigned int
52inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) 53inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
53{ 54{
@@ -89,10 +90,92 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
89 mod_timer(&f->secret_timer, now + f->secret_interval); 90 mod_timer(&f->secret_timer, now + f->secret_interval);
90} 91}
91 92
93static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
94{
95 return q->net->low_thresh == 0 ||
96 frag_mem_limit(q->net) >= q->net->low_thresh;
97}
98
99static unsigned int
100inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
101{
102 struct inet_frag_queue *fq;
103 struct hlist_node *n;
104 unsigned int evicted = 0;
105 HLIST_HEAD(expired);
106
107evict_again:
108 spin_lock(&hb->chain_lock);
109
110 hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
111 if (!inet_fragq_should_evict(fq))
112 continue;
113
114 if (!del_timer(&fq->timer)) {
115 /* q expiring right now thus increment its refcount so
116 * it won't be freed under us and wait until the timer
117 * has finished executing then destroy it
118 */
119 atomic_inc(&fq->refcnt);
120 spin_unlock(&hb->chain_lock);
121 del_timer_sync(&fq->timer);
122 WARN_ON(atomic_read(&fq->refcnt) != 1);
123 inet_frag_put(fq, f);
124 goto evict_again;
125 }
126
127 /* suppress xmit of (icmp) error packet */
128 fq->last_in &= ~INET_FRAG_FIRST_IN;
129 fq->last_in |= INET_FRAG_EVICTED;
130 hlist_del(&fq->list);
131 hlist_add_head(&fq->list, &expired);
132 ++evicted;
133 }
134
135 spin_unlock(&hb->chain_lock);
136
137 hlist_for_each_entry_safe(fq, n, &expired, list)
138 f->frag_expire((unsigned long) fq);
139
140 return evicted;
141}
142
143static void inet_frag_worker(struct work_struct *work)
144{
145 unsigned int budget = INETFRAGS_EVICT_BUCKETS;
146 unsigned int i, evicted = 0;
147 struct inet_frags *f;
148
149 f = container_of(work, struct inet_frags, frags_work);
150
151 BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
152
153 read_lock_bh(&f->lock);
154
155 for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
156 evicted += inet_evict_bucket(f, &f->hash[i]);
157 i = (i + 1) & (INETFRAGS_HASHSZ - 1);
158 if (evicted > INETFRAGS_EVICT_MAX)
159 break;
160 }
161
162 f->next_bucket = i;
163
164 read_unlock_bh(&f->lock);
165}
166
167static void inet_frag_schedule_worker(struct inet_frags *f)
168{
169 if (unlikely(!work_pending(&f->frags_work)))
170 schedule_work(&f->frags_work);
171}
172
92void inet_frags_init(struct inet_frags *f) 173void inet_frags_init(struct inet_frags *f)
93{ 174{
94 int i; 175 int i;
95 176
177 INIT_WORK(&f->frags_work, inet_frag_worker);
178
96 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 179 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
97 struct inet_frag_bucket *hb = &f->hash[i]; 180 struct inet_frag_bucket *hb = &f->hash[i];
98 181
@@ -120,16 +203,22 @@ EXPORT_SYMBOL(inet_frags_init_net);
120void inet_frags_fini(struct inet_frags *f) 203void inet_frags_fini(struct inet_frags *f)
121{ 204{
122 del_timer(&f->secret_timer); 205 del_timer(&f->secret_timer);
206 cancel_work_sync(&f->frags_work);
123} 207}
124EXPORT_SYMBOL(inet_frags_fini); 208EXPORT_SYMBOL(inet_frags_fini);
125 209
126void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) 210void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
127{ 211{
212 int i;
213
128 nf->low_thresh = 0; 214 nf->low_thresh = 0;
129 215
130 local_bh_disable(); 216 read_lock_bh(&f->lock);
131 inet_frag_evictor(nf, f, true); 217
132 local_bh_enable(); 218 for (i = 0; i < INETFRAGS_HASHSZ ; i++)
219 inet_evict_bucket(f, &f->hash[i]);
220
221 read_unlock_bh(&f->lock);
133 222
134 percpu_counter_destroy(&nf->mem); 223 percpu_counter_destroy(&nf->mem);
135} 224}
@@ -205,41 +294,6 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
205} 294}
206EXPORT_SYMBOL(inet_frag_destroy); 295EXPORT_SYMBOL(inet_frag_destroy);
207 296
208static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
209{
210 struct inet_frag_queue *q;
211 int work, evicted = 0;
212
213 work = frag_mem_limit(nf) - nf->low_thresh;
214 while (work > 0 || force) {
215 spin_lock(&nf->lru_lock);
216
217 if (list_empty(&nf->lru_list)) {
218 spin_unlock(&nf->lru_lock);
219 break;
220 }
221
222 q = list_first_entry(&nf->lru_list,
223 struct inet_frag_queue, lru_list);
224 atomic_inc(&q->refcnt);
225 /* Remove q from list to avoid several CPUs grabbing it */
226 list_del_init(&q->lru_list);
227
228 spin_unlock(&nf->lru_lock);
229
230 spin_lock(&q->lock);
231 if (!(q->last_in & INET_FRAG_COMPLETE))
232 inet_frag_kill(q, f);
233 spin_unlock(&q->lock);
234
235 if (atomic_dec_and_test(&q->refcnt))
236 inet_frag_destroy(q, f, &work);
237 evicted++;
238 }
239
240 return evicted;
241}
242
243static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, 297static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
244 struct inet_frag_queue *qp_in, struct inet_frags *f, 298 struct inet_frag_queue *qp_in, struct inet_frags *f,
245 void *arg) 299 void *arg)
@@ -292,8 +346,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
292{ 346{
293 struct inet_frag_queue *q; 347 struct inet_frag_queue *q;
294 348
295 if (frag_mem_limit(nf) > nf->high_thresh) 349 if (frag_mem_limit(nf) > nf->high_thresh) {
350 inet_frag_schedule_worker(f);
296 return NULL; 351 return NULL;
352 }
297 353
298 q = kzalloc(f->qsize, GFP_ATOMIC); 354 q = kzalloc(f->qsize, GFP_ATOMIC);
299 if (q == NULL) 355 if (q == NULL)
@@ -331,8 +387,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
331 struct inet_frag_queue *q; 387 struct inet_frag_queue *q;
332 int depth = 0; 388 int depth = 0;
333 389
334 if (frag_mem_limit(nf) > nf->high_thresh) 390 if (frag_mem_limit(nf) > nf->low_thresh)
335 inet_frag_evictor(nf, f, false); 391 inet_frag_schedule_worker(f);
336 392
337 hash &= (INETFRAGS_HASHSZ - 1); 393 hash &= (INETFRAGS_HASHSZ - 1);
338 hb = &f->hash[hash]; 394 hb = &f->hash[hash];
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 54988672d00d..54bd170c5eb4 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -195,7 +195,8 @@ static void ip_expire(unsigned long arg)
195 195
196 ipq_kill(qp); 196 ipq_kill(qp);
197 197
198 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); 198 if (!(qp->q.last_in & INET_FRAG_EVICTED))
199 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
199 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); 200 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
200 201
201 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 202 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {