diff options
author | Jesper Dangaard Brouer <brouer@redhat.com> | 2017-09-01 05:26:08 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-09-03 14:01:05 -0400 |
commit | fb452a1aa3fd4034d7999e309c5466ff2d7005aa (patch) | |
tree | 929f2e7692f05a685de60535e2de27aa37579400 | |
parent | 64327fc811268d4a24de03dac242ea29de6be75f (diff) |
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet_frag.h | 30 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 4 |
2 files changed, 10 insertions, 24 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 6fdcd2427776..fa635aa6d0b9 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h | |||
@@ -1,14 +1,9 @@ | |||
1 | #ifndef __NET_FRAG_H__ | 1 | #ifndef __NET_FRAG_H__ |
2 | #define __NET_FRAG_H__ | 2 | #define __NET_FRAG_H__ |
3 | 3 | ||
4 | #include <linux/percpu_counter.h> | ||
5 | |||
6 | struct netns_frags { | 4 | struct netns_frags { |
7 | /* The percpu_counter "mem" need to be cacheline aligned. | 5 | /* Keep atomic mem on separate cachelines in structs that include it */ |
8 | * mem.count must not share cacheline with other writers | 6 | atomic_t mem ____cacheline_aligned_in_smp; |
9 | */ | ||
10 | struct percpu_counter mem ____cacheline_aligned_in_smp; | ||
11 | |||
12 | /* sysctls */ | 7 | /* sysctls */ |
13 | int timeout; | 8 | int timeout; |
14 | int high_thresh; | 9 | int high_thresh; |
@@ -110,11 +105,11 @@ void inet_frags_fini(struct inet_frags *); | |||
110 | 105 | ||
111 | static inline int inet_frags_init_net(struct netns_frags *nf) | 106 | static inline int inet_frags_init_net(struct netns_frags *nf) |
112 | { | 107 | { |
113 | return percpu_counter_init(&nf->mem, 0, GFP_KERNEL); | 108 | atomic_set(&nf->mem, 0); |
109 | return 0; | ||
114 | } | 110 | } |
115 | static inline void inet_frags_uninit_net(struct netns_frags *nf) | 111 | static inline void inet_frags_uninit_net(struct netns_frags *nf) |
116 | { | 112 | { |
117 | percpu_counter_destroy(&nf->mem); | ||
118 | } | 113 | } |
119 | 114 | ||
120 | void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); | 115 | void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); |
@@ -140,31 +135,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q) | |||
140 | 135 | ||
141 | /* Memory Tracking Functions. */ | 136 | /* Memory Tracking Functions. */ |
142 | 137 | ||
143 | /* The default percpu_counter batch size is not big enough to scale to | ||
144 | * fragmentation mem acct sizes. | ||
145 | * The mem size of a 64K fragment is approx: | ||
146 | * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes | ||
147 | */ | ||
148 | static unsigned int frag_percpu_counter_batch = 130000; | ||
149 | |||
150 | static inline int frag_mem_limit(struct netns_frags *nf) | 138 | static inline int frag_mem_limit(struct netns_frags *nf) |
151 | { | 139 | { |
152 | return percpu_counter_read(&nf->mem); | 140 | return atomic_read(&nf->mem); |
153 | } | 141 | } |
154 | 142 | ||
155 | static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) | 143 | static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) |
156 | { | 144 | { |
157 | percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch); | 145 | atomic_sub(i, &nf->mem); |
158 | } | 146 | } |
159 | 147 | ||
160 | static inline void add_frag_mem_limit(struct netns_frags *nf, int i) | 148 | static inline void add_frag_mem_limit(struct netns_frags *nf, int i) |
161 | { | 149 | { |
162 | percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch); | 150 | atomic_add(i, &nf->mem); |
163 | } | 151 | } |
164 | 152 | ||
165 | static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) | 153 | static inline int sum_frag_mem_limit(struct netns_frags *nf) |
166 | { | 154 | { |
167 | return percpu_counter_sum_positive(&nf->mem); | 155 | return atomic_read(&nf->mem); |
168 | } | 156 | } |
169 | 157 | ||
170 | /* RFC 3168 support : | 158 | /* RFC 3168 support : |
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 96e95e83cc61..af74d0433453 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
@@ -234,10 +234,8 @@ evict_again: | |||
234 | cond_resched(); | 234 | cond_resched(); |
235 | 235 | ||
236 | if (read_seqretry(&f->rnd_seqlock, seq) || | 236 | if (read_seqretry(&f->rnd_seqlock, seq) || |
237 | percpu_counter_sum(&nf->mem)) | 237 | sum_frag_mem_limit(nf)) |
238 | goto evict_again; | 238 | goto evict_again; |
239 | |||
240 | percpu_counter_destroy(&nf->mem); | ||
241 | } | 239 | } |
242 | EXPORT_SYMBOL(inet_frags_exit_net); | 240 | EXPORT_SYMBOL(inet_frags_exit_net); |
243 | 241 | ||