aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/sock.h
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-12-10 18:42:31 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 20:41:04 -0500
commit3e32cb2e0a12b6915056ff04601cf1bb9b44f967 (patch)
tree75d312d531736fbb4281bfe0e80847d3ef9f8a4a /include/net/sock.h
parent8df0c2dcf61781d2efa8e6e5b06870f6c6785735 (diff)
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit counters, even though the smallest accounting delta is a page. The counter interface is also convoluted and does too many things. Introduce a new lockless word-sized page counter API, then change all memory accounting over to it. The translation from and to bytes then only happens when interfacing with userspace. The removed locking overhead is noticable when scaling beyond the per-cpu charge caches - on a 4-socket machine with 144-threads, the following test shows the performance differences of 288 memcgs concurrently running a page fault benchmark: vanilla: 18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% ) 1,380,638 context-switches # 0.074 K/sec ( +- 0.75% ) 24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% ) 1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% ) 50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% ) 1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% ) 1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% ) 132.474343877 seconds time elapsed ( +- 0.21% ) lockless: 12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% ) 832,850 context-switches # 0.068 K/sec ( +- 0.54% ) 15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% ) 1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% ) 32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% ) 2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% ) 1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% ) 91.369330729 seconds time elapsed ( +- 0.45% ) On top of improved scalability, this also gets rid of the icky long long types in the very heart of memcg, which is great for 32 bit and also makes the code a lot more readable. Notable differences between the old and new API: - res_counter_charge() and res_counter_charge_nofail() become page_counter_try_charge() and page_counter_charge() resp. to match the more common kernel naming scheme of try_do()/do() - res_counter_uncharge_until() is only ever used to cancel a local counter and never to uncharge bigger segments of a hierarchy, so it's replaced by the simpler page_counter_cancel() - res_counter_set_limit() is replaced by page_counter_limit(), which expects its callers to serialize against themselves - res_counter_memparse_write_strategy() is replaced by page_counter_limit(), which rounds down to the nearest page size - rather than up. This is more reasonable for explicitely requested hard upper limits. - to keep charging light-weight, page_counter_try_charge() charges speculatively, only to roll back if the result exceeds the limit. Because of this, a failing bigger charge can temporarily lock out smaller charges that would otherwise succeed. The error is bounded to the difference between the smallest and the biggest possible charge size, so for memcg, this means that a failing THP charge can send base page charges into reclaim upto 2MB (4MB) before the limit would have been reached. This should be acceptable. [akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse] [akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h26
1 files changed, 9 insertions, 17 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index e6f235ebf6c9..7ff44e062a38 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,8 +54,8 @@
54#include <linux/security.h> 54#include <linux/security.h>
55#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/uaccess.h> 56#include <linux/uaccess.h>
57#include <linux/page_counter.h>
57#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
58#include <linux/res_counter.h>
59#include <linux/static_key.h> 59#include <linux/static_key.h>
60#include <linux/aio.h> 60#include <linux/aio.h>
61#include <linux/sched.h> 61#include <linux/sched.h>
@@ -1062,7 +1062,7 @@ enum cg_proto_flags {
1062}; 1062};
1063 1063
1064struct cg_proto { 1064struct cg_proto {
1065 struct res_counter memory_allocated; /* Current allocated memory. */ 1065 struct page_counter memory_allocated; /* Current allocated memory. */
1066 struct percpu_counter sockets_allocated; /* Current number of sockets. */ 1066 struct percpu_counter sockets_allocated; /* Current number of sockets. */
1067 int memory_pressure; 1067 int memory_pressure;
1068 long sysctl_mem[3]; 1068 long sysctl_mem[3];
@@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
1214 unsigned long amt, 1214 unsigned long amt,
1215 int *parent_status) 1215 int *parent_status)
1216{ 1216{
1217 struct res_counter *fail; 1217 page_counter_charge(&prot->memory_allocated, amt);
1218 int ret;
1219 1218
1220 ret = res_counter_charge_nofail(&prot->memory_allocated, 1219 if (page_counter_read(&prot->memory_allocated) >
1221 amt << PAGE_SHIFT, &fail); 1220 prot->memory_allocated.limit)
1222 if (ret < 0)
1223 *parent_status = OVER_LIMIT; 1221 *parent_status = OVER_LIMIT;
1224} 1222}
1225 1223
1226static inline void memcg_memory_allocated_sub(struct cg_proto *prot, 1224static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
1227 unsigned long amt) 1225 unsigned long amt)
1228{ 1226{
1229 res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); 1227 page_counter_uncharge(&prot->memory_allocated, amt);
1230}
1231
1232static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
1233{
1234 u64 ret;
1235 ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE);
1236 return ret >> PAGE_SHIFT;
1237} 1228}
1238 1229
1239static inline long 1230static inline long
1240sk_memory_allocated(const struct sock *sk) 1231sk_memory_allocated(const struct sock *sk)
1241{ 1232{
1242 struct proto *prot = sk->sk_prot; 1233 struct proto *prot = sk->sk_prot;
1234
1243 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1235 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1244 return memcg_memory_allocated_read(sk->sk_cgrp); 1236 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1245 1237
1246 return atomic_long_read(prot->memory_allocated); 1238 return atomic_long_read(prot->memory_allocated);
1247} 1239}
@@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
1255 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); 1247 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
1256 /* update the root cgroup regardless */ 1248 /* update the root cgroup regardless */
1257 atomic_long_add_return(amt, prot->memory_allocated); 1249 atomic_long_add_return(amt, prot->memory_allocated);
1258 return memcg_memory_allocated_read(sk->sk_cgrp); 1250 return page_counter_read(&sk->sk_cgrp->memory_allocated);
1259 } 1251 }
1260 1252
1261 return atomic_long_add_return(amt, prot->memory_allocated); 1253 return atomic_long_add_return(amt, prot->memory_allocated);