diff options
author | Michal Hocko <mhocko@suse.cz> | 2014-10-09 18:28:52 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-09 22:25:59 -0400 |
commit | aabfb57296e3dd9761e47736ec69305c95461d7d (patch) | |
tree | 379e66feb872f9f42b44b3245e52cb16ab3194bb /mm | |
parent | 01c2965f0723a25209d5cf4cac630ed0f6d0edf4 (diff) |
mm: memcontrol: do not kill uncharge batching in free_pages_and_swap_cache
free_pages_and_swap_cache limits release_pages to PAGEVEC_SIZE chunks.
This is not a big deal for the normal release path but it completely kills
memcg uncharge batching which reduces res_counter spin_lock contention.
Dave has noticed this with his page fault scalability test case on a large
machine when the lock was basically dominating on all CPUs:
80.18% 80.18% [kernel] [k] _raw_spin_lock
|
--- _raw_spin_lock
|
|--66.59%-- res_counter_uncharge_until
| res_counter_uncharge
| uncharge_batch
| uncharge_list
| mem_cgroup_uncharge_list
| release_pages
| free_pages_and_swap_cache
| tlb_flush_mmu_free
| |
| |--90.12%-- unmap_single_vma
| | unmap_vmas
| | unmap_region
| | do_munmap
| | vm_munmap
| | sys_munmap
| | system_call_fastpath
| | __GI___munmap
| |
| --9.88%-- tlb_flush_mmu
| tlb_finish_mmu
| unmap_region
| do_munmap
| vm_munmap
| sys_munmap
| system_call_fastpath
| __GI___munmap
In his case the load was running in the root memcg and that part has been
handled by reverting 05b843012335 ("mm: memcontrol: use root_mem_cgroup
res_counter") because this is a clear regression, but the problem remains
inside dedicated memcgs.
There is no reason to limit release_pages to PAGEVEC_SIZE batches other
than lru_lock held times. This logic, however, can be moved inside the
function. mem_cgroup_uncharge_list and free_hot_cold_page_list do not
hold any lock for the whole pages_to_free list so it is safe to call them
in a single run.
The release_pages() code was previously breaking the lru_lock each
PAGEVEC_SIZE pages (ie, 14 pages). However this code has no usage of
pagevecs so switch to breaking the lock at least every SWAP_CLUSTER_MAX
(32) pages. This means that the lock acquisition frequency is
approximately halved and the max hold times are approximately doubled.
The now unneeded batching is removed from free_pages_and_swap_cache().
Also update the grossly out-of-date release_pages documentation.
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Dave Hansen <dave@sr71.net>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/swap.c | 30 | ||||
-rw-r--r-- | mm/swap_state.c | 14 |
2 files changed, 23 insertions, 21 deletions
@@ -887,18 +887,14 @@ void lru_add_drain_all(void) | |||
887 | mutex_unlock(&lock); | 887 | mutex_unlock(&lock); |
888 | } | 888 | } |
889 | 889 | ||
890 | /* | 890 | /** |
891 | * Batched page_cache_release(). Decrement the reference count on all the | 891 | * release_pages - batched page_cache_release() |
892 | * passed pages. If it fell to zero then remove the page from the LRU and | 892 | * @pages: array of pages to release |
893 | * free it. | 893 | * @nr: number of pages |
894 | * | 894 | * @cold: whether the pages are cache cold |
895 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | ||
896 | * for the remainder of the operation. | ||
897 | * | 895 | * |
898 | * The locking in this function is against shrink_inactive_list(): we recheck | 896 | * Decrement the reference count on all the pages in @pages. If it |
899 | * the page count inside the lock to see whether shrink_inactive_list() | 897 | * fell to zero, remove the page from the LRU and free it. |
900 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | ||
901 | * will free it. | ||
902 | */ | 898 | */ |
903 | void release_pages(struct page **pages, int nr, bool cold) | 899 | void release_pages(struct page **pages, int nr, bool cold) |
904 | { | 900 | { |
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
907 | struct zone *zone = NULL; | 903 | struct zone *zone = NULL; |
908 | struct lruvec *lruvec; | 904 | struct lruvec *lruvec; |
909 | unsigned long uninitialized_var(flags); | 905 | unsigned long uninitialized_var(flags); |
906 | unsigned int uninitialized_var(lock_batch); | ||
910 | 907 | ||
911 | for (i = 0; i < nr; i++) { | 908 | for (i = 0; i < nr; i++) { |
912 | struct page *page = pages[i]; | 909 | struct page *page = pages[i]; |
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
920 | continue; | 917 | continue; |
921 | } | 918 | } |
922 | 919 | ||
920 | /* | ||
921 | * Make sure the IRQ-safe lock-holding time does not get | ||
922 | * excessive with a continuous string of pages from the | ||
923 | * same zone. The lock is held only if zone != NULL. | ||
924 | */ | ||
925 | if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { | ||
926 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
927 | zone = NULL; | ||
928 | } | ||
929 | |||
923 | if (!put_page_testzero(page)) | 930 | if (!put_page_testzero(page)) |
924 | continue; | 931 | continue; |
925 | 932 | ||
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
930 | if (zone) | 937 | if (zone) |
931 | spin_unlock_irqrestore(&zone->lru_lock, | 938 | spin_unlock_irqrestore(&zone->lru_lock, |
932 | flags); | 939 | flags); |
940 | lock_batch = 0; | ||
933 | zone = pagezone; | 941 | zone = pagezone; |
934 | spin_lock_irqsave(&zone->lru_lock, flags); | 942 | spin_lock_irqsave(&zone->lru_lock, flags); |
935 | } | 943 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index ef1f39139b71..154444918685 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -265,18 +265,12 @@ void free_page_and_swap_cache(struct page *page) | |||
265 | void free_pages_and_swap_cache(struct page **pages, int nr) | 265 | void free_pages_and_swap_cache(struct page **pages, int nr) |
266 | { | 266 | { |
267 | struct page **pagep = pages; | 267 | struct page **pagep = pages; |
268 | int i; | ||
268 | 269 | ||
269 | lru_add_drain(); | 270 | lru_add_drain(); |
270 | while (nr) { | 271 | for (i = 0; i < nr; i++) |
271 | int todo = min(nr, PAGEVEC_SIZE); | 272 | free_swap_cache(pagep[i]); |
272 | int i; | 273 | release_pages(pagep, nr, false); |
273 | |||
274 | for (i = 0; i < todo; i++) | ||
275 | free_swap_cache(pagep[i]); | ||
276 | release_pages(pagep, todo, false); | ||
277 | pagep += todo; | ||
278 | nr -= todo; | ||
279 | } | ||
280 | } | 274 | } |
281 | 275 | ||
282 | /* | 276 | /* |