aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-08-08 17:19:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:17 -0400
commit00501b531c4723972aa11d6d4ebcf8d6552007c8 (patch)
treeb3ad4850d58f137cf87b8424412d962fb251839f /mm/shmem.c
parent4449a51a7c281602d3a385044ab928322a122a02 (diff)
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c37
1 files changed, 23 insertions, 14 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 302d1cf7ad07..1f1a8085538b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -621,7 +621,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
621 radswap = swp_to_radix_entry(swap); 621 radswap = swp_to_radix_entry(swap);
622 index = radix_tree_locate_item(&mapping->page_tree, radswap); 622 index = radix_tree_locate_item(&mapping->page_tree, radswap);
623 if (index == -1) 623 if (index == -1)
624 return 0; 624 return -EAGAIN; /* tell shmem_unuse we found nothing */
625 625
626 /* 626 /*
627 * Move _head_ to start search for next from here. 627 * Move _head_ to start search for next from here.
@@ -680,7 +680,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
680 spin_unlock(&info->lock); 680 spin_unlock(&info->lock);
681 swap_free(swap); 681 swap_free(swap);
682 } 682 }
683 error = 1; /* not an error, but entry was found */
684 } 683 }
685 return error; 684 return error;
686} 685}
@@ -692,7 +691,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
692{ 691{
693 struct list_head *this, *next; 692 struct list_head *this, *next;
694 struct shmem_inode_info *info; 693 struct shmem_inode_info *info;
695 int found = 0; 694 struct mem_cgroup *memcg;
696 int error = 0; 695 int error = 0;
697 696
698 /* 697 /*
@@ -707,26 +706,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
707 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 706 * the shmem_swaplist_mutex which might hold up shmem_writepage().
708 * Charged back to the user (not to caller) when swap account is used. 707 * Charged back to the user (not to caller) when swap account is used.
709 */ 708 */
710 error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); 709 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
711 if (error) 710 if (error)
712 goto out; 711 goto out;
713 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 712 /* No radix_tree_preload: swap entry keeps a place for page in tree */
713 error = -EAGAIN;
714 714
715 mutex_lock(&shmem_swaplist_mutex); 715 mutex_lock(&shmem_swaplist_mutex);
716 list_for_each_safe(this, next, &shmem_swaplist) { 716 list_for_each_safe(this, next, &shmem_swaplist) {
717 info = list_entry(this, struct shmem_inode_info, swaplist); 717 info = list_entry(this, struct shmem_inode_info, swaplist);
718 if (info->swapped) 718 if (info->swapped)
719 found = shmem_unuse_inode(info, swap, &page); 719 error = shmem_unuse_inode(info, swap, &page);
720 else 720 else
721 list_del_init(&info->swaplist); 721 list_del_init(&info->swaplist);
722 cond_resched(); 722 cond_resched();
723 if (found) 723 if (error != -EAGAIN)
724 break; 724 break;
725 /* found nothing in this: move on to search the next */
725 } 726 }
726 mutex_unlock(&shmem_swaplist_mutex); 727 mutex_unlock(&shmem_swaplist_mutex);
727 728
728 if (found < 0) 729 if (error) {
729 error = found; 730 if (error != -ENOMEM)
731 error = 0;
732 mem_cgroup_cancel_charge(page, memcg);
733 } else
734 mem_cgroup_commit_charge(page, memcg, true);
730out: 735out:
731 unlock_page(page); 736 unlock_page(page);
732 page_cache_release(page); 737 page_cache_release(page);
@@ -1030,6 +1035,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1030 struct address_space *mapping = inode->i_mapping; 1035 struct address_space *mapping = inode->i_mapping;
1031 struct shmem_inode_info *info; 1036 struct shmem_inode_info *info;
1032 struct shmem_sb_info *sbinfo; 1037 struct shmem_sb_info *sbinfo;
1038 struct mem_cgroup *memcg;
1033 struct page *page; 1039 struct page *page;
1034 swp_entry_t swap; 1040 swp_entry_t swap;
1035 int error; 1041 int error;
@@ -1108,8 +1114,7 @@ repeat:
1108 goto failed; 1114 goto failed;
1109 } 1115 }
1110 1116
1111 error = mem_cgroup_charge_file(page, current->mm, 1117 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1112 gfp & GFP_RECLAIM_MASK);
1113 if (!error) { 1118 if (!error) {
1114 error = shmem_add_to_page_cache(page, mapping, index, 1119 error = shmem_add_to_page_cache(page, mapping, index,
1115 swp_to_radix_entry(swap)); 1120 swp_to_radix_entry(swap));
@@ -1125,12 +1130,16 @@ repeat:
1125 * Reset swap.val? No, leave it so "failed" goes back to 1130 * Reset swap.val? No, leave it so "failed" goes back to
1126 * "repeat": reading a hole and writing should succeed. 1131 * "repeat": reading a hole and writing should succeed.
1127 */ 1132 */
1128 if (error) 1133 if (error) {
1134 mem_cgroup_cancel_charge(page, memcg);
1129 delete_from_swap_cache(page); 1135 delete_from_swap_cache(page);
1136 }
1130 } 1137 }
1131 if (error) 1138 if (error)
1132 goto failed; 1139 goto failed;
1133 1140
1141 mem_cgroup_commit_charge(page, memcg, true);
1142
1134 spin_lock(&info->lock); 1143 spin_lock(&info->lock);
1135 info->swapped--; 1144 info->swapped--;
1136 shmem_recalc_inode(inode); 1145 shmem_recalc_inode(inode);
@@ -1168,8 +1177,7 @@ repeat:
1168 if (sgp == SGP_WRITE) 1177 if (sgp == SGP_WRITE)
1169 __SetPageReferenced(page); 1178 __SetPageReferenced(page);
1170 1179
1171 error = mem_cgroup_charge_file(page, current->mm, 1180 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1172 gfp & GFP_RECLAIM_MASK);
1173 if (error) 1181 if (error)
1174 goto decused; 1182 goto decused;
1175 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1183 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1179,9 +1187,10 @@ repeat:
1179 radix_tree_preload_end(); 1187 radix_tree_preload_end();
1180 } 1188 }
1181 if (error) { 1189 if (error) {
1182 mem_cgroup_uncharge_cache_page(page); 1190 mem_cgroup_cancel_charge(page, memcg);
1183 goto decused; 1191 goto decused;
1184 } 1192 }
1193 mem_cgroup_commit_charge(page, memcg, false);
1185 lru_cache_add_anon(page); 1194 lru_cache_add_anon(page);
1186 1195
1187 spin_lock(&info->lock); 1196 spin_lock(&info->lock);