aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorVladimir Davydov <vdavydov@virtuozzo.com>2016-01-20 18:02:56 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-20 20:09:18 -0500
commit37e84351198be087335ad2b2253b35c7cc76a5ad (patch)
tree3f7cfe687fdc86bea76f2e47787ff1f7c79bef23 /mm
parent0b8f73e104285a4badf9d768d1c39b06d77d1f97 (diff)
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2. This patch (of 7): In the legacy hierarchy we charge memsw, which is dubious, because: - memsw.limit must be >= memory.limit, so it is impossible to limit swap usage less than memory usage. Taking into account the fact that the primary limiting mechanism in the unified hierarchy is memory.high while memory.limit is either left unset or set to a very large value, moving memsw.limit knob to the unified hierarchy would effectively make it impossible to limit swap usage according to the user preference. - memsw.usage != memory.usage + swap.usage, because a page occupying both swap entry and a swap cache page is charged only once to memsw counter. As a result, it is possible to effectively eat up to memory.limit of memory pages *and* memsw.limit of swap entries, which looks unexpected. That said, we should provide a different swap limiting mechanism for cgroup2. This patch adds mem_cgroup->swap counter, which charges the actual number of swap entries used by a cgroup. It is only charged in the unified hierarchy, while the legacy hierarchy memsw logic is left intact. The swap usage can be monitored using new memory.swap.current file and limited using memory.swap.max. Note, to charge swap resource properly in the unified hierarchy, we have to make swap_entry_free uncharge swap only when ->usage reaches zero, not just ->count, i.e. when all references to a swap entry, including the one taken by swap cache, are gone. This is necessary, because otherwise swap-in could result in uncharging swap even if the page is still in swap cache and hence still occupies a swap entry. At the same time, this shouldn't break memsw counter logic, where a page is never charged twice for using both memory and swap, because in case of legacy hierarchy we uncharge swap on commit (see mem_cgroup_commit_charge). Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c118
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c4
4 files changed, 121 insertions, 10 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f6bc78f4ed13..1ff552e3722b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1220 pr_cont(":"); 1220 pr_cont(":");
1221 1221
1222 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1222 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1223 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) 1223 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1224 continue; 1224 continue;
1225 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], 1225 pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1226 K(mem_cgroup_read_stat(iter, i))); 1226 K(mem_cgroup_read_stat(iter, i)));
@@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1259 limit = memcg->memory.limit; 1259 limit = memcg->memory.limit;
1260 if (mem_cgroup_swappiness(memcg)) { 1260 if (mem_cgroup_swappiness(memcg)) {
1261 unsigned long memsw_limit; 1261 unsigned long memsw_limit;
1262 unsigned long swap_limit;
1262 1263
1263 memsw_limit = memcg->memsw.limit; 1264 memsw_limit = memcg->memsw.limit;
1264 limit = min(limit + total_swap_pages, memsw_limit); 1265 swap_limit = memcg->swap.limit;
1266 swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1267 limit = min(limit + swap_limit, memsw_limit);
1265 } 1268 }
1266 return limit; 1269 return limit;
1267} 1270}
@@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4201 if (parent && parent->use_hierarchy) { 4204 if (parent && parent->use_hierarchy) {
4202 memcg->use_hierarchy = true; 4205 memcg->use_hierarchy = true;
4203 page_counter_init(&memcg->memory, &parent->memory); 4206 page_counter_init(&memcg->memory, &parent->memory);
4207 page_counter_init(&memcg->swap, &parent->swap);
4204 page_counter_init(&memcg->memsw, &parent->memsw); 4208 page_counter_init(&memcg->memsw, &parent->memsw);
4205 page_counter_init(&memcg->kmem, &parent->kmem); 4209 page_counter_init(&memcg->kmem, &parent->kmem);
4206 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 4210 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4207 } else { 4211 } else {
4208 page_counter_init(&memcg->memory, NULL); 4212 page_counter_init(&memcg->memory, NULL);
4213 page_counter_init(&memcg->swap, NULL);
4209 page_counter_init(&memcg->memsw, NULL); 4214 page_counter_init(&memcg->memsw, NULL);
4210 page_counter_init(&memcg->kmem, NULL); 4215 page_counter_init(&memcg->kmem, NULL);
4211 page_counter_init(&memcg->tcpmem, NULL); 4216 page_counter_init(&memcg->tcpmem, NULL);
@@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5224 if (page->mem_cgroup) 5229 if (page->mem_cgroup)
5225 goto out; 5230 goto out;
5226 5231
5227 if (do_memsw_account()) { 5232 if (do_swap_account) {
5228 swp_entry_t ent = { .val = page_private(page), }; 5233 swp_entry_t ent = { .val = page_private(page), };
5229 unsigned short id = lookup_swap_cgroup_id(ent); 5234 unsigned short id = lookup_swap_cgroup_id(ent);
5230 5235
@@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5677 memcg_check_events(memcg, page); 5682 memcg_check_events(memcg, page);
5678} 5683}
5679 5684
5685/*
5686 * mem_cgroup_try_charge_swap - try charging a swap entry
5687 * @page: page being added to swap
5688 * @entry: swap entry to charge
5689 *
5690 * Try to charge @entry to the memcg that @page belongs to.
5691 *
5692 * Returns 0 on success, -ENOMEM on failure.
5693 */
5694int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
5695{
5696 struct mem_cgroup *memcg;
5697 struct page_counter *counter;
5698 unsigned short oldid;
5699
5700 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
5701 return 0;
5702
5703 memcg = page->mem_cgroup;
5704
5705 /* Readahead page, never charged */
5706 if (!memcg)
5707 return 0;
5708
5709 if (!mem_cgroup_is_root(memcg) &&
5710 !page_counter_try_charge(&memcg->swap, 1, &counter))
5711 return -ENOMEM;
5712
5713 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5714 VM_BUG_ON_PAGE(oldid, page);
5715 mem_cgroup_swap_statistics(memcg, true);
5716
5717 css_get(&memcg->css);
5718 return 0;
5719}
5720
5680/** 5721/**
5681 * mem_cgroup_uncharge_swap - uncharge a swap entry 5722 * mem_cgroup_uncharge_swap - uncharge a swap entry
5682 * @entry: swap entry to uncharge 5723 * @entry: swap entry to uncharge
5683 * 5724 *
5684 * Drop the memsw charge associated with @entry. 5725 * Drop the swap charge associated with @entry.
5685 */ 5726 */
5686void mem_cgroup_uncharge_swap(swp_entry_t entry) 5727void mem_cgroup_uncharge_swap(swp_entry_t entry)
5687{ 5728{
5688 struct mem_cgroup *memcg; 5729 struct mem_cgroup *memcg;
5689 unsigned short id; 5730 unsigned short id;
5690 5731
5691 if (!do_memsw_account()) 5732 if (!do_swap_account)
5692 return; 5733 return;
5693 5734
5694 id = swap_cgroup_record(entry, 0); 5735 id = swap_cgroup_record(entry, 0);
5695 rcu_read_lock(); 5736 rcu_read_lock();
5696 memcg = mem_cgroup_from_id(id); 5737 memcg = mem_cgroup_from_id(id);
5697 if (memcg) { 5738 if (memcg) {
5698 if (!mem_cgroup_is_root(memcg)) 5739 if (!mem_cgroup_is_root(memcg)) {
5699 page_counter_uncharge(&memcg->memsw, 1); 5740 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5741 page_counter_uncharge(&memcg->swap, 1);
5742 else
5743 page_counter_uncharge(&memcg->memsw, 1);
5744 }
5700 mem_cgroup_swap_statistics(memcg, false); 5745 mem_cgroup_swap_statistics(memcg, false);
5701 css_put(&memcg->css); 5746 css_put(&memcg->css);
5702 } 5747 }
@@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s)
5720} 5765}
5721__setup("swapaccount=", enable_swap_account); 5766__setup("swapaccount=", enable_swap_account);
5722 5767
5768static u64 swap_current_read(struct cgroup_subsys_state *css,
5769 struct cftype *cft)
5770{
5771 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5772
5773 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
5774}
5775
5776static int swap_max_show(struct seq_file *m, void *v)
5777{
5778 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5779 unsigned long max = READ_ONCE(memcg->swap.limit);
5780
5781 if (max == PAGE_COUNTER_MAX)
5782 seq_puts(m, "max\n");
5783 else
5784 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5785
5786 return 0;
5787}
5788
5789static ssize_t swap_max_write(struct kernfs_open_file *of,
5790 char *buf, size_t nbytes, loff_t off)
5791{
5792 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5793 unsigned long max;
5794 int err;
5795
5796 buf = strstrip(buf);
5797 err = page_counter_memparse(buf, "max", &max);
5798 if (err)
5799 return err;
5800
5801 mutex_lock(&memcg_limit_mutex);
5802 err = page_counter_limit(&memcg->swap, max);
5803 mutex_unlock(&memcg_limit_mutex);
5804 if (err)
5805 return err;
5806
5807 return nbytes;
5808}
5809
5810static struct cftype swap_files[] = {
5811 {
5812 .name = "swap.current",
5813 .flags = CFTYPE_NOT_ON_ROOT,
5814 .read_u64 = swap_current_read,
5815 },
5816 {
5817 .name = "swap.max",
5818 .flags = CFTYPE_NOT_ON_ROOT,
5819 .seq_show = swap_max_show,
5820 .write = swap_max_write,
5821 },
5822 { } /* terminate */
5823};
5824
5723static struct cftype memsw_cgroup_files[] = { 5825static struct cftype memsw_cgroup_files[] = {
5724 { 5826 {
5725 .name = "memsw.usage_in_bytes", 5827 .name = "memsw.usage_in_bytes",
@@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void)
5751{ 5853{
5752 if (!mem_cgroup_disabled() && really_do_swap_account) { 5854 if (!mem_cgroup_disabled() && really_do_swap_account) {
5753 do_swap_account = 1; 5855 do_swap_account = 1;
5856 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
5857 swap_files));
5754 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5858 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5755 memsw_cgroup_files)); 5859 memsw_cgroup_files));
5756 } 5860 }
diff --git a/mm/shmem.c b/mm/shmem.c
index b98e1011858c..fa2ceb2d2655 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
912 if (!swap.val) 912 if (!swap.val)
913 goto redirty; 913 goto redirty;
914 914
915 if (mem_cgroup_try_charge_swap(page, swap))
916 goto free_swap;
917
915 /* 918 /*
916 * Add inode to shmem_unuse()'s list of swapped-out inodes, 919 * Add inode to shmem_unuse()'s list of swapped-out inodes,
917 * if it's not already there. Do it now before the page is 920 * if it's not already there. Do it now before the page is
@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
940 } 943 }
941 944
942 mutex_unlock(&shmem_swaplist_mutex); 945 mutex_unlock(&shmem_swaplist_mutex);
946free_swap:
943 swapcache_free(swap); 947 swapcache_free(swap);
944redirty: 948redirty:
945 set_page_dirty(page); 949 set_page_dirty(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 676ff2991380..69cb2464e7dc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
170 if (!entry.val) 170 if (!entry.val)
171 return 0; 171 return 0;
172 172
173 if (mem_cgroup_try_charge_swap(page, entry)) {
174 swapcache_free(entry);
175 return 0;
176 }
177
173 if (unlikely(PageTransHuge(page))) 178 if (unlikely(PageTransHuge(page)))
174 if (unlikely(split_huge_page_to_list(page, list))) { 179 if (unlikely(split_huge_page_to_list(page, list))) {
175 swapcache_free(entry); 180 swapcache_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bb30aa3a412..22a7a1fc1e47 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
785 count--; 785 count--;
786 } 786 }
787 787
788 if (!count)
789 mem_cgroup_uncharge_swap(entry);
790
791 usage = count | has_cache; 788 usage = count | has_cache;
792 p->swap_map[offset] = usage; 789 p->swap_map[offset] = usage;
793 790
794 /* free if no reference */ 791 /* free if no reference */
795 if (!usage) { 792 if (!usage) {
793 mem_cgroup_uncharge_swap(entry);
796 dec_cluster_info_page(p, p->cluster_info, offset); 794 dec_cluster_info_page(p, p->cluster_info, offset);
797 if (offset < p->lowest_bit) 795 if (offset < p->lowest_bit)
798 p->lowest_bit = offset; 796 p->lowest_bit = offset;