aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2017-02-22 18:45:49 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:41:30 -0500
commit21440d7eb9044001b7fdb71d0163689f60a0f2a1 (patch)
treeb74142922a0bdd6f430776cebf351c09c41302e1 /mm/huge_memory.c
parentba81f83842549871cbd7226fc11530dc464500bb (diff)
mm, thp: add new defer+madvise defrag option
There is no thp defrag option that currently allows MADV_HUGEPAGE regions to do direct compaction and reclaim while all other thp allocations simply trigger kswapd and kcompactd in the background and fail immediately. The "defer" setting simply triggers background reclaim and compaction for all regions, regardless of MADV_HUGEPAGE, which makes it unusable for our userspace where MADV_HUGEPAGE is being used to indicate the application is willing to wait for work for thp memory to be available. The "madvise" setting will do direct compaction and reclaim for these MADV_HUGEPAGE regions, but does not trigger kswapd and kcompactd in the background for anybody else. For reasonable usage, there needs to be a mesh between the two options. This patch introduces a fifth mode, "defer+madvise", that will do direct reclaim and compaction for MADV_HUGEPAGE regions and trigger background reclaim and compaction for everybody else so that hugepages may be available in the near future. A proposal to allow direct reclaim and compaction for MADV_HUGEPAGE regions as part of the "defer" mode, making it a very powerful setting and avoids breaking userspace, was offered: http://marc.info/?t=148236612700003 This additional mode is a compromise. A second proposal to allow both "defer" and "madvise" to be selected at the same time was also offered: http://marc.info/?t=148357345300001. This is possible, but there was a concern that it might break existing userspaces the parse the output of the defrag mode, so the fifth option was introduced instead. This patch also cleans up the helper function for storing to "enabled" and "defrag" since the former supports three modes while the latter supports five and triple_flag_store() was getting unnecessarily messy. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701101614330.41805@chino.kir.corp.google.com Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@kernel.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c146
1 files changed, 74 insertions, 72 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f3ad65c85de..f9ecc2aeadfc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = {
142}; 142};
143 143
144#ifdef CONFIG_SYSFS 144#ifdef CONFIG_SYSFS
145
146static ssize_t triple_flag_store(struct kobject *kobj,
147 struct kobj_attribute *attr,
148 const char *buf, size_t count,
149 enum transparent_hugepage_flag enabled,
150 enum transparent_hugepage_flag deferred,
151 enum transparent_hugepage_flag req_madv)
152{
153 if (!memcmp("defer", buf,
154 min(sizeof("defer")-1, count))) {
155 if (enabled == deferred)
156 return -EINVAL;
157 clear_bit(enabled, &transparent_hugepage_flags);
158 clear_bit(req_madv, &transparent_hugepage_flags);
159 set_bit(deferred, &transparent_hugepage_flags);
160 } else if (!memcmp("always", buf,
161 min(sizeof("always")-1, count))) {
162 clear_bit(deferred, &transparent_hugepage_flags);
163 clear_bit(req_madv, &transparent_hugepage_flags);
164 set_bit(enabled, &transparent_hugepage_flags);
165 } else if (!memcmp("madvise", buf,
166 min(sizeof("madvise")-1, count))) {
167 clear_bit(enabled, &transparent_hugepage_flags);
168 clear_bit(deferred, &transparent_hugepage_flags);
169 set_bit(req_madv, &transparent_hugepage_flags);
170 } else if (!memcmp("never", buf,
171 min(sizeof("never")-1, count))) {
172 clear_bit(enabled, &transparent_hugepage_flags);
173 clear_bit(req_madv, &transparent_hugepage_flags);
174 clear_bit(deferred, &transparent_hugepage_flags);
175 } else
176 return -EINVAL;
177
178 return count;
179}
180
181static ssize_t enabled_show(struct kobject *kobj, 145static ssize_t enabled_show(struct kobject *kobj,
182 struct kobj_attribute *attr, char *buf) 146 struct kobj_attribute *attr, char *buf)
183{ 147{
@@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj,
193 struct kobj_attribute *attr, 157 struct kobj_attribute *attr,
194 const char *buf, size_t count) 158 const char *buf, size_t count)
195{ 159{
196 ssize_t ret; 160 ssize_t ret = count;
197 161
198 ret = triple_flag_store(kobj, attr, buf, count, 162 if (!memcmp("always", buf,
199 TRANSPARENT_HUGEPAGE_FLAG, 163 min(sizeof("always")-1, count))) {
200 TRANSPARENT_HUGEPAGE_FLAG, 164 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
201 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 165 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
166 } else if (!memcmp("madvise", buf,
167 min(sizeof("madvise")-1, count))) {
168 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
169 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
170 } else if (!memcmp("never", buf,
171 min(sizeof("never")-1, count))) {
172 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
173 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
174 } else
175 ret = -EINVAL;
202 176
203 if (ret > 0) { 177 if (ret > 0) {
204 int err = start_stop_khugepaged(); 178 int err = start_stop_khugepaged();
205 if (err) 179 if (err)
206 ret = err; 180 ret = err;
207 } 181 }
208
209 return ret; 182 return ret;
210} 183}
211static struct kobj_attribute enabled_attr = 184static struct kobj_attribute enabled_attr =
@@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
241 return count; 214 return count;
242} 215}
243 216
244/*
245 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
246 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
247 * memory just to allocate one more hugepage.
248 */
249static ssize_t defrag_show(struct kobject *kobj, 217static ssize_t defrag_show(struct kobject *kobj,
250 struct kobj_attribute *attr, char *buf) 218 struct kobj_attribute *attr, char *buf)
251{ 219{
252 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 220 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
253 return sprintf(buf, "[always] defer madvise never\n"); 221 return sprintf(buf, "[always] defer defer+madvise madvise never\n");
254 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 222 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
255 return sprintf(buf, "always [defer] madvise never\n"); 223 return sprintf(buf, "always [defer] defer+madvise madvise never\n");
256 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 224 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
257 return sprintf(buf, "always defer [madvise] never\n"); 225 return sprintf(buf, "always defer [defer+madvise] madvise never\n");
258 else 226 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
259 return sprintf(buf, "always defer madvise [never]\n"); 227 return sprintf(buf, "always defer defer+madvise [madvise] never\n");
260 228 return sprintf(buf, "always defer defer+madvise madvise [never]\n");
261} 229}
230
262static ssize_t defrag_store(struct kobject *kobj, 231static ssize_t defrag_store(struct kobject *kobj,
263 struct kobj_attribute *attr, 232 struct kobj_attribute *attr,
264 const char *buf, size_t count) 233 const char *buf, size_t count)
265{ 234{
266 return triple_flag_store(kobj, attr, buf, count, 235 if (!memcmp("always", buf,
267 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 236 min(sizeof("always")-1, count))) {
268 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 237 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
269 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 238 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
239 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
240 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
241 } else if (!memcmp("defer", buf,
242 min(sizeof("defer")-1, count))) {
243 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
244 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
245 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
246 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
247 } else if (!memcmp("defer+madvise", buf,
248 min(sizeof("defer+madvise")-1, count))) {
249 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
250 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
251 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
252 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
253 } else if (!memcmp("madvise", buf,
254 min(sizeof("madvise")-1, count))) {
255 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
256 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
257 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
258 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
259 } else if (!memcmp("never", buf,
260 min(sizeof("never")-1, count))) {
261 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
262 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
263 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
264 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
265 } else
266 return -EINVAL;
267
268 return count;
270} 269}
271static struct kobj_attribute defrag_attr = 270static struct kobj_attribute defrag_attr =
272 __ATTR(defrag, 0644, defrag_show, defrag_store); 271 __ATTR(defrag, 0644, defrag_show, defrag_store);
@@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
612} 611}
613 612
614/* 613/*
615 * If THP defrag is set to always then directly reclaim/compact as necessary 614 * always: directly stall for all thp allocations
616 * If set to defer then do only background reclaim/compact and defer to khugepaged 615 * defer: wake kswapd and fail if not immediately available
617 * If set to madvise and the VMA is flagged then directly reclaim/compact 616 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
618 * When direct reclaim/compact is allowed, don't retry except for flagged VMA's 617 * fail if not immediately available
618 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
619 * available
620 * never: never stall for any thp allocation
619 */ 621 */
620static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 622static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
621{ 623{
622 bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 624 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
623 625
624 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 626 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
625 &transparent_hugepage_flags) && vma_madvised)
626 return GFP_TRANSHUGE;
627 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
628 &transparent_hugepage_flags))
629 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
630 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
631 &transparent_hugepage_flags))
632 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 627 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
633 628 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
629 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
630 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
631 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
632 __GFP_KSWAPD_RECLAIM);
633 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
634 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
635 0);
634 return GFP_TRANSHUGE_LIGHT; 636 return GFP_TRANSHUGE_LIGHT;
635} 637}
636 638