aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-03 13:10:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-03 13:10:02 -0500
commitb37a05c083c85c2657dca9bbe1f5d79dccf756d5 (patch)
tree0a9bd376a437484e21a6728ca16f2266a0e3e788
parentd5bfb96bdad3588961f49a6eff89a625fbaa12bf (diff)
parent12c9d70bd5056b3ae84746fca973c286f48384cc (diff)
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "18 fixes" [ The 18 fixes turned into 17 commits, because one of the fixes was a fix for another patch in the series that I just folded in by editing the patch manually - hopefully correctly - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: fix memory leak in copy_huge_pmd() drivers/hwspinlock: fix race between radix tree insertion and lookup radix-tree: fix race in gang lookup mm/vmpressure.c: fix subtree pressure detection mm: polish virtual memory accounting mm: warn about VmData over RLIMIT_DATA Documentation: cgroup-v2: add memory.stat::sock description mm: memcontrol: drop superfluous entry in the per-memcg stats array drivers/scsi/sg.c: mark VMA as VM_IO to prevent migration proc: revert /proc/<pid>/maps [stack:TID] annotation numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390 MAINTAINERS: update Seth email ocfs2/cluster: fix memory leak in o2hb_region_release lib/test-string_helpers.c: fix and improve string_get_size() tests thp: limit number of object to scan on deferred_split_scan() thp: change deferred_split_count() to return number of THP in queue thp: make split_queue per-node
-rw-r--r--Documentation/cgroup-v2.txt4
-rw-r--r--Documentation/filesystems/proc.txt13
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--MAINTAINERS4
-rw-r--r--drivers/hwspinlock/hwspinlock_core.c4
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/proc/task_mmu.c73
-rw-r--r--fs/proc/task_nommu.c49
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/radix-tree.h16
-rw-r--r--lib/radix-tree.c12
-rw-r--r--lib/test-string_helpers.c67
-rw-r--r--mm/huge_memory.c87
-rw-r--r--mm/internal.h31
-rw-r--r--mm/mmap.c23
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/util.c27
-rw-r--r--mm/vmpressure.c3
22 files changed, 269 insertions, 193 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 65b3eac8856c..e8d25e784214 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back.
843 Amount of memory used to cache filesystem data, 843 Amount of memory used to cache filesystem data,
844 including tmpfs and shared memory. 844 including tmpfs and shared memory.
845 845
846 sock
847
848 Amount of memory used in network transmission buffers
849
846 file_mapped 850 file_mapped
847 851
848 Amount of cached filesystem data mapped with mmap() 852 Amount of cached filesystem data mapped with mmap()
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fde9fd06fa98..843b045b4069 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -240,8 +240,8 @@ Table 1-2: Contents of the status files (as of 4.1)
240 RssFile size of resident file mappings 240 RssFile size of resident file mappings
241 RssShmem size of resident shmem memory (includes SysV shm, 241 RssShmem size of resident shmem memory (includes SysV shm,
242 mapping of tmpfs and shared anonymous mappings) 242 mapping of tmpfs and shared anonymous mappings)
243 VmData size of data, stack, and text segments 243 VmData size of private data segments
244 VmStk size of data, stack, and text segments 244 VmStk size of stack segments
245 VmExe size of text segment 245 VmExe size of text segment
246 VmLib size of shared library code 246 VmLib size of shared library code
247 VmPTE size of page table entries 247 VmPTE size of page table entries
@@ -356,7 +356,7 @@ address perms offset dev inode pathname
356a7cb1000-a7cb2000 ---p 00000000 00:00 0 356a7cb1000-a7cb2000 ---p 00000000 00:00 0
357a7cb2000-a7eb2000 rw-p 00000000 00:00 0 357a7cb2000-a7eb2000 rw-p 00000000 00:00 0
358a7eb2000-a7eb3000 ---p 00000000 00:00 0 358a7eb2000-a7eb3000 ---p 00000000 00:00 0
359a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] 359a7eb3000-a7ed5000 rw-p 00000000 00:00 0
360a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 360a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
361a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 361a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
362a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 362a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
@@ -388,7 +388,6 @@ is not associated with a file:
388 388
389 [heap] = the heap of the program 389 [heap] = the heap of the program
390 [stack] = the stack of the main process 390 [stack] = the stack of the main process
391 [stack:1001] = the stack of the thread with tid 1001
392 [vdso] = the "virtual dynamic shared object", 391 [vdso] = the "virtual dynamic shared object",
393 the kernel system call handler 392 the kernel system call handler
394 393
@@ -396,10 +395,8 @@ is not associated with a file:
396 395
397The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint 396The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
398of the individual tasks of a process. In this file you will see a mapping marked 397of the individual tasks of a process. In this file you will see a mapping marked
399as [stack] if that task sees it as a stack. This is a key difference from the 398as [stack] if that task sees it as a stack. Hence, for the example above, the
400content of /proc/PID/maps, where you will see all mappings that are being used 399task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
401as stack by all of those tasks. Hence, for the example above, the task-level
402map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
403 400
40408048000-08049000 r-xp 00000000 03:00 8312 /opt/test 40108048000-08049000 r-xp 00000000 03:00 8312 /opt/test
40508049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 40208049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 87d40a72f6a1..551ecf09c8dd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1496,6 +1496,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1496 could change it dynamically, usually by 1496 could change it dynamically, usually by
1497 /sys/module/printk/parameters/ignore_loglevel. 1497 /sys/module/printk/parameters/ignore_loglevel.
1498 1498
1499 ignore_rlimit_data
1500 Ignore RLIMIT_DATA setting for data mappings,
1501 print warning at first misuse. Can be changed via
1502 /sys/module/kernel/parameters/ignore_rlimit_data.
1503
1499 ihash_entries= [KNL] 1504 ihash_entries= [KNL]
1500 Set number of hash buckets for inode cache. 1505 Set number of hash buckets for inode cache.
1501 1506
diff --git a/MAINTAINERS b/MAINTAINERS
index c245e42cf1d5..24c5b9a29670 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12150,7 +12150,7 @@ F: drivers/net/hamradio/*scc.c
12150F: drivers/net/hamradio/z8530.h 12150F: drivers/net/hamradio/z8530.h
12151 12151
12152ZBUD COMPRESSED PAGE ALLOCATOR 12152ZBUD COMPRESSED PAGE ALLOCATOR
12153M: Seth Jennings <sjennings@variantweb.net> 12153M: Seth Jennings <sjenning@redhat.com>
12154L: linux-mm@kvack.org 12154L: linux-mm@kvack.org
12155S: Maintained 12155S: Maintained
12156F: mm/zbud.c 12156F: mm/zbud.c
@@ -12205,7 +12205,7 @@ F: include/linux/zsmalloc.h
12205F: Documentation/vm/zsmalloc.txt 12205F: Documentation/vm/zsmalloc.txt
12206 12206
12207ZSWAP COMPRESSED SWAP CACHING 12207ZSWAP COMPRESSED SWAP CACHING
12208M: Seth Jennings <sjennings@variantweb.net> 12208M: Seth Jennings <sjenning@redhat.com>
12209L: linux-mm@kvack.org 12209L: linux-mm@kvack.org
12210S: Maintained 12210S: Maintained
12211F: mm/zswap.c 12211F: mm/zswap.c
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index 52f708bcf77f..d50c701b19d6 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -313,6 +313,10 @@ int of_hwspin_lock_get_id(struct device_node *np, int index)
313 hwlock = radix_tree_deref_slot(slot); 313 hwlock = radix_tree_deref_slot(slot);
314 if (unlikely(!hwlock)) 314 if (unlikely(!hwlock))
315 continue; 315 continue;
316 if (radix_tree_is_indirect_ptr(hwlock)) {
317 slot = radix_tree_iter_retry(&iter);
318 continue;
319 }
316 320
317 if (hwlock->bank->dev->of_node == args.np) { 321 if (hwlock->bank->dev->of_node == args.np) {
318 ret = 0; 322 ret = 0;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 503ab8b46c0b..5e820674432c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1261,7 +1261,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
1261 } 1261 }
1262 1262
1263 sfp->mmap_called = 1; 1263 sfp->mmap_called = 1;
1264 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 1264 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
1265 vma->vm_private_data = sfp; 1265 vma->vm_private_data = sfp;
1266 vma->vm_ops = &sg_mmap_vm_ops; 1266 vma->vm_ops = &sg_mmap_vm_ops;
1267 return 0; 1267 return 0;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a3cc6d2fc896..a76b9ea7722e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
1254 1254
1255void o2hb_exit(void) 1255void o2hb_exit(void)
1256{ 1256{
1257 kfree(o2hb_db_livenodes);
1258 kfree(o2hb_db_liveregions);
1259 kfree(o2hb_db_quorumregions);
1260 kfree(o2hb_db_failedregions);
1261 debugfs_remove(o2hb_debug_failedregions); 1257 debugfs_remove(o2hb_debug_failedregions);
1262 debugfs_remove(o2hb_debug_quorumregions); 1258 debugfs_remove(o2hb_debug_quorumregions);
1263 debugfs_remove(o2hb_debug_liveregions); 1259 debugfs_remove(o2hb_debug_liveregions);
1264 debugfs_remove(o2hb_debug_livenodes); 1260 debugfs_remove(o2hb_debug_livenodes);
1265 debugfs_remove(o2hb_debug_dir); 1261 debugfs_remove(o2hb_debug_dir);
1262 kfree(o2hb_db_livenodes);
1263 kfree(o2hb_db_liveregions);
1264 kfree(o2hb_db_quorumregions);
1265 kfree(o2hb_db_failedregions);
1266} 1266}
1267 1267
1268static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, 1268static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
1438 1438
1439 kfree(reg->hr_slots); 1439 kfree(reg->hr_slots);
1440 1440
1441 kfree(reg->hr_db_regnum);
1442 kfree(reg->hr_db_livenodes);
1443 debugfs_remove(reg->hr_debug_livenodes); 1441 debugfs_remove(reg->hr_debug_livenodes);
1444 debugfs_remove(reg->hr_debug_regnum); 1442 debugfs_remove(reg->hr_debug_regnum);
1445 debugfs_remove(reg->hr_debug_elapsed_time); 1443 debugfs_remove(reg->hr_debug_elapsed_time);
1446 debugfs_remove(reg->hr_debug_pinned); 1444 debugfs_remove(reg->hr_debug_pinned);
1447 debugfs_remove(reg->hr_debug_dir); 1445 debugfs_remove(reg->hr_debug_dir);
1446 kfree(reg->hr_db_livenodes);
1447 kfree(reg->hr_db_regnum);
1448 kfree(reg->hr_debug_elapsed_time);
1449 kfree(reg->hr_debug_pinned);
1448 1450
1449 spin_lock(&o2hb_live_lock); 1451 spin_lock(&o2hb_live_lock);
1450 list_del(&reg->hr_all_item); 1452 list_del(&reg->hr_all_item);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85d16c67c33e..fa95ab2d3674 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
259 sizeof(struct proc_maps_private)); 259 sizeof(struct proc_maps_private));
260} 260}
261 261
262static pid_t pid_of_stack(struct proc_maps_private *priv, 262/*
263 struct vm_area_struct *vma, bool is_pid) 263 * Indicate if the VMA is a stack for the given task; for
264 * /proc/PID/maps that is the stack of the main task.
265 */
266static int is_stack(struct proc_maps_private *priv,
267 struct vm_area_struct *vma, int is_pid)
264{ 268{
265 struct inode *inode = priv->inode; 269 int stack = 0;
266 struct task_struct *task; 270
267 pid_t ret = 0; 271 if (is_pid) {
272 stack = vma->vm_start <= vma->vm_mm->start_stack &&
273 vma->vm_end >= vma->vm_mm->start_stack;
274 } else {
275 struct inode *inode = priv->inode;
276 struct task_struct *task;
268 277
269 rcu_read_lock(); 278 rcu_read_lock();
270 task = pid_task(proc_pid(inode), PIDTYPE_PID); 279 task = pid_task(proc_pid(inode), PIDTYPE_PID);
271 if (task) {
272 task = task_of_stack(task, vma, is_pid);
273 if (task) 280 if (task)
274 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); 281 stack = vma_is_stack_for_task(vma, task);
282 rcu_read_unlock();
275 } 283 }
276 rcu_read_unlock(); 284 return stack;
277
278 return ret;
279} 285}
280 286
281static void 287static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
335 341
336 name = arch_vma_name(vma); 342 name = arch_vma_name(vma);
337 if (!name) { 343 if (!name) {
338 pid_t tid;
339
340 if (!mm) { 344 if (!mm) {
341 name = "[vdso]"; 345 name = "[vdso]";
342 goto done; 346 goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
348 goto done; 352 goto done;
349 } 353 }
350 354
351 tid = pid_of_stack(priv, vma, is_pid); 355 if (is_stack(priv, vma, is_pid))
352 if (tid != 0) { 356 name = "[stack]";
353 /*
354 * Thread stack in /proc/PID/task/TID/maps or
355 * the main process stack.
356 */
357 if (!is_pid || (vma->vm_start <= mm->start_stack &&
358 vma->vm_end >= mm->start_stack)) {
359 name = "[stack]";
360 } else {
361 /* Thread stack in /proc/PID/maps */
362 seq_pad(m, ' ');
363 seq_printf(m, "[stack:%d]", tid);
364 }
365 }
366 } 357 }
367 358
368done: 359done:
@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1552static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, 1543static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1553 unsigned long addr, unsigned long end, struct mm_walk *walk) 1544 unsigned long addr, unsigned long end, struct mm_walk *walk)
1554{ 1545{
1546 pte_t huge_pte = huge_ptep_get(pte);
1555 struct numa_maps *md; 1547 struct numa_maps *md;
1556 struct page *page; 1548 struct page *page;
1557 1549
1558 if (!pte_present(*pte)) 1550 if (!pte_present(huge_pte))
1559 return 0; 1551 return 0;
1560 1552
1561 page = pte_page(*pte); 1553 page = pte_page(huge_pte);
1562 if (!page) 1554 if (!page)
1563 return 0; 1555 return 0;
1564 1556
1565 md = walk->private; 1557 md = walk->private;
1566 gather_stats(page, md, pte_dirty(*pte), 1); 1558 gather_stats(page, md, pte_dirty(huge_pte), 1);
1567 return 0; 1559 return 0;
1568} 1560}
1569 1561
@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1617 seq_file_path(m, file, "\n\t= "); 1609 seq_file_path(m, file, "\n\t= ");
1618 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1610 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1619 seq_puts(m, " heap"); 1611 seq_puts(m, " heap");
1620 } else { 1612 } else if (is_stack(proc_priv, vma, is_pid)) {
1621 pid_t tid = pid_of_stack(proc_priv, vma, is_pid); 1613 seq_puts(m, " stack");
1622 if (tid != 0) {
1623 /*
1624 * Thread stack in /proc/PID/task/TID/maps or
1625 * the main process stack.
1626 */
1627 if (!is_pid || (vma->vm_start <= mm->start_stack &&
1628 vma->vm_end >= mm->start_stack))
1629 seq_puts(m, " stack");
1630 else
1631 seq_printf(m, " stack:%d", tid);
1632 }
1633 } 1614 }
1634 1615
1635 if (is_vm_hugetlb_page(vma)) 1616 if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index e0d64c92e4f6..faacb0c0d857 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
123 return size; 123 return size;
124} 124}
125 125
126static pid_t pid_of_stack(struct proc_maps_private *priv, 126static int is_stack(struct proc_maps_private *priv,
127 struct vm_area_struct *vma, bool is_pid) 127 struct vm_area_struct *vma, int is_pid)
128{ 128{
129 struct inode *inode = priv->inode; 129 struct mm_struct *mm = vma->vm_mm;
130 struct task_struct *task; 130 int stack = 0;
131 pid_t ret = 0; 131
132 132 if (is_pid) {
133 rcu_read_lock(); 133 stack = vma->vm_start <= mm->start_stack &&
134 task = pid_task(proc_pid(inode), PIDTYPE_PID); 134 vma->vm_end >= mm->start_stack;
135 if (task) { 135 } else {
136 task = task_of_stack(task, vma, is_pid); 136 struct inode *inode = priv->inode;
137 struct task_struct *task;
138
139 rcu_read_lock();
140 task = pid_task(proc_pid(inode), PIDTYPE_PID);
137 if (task) 141 if (task)
138 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); 142 stack = vma_is_stack_for_task(vma, task);
143 rcu_read_unlock();
139 } 144 }
140 rcu_read_unlock(); 145 return stack;
141
142 return ret;
143} 146}
144 147
145/* 148/*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
181 if (file) { 184 if (file) {
182 seq_pad(m, ' '); 185 seq_pad(m, ' ');
183 seq_file_path(m, file, ""); 186 seq_file_path(m, file, "");
184 } else if (mm) { 187 } else if (mm && is_stack(priv, vma, is_pid)) {
185 pid_t tid = pid_of_stack(priv, vma, is_pid); 188 seq_pad(m, ' ');
186 189 seq_printf(m, "[stack]");
187 if (tid != 0) {
188 seq_pad(m, ' ');
189 /*
190 * Thread stack in /proc/PID/task/TID/maps or
191 * the main process stack.
192 */
193 if (!is_pid || (vma->vm_start <= mm->start_stack &&
194 vma->vm_end >= mm->start_stack))
195 seq_printf(m, "[stack]");
196 else
197 seq_printf(m, "[stack:%d]", tid);
198 }
199 } 190 }
200 191
201 seq_putc(m, '\n'); 192 seq_putc(m, '\n');
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9ae48d4aeb5e..792c8981e633 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -51,7 +51,7 @@ enum mem_cgroup_stat_index {
51 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 51 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
52 MEM_CGROUP_STAT_NSTATS, 52 MEM_CGROUP_STAT_NSTATS,
53 /* default hierarchy stats */ 53 /* default hierarchy stats */
54 MEMCG_SOCK, 54 MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
55 MEMCG_NR_STAT, 55 MEMCG_NR_STAT,
56}; 56};
57 57
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1cd22f2df1a..516e14944339 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -201,11 +201,13 @@ extern unsigned int kobjsize(const void *objp);
201#endif 201#endif
202 202
203#ifdef CONFIG_STACK_GROWSUP 203#ifdef CONFIG_STACK_GROWSUP
204#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 204#define VM_STACK VM_GROWSUP
205#else 205#else
206#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 206#define VM_STACK VM_GROWSDOWN
207#endif 207#endif
208 208
209#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
210
209/* 211/*
210 * Special vmas that are non-mergable, non-mlock()able. 212 * Special vmas that are non-mergable, non-mlock()able.
211 * Note: mm/huge_memory.c VM_NO_THP depends on this definition. 213 * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
@@ -1341,8 +1343,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
1341 !vma_growsup(vma->vm_next, addr); 1343 !vma_growsup(vma->vm_next, addr);
1342} 1344}
1343 1345
1344extern struct task_struct *task_of_stack(struct task_struct *task, 1346int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t);
1345 struct vm_area_struct *vma, bool in_group);
1346 1347
1347extern unsigned long move_page_tables(struct vm_area_struct *vma, 1348extern unsigned long move_page_tables(struct vm_area_struct *vma,
1348 unsigned long old_addr, struct vm_area_struct *new_vma, 1349 unsigned long old_addr, struct vm_area_struct *new_vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d3ebb9d21a53..624b78b848b8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -424,9 +424,9 @@ struct mm_struct {
424 unsigned long total_vm; /* Total pages mapped */ 424 unsigned long total_vm; /* Total pages mapped */
425 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 425 unsigned long locked_vm; /* Pages that have PG_mlocked set */
426 unsigned long pinned_vm; /* Refcount permanently increased */ 426 unsigned long pinned_vm; /* Refcount permanently increased */
427 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ 427 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
428 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ 428 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
429 unsigned long stack_vm; /* VM_GROWSUP/DOWN */ 429 unsigned long stack_vm; /* VM_STACK */
430 unsigned long def_flags; 430 unsigned long def_flags;
431 unsigned long start_code, end_code, start_data, end_data; 431 unsigned long start_code, end_code, start_data, end_data;
432 unsigned long start_brk, brk, start_stack; 432 unsigned long start_brk, brk, start_stack;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..7b6c2cfee390 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -682,6 +682,12 @@ typedef struct pglist_data {
682 */ 682 */
683 unsigned long first_deferred_pfn; 683 unsigned long first_deferred_pfn;
684#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 684#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
685
686#ifdef CONFIG_TRANSPARENT_HUGEPAGE
687 spinlock_t split_queue_lock;
688 struct list_head split_queue;
689 unsigned long split_queue_len;
690#endif
685} pg_data_t; 691} pg_data_t;
686 692
687#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 693#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 7c88ad156a29..00b17c526c1f 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -379,6 +379,22 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
379 struct radix_tree_iter *iter, unsigned flags); 379 struct radix_tree_iter *iter, unsigned flags);
380 380
381/** 381/**
382 * radix_tree_iter_retry - retry this chunk of the iteration
383 * @iter: iterator state
384 *
385 * If we iterate over a tree protected only by the RCU lock, a race
386 * against deletion or creation may result in seeing a slot for which
387 * radix_tree_deref_retry() returns true. If so, call this function
388 * and continue the iteration.
389 */
390static inline __must_check
391void **radix_tree_iter_retry(struct radix_tree_iter *iter)
392{
393 iter->next_index = iter->index;
394 return NULL;
395}
396
397/**
382 * radix_tree_chunk_size - get current chunk size 398 * radix_tree_chunk_size - get current chunk size
383 * 399 *
384 * @iter: pointer to radix tree iterator 400 * @iter: pointer to radix tree iterator
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index fcf5d98574ce..6b79e9026e24 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1019,9 +1019,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
1019 return 0; 1019 return 0;
1020 1020
1021 radix_tree_for_each_slot(slot, root, &iter, first_index) { 1021 radix_tree_for_each_slot(slot, root, &iter, first_index) {
1022 results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); 1022 results[ret] = rcu_dereference_raw(*slot);
1023 if (!results[ret]) 1023 if (!results[ret])
1024 continue; 1024 continue;
1025 if (radix_tree_is_indirect_ptr(results[ret])) {
1026 slot = radix_tree_iter_retry(&iter);
1027 continue;
1028 }
1025 if (++ret == max_items) 1029 if (++ret == max_items)
1026 break; 1030 break;
1027 } 1031 }
@@ -1098,9 +1102,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1098 return 0; 1102 return 0;
1099 1103
1100 radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { 1104 radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
1101 results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); 1105 results[ret] = rcu_dereference_raw(*slot);
1102 if (!results[ret]) 1106 if (!results[ret])
1103 continue; 1107 continue;
1108 if (radix_tree_is_indirect_ptr(results[ret])) {
1109 slot = radix_tree_iter_retry(&iter);
1110 continue;
1111 }
1104 if (++ret == max_items) 1112 if (++ret == max_items)
1105 break; 1113 break;
1106 } 1114 }
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
index 98866a770770..25b5cbfb7615 100644
--- a/lib/test-string_helpers.c
+++ b/lib/test-string_helpers.c
@@ -327,36 +327,67 @@ out:
327} 327}
328 328
329#define string_get_size_maxbuf 16 329#define string_get_size_maxbuf 16
330#define test_string_get_size_one(size, blk_size, units, exp_result) \ 330#define test_string_get_size_one(size, blk_size, exp_result10, exp_result2) \
331 do { \ 331 do { \
332 BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \ 332 BUILD_BUG_ON(sizeof(exp_result10) >= string_get_size_maxbuf); \
333 __test_string_get_size((size), (blk_size), (units), \ 333 BUILD_BUG_ON(sizeof(exp_result2) >= string_get_size_maxbuf); \
334 (exp_result)); \ 334 __test_string_get_size((size), (blk_size), (exp_result10), \
335 (exp_result2)); \
335 } while (0) 336 } while (0)
336 337
337 338
338static __init void __test_string_get_size(const u64 size, const u64 blk_size, 339static __init void test_string_get_size_check(const char *units,
339 const enum string_size_units units, 340 const char *exp,
340 const char *exp_result) 341 char *res,
342 const u64 size,
343 const u64 blk_size)
341{ 344{
342 char buf[string_get_size_maxbuf]; 345 if (!memcmp(res, exp, strlen(exp) + 1))
343
344 string_get_size(size, blk_size, units, buf, sizeof(buf));
345 if (!memcmp(buf, exp_result, strlen(exp_result) + 1))
346 return; 346 return;
347 347
348 buf[sizeof(buf) - 1] = '\0'; 348 res[string_get_size_maxbuf - 1] = '\0';
349 pr_warn("Test 'test_string_get_size_one' failed!\n"); 349
350 pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n", 350 pr_warn("Test 'test_string_get_size' failed!\n");
351 pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %s)\n",
351 size, blk_size, units); 352 size, blk_size, units);
352 pr_warn("expected: '%s', got '%s'\n", exp_result, buf); 353 pr_warn("expected: '%s', got '%s'\n", exp, res);
354}
355
356static __init void __test_string_get_size(const u64 size, const u64 blk_size,
357 const char *exp_result10,
358 const char *exp_result2)
359{
360 char buf10[string_get_size_maxbuf];
361 char buf2[string_get_size_maxbuf];
362
363 string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10));
364 string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2));
365
366 test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10,
367 size, blk_size);
368
369 test_string_get_size_check("STRING_UNITS_2", exp_result2, buf2,
370 size, blk_size);
353} 371}
354 372
355static __init void test_string_get_size(void) 373static __init void test_string_get_size(void)
356{ 374{
357 test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB"); 375 /* small values */
358 test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB"); 376 test_string_get_size_one(0, 512, "0 B", "0 B");
359 test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B"); 377 test_string_get_size_one(1, 512, "512 B", "512 B");
378 test_string_get_size_one(1100, 1, "1.10 kB", "1.07 KiB");
379
380 /* normal values */
381 test_string_get_size_one(16384, 512, "8.39 MB", "8.00 MiB");
382 test_string_get_size_one(500118192, 512, "256 GB", "238 GiB");
383 test_string_get_size_one(8192, 4096, "33.6 MB", "32.0 MiB");
384
385 /* weird block sizes */
386 test_string_get_size_one(3000, 1900, "5.70 MB", "5.44 MiB");
387
388 /* huge values */
389 test_string_get_size_one(U64_MAX, 4096, "75.6 ZB", "64.0 ZiB");
390 test_string_get_size_one(4096, U64_MAX, "75.6 ZB", "64.0 ZiB");
360} 391}
361 392
362static int __init test_string_helpers_init(void) 393static int __init test_string_helpers_init(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b3e6f4..36c070167b71 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 138 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
139}; 139};
140 140
141static DEFINE_SPINLOCK(split_queue_lock);
142static LIST_HEAD(split_queue);
143static unsigned long split_queue_len;
144static struct shrinker deferred_split_shrinker; 141static struct shrinker deferred_split_shrinker;
145 142
146static void set_recommended_min_free_kbytes(void) 143static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
861 return false; 858 return false;
862 entry = mk_pmd(zero_page, vma->vm_page_prot); 859 entry = mk_pmd(zero_page, vma->vm_page_prot);
863 entry = pmd_mkhuge(entry); 860 entry = pmd_mkhuge(entry);
864 pgtable_trans_huge_deposit(mm, pmd, pgtable); 861 if (pgtable)
862 pgtable_trans_huge_deposit(mm, pmd, pgtable);
865 set_pmd_at(mm, haddr, pmd, entry); 863 set_pmd_at(mm, haddr, pmd, entry);
866 atomic_long_inc(&mm->nr_ptes); 864 atomic_long_inc(&mm->nr_ptes);
867 return true; 865 return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 spinlock_t *dst_ptl, *src_ptl; 1037 spinlock_t *dst_ptl, *src_ptl;
1040 struct page *src_page; 1038 struct page *src_page;
1041 pmd_t pmd; 1039 pmd_t pmd;
1042 pgtable_t pgtable; 1040 pgtable_t pgtable = NULL;
1043 int ret; 1041 int ret;
1044 1042
1045 ret = -ENOMEM; 1043 if (!vma_is_dax(vma)) {
1046 pgtable = pte_alloc_one(dst_mm, addr); 1044 ret = -ENOMEM;
1047 if (unlikely(!pgtable)) 1045 pgtable = pte_alloc_one(dst_mm, addr);
1048 goto out; 1046 if (unlikely(!pgtable))
1047 goto out;
1048 }
1049 1049
1050 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1050 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1051 src_ptl = pmd_lockptr(src_mm, src_pmd); 1051 src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1076 goto out_unlock; 1076 goto out_unlock;
1077 } 1077 }
1078 1078
1079 if (pmd_trans_huge(pmd)) { 1079 if (!vma_is_dax(vma)) {
1080 /* thp accounting separate from pmd_devmap accounting */ 1080 /* thp accounting separate from pmd_devmap accounting */
1081 src_page = pmd_page(pmd); 1081 src_page = pmd_page(pmd);
1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1082 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page)
3358int split_huge_page_to_list(struct page *page, struct list_head *list) 3358int split_huge_page_to_list(struct page *page, struct list_head *list)
3359{ 3359{
3360 struct page *head = compound_head(page); 3360 struct page *head = compound_head(page);
3361 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3361 struct anon_vma *anon_vma; 3362 struct anon_vma *anon_vma;
3362 int count, mapcount, ret; 3363 int count, mapcount, ret;
3363 bool mlocked; 3364 bool mlocked;
@@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3401 lru_add_drain(); 3402 lru_add_drain();
3402 3403
3403 /* Prevent deferred_split_scan() touching ->_count */ 3404 /* Prevent deferred_split_scan() touching ->_count */
3404 spin_lock_irqsave(&split_queue_lock, flags); 3405 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3405 count = page_count(head); 3406 count = page_count(head);
3406 mapcount = total_mapcount(head); 3407 mapcount = total_mapcount(head);
3407 if (!mapcount && count == 1) { 3408 if (!mapcount && count == 1) {
3408 if (!list_empty(page_deferred_list(head))) { 3409 if (!list_empty(page_deferred_list(head))) {
3409 split_queue_len--; 3410 pgdata->split_queue_len--;
3410 list_del(page_deferred_list(head)); 3411 list_del(page_deferred_list(head));
3411 } 3412 }
3412 spin_unlock_irqrestore(&split_queue_lock, flags); 3413 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3413 __split_huge_page(page, list); 3414 __split_huge_page(page, list);
3414 ret = 0; 3415 ret = 0;
3415 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3416 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3416 spin_unlock_irqrestore(&split_queue_lock, flags); 3417 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3417 pr_alert("total_mapcount: %u, page_count(): %u\n", 3418 pr_alert("total_mapcount: %u, page_count(): %u\n",
3418 mapcount, count); 3419 mapcount, count);
3419 if (PageTail(page)) 3420 if (PageTail(page))
@@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3421 dump_page(page, "total_mapcount(head) > 0"); 3422 dump_page(page, "total_mapcount(head) > 0");
3422 BUG(); 3423 BUG();
3423 } else { 3424 } else {
3424 spin_unlock_irqrestore(&split_queue_lock, flags); 3425 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3425 unfreeze_page(anon_vma, head); 3426 unfreeze_page(anon_vma, head);
3426 ret = -EBUSY; 3427 ret = -EBUSY;
3427 } 3428 }
@@ -3436,64 +3437,65 @@ out:
3436 3437
3437void free_transhuge_page(struct page *page) 3438void free_transhuge_page(struct page *page)
3438{ 3439{
3440 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3439 unsigned long flags; 3441 unsigned long flags;
3440 3442
3441 spin_lock_irqsave(&split_queue_lock, flags); 3443 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3442 if (!list_empty(page_deferred_list(page))) { 3444 if (!list_empty(page_deferred_list(page))) {
3443 split_queue_len--; 3445 pgdata->split_queue_len--;
3444 list_del(page_deferred_list(page)); 3446 list_del(page_deferred_list(page));
3445 } 3447 }
3446 spin_unlock_irqrestore(&split_queue_lock, flags); 3448 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3447 free_compound_page(page); 3449 free_compound_page(page);
3448} 3450}
3449 3451
3450void deferred_split_huge_page(struct page *page) 3452void deferred_split_huge_page(struct page *page)
3451{ 3453{
3454 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
3452 unsigned long flags; 3455 unsigned long flags;
3453 3456
3454 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3457 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3455 3458
3456 spin_lock_irqsave(&split_queue_lock, flags); 3459 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3457 if (list_empty(page_deferred_list(page))) { 3460 if (list_empty(page_deferred_list(page))) {
3458 list_add_tail(page_deferred_list(page), &split_queue); 3461 list_add_tail(page_deferred_list(page), &pgdata->split_queue);
3459 split_queue_len++; 3462 pgdata->split_queue_len++;
3460 } 3463 }
3461 spin_unlock_irqrestore(&split_queue_lock, flags); 3464 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3462} 3465}
3463 3466
3464static unsigned long deferred_split_count(struct shrinker *shrink, 3467static unsigned long deferred_split_count(struct shrinker *shrink,
3465 struct shrink_control *sc) 3468 struct shrink_control *sc)
3466{ 3469{
3467 /* 3470 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3468 * Split a page from split_queue will free up at least one page, 3471 return ACCESS_ONCE(pgdata->split_queue_len);
3469 * at most HPAGE_PMD_NR - 1. We don't track exact number.
3470 * Let's use HPAGE_PMD_NR / 2 as ballpark.
3471 */
3472 return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
3473} 3472}
3474 3473
3475static unsigned long deferred_split_scan(struct shrinker *shrink, 3474static unsigned long deferred_split_scan(struct shrinker *shrink,
3476 struct shrink_control *sc) 3475 struct shrink_control *sc)
3477{ 3476{
3477 struct pglist_data *pgdata = NODE_DATA(sc->nid);
3478 unsigned long flags; 3478 unsigned long flags;
3479 LIST_HEAD(list), *pos, *next; 3479 LIST_HEAD(list), *pos, *next;
3480 struct page *page; 3480 struct page *page;
3481 int split = 0; 3481 int split = 0;
3482 3482
3483 spin_lock_irqsave(&split_queue_lock, flags); 3483 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3484 list_splice_init(&split_queue, &list);
3485
3486 /* Take pin on all head pages to avoid freeing them under us */ 3484 /* Take pin on all head pages to avoid freeing them under us */
3487 list_for_each_safe(pos, next, &list) { 3485 list_for_each_safe(pos, next, &list) {
3488 page = list_entry((void *)pos, struct page, mapping); 3486 page = list_entry((void *)pos, struct page, mapping);
3489 page = compound_head(page); 3487 page = compound_head(page);
3490 /* race with put_compound_page() */ 3488 if (get_page_unless_zero(page)) {
3491 if (!get_page_unless_zero(page)) { 3489 list_move(page_deferred_list(page), &list);
3490 } else {
3491 /* We lost race with put_compound_page() */
3492 list_del_init(page_deferred_list(page)); 3492 list_del_init(page_deferred_list(page));
3493 split_queue_len--; 3493 pgdata->split_queue_len--;
3494 } 3494 }
3495 if (!--sc->nr_to_scan)
3496 break;
3495 } 3497 }
3496 spin_unlock_irqrestore(&split_queue_lock, flags); 3498 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3497 3499
3498 list_for_each_safe(pos, next, &list) { 3500 list_for_each_safe(pos, next, &list) {
3499 page = list_entry((void *)pos, struct page, mapping); 3501 page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
3505 put_page(page); 3507 put_page(page);
3506 } 3508 }
3507 3509
3508 spin_lock_irqsave(&split_queue_lock, flags); 3510 spin_lock_irqsave(&pgdata->split_queue_lock, flags);
3509 list_splice_tail(&list, &split_queue); 3511 list_splice_tail(&list, &pgdata->split_queue);
3510 spin_unlock_irqrestore(&split_queue_lock, flags); 3512 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3511 3513
3512 return split * HPAGE_PMD_NR / 2; 3514 /*
3515 * Stop shrinker if we didn't split any page, but the queue is empty.
3516 * This can happen if pages were freed under us.
3517 */
3518 if (!split && list_empty(&pgdata->split_queue))
3519 return SHRINK_STOP;
3520 return split;
3513} 3521}
3514 3522
3515static struct shrinker deferred_split_shrinker = { 3523static struct shrinker deferred_split_shrinker = {
3516 .count_objects = deferred_split_count, 3524 .count_objects = deferred_split_count,
3517 .scan_objects = deferred_split_scan, 3525 .scan_objects = deferred_split_scan,
3518 .seeks = DEFAULT_SEEKS, 3526 .seeks = DEFAULT_SEEKS,
3527 .flags = SHRINKER_NUMA_AWARE,
3519}; 3528};
3520 3529
3521#ifdef CONFIG_DEBUG_FS 3530#ifdef CONFIG_DEBUG_FS
diff --git a/mm/internal.h b/mm/internal.h
index ed8b5ffcf9b1..a38a21ebddb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
216 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 216 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
217} 217}
218 218
219/*
220 * These three helpers classifies VMAs for virtual memory accounting.
221 */
222
223/*
224 * Executable code area - executable, not writable, not stack
225 */
226static inline bool is_exec_mapping(vm_flags_t flags)
227{
228 return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
229}
230
231/*
232 * Stack area - atomatically grows in one direction
233 *
234 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
235 * do_mmap() forbids all other combinations.
236 */
237static inline bool is_stack_mapping(vm_flags_t flags)
238{
239 return (flags & VM_STACK) == VM_STACK;
240}
241
242/*
243 * Data area - private, writable, not stack
244 */
245static inline bool is_data_mapping(vm_flags_t flags)
246{
247 return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
248}
249
219/* mm/util.c */ 250/* mm/util.c */
220void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 251void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
221 struct vm_area_struct *prev, struct rb_node *rb_parent); 252 struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/mmap.c b/mm/mmap.c
index 84b12624ceb0..cfc0cdca421e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
42#include <linux/memory.h> 42#include <linux/memory.h>
43#include <linux/printk.h> 43#include <linux/printk.h>
44#include <linux/userfaultfd_k.h> 44#include <linux/userfaultfd_k.h>
45#include <linux/moduleparam.h>
45 46
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
69int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; 70int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
70#endif 71#endif
71 72
73static bool ignore_rlimit_data = true;
74core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
72 75
73static void unmap_region(struct mm_struct *mm, 76static void unmap_region(struct mm_struct *mm,
74 struct vm_area_struct *vma, struct vm_area_struct *prev, 77 struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -2982,9 +2985,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
2982 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) 2985 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
2983 return false; 2986 return false;
2984 2987
2985 if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & 2988 if (is_data_mapping(flags) &&
2986 (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) 2989 mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
2987 return mm->data_vm + npages <= rlimit(RLIMIT_DATA); 2990 if (ignore_rlimit_data)
2991 pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
2992 "%lu. Will be forbidden soon.\n",
2993 current->comm, current->pid,
2994 (mm->data_vm + npages) << PAGE_SHIFT,
2995 rlimit(RLIMIT_DATA));
2996 else
2997 return false;
2998 }
2988 2999
2989 return true; 3000 return true;
2990} 3001}
@@ -2993,11 +3004,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
2993{ 3004{
2994 mm->total_vm += npages; 3005 mm->total_vm += npages;
2995 3006
2996 if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) 3007 if (is_exec_mapping(flags))
2997 mm->exec_vm += npages; 3008 mm->exec_vm += npages;
2998 else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) 3009 else if (is_stack_mapping(flags))
2999 mm->stack_vm += npages; 3010 mm->stack_vm += npages;
3000 else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 3011 else if (is_data_mapping(flags))
3001 mm->data_vm += npages; 3012 mm->data_vm += npages;
3002} 3013}
3003 3014
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..ea2c4d3e0c03 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
5210 pgdat->numabalancing_migrate_nr_pages = 0; 5210 pgdat->numabalancing_migrate_nr_pages = 0;
5211 pgdat->numabalancing_migrate_next_window = jiffies; 5211 pgdat->numabalancing_migrate_next_window = jiffies;
5212#endif 5212#endif
5213#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5214 spin_lock_init(&pgdat->split_queue_lock);
5215 INIT_LIST_HEAD(&pgdat->split_queue);
5216 pgdat->split_queue_len = 0;
5217#endif
5213 init_waitqueue_head(&pgdat->kswapd_wait); 5218 init_waitqueue_head(&pgdat->kswapd_wait);
5214 init_waitqueue_head(&pgdat->pfmemalloc_wait); 5219 init_waitqueue_head(&pgdat->pfmemalloc_wait);
5215 pgdat_page_ext_init(pgdat); 5220 pgdat_page_ext_init(pgdat);
diff --git a/mm/util.c b/mm/util.c
index c108a6542d05..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
230} 230}
231 231
232/* Check if the vma is being used as a stack by this task */ 232/* Check if the vma is being used as a stack by this task */
233static int vm_is_stack_for_task(struct task_struct *t, 233int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
234 struct vm_area_struct *vma)
235{ 234{
236 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 235 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
237} 236}
238 237
239/*
240 * Check if the vma is being used as a stack.
241 * If is_group is non-zero, check in the entire thread group or else
242 * just check in the current task. Returns the task_struct of the task
243 * that the vma is stack for. Must be called under rcu_read_lock().
244 */
245struct task_struct *task_of_stack(struct task_struct *task,
246 struct vm_area_struct *vma, bool in_group)
247{
248 if (vm_is_stack_for_task(task, vma))
249 return task;
250
251 if (in_group) {
252 struct task_struct *t;
253
254 for_each_thread(task, t) {
255 if (vm_is_stack_for_task(t, vma))
256 return t;
257 }
258 }
259
260 return NULL;
261}
262
263#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 238#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
264void arch_pick_mmap_layout(struct mm_struct *mm) 239void arch_pick_mmap_layout(struct mm_struct *mm)
265{ 240{
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
248 248
249 if (tree) { 249 if (tree) {
250 spin_lock(&vmpr->sr_lock); 250 spin_lock(&vmpr->sr_lock);
251 vmpr->tree_scanned += scanned; 251 scanned = vmpr->tree_scanned += scanned;
252 vmpr->tree_reclaimed += reclaimed; 252 vmpr->tree_reclaimed += reclaimed;
253 scanned = vmpr->scanned;
254 spin_unlock(&vmpr->sr_lock); 253 spin_unlock(&vmpr->sr_lock);
255 254
256 if (scanned < vmpressure_win) 255 if (scanned < vmpressure_win)