diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:23:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:23:15 -0400 |
commit | 9e2d8656f5e8aa214e66b462680cf86b210b74a8 (patch) | |
tree | f67d62e896cedf75599ea45f9ecf9999c6ad24cd | |
parent | 1ea4f4f8405cc1ceec23f2d261bc3775785e6712 (diff) | |
parent | 9e695d2ecc8451cc2c1603d60b5c8e7f5581923a (diff) |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge patches from Andrew Morton:
"A few misc things and very nearly all of the MM tree. A tremendous
amount of stuff (again), including a significant rbtree library
rework."
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (160 commits)
sparc64: Support transparent huge pages.
mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd().
mm: Add and use update_mmu_cache_pmd() in transparent huge page code.
sparc64: Document PGD and PMD layout.
sparc64: Eliminate PTE table memory wastage.
sparc64: Halve the size of PTE tables
sparc64: Only support 4MB huge pages and 8KB base pages.
memory-hotplug: suppress "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning
mm: memcg: clean up mm_match_cgroup() signature
mm: document PageHuge somewhat
mm: use %pK for /proc/vmallocinfo
mm, thp: fix mlock statistics
mm, thp: fix mapped pages avoiding unevictable list on mlock
memory-hotplug: update memory block's state and notify userspace
memory-hotplug: preparation to notify memory block's state at memory hot remove
mm: avoid section mismatch warning for memblock_type_name
make GFP_NOTRACK definition unconditional
cma: decrease cc.nr_migratepages after reclaiming pagelist
CMA: migrate mlocked pages
kpageflags: fix wrong KPF_THP on non-huge compound pages
...
255 files changed, 4976 insertions, 3540 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 49c051380daf..f54273e2ac97 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX | |||
@@ -270,8 +270,6 @@ preempt-locking.txt | |||
270 | - info on locking under a preemptive kernel. | 270 | - info on locking under a preemptive kernel. |
271 | printk-formats.txt | 271 | printk-formats.txt |
272 | - how to get printk format specifiers right | 272 | - how to get printk format specifiers right |
273 | prio_tree.txt | ||
274 | - info on radix-priority-search-tree use for indexing vmas. | ||
275 | ramoops.txt | 273 | ramoops.txt |
276 | - documentation of the ramoops oops/panic logging module. | 274 | - documentation of the ramoops oops/panic logging module. |
277 | rbtree.txt | 275 | rbtree.txt |
diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj deleted file mode 100644 index 9a3cb88ade47..000000000000 --- a/Documentation/ABI/obsolete/proc-pid-oom_adj +++ /dev/null | |||
@@ -1,22 +0,0 @@ | |||
1 | What: /proc/<pid>/oom_adj | ||
2 | When: August 2012 | ||
3 | Why: /proc/<pid>/oom_adj allows userspace to influence the oom killer's | ||
4 | badness heuristic used to determine which task to kill when the kernel | ||
5 | is out of memory. | ||
6 | |||
7 | The badness heuristic has since been rewritten since the introduction of | ||
8 | this tunable such that its meaning is deprecated. The value was | ||
9 | implemented as a bitshift on a score generated by the badness() | ||
10 | function that did not have any precise units of measure. With the | ||
11 | rewrite, the score is given as a proportion of available memory to the | ||
12 | task allocating pages, so using a bitshift which grows the score | ||
13 | exponentially is, thus, impossible to tune with fine granularity. | ||
14 | |||
15 | A much more powerful interface, /proc/<pid>/oom_score_adj, was | ||
16 | introduced with the oom killer rewrite that allows users to increase or | ||
17 | decrease the badness score linearly. This interface will replace | ||
18 | /proc/<pid>/oom_adj. | ||
19 | |||
20 | A warning will be emitted to the kernel log if an application uses this | ||
21 | deprecated interface. After it is printed once, future warnings will be | ||
22 | suppressed until the kernel is rebooted. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 4372e6b8a353..c07f7b4fb88d 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -18,16 +18,16 @@ from the rest of the system. The article on LWN [12] mentions some probable | |||
18 | uses of the memory controller. The memory controller can be used to | 18 | uses of the memory controller. The memory controller can be used to |
19 | 19 | ||
20 | a. Isolate an application or a group of applications | 20 | a. Isolate an application or a group of applications |
21 | Memory hungry applications can be isolated and limited to a smaller | 21 | Memory-hungry applications can be isolated and limited to a smaller |
22 | amount of memory. | 22 | amount of memory. |
23 | b. Create a cgroup with limited amount of memory, this can be used | 23 | b. Create a cgroup with a limited amount of memory; this can be used |
24 | as a good alternative to booting with mem=XXXX. | 24 | as a good alternative to booting with mem=XXXX. |
25 | c. Virtualization solutions can control the amount of memory they want | 25 | c. Virtualization solutions can control the amount of memory they want |
26 | to assign to a virtual machine instance. | 26 | to assign to a virtual machine instance. |
27 | d. A CD/DVD burner could control the amount of memory used by the | 27 | d. A CD/DVD burner could control the amount of memory used by the |
28 | rest of the system to ensure that burning does not fail due to lack | 28 | rest of the system to ensure that burning does not fail due to lack |
29 | of available memory. | 29 | of available memory. |
30 | e. There are several other use cases, find one or use the controller just | 30 | e. There are several other use cases; find one or use the controller just |
31 | for fun (to learn and hack on the VM subsystem). | 31 | for fun (to learn and hack on the VM subsystem). |
32 | 32 | ||
33 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) | 33 | Current Status: linux-2.6.34-mmotm(development version of 2010/April) |
@@ -38,12 +38,12 @@ Features: | |||
38 | - optionally, memory+swap usage can be accounted and limited. | 38 | - optionally, memory+swap usage can be accounted and limited. |
39 | - hierarchical accounting | 39 | - hierarchical accounting |
40 | - soft limit | 40 | - soft limit |
41 | - moving(recharging) account at moving a task is selectable. | 41 | - moving (recharging) account at moving a task is selectable. |
42 | - usage threshold notifier | 42 | - usage threshold notifier |
43 | - oom-killer disable knob and oom-notifier | 43 | - oom-killer disable knob and oom-notifier |
44 | - Root cgroup has no limit controls. | 44 | - Root cgroup has no limit controls. |
45 | 45 | ||
46 | Kernel memory support is work in progress, and the current version provides | 46 | Kernel memory support is a work in progress, and the current version provides |
47 | basically functionality. (See Section 2.7) | 47 | basically functionality. (See Section 2.7) |
48 | 48 | ||
49 | Brief summary of control files. | 49 | Brief summary of control files. |
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller | |||
144 | 3. Each page has a pointer to the page_cgroup, which in turn knows the | 144 | 3. Each page has a pointer to the page_cgroup, which in turn knows the |
145 | cgroup it belongs to | 145 | cgroup it belongs to |
146 | 146 | ||
147 | The accounting is done as follows: mem_cgroup_charge() is invoked to setup | 147 | The accounting is done as follows: mem_cgroup_charge() is invoked to set up |
148 | the necessary data structures and check if the cgroup that is being charged | 148 | the necessary data structures and check if the cgroup that is being charged |
149 | is over its limit. If it is then reclaim is invoked on the cgroup. | 149 | is over its limit. If it is, then reclaim is invoked on the cgroup. |
150 | More details can be found in the reclaim section of this document. | 150 | More details can be found in the reclaim section of this document. |
151 | If everything goes well, a page meta-data-structure called page_cgroup is | 151 | If everything goes well, a page meta-data-structure called page_cgroup is |
152 | updated. page_cgroup has its own LRU on cgroup. | 152 | updated. page_cgroup has its own LRU on cgroup. |
@@ -163,13 +163,13 @@ for earlier. A file page will be accounted for as Page Cache when it's | |||
163 | inserted into inode (radix-tree). While it's mapped into the page tables of | 163 | inserted into inode (radix-tree). While it's mapped into the page tables of |
164 | processes, duplicate accounting is carefully avoided. | 164 | processes, duplicate accounting is carefully avoided. |
165 | 165 | ||
166 | A RSS page is unaccounted when it's fully unmapped. A PageCache page is | 166 | An RSS page is unaccounted when it's fully unmapped. A PageCache page is |
167 | unaccounted when it's removed from radix-tree. Even if RSS pages are fully | 167 | unaccounted when it's removed from radix-tree. Even if RSS pages are fully |
168 | unmapped (by kswapd), they may exist as SwapCache in the system until they | 168 | unmapped (by kswapd), they may exist as SwapCache in the system until they |
169 | are really freed. Such SwapCaches also also accounted. | 169 | are really freed. Such SwapCaches are also accounted. |
170 | A swapped-in page is not accounted until it's mapped. | 170 | A swapped-in page is not accounted until it's mapped. |
171 | 171 | ||
172 | Note: The kernel does swapin-readahead and read multiple swaps at once. | 172 | Note: The kernel does swapin-readahead and reads multiple swaps at once. |
173 | This means swapped-in pages may contain pages for other tasks than a task | 173 | This means swapped-in pages may contain pages for other tasks than a task |
174 | causing page fault. So, we avoid accounting at swap-in I/O. | 174 | causing page fault. So, we avoid accounting at swap-in I/O. |
175 | 175 | ||
@@ -209,7 +209,7 @@ memsw.limit_in_bytes. | |||
209 | Example: Assume a system with 4G of swap. A task which allocates 6G of memory | 209 | Example: Assume a system with 4G of swap. A task which allocates 6G of memory |
210 | (by mistake) under 2G memory limitation will use all swap. | 210 | (by mistake) under 2G memory limitation will use all swap. |
211 | In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. | 211 | In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. |
212 | By using memsw limit, you can avoid system OOM which can be caused by swap | 212 | By using the memsw limit, you can avoid system OOM which can be caused by swap |
213 | shortage. | 213 | shortage. |
214 | 214 | ||
215 | * why 'memory+swap' rather than swap. | 215 | * why 'memory+swap' rather than swap. |
@@ -217,7 +217,7 @@ The global LRU(kswapd) can swap out arbitrary pages. Swap-out means | |||
217 | to move account from memory to swap...there is no change in usage of | 217 | to move account from memory to swap...there is no change in usage of |
218 | memory+swap. In other words, when we want to limit the usage of swap without | 218 | memory+swap. In other words, when we want to limit the usage of swap without |
219 | affecting global LRU, memory+swap limit is better than just limiting swap from | 219 | affecting global LRU, memory+swap limit is better than just limiting swap from |
220 | OS point of view. | 220 | an OS point of view. |
221 | 221 | ||
222 | * What happens when a cgroup hits memory.memsw.limit_in_bytes | 222 | * What happens when a cgroup hits memory.memsw.limit_in_bytes |
223 | When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out | 223 | When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out |
@@ -236,7 +236,7 @@ an OOM routine is invoked to select and kill the bulkiest task in the | |||
236 | cgroup. (See 10. OOM Control below.) | 236 | cgroup. (See 10. OOM Control below.) |
237 | 237 | ||
238 | The reclaim algorithm has not been modified for cgroups, except that | 238 | The reclaim algorithm has not been modified for cgroups, except that |
239 | pages that are selected for reclaiming come from the per cgroup LRU | 239 | pages that are selected for reclaiming come from the per-cgroup LRU |
240 | list. | 240 | list. |
241 | 241 | ||
242 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any | 242 | NOTE: Reclaim does not work for the root cgroup, since we cannot set any |
@@ -316,7 +316,7 @@ We can check the usage: | |||
316 | # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes | 316 | # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes |
317 | 1216512 | 317 | 1216512 |
318 | 318 | ||
319 | A successful write to this file does not guarantee a successful set of | 319 | A successful write to this file does not guarantee a successful setting of |
320 | this limit to the value written into the file. This can be due to a | 320 | this limit to the value written into the file. This can be due to a |
321 | number of factors, such as rounding up to page boundaries or the total | 321 | number of factors, such as rounding up to page boundaries or the total |
322 | availability of memory on the system. The user is required to re-read | 322 | availability of memory on the system. The user is required to re-read |
@@ -350,7 +350,7 @@ Trying usual test under memory controller is always helpful. | |||
350 | 4.1 Troubleshooting | 350 | 4.1 Troubleshooting |
351 | 351 | ||
352 | Sometimes a user might find that the application under a cgroup is | 352 | Sometimes a user might find that the application under a cgroup is |
353 | terminated by OOM killer. There are several causes for this: | 353 | terminated by the OOM killer. There are several causes for this: |
354 | 354 | ||
355 | 1. The cgroup limit is too low (just too low to do anything useful) | 355 | 1. The cgroup limit is too low (just too low to do anything useful) |
356 | 2. The user is using anonymous memory and swap is turned off or too low | 356 | 2. The user is using anonymous memory and swap is turned off or too low |
@@ -358,7 +358,7 @@ terminated by OOM killer. There are several causes for this: | |||
358 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of | 358 | A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of |
359 | some of the pages cached in the cgroup (page cache pages). | 359 | some of the pages cached in the cgroup (page cache pages). |
360 | 360 | ||
361 | To know what happens, disable OOM_Kill by 10. OOM Control(see below) and | 361 | To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and |
362 | seeing what happens will be helpful. | 362 | seeing what happens will be helpful. |
363 | 363 | ||
364 | 4.2 Task migration | 364 | 4.2 Task migration |
@@ -399,10 +399,10 @@ About use_hierarchy, see Section 6. | |||
399 | 399 | ||
400 | Almost all pages tracked by this memory cgroup will be unmapped and freed. | 400 | Almost all pages tracked by this memory cgroup will be unmapped and freed. |
401 | Some pages cannot be freed because they are locked or in-use. Such pages are | 401 | Some pages cannot be freed because they are locked or in-use. Such pages are |
402 | moved to parent(if use_hierarchy==1) or root (if use_hierarchy==0) and this | 402 | moved to parent (if use_hierarchy==1) or root (if use_hierarchy==0) and this |
403 | cgroup will be empty. | 403 | cgroup will be empty. |
404 | 404 | ||
405 | Typical use case of this interface is that calling this before rmdir(). | 405 | The typical use case for this interface is before calling rmdir(). |
406 | Because rmdir() moves all pages to parent, some out-of-use page caches can be | 406 | Because rmdir() moves all pages to parent, some out-of-use page caches can be |
407 | moved to the parent. If you want to avoid that, force_empty will be useful. | 407 | moved to the parent. If you want to avoid that, force_empty will be useful. |
408 | 408 | ||
@@ -486,7 +486,7 @@ You can reset failcnt by writing 0 to failcnt file. | |||
486 | 486 | ||
487 | For efficiency, as other kernel components, memory cgroup uses some optimization | 487 | For efficiency, as other kernel components, memory cgroup uses some optimization |
488 | to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the | 488 | to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the |
489 | method and doesn't show 'exact' value of memory(and swap) usage, it's an fuzz | 489 | method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz |
490 | value for efficient access. (Of course, when necessary, it's synchronized.) | 490 | value for efficient access. (Of course, when necessary, it's synchronized.) |
491 | If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) | 491 | If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) |
492 | value in memory.stat(see 5.2). | 492 | value in memory.stat(see 5.2). |
@@ -496,8 +496,8 @@ value in memory.stat(see 5.2). | |||
496 | This is similar to numa_maps but operates on a per-memcg basis. This is | 496 | This is similar to numa_maps but operates on a per-memcg basis. This is |
497 | useful for providing visibility into the numa locality information within | 497 | useful for providing visibility into the numa locality information within |
498 | an memcg since the pages are allowed to be allocated from any physical | 498 | an memcg since the pages are allowed to be allocated from any physical |
499 | node. One of the usecases is evaluating application performance by | 499 | node. One of the use cases is evaluating application performance by |
500 | combining this information with the application's cpu allocation. | 500 | combining this information with the application's CPU allocation. |
501 | 501 | ||
502 | We export "total", "file", "anon" and "unevictable" pages per-node for | 502 | We export "total", "file", "anon" and "unevictable" pages per-node for |
503 | each memcg. The ouput format of memory.numa_stat is: | 503 | each memcg. The ouput format of memory.numa_stat is: |
@@ -561,10 +561,10 @@ are pushed back to their soft limits. If the soft limit of each control | |||
561 | group is very high, they are pushed back as much as possible to make | 561 | group is very high, they are pushed back as much as possible to make |
562 | sure that one control group does not starve the others of memory. | 562 | sure that one control group does not starve the others of memory. |
563 | 563 | ||
564 | Please note that soft limits is a best effort feature, it comes with | 564 | Please note that soft limits is a best-effort feature; it comes with |
565 | no guarantees, but it does its best to make sure that when memory is | 565 | no guarantees, but it does its best to make sure that when memory is |
566 | heavily contended for, memory is allocated based on the soft limit | 566 | heavily contended for, memory is allocated based on the soft limit |
567 | hints/setup. Currently soft limit based reclaim is setup such that | 567 | hints/setup. Currently soft limit based reclaim is set up such that |
568 | it gets invoked from balance_pgdat (kswapd). | 568 | it gets invoked from balance_pgdat (kswapd). |
569 | 569 | ||
570 | 7.1 Interface | 570 | 7.1 Interface |
@@ -592,7 +592,7 @@ page tables. | |||
592 | 592 | ||
593 | 8.1 Interface | 593 | 8.1 Interface |
594 | 594 | ||
595 | This feature is disabled by default. It can be enabled(and disabled again) by | 595 | This feature is disabled by default. It can be enabledi (and disabled again) by |
596 | writing to memory.move_charge_at_immigrate of the destination cgroup. | 596 | writing to memory.move_charge_at_immigrate of the destination cgroup. |
597 | 597 | ||
598 | If you want to enable it: | 598 | If you want to enable it: |
@@ -601,8 +601,8 @@ If you want to enable it: | |||
601 | 601 | ||
602 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type | 602 | Note: Each bits of move_charge_at_immigrate has its own meaning about what type |
603 | of charges should be moved. See 8.2 for details. | 603 | of charges should be moved. See 8.2 for details. |
604 | Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread | 604 | Note: Charges are moved only when you move mm->owner, in other words, |
605 | group. | 605 | a leader of a thread group. |
606 | Note: If we cannot find enough space for the task in the destination cgroup, we | 606 | Note: If we cannot find enough space for the task in the destination cgroup, we |
607 | try to make space by reclaiming memory. Task migration may fail if we | 607 | try to make space by reclaiming memory. Task migration may fail if we |
608 | cannot make enough space. | 608 | cannot make enough space. |
@@ -612,25 +612,25 @@ And if you want disable it again: | |||
612 | 612 | ||
613 | # echo 0 > memory.move_charge_at_immigrate | 613 | # echo 0 > memory.move_charge_at_immigrate |
614 | 614 | ||
615 | 8.2 Type of charges which can be move | 615 | 8.2 Type of charges which can be moved |
616 | 616 | ||
617 | Each bits of move_charge_at_immigrate has its own meaning about what type of | 617 | Each bit in move_charge_at_immigrate has its own meaning about what type of |
618 | charges should be moved. But in any cases, it must be noted that an account of | 618 | charges should be moved. But in any case, it must be noted that an account of |
619 | a page or a swap can be moved only when it is charged to the task's current(old) | 619 | a page or a swap can be moved only when it is charged to the task's current |
620 | memory cgroup. | 620 | (old) memory cgroup. |
621 | 621 | ||
622 | bit | what type of charges would be moved ? | 622 | bit | what type of charges would be moved ? |
623 | -----+------------------------------------------------------------------------ | 623 | -----+------------------------------------------------------------------------ |
624 | 0 | A charge of an anonymous page(or swap of it) used by the target task. | 624 | 0 | A charge of an anonymous page (or swap of it) used by the target task. |
625 | | You must enable Swap Extension(see 2.4) to enable move of swap charges. | 625 | | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
626 | -----+------------------------------------------------------------------------ | 626 | -----+------------------------------------------------------------------------ |
627 | 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) | 627 | 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
628 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | 628 | | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
629 | | anonymous pages, file pages(and swaps) in the range mmapped by the task | 629 | | anonymous pages, file pages (and swaps) in the range mmapped by the task |
630 | | will be moved even if the task hasn't done page fault, i.e. they might | 630 | | will be moved even if the task hasn't done page fault, i.e. they might |
631 | | not be the task's "RSS", but other task's "RSS" that maps the same file. | 631 | | not be the task's "RSS", but other task's "RSS" that maps the same file. |
632 | | And mapcount of the page is ignored(the page can be moved even if | 632 | | And mapcount of the page is ignored (the page can be moved even if |
633 | | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to | 633 | | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to |
634 | | enable move of swap charges. | 634 | | enable move of swap charges. |
635 | 635 | ||
636 | 8.3 TODO | 636 | 8.3 TODO |
@@ -640,11 +640,11 @@ memory cgroup. | |||
640 | 640 | ||
641 | 9. Memory thresholds | 641 | 9. Memory thresholds |
642 | 642 | ||
643 | Memory cgroup implements memory thresholds using cgroups notification | 643 | Memory cgroup implements memory thresholds using the cgroups notification |
644 | API (see cgroups.txt). It allows to register multiple memory and memsw | 644 | API (see cgroups.txt). It allows to register multiple memory and memsw |
645 | thresholds and gets notifications when it crosses. | 645 | thresholds and gets notifications when it crosses. |
646 | 646 | ||
647 | To register a threshold application need: | 647 | To register a threshold, an application must: |
648 | - create an eventfd using eventfd(2); | 648 | - create an eventfd using eventfd(2); |
649 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | 649 | - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; |
650 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to | 650 | - write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to |
@@ -659,24 +659,24 @@ It's applicable for root and non-root cgroup. | |||
659 | 659 | ||
660 | memory.oom_control file is for OOM notification and other controls. | 660 | memory.oom_control file is for OOM notification and other controls. |
661 | 661 | ||
662 | Memory cgroup implements OOM notifier using cgroup notification | 662 | Memory cgroup implements OOM notifier using the cgroup notification |
663 | API (See cgroups.txt). It allows to register multiple OOM notification | 663 | API (See cgroups.txt). It allows to register multiple OOM notification |
664 | delivery and gets notification when OOM happens. | 664 | delivery and gets notification when OOM happens. |
665 | 665 | ||
666 | To register a notifier, application need: | 666 | To register a notifier, an application must: |
667 | - create an eventfd using eventfd(2) | 667 | - create an eventfd using eventfd(2) |
668 | - open memory.oom_control file | 668 | - open memory.oom_control file |
669 | - write string like "<event_fd> <fd of memory.oom_control>" to | 669 | - write string like "<event_fd> <fd of memory.oom_control>" to |
670 | cgroup.event_control | 670 | cgroup.event_control |
671 | 671 | ||
672 | Application will be notified through eventfd when OOM happens. | 672 | The application will be notified through eventfd when OOM happens. |
673 | OOM notification doesn't work for root cgroup. | 673 | OOM notification doesn't work for the root cgroup. |
674 | 674 | ||
675 | You can disable OOM-killer by writing "1" to memory.oom_control file, as: | 675 | You can disable the OOM-killer by writing "1" to memory.oom_control file, as: |
676 | 676 | ||
677 | #echo 1 > memory.oom_control | 677 | #echo 1 > memory.oom_control |
678 | 678 | ||
679 | This operation is only allowed to the top cgroup of sub-hierarchy. | 679 | This operation is only allowed to the top cgroup of a sub-hierarchy. |
680 | If OOM-killer is disabled, tasks under cgroup will hang/sleep | 680 | If OOM-killer is disabled, tasks under cgroup will hang/sleep |
681 | in memory cgroup's OOM-waitqueue when they request accountable memory. | 681 | in memory cgroup's OOM-waitqueue when they request accountable memory. |
682 | 682 | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fb0a6aeb936c..a1793d670cd0 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -33,7 +33,7 @@ Table of Contents | |||
33 | 2 Modifying System Parameters | 33 | 2 Modifying System Parameters |
34 | 34 | ||
35 | 3 Per-Process Parameters | 35 | 3 Per-Process Parameters |
36 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer | 36 | 3.1 /proc/<pid>/oom_score_adj - Adjust the oom-killer |
37 | score | 37 | score |
38 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score | 38 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
39 | 3.3 /proc/<pid>/io - Display the IO accounting fields | 39 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
@@ -1320,10 +1320,10 @@ of the kernel. | |||
1320 | CHAPTER 3: PER-PROCESS PARAMETERS | 1320 | CHAPTER 3: PER-PROCESS PARAMETERS |
1321 | ------------------------------------------------------------------------------ | 1321 | ------------------------------------------------------------------------------ |
1322 | 1322 | ||
1323 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score | 1323 | 3.1 /proc/<pid>/oom_score_adj- Adjust the oom-killer score |
1324 | -------------------------------------------------------------------------------- | 1324 | -------------------------------------------------------------------------------- |
1325 | 1325 | ||
1326 | These file can be used to adjust the badness heuristic used to select which | 1326 | This file can be used to adjust the badness heuristic used to select which |
1327 | process gets killed in out of memory conditions. | 1327 | process gets killed in out of memory conditions. |
1328 | 1328 | ||
1329 | The badness heuristic assigns a value to each candidate task ranging from 0 | 1329 | The badness heuristic assigns a value to each candidate task ranging from 0 |
@@ -1361,22 +1361,10 @@ same system, cpuset, mempolicy, or memory controller resources to use at least | |||
1361 | equivalent to discounting 50% of the task's allowed memory from being considered | 1361 | equivalent to discounting 50% of the task's allowed memory from being considered |
1362 | as scoring against the task. | 1362 | as scoring against the task. |
1363 | 1363 | ||
1364 | For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also | ||
1365 | be used to tune the badness score. Its acceptable values range from -16 | ||
1366 | (OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 | ||
1367 | (OOM_DISABLE) to disable oom killing entirely for that task. Its value is | ||
1368 | scaled linearly with /proc/<pid>/oom_score_adj. | ||
1369 | |||
1370 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the | ||
1371 | other with its scaled value. | ||
1372 | |||
1373 | The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last | 1364 | The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last |
1374 | value set by a CAP_SYS_RESOURCE process. To reduce the value any lower | 1365 | value set by a CAP_SYS_RESOURCE process. To reduce the value any lower |
1375 | requires CAP_SYS_RESOURCE. | 1366 | requires CAP_SYS_RESOURCE. |
1376 | 1367 | ||
1377 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see | ||
1378 | Documentation/feature-removal-schedule.txt. | ||
1379 | |||
1380 | Caveat: when a parent task is selected, the oom killer will sacrifice any first | 1368 | Caveat: when a parent task is selected, the oom killer will sacrifice any first |
1381 | generation children with separate address spaces instead, if possible. This | 1369 | generation children with separate address spaces instead, if possible. This |
1382 | avoids servers and important system daemons from being killed and loses the | 1370 | avoids servers and important system daemons from being killed and loses the |
@@ -1387,9 +1375,7 @@ minimal amount of work. | |||
1387 | ------------------------------------------------------------- | 1375 | ------------------------------------------------------------- |
1388 | 1376 | ||
1389 | This file can be used to check the current score used by the oom-killer is for | 1377 | This file can be used to check the current score used by the oom-killer is for |
1390 | any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which | 1378 | any given <pid>. |
1391 | process should be killed in an out-of-memory situation. | ||
1392 | |||
1393 | 1379 | ||
1394 | 3.3 /proc/<pid>/io - Display the IO accounting fields | 1380 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
1395 | ------------------------------------------------------- | 1381 | ------------------------------------------------------- |
diff --git a/Documentation/memory.txt b/Documentation/memory.txt deleted file mode 100644 index 802efe58647c..000000000000 --- a/Documentation/memory.txt +++ /dev/null | |||
@@ -1,33 +0,0 @@ | |||
1 | There are several classic problems related to memory on Linux | ||
2 | systems. | ||
3 | |||
4 | 1) There are some motherboards that will not cache above | ||
5 | a certain quantity of memory. If you have one of these | ||
6 | motherboards, your system will be SLOWER, not faster | ||
7 | as you add more memory. Consider exchanging your | ||
8 | motherboard. | ||
9 | |||
10 | All of these problems can be addressed with the "mem=XXXM" boot option | ||
11 | (where XXX is the size of RAM to use in megabytes). | ||
12 | It can also tell Linux to use less memory than is actually installed. | ||
13 | If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid | ||
14 | physical address space collisions. | ||
15 | |||
16 | See the documentation of your boot loader (LILO, grub, loadlin, etc.) about | ||
17 | how to pass options to the kernel. | ||
18 | |||
19 | There are other memory problems which Linux cannot deal with. Random | ||
20 | corruption of memory is usually a sign of serious hardware trouble. | ||
21 | Try: | ||
22 | |||
23 | * Reducing memory settings in the BIOS to the most conservative | ||
24 | timings. | ||
25 | |||
26 | * Adding a cooling fan. | ||
27 | |||
28 | * Not overclocking your CPU. | ||
29 | |||
30 | * Having the memory tested in a memory tester or exchanged | ||
31 | with the vendor. Consider testing it with memtest86 yourself. | ||
32 | |||
33 | * Exchanging your CPU, cache, or motherboard for one that works. | ||
diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt deleted file mode 100644 index 3aa68f9a117b..000000000000 --- a/Documentation/prio_tree.txt +++ /dev/null | |||
@@ -1,107 +0,0 @@ | |||
1 | The prio_tree.c code indexes vmas using 3 different indexes: | ||
2 | * heap_index = vm_pgoff + vm_size_in_pages : end_vm_pgoff | ||
3 | * radix_index = vm_pgoff : start_vm_pgoff | ||
4 | * size_index = vm_size_in_pages | ||
5 | |||
6 | A regular radix-priority-search-tree indexes vmas using only heap_index and | ||
7 | radix_index. The conditions for indexing are: | ||
8 | * ->heap_index >= ->left->heap_index && | ||
9 | ->heap_index >= ->right->heap_index | ||
10 | * if (->heap_index == ->left->heap_index) | ||
11 | then ->radix_index < ->left->radix_index; | ||
12 | * if (->heap_index == ->right->heap_index) | ||
13 | then ->radix_index < ->right->radix_index; | ||
14 | * nodes are hashed to left or right subtree using radix_index | ||
15 | similar to a pure binary radix tree. | ||
16 | |||
17 | A regular radix-priority-search-tree helps to store and query | ||
18 | intervals (vmas). However, a regular radix-priority-search-tree is only | ||
19 | suitable for storing vmas with different radix indices (vm_pgoff). | ||
20 | |||
21 | Therefore, the prio_tree.c extends the regular radix-priority-search-tree | ||
22 | to handle many vmas with the same vm_pgoff. Such vmas are handled in | ||
23 | 2 different ways: 1) All vmas with the same radix _and_ heap indices are | ||
24 | linked using vm_set.list, 2) if there are many vmas with the same radix | ||
25 | index, but different heap indices and if the regular radix-priority-search | ||
26 | tree cannot index them all, we build an overflow-sub-tree that indexes such | ||
27 | vmas using heap and size indices instead of heap and radix indices. For | ||
28 | example, in the figure below some vmas with vm_pgoff = 0 (zero) are | ||
29 | indexed by regular radix-priority-search-tree whereas others are pushed | ||
30 | into an overflow-subtree. Note that all vmas in an overflow-sub-tree have | ||
31 | the same vm_pgoff (radix_index) and if necessary we build different | ||
32 | overflow-sub-trees to handle each possible radix_index. For example, | ||
33 | in figure we have 3 overflow-sub-trees corresponding to radix indices | ||
34 | 0, 2, and 4. | ||
35 | |||
36 | In the final tree the first few (prio_tree_root->index_bits) levels | ||
37 | are indexed using heap and radix indices whereas the overflow-sub-trees below | ||
38 | those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are | ||
39 | indexed using heap and size indices. In overflow-sub-trees the size_index | ||
40 | is used for hashing the nodes to appropriate places. | ||
41 | |||
42 | Now, an example prio_tree: | ||
43 | |||
44 | vmas are represented [radix_index, size_index, heap_index] | ||
45 | i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff] | ||
46 | |||
47 | level prio_tree_root->index_bits = 3 | ||
48 | ----- | ||
49 | _ | ||
50 | 0 [0,7,7] | | ||
51 | / \ | | ||
52 | ------------------ ------------ | Regular | ||
53 | / \ | radix priority | ||
54 | 1 [1,6,7] [4,3,7] | search tree | ||
55 | / \ / \ | | ||
56 | ------- ----- ------ ----- | heap-and-radix | ||
57 | / \ / \ | indexed | ||
58 | 2 [0,6,6] [2,5,7] [5,2,7] [6,1,7] | | ||
59 | / \ / \ / \ / \ | | ||
60 | 3 [0,5,5] [1,5,6] [2,4,6] [3,4,7] [4,2,6] [5,1,6] [6,0,6] [7,0,7] | | ||
61 | / / / _ | ||
62 | / / / _ | ||
63 | 4 [0,4,4] [2,3,5] [4,1,5] | | ||
64 | / / / | | ||
65 | 5 [0,3,3] [2,2,4] [4,0,4] | Overflow-sub-trees | ||
66 | / / | | ||
67 | 6 [0,2,2] [2,1,3] | heap-and-size | ||
68 | / / | indexed | ||
69 | 7 [0,1,1] [2,0,2] | | ||
70 | / | | ||
71 | 8 [0,0,0] | | ||
72 | _ | ||
73 | |||
74 | Note that we use prio_tree_root->index_bits to optimize the height | ||
75 | of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is | ||
76 | set according to the maximum end_vm_pgoff mapped, we are sure that all | ||
77 | bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore, | ||
78 | we only use the first prio_tree_root->index_bits as radix_index. | ||
79 | Whenever index_bits is increased in prio_tree_expand, we shuffle the tree | ||
80 | to make sure that the first prio_tree_root->index_bits levels of the tree | ||
81 | is indexed properly using heap and radix indices. | ||
82 | |||
83 | We do not optimize the height of overflow-sub-trees using index_bits. | ||
84 | The reason is: there can be many such overflow-sub-trees and all of | ||
85 | them have to be suffled whenever the index_bits increases. This may involve | ||
86 | walking the whole prio_tree in prio_tree_insert->prio_tree_expand code | ||
87 | path which is not desirable. Hence, we do not optimize the height of the | ||
88 | heap-and-size indexed overflow-sub-trees using prio_tree->index_bits. | ||
89 | Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits | ||
90 | of size_index. This may lead to skewed sub-trees because most of the | ||
91 | higher significant bits of the size_index are likely to be 0 (zero). In | ||
92 | the example above, all 3 overflow-sub-trees are skewed. This may marginally | ||
93 | affect the performance. However, processes rarely map many vmas with the | ||
94 | same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally | ||
95 | do not require overflow-sub-trees to index all vmas. | ||
96 | |||
97 | From the above discussion it is clear that the maximum height of | ||
98 | a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG. | ||
99 | However, in most of the common cases we do not need overflow-sub-trees, | ||
100 | so the tree height in the common cases will be prio_tree_root->index_bits. | ||
101 | |||
102 | It is fair to mention here that the prio_tree_root->index_bits | ||
103 | is increased on demand, however, the index_bits is not decreased when | ||
104 | vmas are removed from the prio_tree. That's tricky to do. Hence, it's | ||
105 | left as a home work problem. | ||
106 | |||
107 | |||
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index 8d32d85a5234..61b6c48871a0 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt | |||
@@ -193,24 +193,55 @@ Example: | |||
193 | Support for Augmented rbtrees | 193 | Support for Augmented rbtrees |
194 | ----------------------------- | 194 | ----------------------------- |
195 | 195 | ||
196 | Augmented rbtree is an rbtree with "some" additional data stored in each node. | 196 | Augmented rbtree is an rbtree with "some" additional data stored in |
197 | This data can be used to augment some new functionality to rbtree. | 197 | each node, where the additional data for node N must be a function of |
198 | Augmented rbtree is an optional feature built on top of basic rbtree | 198 | the contents of all nodes in the subtree rooted at N. This data can |
199 | infrastructure. An rbtree user who wants this feature will have to call the | 199 | be used to augment some new functionality to rbtree. Augmented rbtree |
200 | augmentation functions with the user provided augmentation callback | 200 | is an optional feature built on top of basic rbtree infrastructure. |
201 | when inserting and erasing nodes. | 201 | An rbtree user who wants this feature will have to call the augmentation |
202 | 202 | functions with the user provided augmentation callback when inserting | |
203 | On insertion, the user must call rb_augment_insert() once the new node is in | 203 | and erasing nodes. |
204 | place. This will cause the augmentation function callback to be called for | 204 | |
205 | each node between the new node and the root which has been affected by the | 205 | C files implementing augmented rbtree manipulation must include |
206 | insertion. | 206 | <linux/rbtree_augmented.h> instead of <linus/rbtree.h>. Note that |
207 | 207 | linux/rbtree_augmented.h exposes some rbtree implementations details | |
208 | When erasing a node, the user must call rb_augment_erase_begin() first to | 208 | you are not expected to rely on; please stick to the documented APIs |
209 | retrieve the deepest node on the rebalance path. Then, after erasing the | 209 | there and do not include <linux/rbtree_augmented.h> from header files |
210 | original node, the user must call rb_augment_erase_end() with the deepest | 210 | either so as to minimize chances of your users accidentally relying on |
211 | node found earlier. This will cause the augmentation function to be called | 211 | such implementation details. |
212 | for each affected node between the deepest node and the root. | 212 | |
213 | 213 | On insertion, the user must update the augmented information on the path | |
214 | leading to the inserted node, then call rb_link_node() as usual and | ||
215 | rb_augment_inserted() instead of the usual rb_insert_color() call. | ||
216 | If rb_augment_inserted() rebalances the rbtree, it will callback into | ||
217 | a user provided function to update the augmented information on the | ||
218 | affected subtrees. | ||
219 | |||
220 | When erasing a node, the user must call rb_erase_augmented() instead of | ||
221 | rb_erase(). rb_erase_augmented() calls back into user provided functions | ||
222 | to updated the augmented information on affected subtrees. | ||
223 | |||
224 | In both cases, the callbacks are provided through struct rb_augment_callbacks. | ||
225 | 3 callbacks must be defined: | ||
226 | |||
227 | - A propagation callback, which updates the augmented value for a given | ||
228 | node and its ancestors, up to a given stop point (or NULL to update | ||
229 | all the way to the root). | ||
230 | |||
231 | - A copy callback, which copies the augmented value for a given subtree | ||
232 | to a newly assigned subtree root. | ||
233 | |||
234 | - A tree rotation callback, which copies the augmented value for a given | ||
235 | subtree to a newly assigned subtree root AND recomputes the augmented | ||
236 | information for the former subtree root. | ||
237 | |||
238 | The compiled code for rb_erase_augmented() may inline the propagation and | ||
239 | copy callbacks, which results in a large function, so each augmented rbtree | ||
240 | user should have a single rb_erase_augmented() call site in order to limit | ||
241 | compiled code size. | ||
242 | |||
243 | |||
244 | Sample usage: | ||
214 | 245 | ||
215 | Interval tree is an example of augmented rb tree. Reference - | 246 | Interval tree is an example of augmented rb tree. Reference - |
216 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. | 247 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. |
@@ -230,26 +261,132 @@ and its immediate children. And this will be used in O(log n) lookup | |||
230 | for lowest match (lowest start address among all possible matches) | 261 | for lowest match (lowest start address among all possible matches) |
231 | with something like: | 262 | with something like: |
232 | 263 | ||
233 | find_lowest_match(lo, hi, node) | 264 | struct interval_tree_node * |
265 | interval_tree_first_match(struct rb_root *root, | ||
266 | unsigned long start, unsigned long last) | ||
234 | { | 267 | { |
235 | lowest_match = NULL; | 268 | struct interval_tree_node *node; |
236 | while (node) { | 269 | |
237 | if (max_hi(node->left) > lo) { | 270 | if (!root->rb_node) |
238 | // Lowest overlap if any must be on left side | 271 | return NULL; |
239 | node = node->left; | 272 | node = rb_entry(root->rb_node, struct interval_tree_node, rb); |
240 | } else if (overlap(lo, hi, node)) { | 273 | |
241 | lowest_match = node; | 274 | while (true) { |
242 | break; | 275 | if (node->rb.rb_left) { |
243 | } else if (lo > node->lo) { | 276 | struct interval_tree_node *left = |
244 | // Lowest overlap if any must be on right side | 277 | rb_entry(node->rb.rb_left, |
245 | node = node->right; | 278 | struct interval_tree_node, rb); |
246 | } else { | 279 | if (left->__subtree_last >= start) { |
247 | break; | 280 | /* |
281 | * Some nodes in left subtree satisfy Cond2. | ||
282 | * Iterate to find the leftmost such node N. | ||
283 | * If it also satisfies Cond1, that's the match | ||
284 | * we are looking for. Otherwise, there is no | ||
285 | * matching interval as nodes to the right of N | ||
286 | * can't satisfy Cond1 either. | ||
287 | */ | ||
288 | node = left; | ||
289 | continue; | ||
290 | } | ||
248 | } | 291 | } |
292 | if (node->start <= last) { /* Cond1 */ | ||
293 | if (node->last >= start) /* Cond2 */ | ||
294 | return node; /* node is leftmost match */ | ||
295 | if (node->rb.rb_right) { | ||
296 | node = rb_entry(node->rb.rb_right, | ||
297 | struct interval_tree_node, rb); | ||
298 | if (node->__subtree_last >= start) | ||
299 | continue; | ||
300 | } | ||
301 | } | ||
302 | return NULL; /* No match */ | ||
303 | } | ||
304 | } | ||
305 | |||
306 | Insertion/removal are defined using the following augmented callbacks: | ||
307 | |||
308 | static inline unsigned long | ||
309 | compute_subtree_last(struct interval_tree_node *node) | ||
310 | { | ||
311 | unsigned long max = node->last, subtree_last; | ||
312 | if (node->rb.rb_left) { | ||
313 | subtree_last = rb_entry(node->rb.rb_left, | ||
314 | struct interval_tree_node, rb)->__subtree_last; | ||
315 | if (max < subtree_last) | ||
316 | max = subtree_last; | ||
317 | } | ||
318 | if (node->rb.rb_right) { | ||
319 | subtree_last = rb_entry(node->rb.rb_right, | ||
320 | struct interval_tree_node, rb)->__subtree_last; | ||
321 | if (max < subtree_last) | ||
322 | max = subtree_last; | ||
323 | } | ||
324 | return max; | ||
325 | } | ||
326 | |||
327 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) | ||
328 | { | ||
329 | while (rb != stop) { | ||
330 | struct interval_tree_node *node = | ||
331 | rb_entry(rb, struct interval_tree_node, rb); | ||
332 | unsigned long subtree_last = compute_subtree_last(node); | ||
333 | if (node->__subtree_last == subtree_last) | ||
334 | break; | ||
335 | node->__subtree_last = subtree_last; | ||
336 | rb = rb_parent(&node->rb); | ||
249 | } | 337 | } |
250 | return lowest_match; | ||
251 | } | 338 | } |
252 | 339 | ||
253 | Finding exact match will be to first find lowest match and then to follow | 340 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) |
254 | successor nodes looking for exact match, until the start of a node is beyond | 341 | { |
255 | the hi value we are looking for. | 342 | struct interval_tree_node *old = |
343 | rb_entry(rb_old, struct interval_tree_node, rb); | ||
344 | struct interval_tree_node *new = | ||
345 | rb_entry(rb_new, struct interval_tree_node, rb); | ||
346 | |||
347 | new->__subtree_last = old->__subtree_last; | ||
348 | } | ||
349 | |||
350 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) | ||
351 | { | ||
352 | struct interval_tree_node *old = | ||
353 | rb_entry(rb_old, struct interval_tree_node, rb); | ||
354 | struct interval_tree_node *new = | ||
355 | rb_entry(rb_new, struct interval_tree_node, rb); | ||
356 | |||
357 | new->__subtree_last = old->__subtree_last; | ||
358 | old->__subtree_last = compute_subtree_last(old); | ||
359 | } | ||
360 | |||
361 | static const struct rb_augment_callbacks augment_callbacks = { | ||
362 | augment_propagate, augment_copy, augment_rotate | ||
363 | }; | ||
364 | |||
365 | void interval_tree_insert(struct interval_tree_node *node, | ||
366 | struct rb_root *root) | ||
367 | { | ||
368 | struct rb_node **link = &root->rb_node, *rb_parent = NULL; | ||
369 | unsigned long start = node->start, last = node->last; | ||
370 | struct interval_tree_node *parent; | ||
371 | |||
372 | while (*link) { | ||
373 | rb_parent = *link; | ||
374 | parent = rb_entry(rb_parent, struct interval_tree_node, rb); | ||
375 | if (parent->__subtree_last < last) | ||
376 | parent->__subtree_last = last; | ||
377 | if (start < parent->start) | ||
378 | link = &parent->rb.rb_left; | ||
379 | else | ||
380 | link = &parent->rb.rb_right; | ||
381 | } | ||
382 | |||
383 | node->__subtree_last = last; | ||
384 | rb_link_node(&node->rb, rb_parent, link); | ||
385 | rb_insert_augmented(&node->rb, root, &augment_callbacks); | ||
386 | } | ||
387 | |||
388 | void interval_tree_remove(struct interval_tree_node *node, | ||
389 | struct rb_root *root) | ||
390 | { | ||
391 | rb_erase_augmented(&node->rb, root, &augment_callbacks); | ||
392 | } | ||
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index fa206cccf89f..a68db7692ee8 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -197,12 +197,8 @@ the pages are also "rescued" from the unevictable list in the process of | |||
197 | freeing them. | 197 | freeing them. |
198 | 198 | ||
199 | page_evictable() also checks for mlocked pages by testing an additional page | 199 | page_evictable() also checks for mlocked pages by testing an additional page |
200 | flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked, | 200 | flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is |
201 | and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is | 201 | faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED. |
202 | VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and | ||
203 | update the appropriate statistics if the vma is VM_LOCKED. This method allows | ||
204 | efficient "culling" of pages in the fault path that are being faulted in to | ||
205 | VM_LOCKED VMAs. | ||
206 | 202 | ||
207 | 203 | ||
208 | VMSCAN'S HANDLING OF UNEVICTABLE PAGES | 204 | VMSCAN'S HANDLING OF UNEVICTABLE PAGES |
@@ -371,8 +367,8 @@ mlock_fixup() filters several classes of "special" VMAs: | |||
371 | mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to | 367 | mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to |
372 | allocate the huge pages and populate the ptes. | 368 | allocate the huge pages and populate the ptes. |
373 | 369 | ||
374 | 3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of | 370 | 3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, |
375 | kernel pages, such as the VDSO page, relay channel pages, etc. These pages | 371 | such as the VDSO page, relay channel pages, etc. These pages |
376 | are inherently unevictable and are not managed on the LRU lists. | 372 | are inherently unevictable and are not managed on the LRU lists. |
377 | mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls | 373 | mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls |
378 | make_pages_present() to populate the ptes. | 374 | make_pages_present() to populate the ptes. |
@@ -651,7 +647,7 @@ PAGE RECLAIM IN shrink_*_list() | |||
651 | ------------------------------- | 647 | ------------------------------- |
652 | 648 | ||
653 | shrink_active_list() culls any obviously unevictable pages - i.e. | 649 | shrink_active_list() culls any obviously unevictable pages - i.e. |
654 | !page_evictable(page, NULL) - diverting these to the unevictable list. | 650 | !page_evictable(page) - diverting these to the unevictable list. |
655 | However, shrink_active_list() only sees unevictable pages that made it onto the | 651 | However, shrink_active_list() only sees unevictable pages that made it onto the |
656 | active/inactive lru lists. Note that these pages do not have PageUnevictable | 652 | active/inactive lru lists. Note that these pages do not have PageUnevictable |
657 | set - otherwise they would be on the unevictable list and shrink_active_list | 653 | set - otherwise they would be on the unevictable list and shrink_active_list |
diff --git a/MAINTAINERS b/MAINTAINERS index ab98a99bee92..eae3cd86831e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -7039,6 +7039,14 @@ S: Maintained | |||
7039 | F: Documentation/svga.txt | 7039 | F: Documentation/svga.txt |
7040 | F: arch/x86/boot/video* | 7040 | F: arch/x86/boot/video* |
7041 | 7041 | ||
7042 | SWIOTLB SUBSYSTEM | ||
7043 | M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | ||
7044 | L: linux-kernel@vger.kernel.org | ||
7045 | S: Supported | ||
7046 | F: lib/swiotlb.c | ||
7047 | F: arch/*/kernel/pci-swiotlb.c | ||
7048 | F: include/linux/swiotlb.h | ||
7049 | |||
7042 | SYSV FILESYSTEM | 7050 | SYSV FILESYSTEM |
7043 | M: Christoph Hellwig <hch@infradead.org> | 7051 | M: Christoph Hellwig <hch@infradead.org> |
7044 | S: Maintained | 7052 | S: Maintained |
diff --git a/arch/Kconfig b/arch/Kconfig index a62965d057f6..550cce4dd648 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -313,4 +313,7 @@ config HAVE_IRQ_TIME_ACCOUNTING | |||
313 | Archs need to ensure they use a high enough resolution clock to | 313 | Archs need to ensure they use a high enough resolution clock to |
314 | support irq time accounting and then call enable_sched_clock_irqtime(). | 314 | support irq time accounting and then call enable_sched_clock_irqtime(). |
315 | 315 | ||
316 | config HAVE_ARCH_TRANSPARENT_HUGEPAGE | ||
317 | bool | ||
318 | |||
316 | source "kernel/gcov/Kconfig" | 319 | source "kernel/gcov/Kconfig" |
diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 53649c7d0068..b51f7b4818cd 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c | |||
@@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose, | |||
26 | base = sparse ? hose->sparse_io_base : hose->dense_io_base; | 26 | base = sparse ? hose->sparse_io_base : hose->dense_io_base; |
27 | 27 | ||
28 | vma->vm_pgoff += base >> PAGE_SHIFT; | 28 | vma->vm_pgoff += base >> PAGE_SHIFT; |
29 | vma->vm_flags |= (VM_IO | VM_RESERVED); | 29 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
30 | 30 | ||
31 | return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, | 31 | return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, |
32 | vma->vm_end - vma->vm_start, | 32 | vma->vm_end - vma->vm_start, |
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6d2f7f5c0036..2867a7742306 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -25,6 +25,7 @@ config ARM | |||
25 | select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) | 25 | select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) |
26 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 26 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE |
27 | select HAVE_GENERIC_DMA_COHERENT | 27 | select HAVE_GENERIC_DMA_COHERENT |
28 | select HAVE_DEBUG_KMEMLEAK | ||
28 | select HAVE_KERNEL_GZIP | 29 | select HAVE_KERNEL_GZIP |
29 | select HAVE_KERNEL_LZO | 30 | select HAVE_KERNEL_LZO |
30 | select HAVE_KERNEL_LZMA | 31 | select HAVE_KERNEL_LZMA |
@@ -39,6 +40,7 @@ config ARM | |||
39 | select HARDIRQS_SW_RESEND | 40 | select HARDIRQS_SW_RESEND |
40 | select GENERIC_IRQ_PROBE | 41 | select GENERIC_IRQ_PROBE |
41 | select GENERIC_IRQ_SHOW | 42 | select GENERIC_IRQ_SHOW |
43 | select HAVE_UID16 | ||
42 | select ARCH_WANT_IPC_PARSE_VERSION | 44 | select ARCH_WANT_IPC_PARSE_VERSION |
43 | select HARDIRQS_SW_RESEND | 45 | select HARDIRQS_SW_RESEND |
44 | select CPU_PM if (SUSPEND || CPU_IDLE) | 46 | select CPU_PM if (SUSPEND || CPU_IDLE) |
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7599e2625c7d..2a5907b5c8d2 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c | |||
@@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, | |||
134 | { | 134 | { |
135 | struct mm_struct *mm = vma->vm_mm; | 135 | struct mm_struct *mm = vma->vm_mm; |
136 | struct vm_area_struct *mpnt; | 136 | struct vm_area_struct *mpnt; |
137 | struct prio_tree_iter iter; | ||
138 | unsigned long offset; | 137 | unsigned long offset; |
139 | pgoff_t pgoff; | 138 | pgoff_t pgoff; |
140 | int aliases = 0; | 139 | int aliases = 0; |
@@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, | |||
147 | * cache coherency. | 146 | * cache coherency. |
148 | */ | 147 | */ |
149 | flush_dcache_mmap_lock(mapping); | 148 | flush_dcache_mmap_lock(mapping); |
150 | vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { | 149 | vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { |
151 | /* | 150 | /* |
152 | * If this VMA is not in our MM, we can ignore it. | 151 | * If this VMA is not in our MM, we can ignore it. |
153 | * Note that we intentionally mask out the VMA | 152 | * Note that we intentionally mask out the VMA |
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c3bd83450227..5dbf13f954f6 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c | |||
@@ -336,6 +336,7 @@ retry: | |||
336 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 336 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
337 | * of starvation. */ | 337 | * of starvation. */ |
338 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 338 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
339 | flags |= FAULT_FLAG_TRIED; | ||
339 | goto retry; | 340 | goto retry; |
340 | } | 341 | } |
341 | } | 342 | } |
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 40ca11ed6e5f..1c8f7f564175 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c | |||
@@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p | |||
196 | { | 196 | { |
197 | struct mm_struct *mm = current->active_mm; | 197 | struct mm_struct *mm = current->active_mm; |
198 | struct vm_area_struct *mpnt; | 198 | struct vm_area_struct *mpnt; |
199 | struct prio_tree_iter iter; | ||
200 | pgoff_t pgoff; | 199 | pgoff_t pgoff; |
201 | 200 | ||
202 | /* | 201 | /* |
@@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p | |||
208 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 207 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
209 | 208 | ||
210 | flush_dcache_mmap_lock(mapping); | 209 | flush_dcache_mmap_lock(mapping); |
211 | vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { | 210 | vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { |
212 | unsigned long offset; | 211 | unsigned long offset; |
213 | 212 | ||
214 | /* | 213 | /* |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 767ba5685454..7ff68c946073 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -10,6 +10,8 @@ config ARM64 | |||
10 | select GENERIC_TIME_VSYSCALL | 10 | select GENERIC_TIME_VSYSCALL |
11 | select HARDIRQS_SW_RESEND | 11 | select HARDIRQS_SW_RESEND |
12 | select HAVE_ARCH_TRACEHOOK | 12 | select HAVE_ARCH_TRACEHOOK |
13 | select HAVE_DEBUG_BUGVERBOSE | ||
14 | select HAVE_DEBUG_KMEMLEAK | ||
13 | select HAVE_DMA_API_DEBUG | 15 | select HAVE_DMA_API_DEBUG |
14 | select HAVE_DMA_ATTRS | 16 | select HAVE_DMA_ATTRS |
15 | select HAVE_GENERIC_DMA_COHERENT | 17 | select HAVE_GENERIC_DMA_COHERENT |
@@ -26,6 +28,7 @@ config ARM64 | |||
26 | select PERF_USE_VMALLOC | 28 | select PERF_USE_VMALLOC |
27 | select RTC_LIB | 29 | select RTC_LIB |
28 | select SPARSE_IRQ | 30 | select SPARSE_IRQ |
31 | select SYSCTL_EXCEPTION_TRACE | ||
29 | help | 32 | help |
30 | ARM 64-bit (AArch64) Linux support. | 33 | ARM 64-bit (AArch64) Linux support. |
31 | 34 | ||
@@ -193,6 +196,7 @@ config COMPAT | |||
193 | bool "Kernel support for 32-bit EL0" | 196 | bool "Kernel support for 32-bit EL0" |
194 | depends on !ARM64_64K_PAGES | 197 | depends on !ARM64_64K_PAGES |
195 | select COMPAT_BINFMT_ELF | 198 | select COMPAT_BINFMT_ELF |
199 | select HAVE_UID16 | ||
196 | help | 200 | help |
197 | This option enables support for a 32-bit EL0 running under a 64-bit | 201 | This option enables support for a 32-bit EL0 running under a 64-bit |
198 | kernel at EL1. AArch32-specific components such as system calls, | 202 | kernel at EL1. AArch32-specific components such as system calls, |
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b92e60958617..b2f2d2d66849 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c | |||
@@ -152,6 +152,7 @@ good_area: | |||
152 | tsk->min_flt++; | 152 | tsk->min_flt++; |
153 | if (fault & VM_FAULT_RETRY) { | 153 | if (fault & VM_FAULT_RETRY) { |
154 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 154 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
155 | flags |= FAULT_FLAG_TRIED; | ||
155 | 156 | ||
156 | /* | 157 | /* |
157 | * No need to up_read(&mm->mmap_sem) as we would have | 158 | * No need to up_read(&mm->mmap_sem) as we would have |
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 99224c4eb86b..ccd9193932b2 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig | |||
@@ -33,6 +33,7 @@ config BLACKFIN | |||
33 | select HAVE_PERF_EVENTS | 33 | select HAVE_PERF_EVENTS |
34 | select ARCH_HAVE_CUSTOM_GPIO_H | 34 | select ARCH_HAVE_CUSTOM_GPIO_H |
35 | select ARCH_WANT_OPTIONAL_GPIOLIB | 35 | select ARCH_WANT_OPTIONAL_GPIOLIB |
36 | select HAVE_UID16 | ||
36 | select ARCH_WANT_IPC_PARSE_VERSION | 37 | select ARCH_WANT_IPC_PARSE_VERSION |
37 | select HAVE_GENERIC_HARDIRQS | 38 | select HAVE_GENERIC_HARDIRQS |
38 | select GENERIC_ATOMIC64 | 39 | select GENERIC_ATOMIC64 |
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 72bd5ae50a89..a118163b04ee 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig | |||
@@ -42,6 +42,7 @@ config CRIS | |||
42 | select HAVE_IDE | 42 | select HAVE_IDE |
43 | select GENERIC_ATOMIC64 | 43 | select GENERIC_ATOMIC64 |
44 | select HAVE_GENERIC_HARDIRQS | 44 | select HAVE_GENERIC_HARDIRQS |
45 | select HAVE_UID16 | ||
45 | select ARCH_WANT_IPC_PARSE_VERSION | 46 | select ARCH_WANT_IPC_PARSE_VERSION |
46 | select GENERIC_IRQ_SHOW | 47 | select GENERIC_IRQ_SHOW |
47 | select GENERIC_IOMAP | 48 | select GENERIC_IOMAP |
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 45fd542cf173..73312ab6c696 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c | |||
@@ -186,6 +186,7 @@ retry: | |||
186 | tsk->min_flt++; | 186 | tsk->min_flt++; |
187 | if (fault & VM_FAULT_RETRY) { | 187 | if (fault & VM_FAULT_RETRY) { |
188 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 188 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
189 | flags |= FAULT_FLAG_TRIED; | ||
189 | 190 | ||
190 | /* | 191 | /* |
191 | * No need to up_read(&mm->mmap_sem) as we would | 192 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 971c0a19facb..9d262645f667 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig | |||
@@ -5,8 +5,10 @@ config FRV | |||
5 | select HAVE_ARCH_TRACEHOOK | 5 | select HAVE_ARCH_TRACEHOOK |
6 | select HAVE_IRQ_WORK | 6 | select HAVE_IRQ_WORK |
7 | select HAVE_PERF_EVENTS | 7 | select HAVE_PERF_EVENTS |
8 | select HAVE_UID16 | ||
8 | select HAVE_GENERIC_HARDIRQS | 9 | select HAVE_GENERIC_HARDIRQS |
9 | select GENERIC_IRQ_SHOW | 10 | select GENERIC_IRQ_SHOW |
11 | select HAVE_DEBUG_BUGVERBOSE | ||
10 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 12 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
11 | select GENERIC_CPU_DEVICES | 13 | select GENERIC_CPU_DEVICES |
12 | select ARCH_WANT_IPC_PARSE_VERSION | 14 | select ARCH_WANT_IPC_PARSE_VERSION |
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 5e8a0d9a09ce..90462eb23d02 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig | |||
@@ -3,6 +3,7 @@ config H8300 | |||
3 | default y | 3 | default y |
4 | select HAVE_IDE | 4 | select HAVE_IDE |
5 | select HAVE_GENERIC_HARDIRQS | 5 | select HAVE_GENERIC_HARDIRQS |
6 | select HAVE_UID16 | ||
6 | select ARCH_WANT_IPC_PARSE_VERSION | 7 | select ARCH_WANT_IPC_PARSE_VERSION |
7 | select GENERIC_IRQ_SHOW | 8 | select GENERIC_IRQ_SHOW |
8 | select GENERIC_CPU_DEVICES | 9 | select GENERIC_CPU_DEVICES |
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 06695cc4fe58..513b74cb397e 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c | |||
@@ -113,6 +113,7 @@ good_area: | |||
113 | current->min_flt++; | 113 | current->min_flt++; |
114 | if (fault & VM_FAULT_RETRY) { | 114 | if (fault & VM_FAULT_RETRY) { |
115 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 115 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
116 | flags |= FAULT_FLAG_TRIED; | ||
116 | goto retry; | 117 | goto retry; |
117 | } | 118 | } |
118 | } | 119 | } |
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index da55c63728e0..94eaa5bd5d0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h | |||
@@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page) | |||
77 | { | 77 | { |
78 | } | 78 | } |
79 | 79 | ||
80 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
81 | { | ||
82 | } | ||
83 | |||
80 | #endif /* _ASM_IA64_HUGETLB_H */ | 84 | #endif /* _ASM_IA64_HUGETLB_H */ |
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f388b4e18a37..ea39eba61ef5 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c | |||
@@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t | |||
2307 | */ | 2307 | */ |
2308 | vma->vm_mm = mm; | 2308 | vma->vm_mm = mm; |
2309 | vma->vm_file = get_file(filp); | 2309 | vma->vm_file = get_file(filp); |
2310 | vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; | 2310 | vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; |
2311 | vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ | 2311 | vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ |
2312 | 2312 | ||
2313 | /* | 2313 | /* |
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 8443daf4f515..6cf0341f978e 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c | |||
@@ -184,6 +184,7 @@ retry: | |||
184 | current->min_flt++; | 184 | current->min_flt++; |
185 | if (fault & VM_FAULT_RETRY) { | 185 | if (fault & VM_FAULT_RETRY) { |
186 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 186 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
187 | flags |= FAULT_FLAG_TRIED; | ||
187 | 188 | ||
188 | /* No need to up_read(&mm->mmap_sem) as we would | 189 | /* No need to up_read(&mm->mmap_sem) as we would |
189 | * have already released it in __lock_page_or_retry | 190 | * have already released it in __lock_page_or_retry |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0eab454867a2..acd5b68e8871 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -138,7 +138,8 @@ ia64_init_addr_space (void) | |||
138 | vma->vm_mm = current->mm; | 138 | vma->vm_mm = current->mm; |
139 | vma->vm_end = PAGE_SIZE; | 139 | vma->vm_end = PAGE_SIZE; |
140 | vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); | 140 | vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); |
141 | vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; | 141 | vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | |
142 | VM_DONTEXPAND | VM_DONTDUMP; | ||
142 | down_write(¤t->mm->mmap_sem); | 143 | down_write(¤t->mm->mmap_sem); |
143 | if (insert_vm_struct(current->mm, vma)) { | 144 | if (insert_vm_struct(current->mm, vma)) { |
144 | up_write(¤t->mm->mmap_sem); | 145 | up_write(¤t->mm->mmap_sem); |
@@ -636,6 +637,7 @@ mem_init (void) | |||
636 | 637 | ||
637 | high_memory = __va(max_low_pfn * PAGE_SIZE); | 638 | high_memory = __va(max_low_pfn * PAGE_SIZE); |
638 | 639 | ||
640 | reset_zone_present_pages(); | ||
639 | for_each_online_pgdat(pgdat) | 641 | for_each_online_pgdat(pgdat) |
640 | if (pgdat->bdata->node_bootmem_map) | 642 | if (pgdat->bdata->node_bootmem_map) |
641 | totalram_pages += free_all_bootmem_node(pgdat); | 643 | totalram_pages += free_all_bootmem_node(pgdat); |
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 49498bbb9616..e875fc3ce9cb 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig | |||
@@ -8,6 +8,7 @@ config M32R | |||
8 | select HAVE_KERNEL_BZIP2 | 8 | select HAVE_KERNEL_BZIP2 |
9 | select HAVE_KERNEL_LZMA | 9 | select HAVE_KERNEL_LZMA |
10 | select ARCH_WANT_IPC_PARSE_VERSION | 10 | select ARCH_WANT_IPC_PARSE_VERSION |
11 | select HAVE_DEBUG_BUGVERBOSE | ||
11 | select HAVE_GENERIC_HARDIRQS | 12 | select HAVE_GENERIC_HARDIRQS |
12 | select GENERIC_IRQ_PROBE | 13 | select GENERIC_IRQ_PROBE |
13 | select GENERIC_IRQ_SHOW | 14 | select GENERIC_IRQ_SHOW |
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index b22df9410dce..dae1e7e16a37 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig | |||
@@ -3,9 +3,11 @@ config M68K | |||
3 | default y | 3 | default y |
4 | select HAVE_IDE | 4 | select HAVE_IDE |
5 | select HAVE_AOUT if MMU | 5 | select HAVE_AOUT if MMU |
6 | select HAVE_DEBUG_BUGVERBOSE | ||
6 | select HAVE_GENERIC_HARDIRQS | 7 | select HAVE_GENERIC_HARDIRQS |
7 | select GENERIC_IRQ_SHOW | 8 | select GENERIC_IRQ_SHOW |
8 | select GENERIC_ATOMIC64 | 9 | select GENERIC_ATOMIC64 |
10 | select HAVE_UID16 | ||
9 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS | 11 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS |
10 | select GENERIC_CPU_DEVICES | 12 | select GENERIC_CPU_DEVICES |
11 | select GENERIC_STRNCPY_FROM_USER if MMU | 13 | select GENERIC_STRNCPY_FROM_USER if MMU |
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index aeebbb7b30f0..a563727806bf 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c | |||
@@ -170,6 +170,7 @@ good_area: | |||
170 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 170 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
171 | * of starvation. */ | 171 | * of starvation. */ |
172 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 172 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
173 | flags |= FAULT_FLAG_TRIED; | ||
173 | 174 | ||
174 | /* | 175 | /* |
175 | * No need to up_read(&mm->mmap_sem) as we would | 176 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 6133bed2b855..53fd94ab60f0 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig | |||
@@ -16,6 +16,7 @@ config MICROBLAZE | |||
16 | select OF | 16 | select OF |
17 | select OF_EARLY_FLATTREE | 17 | select OF_EARLY_FLATTREE |
18 | select ARCH_WANT_IPC_PARSE_VERSION | 18 | select ARCH_WANT_IPC_PARSE_VERSION |
19 | select HAVE_DEBUG_KMEMLEAK | ||
19 | select IRQ_DOMAIN | 20 | select IRQ_DOMAIN |
20 | select HAVE_GENERIC_HARDIRQS | 21 | select HAVE_GENERIC_HARDIRQS |
21 | select GENERIC_IRQ_PROBE | 22 | select GENERIC_IRQ_PROBE |
diff --git a/arch/microblaze/include/asm/atomic.h b/arch/microblaze/include/asm/atomic.h index 472d8bf726df..42ac382a09da 100644 --- a/arch/microblaze/include/asm/atomic.h +++ b/arch/microblaze/include/asm/atomic.h | |||
@@ -22,5 +22,6 @@ static inline int atomic_dec_if_positive(atomic_t *v) | |||
22 | 22 | ||
23 | return res; | 23 | return res; |
24 | } | 24 | } |
25 | #define atomic_dec_if_positive atomic_dec_if_positive | ||
25 | 26 | ||
26 | #endif /* _ASM_MICROBLAZE_ATOMIC_H */ | 27 | #endif /* _ASM_MICROBLAZE_ATOMIC_H */ |
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index eb365d6795fa..714b35a9c4f7 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c | |||
@@ -233,6 +233,7 @@ good_area: | |||
233 | current->min_flt++; | 233 | current->min_flt++; |
234 | if (fault & VM_FAULT_RETRY) { | 234 | if (fault & VM_FAULT_RETRY) { |
235 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 235 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
236 | flags |= FAULT_FLAG_TRIED; | ||
236 | 237 | ||
237 | /* | 238 | /* |
238 | * No need to up_read(&mm->mmap_sem) as we would | 239 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4cd538b42a3f..35453eaeffb5 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig | |||
@@ -17,6 +17,7 @@ config MIPS | |||
17 | select HAVE_FUNCTION_GRAPH_TRACER | 17 | select HAVE_FUNCTION_GRAPH_TRACER |
18 | select HAVE_KPROBES | 18 | select HAVE_KPROBES |
19 | select HAVE_KRETPROBES | 19 | select HAVE_KRETPROBES |
20 | select HAVE_DEBUG_KMEMLEAK | ||
20 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE | 21 | select ARCH_BINFMT_ELF_RANDOMIZE_PIE |
21 | select RTC_LIB if !MACH_LOONGSON | 22 | select RTC_LIB if !MACH_LOONGSON |
22 | select GENERIC_ATOMIC64 if !64BIT | 23 | select GENERIC_ATOMIC64 if !64BIT |
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 58d36889f09b..bd94946a18f3 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h | |||
@@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page) | |||
112 | { | 112 | { |
113 | } | 113 | } |
114 | 114 | ||
115 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
116 | { | ||
117 | } | ||
118 | |||
115 | #endif /* __ASM_HUGETLB_H */ | 119 | #endif /* __ASM_HUGETLB_H */ |
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 7a19957735e9..ddcec1e1a0cd 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c | |||
@@ -171,6 +171,7 @@ good_area: | |||
171 | } | 171 | } |
172 | if (fault & VM_FAULT_RETRY) { | 172 | if (fault & VM_FAULT_RETRY) { |
173 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 173 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
174 | flags |= FAULT_FLAG_TRIED; | ||
174 | 175 | ||
175 | /* | 176 | /* |
176 | * No need to up_read(&mm->mmap_sem) as we would | 177 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 40f850e9766c..e2bfafce66c5 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c | |||
@@ -183,6 +183,7 @@ good_area: | |||
183 | tsk->min_flt++; | 183 | tsk->min_flt++; |
184 | if (fault & VM_FAULT_RETRY) { | 184 | if (fault & VM_FAULT_RETRY) { |
185 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 185 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
186 | flags |= FAULT_FLAG_TRIED; | ||
186 | 187 | ||
187 | /* No need to up_read(&mm->mmap_sem) as we would | 188 | /* No need to up_read(&mm->mmap_sem) as we would |
188 | * have already released it in __lock_page_or_retry | 189 | * have already released it in __lock_page_or_retry |
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 9d181890a7e3..48e16dc20102 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c | |||
@@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page) | |||
276 | { | 276 | { |
277 | struct address_space *mapping = page_mapping(page); | 277 | struct address_space *mapping = page_mapping(page); |
278 | struct vm_area_struct *mpnt; | 278 | struct vm_area_struct *mpnt; |
279 | struct prio_tree_iter iter; | ||
280 | unsigned long offset; | 279 | unsigned long offset; |
281 | unsigned long addr, old_addr = 0; | 280 | unsigned long addr, old_addr = 0; |
282 | pgoff_t pgoff; | 281 | pgoff_t pgoff; |
@@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page) | |||
299 | * to flush one address here for them all to become coherent */ | 298 | * to flush one address here for them all to become coherent */ |
300 | 299 | ||
301 | flush_dcache_mmap_lock(mapping); | 300 | flush_dcache_mmap_lock(mapping); |
302 | vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { | 301 | vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { |
303 | offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; | 302 | offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; |
304 | addr = mpnt->vm_start + offset; | 303 | addr = mpnt->vm_start + offset; |
305 | 304 | ||
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4ce0be32d153..df7edb887a04 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -99,6 +99,7 @@ config PPC | |||
99 | select HAVE_DYNAMIC_FTRACE | 99 | select HAVE_DYNAMIC_FTRACE |
100 | select HAVE_FUNCTION_TRACER | 100 | select HAVE_FUNCTION_TRACER |
101 | select HAVE_FUNCTION_GRAPH_TRACER | 101 | select HAVE_FUNCTION_GRAPH_TRACER |
102 | select SYSCTL_EXCEPTION_TRACE | ||
102 | select ARCH_WANT_OPTIONAL_GPIOLIB | 103 | select ARCH_WANT_OPTIONAL_GPIOLIB |
103 | select HAVE_IDE | 104 | select HAVE_IDE |
104 | select HAVE_IOREMAP_PROT | 105 | select HAVE_IOREMAP_PROT |
@@ -113,6 +114,7 @@ config PPC | |||
113 | select HAVE_DMA_API_DEBUG | 114 | select HAVE_DMA_API_DEBUG |
114 | select USE_GENERIC_SMP_HELPERS if SMP | 115 | select USE_GENERIC_SMP_HELPERS if SMP |
115 | select HAVE_OPROFILE | 116 | select HAVE_OPROFILE |
117 | select HAVE_DEBUG_KMEMLEAK | ||
116 | select HAVE_SYSCALL_WRAPPERS if PPC64 | 118 | select HAVE_SYSCALL_WRAPPERS if PPC64 |
117 | select GENERIC_ATOMIC64 if PPC32 | 119 | select GENERIC_ATOMIC64 if PPC32 |
118 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE | 120 | select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE |
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index da29032ae38f..e3b1d41c89be 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h | |||
@@ -268,6 +268,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v) | |||
268 | 268 | ||
269 | return t; | 269 | return t; |
270 | } | 270 | } |
271 | #define atomic_dec_if_positive atomic_dec_if_positive | ||
271 | 272 | ||
272 | #define smp_mb__before_atomic_dec() smp_mb() | 273 | #define smp_mb__before_atomic_dec() smp_mb() |
273 | #define smp_mb__after_atomic_dec() smp_mb() | 274 | #define smp_mb__after_atomic_dec() smp_mb() |
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dfdb95bc59a5..62e11a32c4c2 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h | |||
@@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page) | |||
151 | { | 151 | { |
152 | } | 152 | } |
153 | 153 | ||
154 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
155 | { | ||
156 | } | ||
157 | |||
154 | #else /* ! CONFIG_HUGETLB_PAGE */ | 158 | #else /* ! CONFIG_HUGETLB_PAGE */ |
155 | static inline void flush_hugetlb_page(struct vm_area_struct *vma, | 159 | static inline void flush_hugetlb_page(struct vm_area_struct *vma, |
156 | unsigned long vmaddr) | 160 | unsigned long vmaddr) |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 83e929e66f9d..721d4603a235 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
@@ -1183,7 +1183,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = { | |||
1183 | 1183 | ||
1184 | static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) | 1184 | static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) |
1185 | { | 1185 | { |
1186 | vma->vm_flags |= VM_RESERVED; | 1186 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
1187 | vma->vm_ops = &kvm_rma_vm_ops; | 1187 | vma->vm_ops = &kvm_rma_vm_ops; |
1188 | return 0; | 1188 | return 0; |
1189 | } | 1189 | } |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5495ebe983a2..0a6b28336eb0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -451,6 +451,7 @@ good_area: | |||
451 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 451 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
452 | * of starvation. */ | 452 | * of starvation. */ |
453 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 453 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
454 | flags |= FAULT_FLAG_TRIED; | ||
454 | goto retry; | 455 | goto retry; |
455 | } | 456 | } |
456 | } | 457 | } |
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 642fca137ccb..28f1af2db1f5 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c | |||
@@ -304,7 +304,7 @@ static inline unsigned long fast_get_dcookie(struct path *path) | |||
304 | return cookie; | 304 | return cookie; |
305 | } | 305 | } |
306 | 306 | ||
307 | /* Look up the dcookie for the task's first VM_EXECUTABLE mapping, | 307 | /* Look up the dcookie for the task's mm->exe_file, |
308 | * which corresponds loosely to "application name". Also, determine | 308 | * which corresponds loosely to "application name". Also, determine |
309 | * the offset for the SPU ELF object. If computed offset is | 309 | * the offset for the SPU ELF object. If computed offset is |
310 | * non-zero, it implies an embedded SPU object; otherwise, it's a | 310 | * non-zero, it implies an embedded SPU object; otherwise, it's a |
@@ -321,7 +321,6 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp, | |||
321 | { | 321 | { |
322 | unsigned long app_cookie = 0; | 322 | unsigned long app_cookie = 0; |
323 | unsigned int my_offset = 0; | 323 | unsigned int my_offset = 0; |
324 | struct file *app = NULL; | ||
325 | struct vm_area_struct *vma; | 324 | struct vm_area_struct *vma; |
326 | struct mm_struct *mm = spu->mm; | 325 | struct mm_struct *mm = spu->mm; |
327 | 326 | ||
@@ -330,16 +329,10 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp, | |||
330 | 329 | ||
331 | down_read(&mm->mmap_sem); | 330 | down_read(&mm->mmap_sem); |
332 | 331 | ||
333 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 332 | if (mm->exe_file) { |
334 | if (!vma->vm_file) | 333 | app_cookie = fast_get_dcookie(&mm->exe_file->f_path); |
335 | continue; | ||
336 | if (!(vma->vm_flags & VM_EXECUTABLE)) | ||
337 | continue; | ||
338 | app_cookie = fast_get_dcookie(&vma->vm_file->f_path); | ||
339 | pr_debug("got dcookie for %s\n", | 334 | pr_debug("got dcookie for %s\n", |
340 | vma->vm_file->f_dentry->d_name.name); | 335 | mm->exe_file->f_dentry->d_name.name); |
341 | app = vma->vm_file; | ||
342 | break; | ||
343 | } | 336 | } |
344 | 337 | ||
345 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 338 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 11d8e0544ac0..dc0a035e63bb 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c | |||
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz | |||
77 | { | 77 | { |
78 | unsigned long start, start_pfn; | 78 | unsigned long start, start_pfn; |
79 | struct zone *zone; | 79 | struct zone *zone; |
80 | int ret; | 80 | int i, ret; |
81 | int sections_to_remove; | ||
81 | 82 | ||
82 | start_pfn = base >> PAGE_SHIFT; | 83 | start_pfn = base >> PAGE_SHIFT; |
83 | 84 | ||
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz | |||
97 | * to sysfs "state" file and we can't remove sysfs entries | 98 | * to sysfs "state" file and we can't remove sysfs entries |
98 | * while writing to it. So we have to defer it to here. | 99 | * while writing to it. So we have to defer it to here. |
99 | */ | 100 | */ |
100 | ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT); | 101 | sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION; |
101 | if (ret) | 102 | for (i = 0; i < sections_to_remove; i++) { |
102 | return ret; | 103 | unsigned long pfn = start_pfn + i * PAGES_PER_SECTION; |
104 | ret = __remove_pages(zone, start_pfn, PAGES_PER_SECTION); | ||
105 | if (ret) | ||
106 | return ret; | ||
107 | } | ||
103 | 108 | ||
104 | /* | 109 | /* |
105 | * Update memory regions for memory remove | 110 | * Update memory regions for memory remove |
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c8af429991d9..ceff7aef2477 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -68,6 +68,7 @@ config S390 | |||
68 | select HAVE_FTRACE_MCOUNT_RECORD | 68 | select HAVE_FTRACE_MCOUNT_RECORD |
69 | select HAVE_C_RECORDMCOUNT | 69 | select HAVE_C_RECORDMCOUNT |
70 | select HAVE_SYSCALL_TRACEPOINTS | 70 | select HAVE_SYSCALL_TRACEPOINTS |
71 | select SYSCTL_EXCEPTION_TRACE | ||
71 | select HAVE_DYNAMIC_FTRACE | 72 | select HAVE_DYNAMIC_FTRACE |
72 | select HAVE_FUNCTION_GRAPH_TRACER | 73 | select HAVE_FUNCTION_GRAPH_TRACER |
73 | select HAVE_REGS_AND_STACK_ACCESS_API | 74 | select HAVE_REGS_AND_STACK_ACCESS_API |
@@ -80,6 +81,7 @@ config S390 | |||
80 | select HAVE_IRQ_WORK | 81 | select HAVE_IRQ_WORK |
81 | select HAVE_PERF_EVENTS | 82 | select HAVE_PERF_EVENTS |
82 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 83 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
84 | select HAVE_DEBUG_KMEMLEAK | ||
83 | select HAVE_KERNEL_GZIP | 85 | select HAVE_KERNEL_GZIP |
84 | select HAVE_KERNEL_BZIP2 | 86 | select HAVE_KERNEL_BZIP2 |
85 | select HAVE_KERNEL_LZMA | 87 | select HAVE_KERNEL_LZMA |
@@ -126,6 +128,7 @@ config S390 | |||
126 | select ARCH_INLINE_WRITE_UNLOCK_BH | 128 | select ARCH_INLINE_WRITE_UNLOCK_BH |
127 | select ARCH_INLINE_WRITE_UNLOCK_IRQ | 129 | select ARCH_INLINE_WRITE_UNLOCK_IRQ |
128 | select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 130 | select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
131 | select HAVE_UID16 if 32BIT | ||
129 | select ARCH_WANT_IPC_PARSE_VERSION | 132 | select ARCH_WANT_IPC_PARSE_VERSION |
130 | select GENERIC_SMP_IDLE_THREAD | 133 | select GENERIC_SMP_IDLE_THREAD |
131 | select GENERIC_TIME_VSYSCALL | 134 | select GENERIC_TIME_VSYSCALL |
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d6e6e380564..593753ee07f3 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h | |||
@@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file, | |||
33 | } | 33 | } |
34 | 34 | ||
35 | #define hugetlb_prefault_arch_hook(mm) do { } while (0) | 35 | #define hugetlb_prefault_arch_hook(mm) do { } while (0) |
36 | #define arch_clear_hugepage_flags(page) do { } while (0) | ||
36 | 37 | ||
37 | int arch_prepare_hugepage(struct page *page); | 38 | int arch_prepare_hugepage(struct page *page); |
38 | void arch_release_hugepage(struct page *page); | 39 | void arch_release_hugepage(struct page *page); |
@@ -77,23 +78,6 @@ static inline void __pmd_csp(pmd_t *pmdp) | |||
77 | " csp %1,%3" | 78 | " csp %1,%3" |
78 | : "=m" (*pmdp) | 79 | : "=m" (*pmdp) |
79 | : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); | 80 | : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); |
80 | pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY; | ||
81 | } | ||
82 | |||
83 | static inline void __pmd_idte(unsigned long address, pmd_t *pmdp) | ||
84 | { | ||
85 | unsigned long sto = (unsigned long) pmdp - | ||
86 | pmd_index(address) * sizeof(pmd_t); | ||
87 | |||
88 | if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) { | ||
89 | asm volatile( | ||
90 | " .insn rrf,0xb98e0000,%2,%3,0,0" | ||
91 | : "=m" (*pmdp) | ||
92 | : "m" (*pmdp), "a" (sto), | ||
93 | "a" ((address & HPAGE_MASK)) | ||
94 | ); | ||
95 | } | ||
96 | pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY; | ||
97 | } | 81 | } |
98 | 82 | ||
99 | static inline void huge_ptep_invalidate(struct mm_struct *mm, | 83 | static inline void huge_ptep_invalidate(struct mm_struct *mm, |
@@ -105,6 +89,7 @@ static inline void huge_ptep_invalidate(struct mm_struct *mm, | |||
105 | __pmd_idte(address, pmdp); | 89 | __pmd_idte(address, pmdp); |
106 | else | 90 | else |
107 | __pmd_csp(pmdp); | 91 | __pmd_csp(pmdp); |
92 | pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY; | ||
108 | } | 93 | } |
109 | 94 | ||
110 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, | 95 | static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6bd7d7483017..979fe3dc0788 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -42,6 +42,7 @@ extern void fault_init(void); | |||
42 | * tables contain all the necessary information. | 42 | * tables contain all the necessary information. |
43 | */ | 43 | */ |
44 | #define update_mmu_cache(vma, address, ptep) do { } while (0) | 44 | #define update_mmu_cache(vma, address, ptep) do { } while (0) |
45 | #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * ZERO_PAGE is a global shared page that is always zero; used | 48 | * ZERO_PAGE is a global shared page that is always zero; used |
@@ -347,6 +348,12 @@ extern struct page *vmemmap; | |||
347 | 348 | ||
348 | #define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */ | 349 | #define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */ |
349 | #define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */ | 350 | #define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */ |
351 | #define _SEGMENT_ENTRY_SPLIT_BIT 0 /* THP splitting bit number */ | ||
352 | #define _SEGMENT_ENTRY_SPLIT (1UL << _SEGMENT_ENTRY_SPLIT_BIT) | ||
353 | |||
354 | /* Set of bits not changed in pmd_modify */ | ||
355 | #define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \ | ||
356 | | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO) | ||
350 | 357 | ||
351 | /* Page status table bits for virtualization */ | 358 | /* Page status table bits for virtualization */ |
352 | #define RCP_ACC_BITS 0xf000000000000000UL | 359 | #define RCP_ACC_BITS 0xf000000000000000UL |
@@ -506,6 +513,30 @@ static inline int pmd_bad(pmd_t pmd) | |||
506 | return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY; | 513 | return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY; |
507 | } | 514 | } |
508 | 515 | ||
516 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
517 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
518 | unsigned long addr, pmd_t *pmdp); | ||
519 | |||
520 | #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | ||
521 | extern int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
522 | unsigned long address, pmd_t *pmdp, | ||
523 | pmd_t entry, int dirty); | ||
524 | |||
525 | #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH | ||
526 | extern int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
527 | unsigned long address, pmd_t *pmdp); | ||
528 | |||
529 | #define __HAVE_ARCH_PMD_WRITE | ||
530 | static inline int pmd_write(pmd_t pmd) | ||
531 | { | ||
532 | return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0; | ||
533 | } | ||
534 | |||
535 | static inline int pmd_young(pmd_t pmd) | ||
536 | { | ||
537 | return 0; | ||
538 | } | ||
539 | |||
509 | static inline int pte_none(pte_t pte) | 540 | static inline int pte_none(pte_t pte) |
510 | { | 541 | { |
511 | return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT); | 542 | return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT); |
@@ -1159,6 +1190,185 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) | |||
1159 | #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) | 1190 | #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) |
1160 | #define pte_unmap(pte) do { } while (0) | 1191 | #define pte_unmap(pte) do { } while (0) |
1161 | 1192 | ||
1193 | static inline void __pmd_idte(unsigned long address, pmd_t *pmdp) | ||
1194 | { | ||
1195 | unsigned long sto = (unsigned long) pmdp - | ||
1196 | pmd_index(address) * sizeof(pmd_t); | ||
1197 | |||
1198 | if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) { | ||
1199 | asm volatile( | ||
1200 | " .insn rrf,0xb98e0000,%2,%3,0,0" | ||
1201 | : "=m" (*pmdp) | ||
1202 | : "m" (*pmdp), "a" (sto), | ||
1203 | "a" ((address & HPAGE_MASK)) | ||
1204 | : "cc" | ||
1205 | ); | ||
1206 | } | ||
1207 | } | ||
1208 | |||
1209 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1210 | #define __HAVE_ARCH_PGTABLE_DEPOSIT | ||
1211 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); | ||
1212 | |||
1213 | #define __HAVE_ARCH_PGTABLE_WITHDRAW | ||
1214 | extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); | ||
1215 | |||
1216 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
1217 | { | ||
1218 | return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT; | ||
1219 | } | ||
1220 | |||
1221 | static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||
1222 | pmd_t *pmdp, pmd_t entry) | ||
1223 | { | ||
1224 | *pmdp = entry; | ||
1225 | } | ||
1226 | |||
1227 | static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) | ||
1228 | { | ||
1229 | unsigned long pgprot_pmd = 0; | ||
1230 | |||
1231 | if (pgprot_val(pgprot) & _PAGE_INVALID) { | ||
1232 | if (pgprot_val(pgprot) & _PAGE_SWT) | ||
1233 | pgprot_pmd |= _HPAGE_TYPE_NONE; | ||
1234 | pgprot_pmd |= _SEGMENT_ENTRY_INV; | ||
1235 | } | ||
1236 | if (pgprot_val(pgprot) & _PAGE_RO) | ||
1237 | pgprot_pmd |= _SEGMENT_ENTRY_RO; | ||
1238 | return pgprot_pmd; | ||
1239 | } | ||
1240 | |||
1241 | static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | ||
1242 | { | ||
1243 | pmd_val(pmd) &= _SEGMENT_CHG_MASK; | ||
1244 | pmd_val(pmd) |= massage_pgprot_pmd(newprot); | ||
1245 | return pmd; | ||
1246 | } | ||
1247 | |||
1248 | static inline pmd_t pmd_mkhuge(pmd_t pmd) | ||
1249 | { | ||
1250 | pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; | ||
1251 | return pmd; | ||
1252 | } | ||
1253 | |||
1254 | static inline pmd_t pmd_mkwrite(pmd_t pmd) | ||
1255 | { | ||
1256 | pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO; | ||
1257 | return pmd; | ||
1258 | } | ||
1259 | |||
1260 | static inline pmd_t pmd_wrprotect(pmd_t pmd) | ||
1261 | { | ||
1262 | pmd_val(pmd) |= _SEGMENT_ENTRY_RO; | ||
1263 | return pmd; | ||
1264 | } | ||
1265 | |||
1266 | static inline pmd_t pmd_mkdirty(pmd_t pmd) | ||
1267 | { | ||
1268 | /* No dirty bit in the segment table entry. */ | ||
1269 | return pmd; | ||
1270 | } | ||
1271 | |||
1272 | static inline pmd_t pmd_mkold(pmd_t pmd) | ||
1273 | { | ||
1274 | /* No referenced bit in the segment table entry. */ | ||
1275 | return pmd; | ||
1276 | } | ||
1277 | |||
1278 | static inline pmd_t pmd_mkyoung(pmd_t pmd) | ||
1279 | { | ||
1280 | /* No referenced bit in the segment table entry. */ | ||
1281 | return pmd; | ||
1282 | } | ||
1283 | |||
1284 | #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG | ||
1285 | static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, | ||
1286 | unsigned long address, pmd_t *pmdp) | ||
1287 | { | ||
1288 | unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK; | ||
1289 | long tmp, rc; | ||
1290 | int counter; | ||
1291 | |||
1292 | rc = 0; | ||
1293 | if (MACHINE_HAS_RRBM) { | ||
1294 | counter = PTRS_PER_PTE >> 6; | ||
1295 | asm volatile( | ||
1296 | "0: .insn rre,0xb9ae0000,%0,%3\n" /* rrbm */ | ||
1297 | " ogr %1,%0\n" | ||
1298 | " la %3,0(%4,%3)\n" | ||
1299 | " brct %2,0b\n" | ||
1300 | : "=&d" (tmp), "+&d" (rc), "+d" (counter), | ||
1301 | "+a" (pmd_addr) | ||
1302 | : "a" (64 * 4096UL) : "cc"); | ||
1303 | rc = !!rc; | ||
1304 | } else { | ||
1305 | counter = PTRS_PER_PTE; | ||
1306 | asm volatile( | ||
1307 | "0: rrbe 0,%2\n" | ||
1308 | " la %2,0(%3,%2)\n" | ||
1309 | " brc 12,1f\n" | ||
1310 | " lhi %0,1\n" | ||
1311 | "1: brct %1,0b\n" | ||
1312 | : "+d" (rc), "+d" (counter), "+a" (pmd_addr) | ||
1313 | : "a" (4096UL) : "cc"); | ||
1314 | } | ||
1315 | return rc; | ||
1316 | } | ||
1317 | |||
1318 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | ||
1319 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | ||
1320 | unsigned long address, pmd_t *pmdp) | ||
1321 | { | ||
1322 | pmd_t pmd = *pmdp; | ||
1323 | |||
1324 | __pmd_idte(address, pmdp); | ||
1325 | pmd_clear(pmdp); | ||
1326 | return pmd; | ||
1327 | } | ||
1328 | |||
1329 | #define __HAVE_ARCH_PMDP_CLEAR_FLUSH | ||
1330 | static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma, | ||
1331 | unsigned long address, pmd_t *pmdp) | ||
1332 | { | ||
1333 | return pmdp_get_and_clear(vma->vm_mm, address, pmdp); | ||
1334 | } | ||
1335 | |||
1336 | #define __HAVE_ARCH_PMDP_INVALIDATE | ||
1337 | static inline void pmdp_invalidate(struct vm_area_struct *vma, | ||
1338 | unsigned long address, pmd_t *pmdp) | ||
1339 | { | ||
1340 | __pmd_idte(address, pmdp); | ||
1341 | } | ||
1342 | |||
1343 | static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot) | ||
1344 | { | ||
1345 | pmd_t __pmd; | ||
1346 | pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot); | ||
1347 | return __pmd; | ||
1348 | } | ||
1349 | |||
1350 | #define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) | ||
1351 | #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) | ||
1352 | |||
1353 | static inline int pmd_trans_huge(pmd_t pmd) | ||
1354 | { | ||
1355 | return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; | ||
1356 | } | ||
1357 | |||
1358 | static inline int has_transparent_hugepage(void) | ||
1359 | { | ||
1360 | return MACHINE_HAS_HPAGE ? 1 : 0; | ||
1361 | } | ||
1362 | |||
1363 | static inline unsigned long pmd_pfn(pmd_t pmd) | ||
1364 | { | ||
1365 | if (pmd_trans_huge(pmd)) | ||
1366 | return pmd_val(pmd) >> HPAGE_SHIFT; | ||
1367 | else | ||
1368 | return pmd_val(pmd) >> PAGE_SHIFT; | ||
1369 | } | ||
1370 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
1371 | |||
1162 | /* | 1372 | /* |
1163 | * 31 bit swap entry format: | 1373 | * 31 bit swap entry format: |
1164 | * A page-table entry has some bits we have to treat in a special way. | 1374 | * A page-table entry has some bits we have to treat in a special way. |
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 87b47ca954f1..8cfd731a18d8 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h | |||
@@ -81,6 +81,7 @@ extern unsigned int s390_user_mode; | |||
81 | #define MACHINE_FLAG_SPP (1UL << 13) | 81 | #define MACHINE_FLAG_SPP (1UL << 13) |
82 | #define MACHINE_FLAG_TOPOLOGY (1UL << 14) | 82 | #define MACHINE_FLAG_TOPOLOGY (1UL << 14) |
83 | #define MACHINE_FLAG_TE (1UL << 15) | 83 | #define MACHINE_FLAG_TE (1UL << 15) |
84 | #define MACHINE_FLAG_RRBM (1UL << 16) | ||
84 | 85 | ||
85 | #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) | 86 | #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) |
86 | #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) | 87 | #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) |
@@ -99,7 +100,8 @@ extern unsigned int s390_user_mode; | |||
99 | #define MACHINE_HAS_PFMF (0) | 100 | #define MACHINE_HAS_PFMF (0) |
100 | #define MACHINE_HAS_SPP (0) | 101 | #define MACHINE_HAS_SPP (0) |
101 | #define MACHINE_HAS_TOPOLOGY (0) | 102 | #define MACHINE_HAS_TOPOLOGY (0) |
102 | #define MACHINE_HAS_TE (0) | 103 | #define MACHINE_HAS_TE (0) |
104 | #define MACHINE_HAS_RRBM (0) | ||
103 | #else /* CONFIG_64BIT */ | 105 | #else /* CONFIG_64BIT */ |
104 | #define MACHINE_HAS_IEEE (1) | 106 | #define MACHINE_HAS_IEEE (1) |
105 | #define MACHINE_HAS_CSP (1) | 107 | #define MACHINE_HAS_CSP (1) |
@@ -112,6 +114,7 @@ extern unsigned int s390_user_mode; | |||
112 | #define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP) | 114 | #define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP) |
113 | #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) | 115 | #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) |
114 | #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) | 116 | #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) |
117 | #define MACHINE_HAS_RRBM (S390_lowcore.machine_flags & MACHINE_FLAG_RRBM) | ||
115 | #endif /* CONFIG_64BIT */ | 118 | #endif /* CONFIG_64BIT */ |
116 | 119 | ||
117 | #define ZFCPDUMP_HSA_SIZE (32UL<<20) | 120 | #define ZFCPDUMP_HSA_SIZE (32UL<<20) |
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 06e5acbc84bd..b75d7d686684 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h | |||
@@ -137,6 +137,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | |||
137 | #define tlb_start_vma(tlb, vma) do { } while (0) | 137 | #define tlb_start_vma(tlb, vma) do { } while (0) |
138 | #define tlb_end_vma(tlb, vma) do { } while (0) | 138 | #define tlb_end_vma(tlb, vma) do { } while (0) |
139 | #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) | 139 | #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) |
140 | #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) | ||
140 | #define tlb_migrate_finish(mm) do { } while (0) | 141 | #define tlb_migrate_finish(mm) do { } while (0) |
141 | 142 | ||
142 | #endif /* _S390_TLB_H */ | 143 | #endif /* _S390_TLB_H */ |
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 7f4717675c19..00d114445068 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c | |||
@@ -388,6 +388,8 @@ static __init void detect_machine_facilities(void) | |||
388 | S390_lowcore.machine_flags |= MACHINE_FLAG_SPP; | 388 | S390_lowcore.machine_flags |= MACHINE_FLAG_SPP; |
389 | if (test_facility(50) && test_facility(73)) | 389 | if (test_facility(50) && test_facility(73)) |
390 | S390_lowcore.machine_flags |= MACHINE_FLAG_TE; | 390 | S390_lowcore.machine_flags |= MACHINE_FLAG_TE; |
391 | if (test_facility(66)) | ||
392 | S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM; | ||
391 | #endif | 393 | #endif |
392 | } | 394 | } |
393 | 395 | ||
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ac9122ca1152..04ad4001a289 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
@@ -367,6 +367,7 @@ retry: | |||
367 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 367 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
368 | * of starvation. */ | 368 | * of starvation. */ |
369 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 369 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
370 | flags |= FAULT_FLAG_TRIED; | ||
370 | down_read(&mm->mmap_sem); | 371 | down_read(&mm->mmap_sem); |
371 | goto retry; | 372 | goto retry; |
372 | } | 373 | } |
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index eeaf8023851f..60acb93a4680 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c | |||
@@ -115,7 +115,16 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, | |||
115 | pmd = *pmdp; | 115 | pmd = *pmdp; |
116 | barrier(); | 116 | barrier(); |
117 | next = pmd_addr_end(addr, end); | 117 | next = pmd_addr_end(addr, end); |
118 | if (pmd_none(pmd)) | 118 | /* |
119 | * The pmd_trans_splitting() check below explains why | ||
120 | * pmdp_splitting_flush() has to serialize with | ||
121 | * smp_call_function() against our disabled IRQs, to stop | ||
122 | * this gup-fast code from running while we set the | ||
123 | * splitting bit in the pmd. Returning zero will take | ||
124 | * the slow path that will call wait_split_huge_page() | ||
125 | * if the pmd is still in splitting state. | ||
126 | */ | ||
127 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
119 | return 0; | 128 | return 0; |
120 | if (unlikely(pmd_huge(pmd))) { | 129 | if (unlikely(pmd_huge(pmd))) { |
121 | if (!gup_huge_pmd(pmdp, pmd, addr, next, | 130 | if (!gup_huge_pmd(pmdp, pmd, addr, next, |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b402991e43d7..c8188a18af05 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -787,6 +787,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) | |||
787 | tlb_table_flush(tlb); | 787 | tlb_table_flush(tlb); |
788 | } | 788 | } |
789 | 789 | ||
790 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
791 | void thp_split_vma(struct vm_area_struct *vma) | ||
792 | { | ||
793 | unsigned long addr; | ||
794 | struct page *page; | ||
795 | |||
796 | for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { | ||
797 | page = follow_page(vma, addr, FOLL_SPLIT); | ||
798 | } | ||
799 | } | ||
800 | |||
801 | void thp_split_mm(struct mm_struct *mm) | ||
802 | { | ||
803 | struct vm_area_struct *vma = mm->mmap; | ||
804 | |||
805 | while (vma != NULL) { | ||
806 | thp_split_vma(vma); | ||
807 | vma->vm_flags &= ~VM_HUGEPAGE; | ||
808 | vma->vm_flags |= VM_NOHUGEPAGE; | ||
809 | vma = vma->vm_next; | ||
810 | } | ||
811 | } | ||
812 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
813 | |||
790 | /* | 814 | /* |
791 | * switch on pgstes for its userspace process (for kvm) | 815 | * switch on pgstes for its userspace process (for kvm) |
792 | */ | 816 | */ |
@@ -824,6 +848,12 @@ int s390_enable_sie(void) | |||
824 | if (!mm) | 848 | if (!mm) |
825 | return -ENOMEM; | 849 | return -ENOMEM; |
826 | 850 | ||
851 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
852 | /* split thp mappings and disable thp for future mappings */ | ||
853 | thp_split_mm(mm); | ||
854 | mm->def_flags |= VM_NOHUGEPAGE; | ||
855 | #endif | ||
856 | |||
827 | /* Now lets check again if something happened */ | 857 | /* Now lets check again if something happened */ |
828 | task_lock(tsk); | 858 | task_lock(tsk); |
829 | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | 859 | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || |
@@ -866,3 +896,81 @@ bool kernel_page_present(struct page *page) | |||
866 | return cc == 0; | 896 | return cc == 0; |
867 | } | 897 | } |
868 | #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ | 898 | #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ |
899 | |||
900 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
901 | int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, | ||
902 | pmd_t *pmdp) | ||
903 | { | ||
904 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
905 | /* No need to flush TLB | ||
906 | * On s390 reference bits are in storage key and never in TLB */ | ||
907 | return pmdp_test_and_clear_young(vma, address, pmdp); | ||
908 | } | ||
909 | |||
910 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
911 | unsigned long address, pmd_t *pmdp, | ||
912 | pmd_t entry, int dirty) | ||
913 | { | ||
914 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
915 | |||
916 | if (pmd_same(*pmdp, entry)) | ||
917 | return 0; | ||
918 | pmdp_invalidate(vma, address, pmdp); | ||
919 | set_pmd_at(vma->vm_mm, address, pmdp, entry); | ||
920 | return 1; | ||
921 | } | ||
922 | |||
923 | static void pmdp_splitting_flush_sync(void *arg) | ||
924 | { | ||
925 | /* Simply deliver the interrupt */ | ||
926 | } | ||
927 | |||
928 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
929 | pmd_t *pmdp) | ||
930 | { | ||
931 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
932 | if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, | ||
933 | (unsigned long *) pmdp)) { | ||
934 | /* need to serialize against gup-fast (IRQ disabled) */ | ||
935 | smp_call_function(pmdp_splitting_flush_sync, NULL, 1); | ||
936 | } | ||
937 | } | ||
938 | |||
939 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | ||
940 | { | ||
941 | struct list_head *lh = (struct list_head *) pgtable; | ||
942 | |||
943 | assert_spin_locked(&mm->page_table_lock); | ||
944 | |||
945 | /* FIFO */ | ||
946 | if (!mm->pmd_huge_pte) | ||
947 | INIT_LIST_HEAD(lh); | ||
948 | else | ||
949 | list_add(lh, (struct list_head *) mm->pmd_huge_pte); | ||
950 | mm->pmd_huge_pte = pgtable; | ||
951 | } | ||
952 | |||
953 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | ||
954 | { | ||
955 | struct list_head *lh; | ||
956 | pgtable_t pgtable; | ||
957 | pte_t *ptep; | ||
958 | |||
959 | assert_spin_locked(&mm->page_table_lock); | ||
960 | |||
961 | /* FIFO */ | ||
962 | pgtable = mm->pmd_huge_pte; | ||
963 | lh = (struct list_head *) pgtable; | ||
964 | if (list_empty(lh)) | ||
965 | mm->pmd_huge_pte = NULL; | ||
966 | else { | ||
967 | mm->pmd_huge_pte = (pgtable_t) lh->next; | ||
968 | list_del(lh); | ||
969 | } | ||
970 | ptep = (pte_t *) pgtable; | ||
971 | pte_val(*ptep) = _PAGE_TYPE_EMPTY; | ||
972 | ptep++; | ||
973 | pte_val(*ptep) = _PAGE_TYPE_EMPTY; | ||
974 | return pgtable; | ||
975 | } | ||
976 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 36f5141e8041..3b3e27a3ff2c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig | |||
@@ -13,14 +13,17 @@ config SUPERH | |||
13 | select HAVE_DMA_ATTRS | 13 | select HAVE_DMA_ATTRS |
14 | select HAVE_IRQ_WORK | 14 | select HAVE_IRQ_WORK |
15 | select HAVE_PERF_EVENTS | 15 | select HAVE_PERF_EVENTS |
16 | select HAVE_DEBUG_BUGVERBOSE | ||
16 | select ARCH_HAVE_CUSTOM_GPIO_H | 17 | select ARCH_HAVE_CUSTOM_GPIO_H |
17 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) | 18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) |
18 | select PERF_USE_VMALLOC | 19 | select PERF_USE_VMALLOC |
20 | select HAVE_DEBUG_KMEMLEAK | ||
19 | select HAVE_KERNEL_GZIP | 21 | select HAVE_KERNEL_GZIP |
20 | select HAVE_KERNEL_BZIP2 | 22 | select HAVE_KERNEL_BZIP2 |
21 | select HAVE_KERNEL_LZMA | 23 | select HAVE_KERNEL_LZMA |
22 | select HAVE_KERNEL_XZ | 24 | select HAVE_KERNEL_XZ |
23 | select HAVE_KERNEL_LZO | 25 | select HAVE_KERNEL_LZO |
26 | select HAVE_UID16 | ||
24 | select ARCH_WANT_IPC_PARSE_VERSION | 27 | select ARCH_WANT_IPC_PARSE_VERSION |
25 | select HAVE_SYSCALL_TRACEPOINTS | 28 | select HAVE_SYSCALL_TRACEPOINTS |
26 | select HAVE_REGS_AND_STACK_ACCESS_API | 29 | select HAVE_REGS_AND_STACK_ACCESS_API |
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 967068fb79ac..b3808c7d67b2 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef _ASM_SH_HUGETLB_H | 1 | #ifndef _ASM_SH_HUGETLB_H |
2 | #define _ASM_SH_HUGETLB_H | 2 | #define _ASM_SH_HUGETLB_H |
3 | 3 | ||
4 | #include <asm/cacheflush.h> | ||
4 | #include <asm/page.h> | 5 | #include <asm/page.h> |
5 | 6 | ||
6 | 7 | ||
@@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page) | |||
89 | { | 90 | { |
90 | } | 91 | } |
91 | 92 | ||
93 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
94 | { | ||
95 | clear_bit(PG_dcache_clean, &page->flags); | ||
96 | } | ||
97 | |||
92 | #endif /* _ASM_SH_HUGETLB_H */ | 98 | #endif /* _ASM_SH_HUGETLB_H */ |
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3bdc1ad9a341..cbbdcad8fcb3 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
@@ -504,6 +504,7 @@ good_area: | |||
504 | } | 504 | } |
505 | if (fault & VM_FAULT_RETRY) { | 505 | if (fault & VM_FAULT_RETRY) { |
506 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 506 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
507 | flags |= FAULT_FLAG_TRIED; | ||
507 | 508 | ||
508 | /* | 509 | /* |
509 | * No need to up_read(&mm->mmap_sem) as we would | 510 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 67f1f6f5f4e1..91c780c973ba 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig | |||
@@ -18,6 +18,7 @@ config SPARC | |||
18 | select HAVE_OPROFILE | 18 | select HAVE_OPROFILE |
19 | select HAVE_ARCH_KGDB if !SMP || SPARC64 | 19 | select HAVE_ARCH_KGDB if !SMP || SPARC64 |
20 | select HAVE_ARCH_TRACEHOOK | 20 | select HAVE_ARCH_TRACEHOOK |
21 | select SYSCTL_EXCEPTION_TRACE | ||
21 | select ARCH_WANT_OPTIONAL_GPIOLIB | 22 | select ARCH_WANT_OPTIONAL_GPIOLIB |
22 | select RTC_CLASS | 23 | select RTC_CLASS |
23 | select RTC_DRV_M48T59 | 24 | select RTC_DRV_M48T59 |
@@ -32,6 +33,7 @@ config SPARC | |||
32 | select GENERIC_PCI_IOMAP | 33 | select GENERIC_PCI_IOMAP |
33 | select HAVE_NMI_WATCHDOG if SPARC64 | 34 | select HAVE_NMI_WATCHDOG if SPARC64 |
34 | select HAVE_BPF_JIT | 35 | select HAVE_BPF_JIT |
36 | select HAVE_DEBUG_BUGVERBOSE | ||
35 | select GENERIC_SMP_IDLE_THREAD | 37 | select GENERIC_SMP_IDLE_THREAD |
36 | select GENERIC_CMOS_UPDATE | 38 | select GENERIC_CMOS_UPDATE |
37 | select GENERIC_CLOCKEVENTS | 39 | select GENERIC_CLOCKEVENTS |
@@ -42,6 +44,7 @@ config SPARC32 | |||
42 | def_bool !64BIT | 44 | def_bool !64BIT |
43 | select GENERIC_ATOMIC64 | 45 | select GENERIC_ATOMIC64 |
44 | select CLZ_TAB | 46 | select CLZ_TAB |
47 | select HAVE_UID16 | ||
45 | 48 | ||
46 | config SPARC64 | 49 | config SPARC64 |
47 | def_bool 64BIT | 50 | def_bool 64BIT |
@@ -59,6 +62,7 @@ config SPARC64 | |||
59 | select HAVE_DYNAMIC_FTRACE | 62 | select HAVE_DYNAMIC_FTRACE |
60 | select HAVE_FTRACE_MCOUNT_RECORD | 63 | select HAVE_FTRACE_MCOUNT_RECORD |
61 | select HAVE_SYSCALL_TRACEPOINTS | 64 | select HAVE_SYSCALL_TRACEPOINTS |
65 | select HAVE_DEBUG_KMEMLEAK | ||
62 | select RTC_DRV_CMOS | 66 | select RTC_DRV_CMOS |
63 | select RTC_DRV_BQ4802 | 67 | select RTC_DRV_BQ4802 |
64 | select RTC_DRV_SUN4V | 68 | select RTC_DRV_SUN4V |
@@ -226,25 +230,6 @@ config EARLYFB | |||
226 | help | 230 | help |
227 | Say Y here to enable a faster early framebuffer boot console. | 231 | Say Y here to enable a faster early framebuffer boot console. |
228 | 232 | ||
229 | choice | ||
230 | prompt "Kernel page size" if SPARC64 | ||
231 | default SPARC64_PAGE_SIZE_8KB | ||
232 | |||
233 | config SPARC64_PAGE_SIZE_8KB | ||
234 | bool "8KB" | ||
235 | help | ||
236 | This lets you select the page size of the kernel. | ||
237 | |||
238 | 8KB and 64KB work quite well, since SPARC ELF sections | ||
239 | provide for up to 64KB alignment. | ||
240 | |||
241 | If you don't know what to do, choose 8KB. | ||
242 | |||
243 | config SPARC64_PAGE_SIZE_64KB | ||
244 | bool "64KB" | ||
245 | |||
246 | endchoice | ||
247 | |||
248 | config SECCOMP | 233 | config SECCOMP |
249 | bool "Enable seccomp to safely compute untrusted bytecode" | 234 | bool "Enable seccomp to safely compute untrusted bytecode" |
250 | depends on SPARC64 && PROC_FS | 235 | depends on SPARC64 && PROC_FS |
@@ -316,23 +301,6 @@ config GENERIC_LOCKBREAK | |||
316 | default y | 301 | default y |
317 | depends on SPARC64 && SMP && PREEMPT | 302 | depends on SPARC64 && SMP && PREEMPT |
318 | 303 | ||
319 | choice | ||
320 | prompt "SPARC64 Huge TLB Page Size" | ||
321 | depends on SPARC64 && HUGETLB_PAGE | ||
322 | default HUGETLB_PAGE_SIZE_4MB | ||
323 | |||
324 | config HUGETLB_PAGE_SIZE_4MB | ||
325 | bool "4MB" | ||
326 | |||
327 | config HUGETLB_PAGE_SIZE_512K | ||
328 | bool "512K" | ||
329 | |||
330 | config HUGETLB_PAGE_SIZE_64K | ||
331 | depends on !SPARC64_PAGE_SIZE_64KB | ||
332 | bool "64K" | ||
333 | |||
334 | endchoice | ||
335 | |||
336 | config NUMA | 304 | config NUMA |
337 | bool "NUMA support" | 305 | bool "NUMA support" |
338 | depends on SPARC64 && SMP | 306 | depends on SPARC64 && SMP |
@@ -571,6 +539,7 @@ config COMPAT | |||
571 | depends on SPARC64 | 539 | depends on SPARC64 |
572 | default y | 540 | default y |
573 | select COMPAT_BINFMT_ELF | 541 | select COMPAT_BINFMT_ELF |
542 | select HAVE_UID16 | ||
574 | select ARCH_WANT_OLD_COMPAT_IPC | 543 | select ARCH_WANT_OLD_COMPAT_IPC |
575 | 544 | ||
576 | config SYSVIPC_COMPAT | 545 | config SYSVIPC_COMPAT |
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 177061064ee6..8c5eed6d267f 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h | |||
@@ -10,7 +10,10 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | |||
10 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | 10 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
11 | pte_t *ptep); | 11 | pte_t *ptep); |
12 | 12 | ||
13 | void hugetlb_prefault_arch_hook(struct mm_struct *mm); | 13 | static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) |
14 | { | ||
15 | hugetlb_setup(mm); | ||
16 | } | ||
14 | 17 | ||
15 | static inline int is_hugepage_only_range(struct mm_struct *mm, | 18 | static inline int is_hugepage_only_range(struct mm_struct *mm, |
16 | unsigned long addr, | 19 | unsigned long addr, |
@@ -82,4 +85,8 @@ static inline void arch_release_hugepage(struct page *page) | |||
82 | { | 85 | { |
83 | } | 86 | } |
84 | 87 | ||
88 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
89 | { | ||
90 | } | ||
91 | |||
85 | #endif /* _ASM_SPARC64_HUGETLB_H */ | 92 | #endif /* _ASM_SPARC64_HUGETLB_H */ |
diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 9067dc500535..76092c4dd277 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h | |||
@@ -30,22 +30,8 @@ | |||
30 | #define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \ | 30 | #define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \ |
31 | (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT)) | 31 | (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT)) |
32 | 32 | ||
33 | #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) | ||
34 | #define CTX_PGSZ_BASE CTX_PGSZ_8KB | 33 | #define CTX_PGSZ_BASE CTX_PGSZ_8KB |
35 | #elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) | 34 | #define CTX_PGSZ_HUGE CTX_PGSZ_4MB |
36 | #define CTX_PGSZ_BASE CTX_PGSZ_64KB | ||
37 | #else | ||
38 | #error No page size specified in kernel configuration | ||
39 | #endif | ||
40 | |||
41 | #if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) | ||
42 | #define CTX_PGSZ_HUGE CTX_PGSZ_4MB | ||
43 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) | ||
44 | #define CTX_PGSZ_HUGE CTX_PGSZ_512KB | ||
45 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) | ||
46 | #define CTX_PGSZ_HUGE CTX_PGSZ_64KB | ||
47 | #endif | ||
48 | |||
49 | #define CTX_PGSZ_KERN CTX_PGSZ_4MB | 35 | #define CTX_PGSZ_KERN CTX_PGSZ_4MB |
50 | 36 | ||
51 | /* Thus, when running on UltraSPARC-III+ and later, we use the following | 37 | /* Thus, when running on UltraSPARC-III+ and later, we use the following |
@@ -96,7 +82,7 @@ struct tsb_config { | |||
96 | 82 | ||
97 | #define MM_TSB_BASE 0 | 83 | #define MM_TSB_BASE 0 |
98 | 84 | ||
99 | #ifdef CONFIG_HUGETLB_PAGE | 85 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
100 | #define MM_TSB_HUGE 1 | 86 | #define MM_TSB_HUGE 1 |
101 | #define MM_NUM_TSBS 2 | 87 | #define MM_NUM_TSBS 2 |
102 | #else | 88 | #else |
@@ -107,6 +93,7 @@ typedef struct { | |||
107 | spinlock_t lock; | 93 | spinlock_t lock; |
108 | unsigned long sparc64_ctx_val; | 94 | unsigned long sparc64_ctx_val; |
109 | unsigned long huge_pte_count; | 95 | unsigned long huge_pte_count; |
96 | struct page *pgtable_page; | ||
110 | struct tsb_config tsb_block[MM_NUM_TSBS]; | 97 | struct tsb_config tsb_block[MM_NUM_TSBS]; |
111 | struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; | 98 | struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; |
112 | } mm_context_t; | 99 | } mm_context_t; |
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index a97fd085cebe..9191ca62ed9c 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h | |||
@@ -36,7 +36,7 @@ static inline void tsb_context_switch(struct mm_struct *mm) | |||
36 | { | 36 | { |
37 | __tsb_context_switch(__pa(mm->pgd), | 37 | __tsb_context_switch(__pa(mm->pgd), |
38 | &mm->context.tsb_block[0], | 38 | &mm->context.tsb_block[0], |
39 | #ifdef CONFIG_HUGETLB_PAGE | 39 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
40 | (mm->context.tsb_block[1].tsb ? | 40 | (mm->context.tsb_block[1].tsb ? |
41 | &mm->context.tsb_block[1] : | 41 | &mm->context.tsb_block[1] : |
42 | NULL) | 42 | NULL) |
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index f0d09b401036..4b39f74d6ca0 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h | |||
@@ -3,13 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | 5 | ||
6 | #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) | ||
7 | #define PAGE_SHIFT 13 | 6 | #define PAGE_SHIFT 13 |
8 | #elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) | ||
9 | #define PAGE_SHIFT 16 | ||
10 | #else | ||
11 | #error No page size specified in kernel configuration | ||
12 | #endif | ||
13 | 7 | ||
14 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) | 8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) |
15 | #define PAGE_MASK (~(PAGE_SIZE-1)) | 9 | #define PAGE_MASK (~(PAGE_SIZE-1)) |
@@ -21,15 +15,9 @@ | |||
21 | #define DCACHE_ALIASING_POSSIBLE | 15 | #define DCACHE_ALIASING_POSSIBLE |
22 | #endif | 16 | #endif |
23 | 17 | ||
24 | #if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) | ||
25 | #define HPAGE_SHIFT 22 | 18 | #define HPAGE_SHIFT 22 |
26 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) | ||
27 | #define HPAGE_SHIFT 19 | ||
28 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) | ||
29 | #define HPAGE_SHIFT 16 | ||
30 | #endif | ||
31 | 19 | ||
32 | #ifdef CONFIG_HUGETLB_PAGE | 20 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
33 | #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) | 21 | #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) |
34 | #define HPAGE_MASK (~(HPAGE_SIZE - 1UL)) | 22 | #define HPAGE_MASK (~(HPAGE_SIZE - 1UL)) |
35 | #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) | 23 | #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) |
@@ -38,6 +26,11 @@ | |||
38 | 26 | ||
39 | #ifndef __ASSEMBLY__ | 27 | #ifndef __ASSEMBLY__ |
40 | 28 | ||
29 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
30 | struct mm_struct; | ||
31 | extern void hugetlb_setup(struct mm_struct *mm); | ||
32 | #endif | ||
33 | |||
41 | #define WANT_PAGE_VIRTUAL | 34 | #define WANT_PAGE_VIRTUAL |
42 | 35 | ||
43 | extern void _clear_page(void *page); | 36 | extern void _clear_page(void *page); |
@@ -98,7 +91,7 @@ typedef unsigned long pgprot_t; | |||
98 | 91 | ||
99 | #endif /* (STRICT_MM_TYPECHECKS) */ | 92 | #endif /* (STRICT_MM_TYPECHECKS) */ |
100 | 93 | ||
101 | typedef struct page *pgtable_t; | 94 | typedef pte_t *pgtable_t; |
102 | 95 | ||
103 | #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ | 96 | #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ |
104 | (_AC(0x0000000070000000,UL)) : \ | 97 | (_AC(0x0000000070000000,UL)) : \ |
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 40b2d7a7023d..bcfe063bce23 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h | |||
@@ -38,51 +38,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
38 | kmem_cache_free(pgtable_cache, pmd); | 38 | kmem_cache_free(pgtable_cache, pmd); |
39 | } | 39 | } |
40 | 40 | ||
41 | static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, | 41 | extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, |
42 | unsigned long address) | 42 | unsigned long address); |
43 | { | 43 | extern pgtable_t pte_alloc_one(struct mm_struct *mm, |
44 | return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); | 44 | unsigned long address); |
45 | } | 45 | extern void pte_free_kernel(struct mm_struct *mm, pte_t *pte); |
46 | 46 | extern void pte_free(struct mm_struct *mm, pgtable_t ptepage); | |
47 | static inline pgtable_t pte_alloc_one(struct mm_struct *mm, | ||
48 | unsigned long address) | ||
49 | { | ||
50 | struct page *page; | ||
51 | pte_t *pte; | ||
52 | |||
53 | pte = pte_alloc_one_kernel(mm, address); | ||
54 | if (!pte) | ||
55 | return NULL; | ||
56 | page = virt_to_page(pte); | ||
57 | pgtable_page_ctor(page); | ||
58 | return page; | ||
59 | } | ||
60 | |||
61 | static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | ||
62 | { | ||
63 | free_page((unsigned long)pte); | ||
64 | } | ||
65 | |||
66 | static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) | ||
67 | { | ||
68 | pgtable_page_dtor(ptepage); | ||
69 | __free_page(ptepage); | ||
70 | } | ||
71 | 47 | ||
72 | #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) | 48 | #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE) |
73 | #define pmd_populate(MM,PMD,PTE_PAGE) \ | 49 | #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) |
74 | pmd_populate_kernel(MM,PMD,page_address(PTE_PAGE)) | 50 | #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) |
75 | #define pmd_pgtable(pmd) pmd_page(pmd) | ||
76 | 51 | ||
77 | #define check_pgt_cache() do { } while (0) | 52 | #define check_pgt_cache() do { } while (0) |
78 | 53 | ||
79 | static inline void pgtable_free(void *table, bool is_page) | 54 | extern void pgtable_free(void *table, bool is_page); |
80 | { | ||
81 | if (is_page) | ||
82 | free_page((unsigned long)table); | ||
83 | else | ||
84 | kmem_cache_free(pgtable_cache, table); | ||
85 | } | ||
86 | 55 | ||
87 | #ifdef CONFIG_SMP | 56 | #ifdef CONFIG_SMP |
88 | 57 | ||
@@ -113,11 +82,10 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is | |||
113 | } | 82 | } |
114 | #endif /* !CONFIG_SMP */ | 83 | #endif /* !CONFIG_SMP */ |
115 | 84 | ||
116 | static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, | 85 | static inline void __pte_free_tlb(struct mmu_gather *tlb, pte_t *pte, |
117 | unsigned long address) | 86 | unsigned long address) |
118 | { | 87 | { |
119 | pgtable_page_dtor(ptepage); | 88 | pgtable_free_tlb(tlb, pte, true); |
120 | pgtable_free_tlb(tlb, page_address(ptepage), true); | ||
121 | } | 89 | } |
122 | 90 | ||
123 | #define __pmd_free_tlb(tlb, pmd, addr) \ | 91 | #define __pmd_free_tlb(tlb, pmd, addr) \ |
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 61210db139fb..95515f1e7cef 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -45,40 +45,59 @@ | |||
45 | 45 | ||
46 | #define vmemmap ((struct page *)VMEMMAP_BASE) | 46 | #define vmemmap ((struct page *)VMEMMAP_BASE) |
47 | 47 | ||
48 | /* XXX All of this needs to be rethought so we can take advantage | ||
49 | * XXX cheetah's full 64-bit virtual address space, ie. no more hole | ||
50 | * XXX in the middle like on spitfire. -DaveM | ||
51 | */ | ||
52 | /* | ||
53 | * Given a virtual address, the lowest PAGE_SHIFT bits determine offset | ||
54 | * into the page; the next higher PAGE_SHIFT-3 bits determine the pte# | ||
55 | * in the proper pagetable (the -3 is from the 8 byte ptes, and each page | ||
56 | * table is a single page long). The next higher PMD_BITS determine pmd# | ||
57 | * in the proper pmdtable (where we must have PMD_BITS <= (PAGE_SHIFT-2) | ||
58 | * since the pmd entries are 4 bytes, and each pmd page is a single page | ||
59 | * long). Finally, the higher few bits determine pgde#. | ||
60 | */ | ||
61 | |||
62 | /* PMD_SHIFT determines the size of the area a second-level page | 48 | /* PMD_SHIFT determines the size of the area a second-level page |
63 | * table can map | 49 | * table can map |
64 | */ | 50 | */ |
65 | #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) | 51 | #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4)) |
66 | #define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) | 52 | #define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) |
67 | #define PMD_MASK (~(PMD_SIZE-1)) | 53 | #define PMD_MASK (~(PMD_SIZE-1)) |
68 | #define PMD_BITS (PAGE_SHIFT - 2) | 54 | #define PMD_BITS (PAGE_SHIFT - 2) |
69 | 55 | ||
70 | /* PGDIR_SHIFT determines what a third-level page table entry can map */ | 56 | /* PGDIR_SHIFT determines what a third-level page table entry can map */ |
71 | #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS) | 57 | #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-4) + PMD_BITS) |
72 | #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) | 58 | #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) |
73 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) | 59 | #define PGDIR_MASK (~(PGDIR_SIZE-1)) |
74 | #define PGDIR_BITS (PAGE_SHIFT - 2) | 60 | #define PGDIR_BITS (PAGE_SHIFT - 2) |
75 | 61 | ||
62 | #if (PGDIR_SHIFT + PGDIR_BITS) != 44 | ||
63 | #error Page table parameters do not cover virtual address space properly. | ||
64 | #endif | ||
65 | |||
66 | #if (PMD_SHIFT != HPAGE_SHIFT) | ||
67 | #error PMD_SHIFT must equal HPAGE_SHIFT for transparent huge pages. | ||
68 | #endif | ||
69 | |||
70 | /* PMDs point to PTE tables which are 4K aligned. */ | ||
71 | #define PMD_PADDR _AC(0xfffffffe,UL) | ||
72 | #define PMD_PADDR_SHIFT _AC(11,UL) | ||
73 | |||
74 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
75 | #define PMD_ISHUGE _AC(0x00000001,UL) | ||
76 | |||
77 | /* This is the PMD layout when PMD_ISHUGE is set. With 4MB huge | ||
78 | * pages, this frees up a bunch of bits in the layout that we can | ||
79 | * use for the protection settings and software metadata. | ||
80 | */ | ||
81 | #define PMD_HUGE_PADDR _AC(0xfffff800,UL) | ||
82 | #define PMD_HUGE_PROTBITS _AC(0x000007ff,UL) | ||
83 | #define PMD_HUGE_PRESENT _AC(0x00000400,UL) | ||
84 | #define PMD_HUGE_WRITE _AC(0x00000200,UL) | ||
85 | #define PMD_HUGE_DIRTY _AC(0x00000100,UL) | ||
86 | #define PMD_HUGE_ACCESSED _AC(0x00000080,UL) | ||
87 | #define PMD_HUGE_EXEC _AC(0x00000040,UL) | ||
88 | #define PMD_HUGE_SPLITTING _AC(0x00000020,UL) | ||
89 | #endif | ||
90 | |||
91 | /* PGDs point to PMD tables which are 8K aligned. */ | ||
92 | #define PGD_PADDR _AC(0xfffffffc,UL) | ||
93 | #define PGD_PADDR_SHIFT _AC(11,UL) | ||
94 | |||
76 | #ifndef __ASSEMBLY__ | 95 | #ifndef __ASSEMBLY__ |
77 | 96 | ||
78 | #include <linux/sched.h> | 97 | #include <linux/sched.h> |
79 | 98 | ||
80 | /* Entries per page directory level. */ | 99 | /* Entries per page directory level. */ |
81 | #define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3)) | 100 | #define PTRS_PER_PTE (1UL << (PAGE_SHIFT-4)) |
82 | #define PTRS_PER_PMD (1UL << PMD_BITS) | 101 | #define PTRS_PER_PMD (1UL << PMD_BITS) |
83 | #define PTRS_PER_PGD (1UL << PGDIR_BITS) | 102 | #define PTRS_PER_PGD (1UL << PGDIR_BITS) |
84 | 103 | ||
@@ -160,26 +179,11 @@ | |||
160 | #define _PAGE_SZ8K_4V _AC(0x0000000000000000,UL) /* 8K Page */ | 179 | #define _PAGE_SZ8K_4V _AC(0x0000000000000000,UL) /* 8K Page */ |
161 | #define _PAGE_SZALL_4V _AC(0x0000000000000007,UL) /* All pgsz bits */ | 180 | #define _PAGE_SZALL_4V _AC(0x0000000000000007,UL) /* All pgsz bits */ |
162 | 181 | ||
163 | #if PAGE_SHIFT == 13 | ||
164 | #define _PAGE_SZBITS_4U _PAGE_SZ8K_4U | 182 | #define _PAGE_SZBITS_4U _PAGE_SZ8K_4U |
165 | #define _PAGE_SZBITS_4V _PAGE_SZ8K_4V | 183 | #define _PAGE_SZBITS_4V _PAGE_SZ8K_4V |
166 | #elif PAGE_SHIFT == 16 | ||
167 | #define _PAGE_SZBITS_4U _PAGE_SZ64K_4U | ||
168 | #define _PAGE_SZBITS_4V _PAGE_SZ64K_4V | ||
169 | #else | ||
170 | #error Wrong PAGE_SHIFT specified | ||
171 | #endif | ||
172 | 184 | ||
173 | #if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) | ||
174 | #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U | 185 | #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U |
175 | #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V | 186 | #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V |
176 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) | ||
177 | #define _PAGE_SZHUGE_4U _PAGE_SZ512K_4U | ||
178 | #define _PAGE_SZHUGE_4V _PAGE_SZ512K_4V | ||
179 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K) | ||
180 | #define _PAGE_SZHUGE_4U _PAGE_SZ64K_4U | ||
181 | #define _PAGE_SZHUGE_4V _PAGE_SZ64K_4V | ||
182 | #endif | ||
183 | 187 | ||
184 | /* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ | 188 | /* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ |
185 | #define __P000 __pgprot(0) | 189 | #define __P000 __pgprot(0) |
@@ -218,7 +222,6 @@ extern unsigned long _PAGE_CACHE; | |||
218 | 222 | ||
219 | extern unsigned long pg_iobits; | 223 | extern unsigned long pg_iobits; |
220 | extern unsigned long _PAGE_ALL_SZ_BITS; | 224 | extern unsigned long _PAGE_ALL_SZ_BITS; |
221 | extern unsigned long _PAGE_SZBITS; | ||
222 | 225 | ||
223 | extern struct page *mem_map_zero; | 226 | extern struct page *mem_map_zero; |
224 | #define ZERO_PAGE(vaddr) (mem_map_zero) | 227 | #define ZERO_PAGE(vaddr) (mem_map_zero) |
@@ -231,25 +234,25 @@ extern struct page *mem_map_zero; | |||
231 | static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) | 234 | static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) |
232 | { | 235 | { |
233 | unsigned long paddr = pfn << PAGE_SHIFT; | 236 | unsigned long paddr = pfn << PAGE_SHIFT; |
234 | unsigned long sz_bits; | 237 | |
235 | 238 | BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL); | |
236 | sz_bits = 0UL; | 239 | return __pte(paddr | pgprot_val(prot)); |
237 | if (_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL) { | ||
238 | __asm__ __volatile__( | ||
239 | "\n661: sethi %%uhi(%1), %0\n" | ||
240 | " sllx %0, 32, %0\n" | ||
241 | " .section .sun4v_2insn_patch, \"ax\"\n" | ||
242 | " .word 661b\n" | ||
243 | " mov %2, %0\n" | ||
244 | " nop\n" | ||
245 | " .previous\n" | ||
246 | : "=r" (sz_bits) | ||
247 | : "i" (_PAGE_SZBITS_4U), "i" (_PAGE_SZBITS_4V)); | ||
248 | } | ||
249 | return __pte(paddr | sz_bits | pgprot_val(prot)); | ||
250 | } | 240 | } |
251 | #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) | 241 | #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) |
252 | 242 | ||
243 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
244 | extern pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot); | ||
245 | #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) | ||
246 | |||
247 | extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); | ||
248 | |||
249 | static inline pmd_t pmd_mkhuge(pmd_t pmd) | ||
250 | { | ||
251 | /* Do nothing, mk_pmd() does this part. */ | ||
252 | return pmd; | ||
253 | } | ||
254 | #endif | ||
255 | |||
253 | /* This one can be done with two shifts. */ | 256 | /* This one can be done with two shifts. */ |
254 | static inline unsigned long pte_pfn(pte_t pte) | 257 | static inline unsigned long pte_pfn(pte_t pte) |
255 | { | 258 | { |
@@ -286,6 +289,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot) | |||
286 | * Note: We encode this into 3 sun4v 2-insn patch sequences. | 289 | * Note: We encode this into 3 sun4v 2-insn patch sequences. |
287 | */ | 290 | */ |
288 | 291 | ||
292 | BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL); | ||
289 | __asm__ __volatile__( | 293 | __asm__ __volatile__( |
290 | "\n661: sethi %%uhi(%2), %1\n" | 294 | "\n661: sethi %%uhi(%2), %1\n" |
291 | " sethi %%hi(%2), %0\n" | 295 | " sethi %%hi(%2), %0\n" |
@@ -307,10 +311,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot) | |||
307 | : "=r" (mask), "=r" (tmp) | 311 | : "=r" (mask), "=r" (tmp) |
308 | : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U | | 312 | : "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U | |
309 | _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U | | 313 | _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U | _PAGE_PRESENT_4U | |
310 | _PAGE_SZBITS_4U | _PAGE_SPECIAL), | 314 | _PAGE_SPECIAL), |
311 | "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V | | 315 | "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V | |
312 | _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V | | 316 | _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V | _PAGE_PRESENT_4V | |
313 | _PAGE_SZBITS_4V | _PAGE_SPECIAL)); | 317 | _PAGE_SPECIAL)); |
314 | 318 | ||
315 | return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask)); | 319 | return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask)); |
316 | } | 320 | } |
@@ -618,19 +622,130 @@ static inline unsigned long pte_special(pte_t pte) | |||
618 | return pte_val(pte) & _PAGE_SPECIAL; | 622 | return pte_val(pte) & _PAGE_SPECIAL; |
619 | } | 623 | } |
620 | 624 | ||
621 | #define pmd_set(pmdp, ptep) \ | 625 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
622 | (pmd_val(*(pmdp)) = (__pa((unsigned long) (ptep)) >> 11UL)) | 626 | static inline int pmd_young(pmd_t pmd) |
627 | { | ||
628 | return pmd_val(pmd) & PMD_HUGE_ACCESSED; | ||
629 | } | ||
630 | |||
631 | static inline int pmd_write(pmd_t pmd) | ||
632 | { | ||
633 | return pmd_val(pmd) & PMD_HUGE_WRITE; | ||
634 | } | ||
635 | |||
636 | static inline unsigned long pmd_pfn(pmd_t pmd) | ||
637 | { | ||
638 | unsigned long val = pmd_val(pmd) & PMD_HUGE_PADDR; | ||
639 | |||
640 | return val >> (PAGE_SHIFT - PMD_PADDR_SHIFT); | ||
641 | } | ||
642 | |||
643 | static inline int pmd_large(pmd_t pmd) | ||
644 | { | ||
645 | return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) == | ||
646 | (PMD_ISHUGE | PMD_HUGE_PRESENT); | ||
647 | } | ||
648 | |||
649 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
650 | { | ||
651 | return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) == | ||
652 | (PMD_ISHUGE|PMD_HUGE_SPLITTING); | ||
653 | } | ||
654 | |||
655 | static inline int pmd_trans_huge(pmd_t pmd) | ||
656 | { | ||
657 | return pmd_val(pmd) & PMD_ISHUGE; | ||
658 | } | ||
659 | |||
660 | #define has_transparent_hugepage() 1 | ||
661 | |||
662 | static inline pmd_t pmd_mkold(pmd_t pmd) | ||
663 | { | ||
664 | pmd_val(pmd) &= ~PMD_HUGE_ACCESSED; | ||
665 | return pmd; | ||
666 | } | ||
667 | |||
668 | static inline pmd_t pmd_wrprotect(pmd_t pmd) | ||
669 | { | ||
670 | pmd_val(pmd) &= ~PMD_HUGE_WRITE; | ||
671 | return pmd; | ||
672 | } | ||
673 | |||
674 | static inline pmd_t pmd_mkdirty(pmd_t pmd) | ||
675 | { | ||
676 | pmd_val(pmd) |= PMD_HUGE_DIRTY; | ||
677 | return pmd; | ||
678 | } | ||
679 | |||
680 | static inline pmd_t pmd_mkyoung(pmd_t pmd) | ||
681 | { | ||
682 | pmd_val(pmd) |= PMD_HUGE_ACCESSED; | ||
683 | return pmd; | ||
684 | } | ||
685 | |||
686 | static inline pmd_t pmd_mkwrite(pmd_t pmd) | ||
687 | { | ||
688 | pmd_val(pmd) |= PMD_HUGE_WRITE; | ||
689 | return pmd; | ||
690 | } | ||
691 | |||
692 | static inline pmd_t pmd_mknotpresent(pmd_t pmd) | ||
693 | { | ||
694 | pmd_val(pmd) &= ~PMD_HUGE_PRESENT; | ||
695 | return pmd; | ||
696 | } | ||
697 | |||
698 | static inline pmd_t pmd_mksplitting(pmd_t pmd) | ||
699 | { | ||
700 | pmd_val(pmd) |= PMD_HUGE_SPLITTING; | ||
701 | return pmd; | ||
702 | } | ||
703 | |||
704 | extern pgprot_t pmd_pgprot(pmd_t entry); | ||
705 | #endif | ||
706 | |||
707 | static inline int pmd_present(pmd_t pmd) | ||
708 | { | ||
709 | return pmd_val(pmd) != 0U; | ||
710 | } | ||
711 | |||
712 | #define pmd_none(pmd) (!pmd_val(pmd)) | ||
713 | |||
714 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
715 | extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||
716 | pmd_t *pmdp, pmd_t pmd); | ||
717 | #else | ||
718 | static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||
719 | pmd_t *pmdp, pmd_t pmd) | ||
720 | { | ||
721 | *pmdp = pmd; | ||
722 | } | ||
723 | #endif | ||
724 | |||
725 | static inline void pmd_set(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) | ||
726 | { | ||
727 | unsigned long val = __pa((unsigned long) (ptep)) >> PMD_PADDR_SHIFT; | ||
728 | |||
729 | pmd_val(*pmdp) = val; | ||
730 | } | ||
731 | |||
623 | #define pud_set(pudp, pmdp) \ | 732 | #define pud_set(pudp, pmdp) \ |
624 | (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) | 733 | (pud_val(*(pudp)) = (__pa((unsigned long) (pmdp)) >> PGD_PADDR_SHIFT)) |
625 | #define __pmd_page(pmd) \ | 734 | static inline unsigned long __pmd_page(pmd_t pmd) |
626 | ((unsigned long) __va((((unsigned long)pmd_val(pmd))<<11UL))) | 735 | { |
736 | unsigned long paddr = (unsigned long) pmd_val(pmd); | ||
737 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
738 | if (pmd_val(pmd) & PMD_ISHUGE) | ||
739 | paddr &= PMD_HUGE_PADDR; | ||
740 | #endif | ||
741 | paddr <<= PMD_PADDR_SHIFT; | ||
742 | return ((unsigned long) __va(paddr)); | ||
743 | } | ||
627 | #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) | 744 | #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) |
628 | #define pud_page_vaddr(pud) \ | 745 | #define pud_page_vaddr(pud) \ |
629 | ((unsigned long) __va((((unsigned long)pud_val(pud))<<11UL))) | 746 | ((unsigned long) __va((((unsigned long)pud_val(pud))<<PGD_PADDR_SHIFT))) |
630 | #define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) | 747 | #define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) |
631 | #define pmd_none(pmd) (!pmd_val(pmd)) | ||
632 | #define pmd_bad(pmd) (0) | 748 | #define pmd_bad(pmd) (0) |
633 | #define pmd_present(pmd) (pmd_val(pmd) != 0U) | ||
634 | #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U) | 749 | #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0U) |
635 | #define pud_none(pud) (!pud_val(pud)) | 750 | #define pud_none(pud) (!pud_val(pud)) |
636 | #define pud_bad(pud) (0) | 751 | #define pud_bad(pud) (0) |
@@ -664,6 +779,16 @@ static inline unsigned long pte_special(pte_t pte) | |||
664 | extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, | 779 | extern void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, |
665 | pte_t *ptep, pte_t orig, int fullmm); | 780 | pte_t *ptep, pte_t orig, int fullmm); |
666 | 781 | ||
782 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | ||
783 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | ||
784 | unsigned long addr, | ||
785 | pmd_t *pmdp) | ||
786 | { | ||
787 | pmd_t pmd = *pmdp; | ||
788 | set_pmd_at(mm, addr, pmdp, __pmd(0U)); | ||
789 | return pmd; | ||
790 | } | ||
791 | |||
667 | static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, | 792 | static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, |
668 | pte_t *ptep, pte_t pte, int fullmm) | 793 | pte_t *ptep, pte_t pte, int fullmm) |
669 | { | 794 | { |
@@ -719,6 +844,16 @@ extern void mmu_info(struct seq_file *); | |||
719 | 844 | ||
720 | struct vm_area_struct; | 845 | struct vm_area_struct; |
721 | extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); | 846 | extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); |
847 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
848 | extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
849 | pmd_t *pmd); | ||
850 | |||
851 | #define __HAVE_ARCH_PGTABLE_DEPOSIT | ||
852 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); | ||
853 | |||
854 | #define __HAVE_ARCH_PGTABLE_WITHDRAW | ||
855 | extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); | ||
856 | #endif | ||
722 | 857 | ||
723 | /* Encode and de-code a swap entry */ | 858 | /* Encode and de-code a swap entry */ |
724 | #define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) | 859 | #define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) |
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h index 1a8afd1ad04f..b4c258de4443 100644 --- a/arch/sparc/include/asm/tsb.h +++ b/arch/sparc/include/asm/tsb.h | |||
@@ -147,20 +147,96 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; | |||
147 | brz,pn REG1, FAIL_LABEL; \ | 147 | brz,pn REG1, FAIL_LABEL; \ |
148 | sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ | 148 | sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ |
149 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ | 149 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ |
150 | sllx REG1, 11, REG1; \ | 150 | sllx REG1, PGD_PADDR_SHIFT, REG1; \ |
151 | andn REG2, 0x3, REG2; \ | 151 | andn REG2, 0x3, REG2; \ |
152 | lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ | 152 | lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ |
153 | brz,pn REG1, FAIL_LABEL; \ | 153 | brz,pn REG1, FAIL_LABEL; \ |
154 | sllx VADDR, 64 - PMD_SHIFT, REG2; \ | 154 | sllx VADDR, 64 - PMD_SHIFT, REG2; \ |
155 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ | 155 | srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ |
156 | sllx REG1, 11, REG1; \ | 156 | sllx REG1, PMD_PADDR_SHIFT, REG1; \ |
157 | andn REG2, 0x7, REG2; \ | 157 | andn REG2, 0x7, REG2; \ |
158 | add REG1, REG2, REG1; | 158 | add REG1, REG2, REG1; |
159 | 159 | ||
160 | /* Do a user page table walk in MMU globals. Leaves physical PTE | 160 | /* This macro exists only to make the PMD translator below easier |
161 | * pointer in REG1. Jumps to FAIL_LABEL on early page table walk | 161 | * to read. It hides the ELF section switch for the sun4v code |
162 | * termination. Physical base of page tables is in PHYS_PGD which | 162 | * patching. |
163 | * will not be modified. | 163 | */ |
164 | #define OR_PTE_BIT(REG, NAME) \ | ||
165 | 661: or REG, _PAGE_##NAME##_4U, REG; \ | ||
166 | .section .sun4v_1insn_patch, "ax"; \ | ||
167 | .word 661b; \ | ||
168 | or REG, _PAGE_##NAME##_4V, REG; \ | ||
169 | .previous; | ||
170 | |||
171 | /* Load into REG the PTE value for VALID, CACHE, and SZHUGE. */ | ||
172 | #define BUILD_PTE_VALID_SZHUGE_CACHE(REG) \ | ||
173 | 661: sethi %uhi(_PAGE_VALID|_PAGE_SZHUGE_4U), REG; \ | ||
174 | .section .sun4v_1insn_patch, "ax"; \ | ||
175 | .word 661b; \ | ||
176 | sethi %uhi(_PAGE_VALID), REG; \ | ||
177 | .previous; \ | ||
178 | sllx REG, 32, REG; \ | ||
179 | 661: or REG, _PAGE_CP_4U|_PAGE_CV_4U, REG; \ | ||
180 | .section .sun4v_1insn_patch, "ax"; \ | ||
181 | .word 661b; \ | ||
182 | or REG, _PAGE_CP_4V|_PAGE_CV_4V|_PAGE_SZHUGE_4V, REG; \ | ||
183 | .previous; | ||
184 | |||
185 | /* PMD has been loaded into REG1, interpret the value, seeing | ||
186 | * if it is a HUGE PMD or a normal one. If it is not valid | ||
187 | * then jump to FAIL_LABEL. If it is a HUGE PMD, and it | ||
188 | * translates to a valid PTE, branch to PTE_LABEL. | ||
189 | * | ||
190 | * We translate the PMD by hand, one bit at a time, | ||
191 | * constructing the huge PTE. | ||
192 | * | ||
193 | * So we construct the PTE in REG2 as follows: | ||
194 | * | ||
195 | * 1) Extract the PMD PFN from REG1 and place it into REG2. | ||
196 | * | ||
197 | * 2) Translate PMD protection bits in REG1 into REG2, one bit | ||
198 | * at a time using andcc tests on REG1 and OR's into REG2. | ||
199 | * | ||
200 | * Only two bits to be concerned with here, EXEC and WRITE. | ||
201 | * Now REG1 is freed up and we can use it as a temporary. | ||
202 | * | ||
203 | * 3) Construct the VALID, CACHE, and page size PTE bits in | ||
204 | * REG1, OR with REG2 to form final PTE. | ||
205 | */ | ||
206 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
207 | #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ | ||
208 | brz,pn REG1, FAIL_LABEL; \ | ||
209 | andcc REG1, PMD_ISHUGE, %g0; \ | ||
210 | be,pt %xcc, 700f; \ | ||
211 | and REG1, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED, REG2; \ | ||
212 | cmp REG2, PMD_HUGE_PRESENT|PMD_HUGE_ACCESSED; \ | ||
213 | bne,pn %xcc, FAIL_LABEL; \ | ||
214 | andn REG1, PMD_HUGE_PROTBITS, REG2; \ | ||
215 | sllx REG2, PMD_PADDR_SHIFT, REG2; \ | ||
216 | /* REG2 now holds PFN << PAGE_SHIFT */ \ | ||
217 | andcc REG1, PMD_HUGE_EXEC, %g0; \ | ||
218 | bne,a,pt %xcc, 1f; \ | ||
219 | OR_PTE_BIT(REG2, EXEC); \ | ||
220 | 1: andcc REG1, PMD_HUGE_WRITE, %g0; \ | ||
221 | bne,a,pt %xcc, 1f; \ | ||
222 | OR_PTE_BIT(REG2, W); \ | ||
223 | /* REG1 can now be clobbered, build final PTE */ \ | ||
224 | 1: BUILD_PTE_VALID_SZHUGE_CACHE(REG1); \ | ||
225 | ba,pt %xcc, PTE_LABEL; \ | ||
226 | or REG1, REG2, REG1; \ | ||
227 | 700: | ||
228 | #else | ||
229 | #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \ | ||
230 | brz,pn REG1, FAIL_LABEL; \ | ||
231 | nop; | ||
232 | #endif | ||
233 | |||
234 | /* Do a user page table walk in MMU globals. Leaves final, | ||
235 | * valid, PTE value in REG1. Jumps to FAIL_LABEL on early | ||
236 | * page table walk termination or if the PTE is not valid. | ||
237 | * | ||
238 | * Physical base of page tables is in PHYS_PGD which will not | ||
239 | * be modified. | ||
164 | * | 240 | * |
165 | * VADDR will not be clobbered, but REG1 and REG2 will. | 241 | * VADDR will not be clobbered, but REG1 and REG2 will. |
166 | */ | 242 | */ |
@@ -172,15 +248,19 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; | |||
172 | brz,pn REG1, FAIL_LABEL; \ | 248 | brz,pn REG1, FAIL_LABEL; \ |
173 | sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ | 249 | sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \ |
174 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ | 250 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ |
175 | sllx REG1, 11, REG1; \ | 251 | sllx REG1, PGD_PADDR_SHIFT, REG1; \ |
176 | andn REG2, 0x3, REG2; \ | 252 | andn REG2, 0x3, REG2; \ |
177 | lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ | 253 | lduwa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \ |
178 | brz,pn REG1, FAIL_LABEL; \ | 254 | USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \ |
179 | sllx VADDR, 64 - PMD_SHIFT, REG2; \ | 255 | sllx VADDR, 64 - PMD_SHIFT, REG2; \ |
180 | srlx REG2, 64 - PAGE_SHIFT, REG2; \ | 256 | srlx REG2, 64 - (PAGE_SHIFT - 1), REG2; \ |
181 | sllx REG1, 11, REG1; \ | 257 | sllx REG1, PMD_PADDR_SHIFT, REG1; \ |
182 | andn REG2, 0x7, REG2; \ | 258 | andn REG2, 0x7, REG2; \ |
183 | add REG1, REG2, REG1; | 259 | add REG1, REG2, REG1; \ |
260 | ldxa [REG1] ASI_PHYS_USE_EC, REG1; \ | ||
261 | brgez,pn REG1, FAIL_LABEL; \ | ||
262 | nop; \ | ||
263 | 800: | ||
184 | 264 | ||
185 | /* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0. | 265 | /* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0. |
186 | * If no entry is found, FAIL_LABEL will be branched to. On success | 266 | * If no entry is found, FAIL_LABEL will be branched to. On success |
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index acc8c838ff72..75b31bcdeadf 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c | |||
@@ -779,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev, | |||
779 | static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, | 779 | static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, |
780 | enum pci_mmap_state mmap_state) | 780 | enum pci_mmap_state mmap_state) |
781 | { | 781 | { |
782 | vma->vm_flags |= (VM_IO | VM_RESERVED); | 782 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
783 | } | 783 | } |
784 | 784 | ||
785 | /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci | 785 | /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci |
diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S index e1fbf8c75787..bde867fd71e8 100644 --- a/arch/sparc/kernel/sun4v_tlb_miss.S +++ b/arch/sparc/kernel/sun4v_tlb_miss.S | |||
@@ -176,7 +176,7 @@ sun4v_tsb_miss_common: | |||
176 | 176 | ||
177 | sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 | 177 | sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 |
178 | 178 | ||
179 | #ifdef CONFIG_HUGETLB_PAGE | 179 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
180 | mov SCRATCHPAD_UTSBREG2, %g5 | 180 | mov SCRATCHPAD_UTSBREG2, %g5 |
181 | ldxa [%g5] ASI_SCRATCHPAD, %g5 | 181 | ldxa [%g5] ASI_SCRATCHPAD, %g5 |
182 | cmp %g5, -1 | 182 | cmp %g5, -1 |
diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index db15d123f054..d4bdc7a62375 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S | |||
@@ -49,7 +49,7 @@ tsb_miss_page_table_walk: | |||
49 | /* Before committing to a full page table walk, | 49 | /* Before committing to a full page table walk, |
50 | * check the huge page TSB. | 50 | * check the huge page TSB. |
51 | */ | 51 | */ |
52 | #ifdef CONFIG_HUGETLB_PAGE | 52 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
53 | 53 | ||
54 | 661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5 | 54 | 661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5 |
55 | nop | 55 | nop |
@@ -110,12 +110,9 @@ tsb_miss_page_table_walk: | |||
110 | tsb_miss_page_table_walk_sun4v_fastpath: | 110 | tsb_miss_page_table_walk_sun4v_fastpath: |
111 | USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) | 111 | USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) |
112 | 112 | ||
113 | /* Load and check PTE. */ | 113 | /* Valid PTE is now in %g5. */ |
114 | ldxa [%g5] ASI_PHYS_USE_EC, %g5 | ||
115 | brgez,pn %g5, tsb_do_fault | ||
116 | nop | ||
117 | 114 | ||
118 | #ifdef CONFIG_HUGETLB_PAGE | 115 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
119 | 661: sethi %uhi(_PAGE_SZALL_4U), %g7 | 116 | 661: sethi %uhi(_PAGE_SZALL_4U), %g7 |
120 | sllx %g7, 32, %g7 | 117 | sllx %g7, 32, %g7 |
121 | .section .sun4v_2insn_patch, "ax" | 118 | .section .sun4v_2insn_patch, "ax" |
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 77ac917be152..e98bfda205a2 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c | |||
@@ -265,6 +265,7 @@ good_area: | |||
265 | } | 265 | } |
266 | if (fault & VM_FAULT_RETRY) { | 266 | if (fault & VM_FAULT_RETRY) { |
267 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 267 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
268 | flags |= FAULT_FLAG_TRIED; | ||
268 | 269 | ||
269 | /* No need to up_read(&mm->mmap_sem) as we would | 270 | /* No need to up_read(&mm->mmap_sem) as we would |
270 | * have already released it in __lock_page_or_retry | 271 | * have already released it in __lock_page_or_retry |
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 1fe0429b6314..2976dba1ebaf 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c | |||
@@ -452,6 +452,7 @@ good_area: | |||
452 | } | 452 | } |
453 | if (fault & VM_FAULT_RETRY) { | 453 | if (fault & VM_FAULT_RETRY) { |
454 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 454 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
455 | flags |= FAULT_FLAG_TRIED; | ||
455 | 456 | ||
456 | /* No need to up_read(&mm->mmap_sem) as we would | 457 | /* No need to up_read(&mm->mmap_sem) as we would |
457 | * have already released it in __lock_page_or_retry | 458 | * have already released it in __lock_page_or_retry |
@@ -464,13 +465,13 @@ good_area: | |||
464 | up_read(&mm->mmap_sem); | 465 | up_read(&mm->mmap_sem); |
465 | 466 | ||
466 | mm_rss = get_mm_rss(mm); | 467 | mm_rss = get_mm_rss(mm); |
467 | #ifdef CONFIG_HUGETLB_PAGE | 468 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
468 | mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); | 469 | mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); |
469 | #endif | 470 | #endif |
470 | if (unlikely(mm_rss > | 471 | if (unlikely(mm_rss > |
471 | mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) | 472 | mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) |
472 | tsb_grow(mm, MM_TSB_BASE, mm_rss); | 473 | tsb_grow(mm, MM_TSB_BASE, mm_rss); |
473 | #ifdef CONFIG_HUGETLB_PAGE | 474 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
474 | mm_rss = mm->context.huge_pte_count; | 475 | mm_rss = mm->context.huge_pte_count; |
475 | if (unlikely(mm_rss > | 476 | if (unlikely(mm_rss > |
476 | mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) | 477 | mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) |
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 07e14535375c..f76f83d5ac63 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c | |||
@@ -303,53 +303,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
303 | { | 303 | { |
304 | return NULL; | 304 | return NULL; |
305 | } | 305 | } |
306 | |||
307 | static void context_reload(void *__data) | ||
308 | { | ||
309 | struct mm_struct *mm = __data; | ||
310 | |||
311 | if (mm == current->mm) | ||
312 | load_secondary_context(mm); | ||
313 | } | ||
314 | |||
315 | void hugetlb_prefault_arch_hook(struct mm_struct *mm) | ||
316 | { | ||
317 | struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; | ||
318 | |||
319 | if (likely(tp->tsb != NULL)) | ||
320 | return; | ||
321 | |||
322 | tsb_grow(mm, MM_TSB_HUGE, 0); | ||
323 | tsb_context_switch(mm); | ||
324 | smp_tsb_sync(mm); | ||
325 | |||
326 | /* On UltraSPARC-III+ and later, configure the second half of | ||
327 | * the Data-TLB for huge pages. | ||
328 | */ | ||
329 | if (tlb_type == cheetah_plus) { | ||
330 | unsigned long ctx; | ||
331 | |||
332 | spin_lock(&ctx_alloc_lock); | ||
333 | ctx = mm->context.sparc64_ctx_val; | ||
334 | ctx &= ~CTX_PGSZ_MASK; | ||
335 | ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; | ||
336 | ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; | ||
337 | |||
338 | if (ctx != mm->context.sparc64_ctx_val) { | ||
339 | /* When changing the page size fields, we | ||
340 | * must perform a context flush so that no | ||
341 | * stale entries match. This flush must | ||
342 | * occur with the original context register | ||
343 | * settings. | ||
344 | */ | ||
345 | do_flush_tlb_mm(mm); | ||
346 | |||
347 | /* Reload the context register of all processors | ||
348 | * also executing in this address space. | ||
349 | */ | ||
350 | mm->context.sparc64_ctx_val = ctx; | ||
351 | on_each_cpu(context_reload, mm, 0); | ||
352 | } | ||
353 | spin_unlock(&ctx_alloc_lock); | ||
354 | } | ||
355 | } | ||
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 7a9b788c6ced..9e28a118e6a4 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -276,7 +276,6 @@ static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long | |||
276 | } | 276 | } |
277 | 277 | ||
278 | unsigned long _PAGE_ALL_SZ_BITS __read_mostly; | 278 | unsigned long _PAGE_ALL_SZ_BITS __read_mostly; |
279 | unsigned long _PAGE_SZBITS __read_mostly; | ||
280 | 279 | ||
281 | static void flush_dcache(unsigned long pfn) | 280 | static void flush_dcache(unsigned long pfn) |
282 | { | 281 | { |
@@ -307,12 +306,24 @@ static void flush_dcache(unsigned long pfn) | |||
307 | } | 306 | } |
308 | } | 307 | } |
309 | 308 | ||
309 | /* mm->context.lock must be held */ | ||
310 | static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index, | ||
311 | unsigned long tsb_hash_shift, unsigned long address, | ||
312 | unsigned long tte) | ||
313 | { | ||
314 | struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb; | ||
315 | unsigned long tag; | ||
316 | |||
317 | tsb += ((address >> tsb_hash_shift) & | ||
318 | (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); | ||
319 | tag = (address >> 22UL); | ||
320 | tsb_insert(tsb, tag, tte); | ||
321 | } | ||
322 | |||
310 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) | 323 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) |
311 | { | 324 | { |
325 | unsigned long tsb_index, tsb_hash_shift, flags; | ||
312 | struct mm_struct *mm; | 326 | struct mm_struct *mm; |
313 | struct tsb *tsb; | ||
314 | unsigned long tag, flags; | ||
315 | unsigned long tsb_index, tsb_hash_shift; | ||
316 | pte_t pte = *ptep; | 327 | pte_t pte = *ptep; |
317 | 328 | ||
318 | if (tlb_type != hypervisor) { | 329 | if (tlb_type != hypervisor) { |
@@ -329,7 +340,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * | |||
329 | 340 | ||
330 | spin_lock_irqsave(&mm->context.lock, flags); | 341 | spin_lock_irqsave(&mm->context.lock, flags); |
331 | 342 | ||
332 | #ifdef CONFIG_HUGETLB_PAGE | 343 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
333 | if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { | 344 | if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { |
334 | if ((tlb_type == hypervisor && | 345 | if ((tlb_type == hypervisor && |
335 | (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || | 346 | (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || |
@@ -341,11 +352,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * | |||
341 | } | 352 | } |
342 | #endif | 353 | #endif |
343 | 354 | ||
344 | tsb = mm->context.tsb_block[tsb_index].tsb; | 355 | __update_mmu_tsb_insert(mm, tsb_index, tsb_hash_shift, |
345 | tsb += ((address >> tsb_hash_shift) & | 356 | address, pte_val(pte)); |
346 | (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); | ||
347 | tag = (address >> 22UL); | ||
348 | tsb_insert(tsb, tag, pte_val(pte)); | ||
349 | 357 | ||
350 | spin_unlock_irqrestore(&mm->context.lock, flags); | 358 | spin_unlock_irqrestore(&mm->context.lock, flags); |
351 | } | 359 | } |
@@ -2275,8 +2283,7 @@ static void __init sun4u_pgprot_init(void) | |||
2275 | __ACCESS_BITS_4U | _PAGE_E_4U); | 2283 | __ACCESS_BITS_4U | _PAGE_E_4U); |
2276 | 2284 | ||
2277 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2285 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2278 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4U) ^ | 2286 | kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; |
2279 | 0xfffff80000000000UL; | ||
2280 | #else | 2287 | #else |
2281 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ | 2288 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ |
2282 | 0xfffff80000000000UL; | 2289 | 0xfffff80000000000UL; |
@@ -2287,7 +2294,6 @@ static void __init sun4u_pgprot_init(void) | |||
2287 | for (i = 1; i < 4; i++) | 2294 | for (i = 1; i < 4; i++) |
2288 | kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; | 2295 | kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; |
2289 | 2296 | ||
2290 | _PAGE_SZBITS = _PAGE_SZBITS_4U; | ||
2291 | _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | | 2297 | _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | |
2292 | _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | | 2298 | _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | |
2293 | _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); | 2299 | _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); |
@@ -2324,8 +2330,7 @@ static void __init sun4v_pgprot_init(void) | |||
2324 | _PAGE_CACHE = _PAGE_CACHE_4V; | 2330 | _PAGE_CACHE = _PAGE_CACHE_4V; |
2325 | 2331 | ||
2326 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2332 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2327 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ | 2333 | kern_linear_pte_xor[0] = _PAGE_VALID ^ 0xfffff80000000000UL; |
2328 | 0xfffff80000000000UL; | ||
2329 | #else | 2334 | #else |
2330 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ | 2335 | kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ |
2331 | 0xfffff80000000000UL; | 2336 | 0xfffff80000000000UL; |
@@ -2339,7 +2344,6 @@ static void __init sun4v_pgprot_init(void) | |||
2339 | pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | | 2344 | pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | |
2340 | __ACCESS_BITS_4V | _PAGE_E_4V); | 2345 | __ACCESS_BITS_4V | _PAGE_E_4V); |
2341 | 2346 | ||
2342 | _PAGE_SZBITS = _PAGE_SZBITS_4V; | ||
2343 | _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | | 2347 | _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | |
2344 | _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | | 2348 | _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | |
2345 | _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | | 2349 | _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | |
@@ -2472,3 +2476,281 @@ void __flush_tlb_all(void) | |||
2472 | __asm__ __volatile__("wrpr %0, 0, %%pstate" | 2476 | __asm__ __volatile__("wrpr %0, 0, %%pstate" |
2473 | : : "r" (pstate)); | 2477 | : : "r" (pstate)); |
2474 | } | 2478 | } |
2479 | |||
2480 | static pte_t *get_from_cache(struct mm_struct *mm) | ||
2481 | { | ||
2482 | struct page *page; | ||
2483 | pte_t *ret; | ||
2484 | |||
2485 | spin_lock(&mm->page_table_lock); | ||
2486 | page = mm->context.pgtable_page; | ||
2487 | ret = NULL; | ||
2488 | if (page) { | ||
2489 | void *p = page_address(page); | ||
2490 | |||
2491 | mm->context.pgtable_page = NULL; | ||
2492 | |||
2493 | ret = (pte_t *) (p + (PAGE_SIZE / 2)); | ||
2494 | } | ||
2495 | spin_unlock(&mm->page_table_lock); | ||
2496 | |||
2497 | return ret; | ||
2498 | } | ||
2499 | |||
2500 | static struct page *__alloc_for_cache(struct mm_struct *mm) | ||
2501 | { | ||
2502 | struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | | ||
2503 | __GFP_REPEAT | __GFP_ZERO); | ||
2504 | |||
2505 | if (page) { | ||
2506 | spin_lock(&mm->page_table_lock); | ||
2507 | if (!mm->context.pgtable_page) { | ||
2508 | atomic_set(&page->_count, 2); | ||
2509 | mm->context.pgtable_page = page; | ||
2510 | } | ||
2511 | spin_unlock(&mm->page_table_lock); | ||
2512 | } | ||
2513 | return page; | ||
2514 | } | ||
2515 | |||
2516 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, | ||
2517 | unsigned long address) | ||
2518 | { | ||
2519 | struct page *page; | ||
2520 | pte_t *pte; | ||
2521 | |||
2522 | pte = get_from_cache(mm); | ||
2523 | if (pte) | ||
2524 | return pte; | ||
2525 | |||
2526 | page = __alloc_for_cache(mm); | ||
2527 | if (page) | ||
2528 | pte = (pte_t *) page_address(page); | ||
2529 | |||
2530 | return pte; | ||
2531 | } | ||
2532 | |||
2533 | pgtable_t pte_alloc_one(struct mm_struct *mm, | ||
2534 | unsigned long address) | ||
2535 | { | ||
2536 | struct page *page; | ||
2537 | pte_t *pte; | ||
2538 | |||
2539 | pte = get_from_cache(mm); | ||
2540 | if (pte) | ||
2541 | return pte; | ||
2542 | |||
2543 | page = __alloc_for_cache(mm); | ||
2544 | if (page) { | ||
2545 | pgtable_page_ctor(page); | ||
2546 | pte = (pte_t *) page_address(page); | ||
2547 | } | ||
2548 | |||
2549 | return pte; | ||
2550 | } | ||
2551 | |||
2552 | void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | ||
2553 | { | ||
2554 | struct page *page = virt_to_page(pte); | ||
2555 | if (put_page_testzero(page)) | ||
2556 | free_hot_cold_page(page, 0); | ||
2557 | } | ||
2558 | |||
2559 | static void __pte_free(pgtable_t pte) | ||
2560 | { | ||
2561 | struct page *page = virt_to_page(pte); | ||
2562 | if (put_page_testzero(page)) { | ||
2563 | pgtable_page_dtor(page); | ||
2564 | free_hot_cold_page(page, 0); | ||
2565 | } | ||
2566 | } | ||
2567 | |||
2568 | void pte_free(struct mm_struct *mm, pgtable_t pte) | ||
2569 | { | ||
2570 | __pte_free(pte); | ||
2571 | } | ||
2572 | |||
2573 | void pgtable_free(void *table, bool is_page) | ||
2574 | { | ||
2575 | if (is_page) | ||
2576 | __pte_free(table); | ||
2577 | else | ||
2578 | kmem_cache_free(pgtable_cache, table); | ||
2579 | } | ||
2580 | |||
2581 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
2582 | static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot, bool for_modify) | ||
2583 | { | ||
2584 | if (pgprot_val(pgprot) & _PAGE_VALID) | ||
2585 | pmd_val(pmd) |= PMD_HUGE_PRESENT; | ||
2586 | if (tlb_type == hypervisor) { | ||
2587 | if (pgprot_val(pgprot) & _PAGE_WRITE_4V) | ||
2588 | pmd_val(pmd) |= PMD_HUGE_WRITE; | ||
2589 | if (pgprot_val(pgprot) & _PAGE_EXEC_4V) | ||
2590 | pmd_val(pmd) |= PMD_HUGE_EXEC; | ||
2591 | |||
2592 | if (!for_modify) { | ||
2593 | if (pgprot_val(pgprot) & _PAGE_ACCESSED_4V) | ||
2594 | pmd_val(pmd) |= PMD_HUGE_ACCESSED; | ||
2595 | if (pgprot_val(pgprot) & _PAGE_MODIFIED_4V) | ||
2596 | pmd_val(pmd) |= PMD_HUGE_DIRTY; | ||
2597 | } | ||
2598 | } else { | ||
2599 | if (pgprot_val(pgprot) & _PAGE_WRITE_4U) | ||
2600 | pmd_val(pmd) |= PMD_HUGE_WRITE; | ||
2601 | if (pgprot_val(pgprot) & _PAGE_EXEC_4U) | ||
2602 | pmd_val(pmd) |= PMD_HUGE_EXEC; | ||
2603 | |||
2604 | if (!for_modify) { | ||
2605 | if (pgprot_val(pgprot) & _PAGE_ACCESSED_4U) | ||
2606 | pmd_val(pmd) |= PMD_HUGE_ACCESSED; | ||
2607 | if (pgprot_val(pgprot) & _PAGE_MODIFIED_4U) | ||
2608 | pmd_val(pmd) |= PMD_HUGE_DIRTY; | ||
2609 | } | ||
2610 | } | ||
2611 | |||
2612 | return pmd; | ||
2613 | } | ||
2614 | |||
2615 | pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) | ||
2616 | { | ||
2617 | pmd_t pmd; | ||
2618 | |||
2619 | pmd_val(pmd) = (page_nr << ((PAGE_SHIFT - PMD_PADDR_SHIFT))); | ||
2620 | pmd_val(pmd) |= PMD_ISHUGE; | ||
2621 | pmd = pmd_set_protbits(pmd, pgprot, false); | ||
2622 | return pmd; | ||
2623 | } | ||
2624 | |||
2625 | pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | ||
2626 | { | ||
2627 | pmd_val(pmd) &= ~(PMD_HUGE_PRESENT | | ||
2628 | PMD_HUGE_WRITE | | ||
2629 | PMD_HUGE_EXEC); | ||
2630 | pmd = pmd_set_protbits(pmd, newprot, true); | ||
2631 | return pmd; | ||
2632 | } | ||
2633 | |||
2634 | pgprot_t pmd_pgprot(pmd_t entry) | ||
2635 | { | ||
2636 | unsigned long pte = 0; | ||
2637 | |||
2638 | if (pmd_val(entry) & PMD_HUGE_PRESENT) | ||
2639 | pte |= _PAGE_VALID; | ||
2640 | |||
2641 | if (tlb_type == hypervisor) { | ||
2642 | if (pmd_val(entry) & PMD_HUGE_PRESENT) | ||
2643 | pte |= _PAGE_PRESENT_4V; | ||
2644 | if (pmd_val(entry) & PMD_HUGE_EXEC) | ||
2645 | pte |= _PAGE_EXEC_4V; | ||
2646 | if (pmd_val(entry) & PMD_HUGE_WRITE) | ||
2647 | pte |= _PAGE_W_4V; | ||
2648 | if (pmd_val(entry) & PMD_HUGE_ACCESSED) | ||
2649 | pte |= _PAGE_ACCESSED_4V; | ||
2650 | if (pmd_val(entry) & PMD_HUGE_DIRTY) | ||
2651 | pte |= _PAGE_MODIFIED_4V; | ||
2652 | pte |= _PAGE_CP_4V|_PAGE_CV_4V; | ||
2653 | } else { | ||
2654 | if (pmd_val(entry) & PMD_HUGE_PRESENT) | ||
2655 | pte |= _PAGE_PRESENT_4U; | ||
2656 | if (pmd_val(entry) & PMD_HUGE_EXEC) | ||
2657 | pte |= _PAGE_EXEC_4U; | ||
2658 | if (pmd_val(entry) & PMD_HUGE_WRITE) | ||
2659 | pte |= _PAGE_W_4U; | ||
2660 | if (pmd_val(entry) & PMD_HUGE_ACCESSED) | ||
2661 | pte |= _PAGE_ACCESSED_4U; | ||
2662 | if (pmd_val(entry) & PMD_HUGE_DIRTY) | ||
2663 | pte |= _PAGE_MODIFIED_4U; | ||
2664 | pte |= _PAGE_CP_4U|_PAGE_CV_4U; | ||
2665 | } | ||
2666 | |||
2667 | return __pgprot(pte); | ||
2668 | } | ||
2669 | |||
2670 | void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
2671 | pmd_t *pmd) | ||
2672 | { | ||
2673 | unsigned long pte, flags; | ||
2674 | struct mm_struct *mm; | ||
2675 | pmd_t entry = *pmd; | ||
2676 | pgprot_t prot; | ||
2677 | |||
2678 | if (!pmd_large(entry) || !pmd_young(entry)) | ||
2679 | return; | ||
2680 | |||
2681 | pte = (pmd_val(entry) & ~PMD_HUGE_PROTBITS); | ||
2682 | pte <<= PMD_PADDR_SHIFT; | ||
2683 | pte |= _PAGE_VALID; | ||
2684 | |||
2685 | prot = pmd_pgprot(entry); | ||
2686 | |||
2687 | if (tlb_type == hypervisor) | ||
2688 | pgprot_val(prot) |= _PAGE_SZHUGE_4V; | ||
2689 | else | ||
2690 | pgprot_val(prot) |= _PAGE_SZHUGE_4U; | ||
2691 | |||
2692 | pte |= pgprot_val(prot); | ||
2693 | |||
2694 | mm = vma->vm_mm; | ||
2695 | |||
2696 | spin_lock_irqsave(&mm->context.lock, flags); | ||
2697 | |||
2698 | if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) | ||
2699 | __update_mmu_tsb_insert(mm, MM_TSB_HUGE, HPAGE_SHIFT, | ||
2700 | addr, pte); | ||
2701 | |||
2702 | spin_unlock_irqrestore(&mm->context.lock, flags); | ||
2703 | } | ||
2704 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
2705 | |||
2706 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
2707 | static void context_reload(void *__data) | ||
2708 | { | ||
2709 | struct mm_struct *mm = __data; | ||
2710 | |||
2711 | if (mm == current->mm) | ||
2712 | load_secondary_context(mm); | ||
2713 | } | ||
2714 | |||
2715 | void hugetlb_setup(struct mm_struct *mm) | ||
2716 | { | ||
2717 | struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; | ||
2718 | |||
2719 | if (likely(tp->tsb != NULL)) | ||
2720 | return; | ||
2721 | |||
2722 | tsb_grow(mm, MM_TSB_HUGE, 0); | ||
2723 | tsb_context_switch(mm); | ||
2724 | smp_tsb_sync(mm); | ||
2725 | |||
2726 | /* On UltraSPARC-III+ and later, configure the second half of | ||
2727 | * the Data-TLB for huge pages. | ||
2728 | */ | ||
2729 | if (tlb_type == cheetah_plus) { | ||
2730 | unsigned long ctx; | ||
2731 | |||
2732 | spin_lock(&ctx_alloc_lock); | ||
2733 | ctx = mm->context.sparc64_ctx_val; | ||
2734 | ctx &= ~CTX_PGSZ_MASK; | ||
2735 | ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; | ||
2736 | ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; | ||
2737 | |||
2738 | if (ctx != mm->context.sparc64_ctx_val) { | ||
2739 | /* When changing the page size fields, we | ||
2740 | * must perform a context flush so that no | ||
2741 | * stale entries match. This flush must | ||
2742 | * occur with the original context register | ||
2743 | * settings. | ||
2744 | */ | ||
2745 | do_flush_tlb_mm(mm); | ||
2746 | |||
2747 | /* Reload the context register of all processors | ||
2748 | * also executing in this address space. | ||
2749 | */ | ||
2750 | mm->context.sparc64_ctx_val = ctx; | ||
2751 | on_each_cpu(context_reload, mm, 0); | ||
2752 | } | ||
2753 | spin_unlock(&ctx_alloc_lock); | ||
2754 | } | ||
2755 | } | ||
2756 | #endif | ||
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index b1f279cd00bf..3e8fec391fe0 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c | |||
@@ -43,16 +43,37 @@ void flush_tlb_pending(void) | |||
43 | put_cpu_var(tlb_batch); | 43 | put_cpu_var(tlb_batch); |
44 | } | 44 | } |
45 | 45 | ||
46 | void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, | 46 | static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, |
47 | pte_t *ptep, pte_t orig, int fullmm) | 47 | bool exec) |
48 | { | 48 | { |
49 | struct tlb_batch *tb = &get_cpu_var(tlb_batch); | 49 | struct tlb_batch *tb = &get_cpu_var(tlb_batch); |
50 | unsigned long nr; | 50 | unsigned long nr; |
51 | 51 | ||
52 | vaddr &= PAGE_MASK; | 52 | vaddr &= PAGE_MASK; |
53 | if (pte_exec(orig)) | 53 | if (exec) |
54 | vaddr |= 0x1UL; | 54 | vaddr |= 0x1UL; |
55 | 55 | ||
56 | nr = tb->tlb_nr; | ||
57 | |||
58 | if (unlikely(nr != 0 && mm != tb->mm)) { | ||
59 | flush_tlb_pending(); | ||
60 | nr = 0; | ||
61 | } | ||
62 | |||
63 | if (nr == 0) | ||
64 | tb->mm = mm; | ||
65 | |||
66 | tb->vaddrs[nr] = vaddr; | ||
67 | tb->tlb_nr = ++nr; | ||
68 | if (nr >= TLB_BATCH_NR) | ||
69 | flush_tlb_pending(); | ||
70 | |||
71 | put_cpu_var(tlb_batch); | ||
72 | } | ||
73 | |||
74 | void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, | ||
75 | pte_t *ptep, pte_t orig, int fullmm) | ||
76 | { | ||
56 | if (tlb_type != hypervisor && | 77 | if (tlb_type != hypervisor && |
57 | pte_dirty(orig)) { | 78 | pte_dirty(orig)) { |
58 | unsigned long paddr, pfn = pte_pfn(orig); | 79 | unsigned long paddr, pfn = pte_pfn(orig); |
@@ -77,26 +98,91 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, | |||
77 | } | 98 | } |
78 | 99 | ||
79 | no_cache_flush: | 100 | no_cache_flush: |
101 | if (!fullmm) | ||
102 | tlb_batch_add_one(mm, vaddr, pte_exec(orig)); | ||
103 | } | ||
104 | |||
105 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
106 | static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr, | ||
107 | pmd_t pmd, bool exec) | ||
108 | { | ||
109 | unsigned long end; | ||
110 | pte_t *pte; | ||
111 | |||
112 | pte = pte_offset_map(&pmd, vaddr); | ||
113 | end = vaddr + HPAGE_SIZE; | ||
114 | while (vaddr < end) { | ||
115 | if (pte_val(*pte) & _PAGE_VALID) | ||
116 | tlb_batch_add_one(mm, vaddr, exec); | ||
117 | pte++; | ||
118 | vaddr += PAGE_SIZE; | ||
119 | } | ||
120 | pte_unmap(pte); | ||
121 | } | ||
80 | 122 | ||
81 | if (fullmm) { | 123 | void set_pmd_at(struct mm_struct *mm, unsigned long addr, |
82 | put_cpu_var(tlb_batch); | 124 | pmd_t *pmdp, pmd_t pmd) |
125 | { | ||
126 | pmd_t orig = *pmdp; | ||
127 | |||
128 | *pmdp = pmd; | ||
129 | |||
130 | if (mm == &init_mm) | ||
83 | return; | 131 | return; |
132 | |||
133 | if ((pmd_val(pmd) ^ pmd_val(orig)) & PMD_ISHUGE) { | ||
134 | if (pmd_val(pmd) & PMD_ISHUGE) | ||
135 | mm->context.huge_pte_count++; | ||
136 | else | ||
137 | mm->context.huge_pte_count--; | ||
138 | if (mm->context.huge_pte_count == 1) | ||
139 | hugetlb_setup(mm); | ||
84 | } | 140 | } |
85 | 141 | ||
86 | nr = tb->tlb_nr; | 142 | if (!pmd_none(orig)) { |
143 | bool exec = ((pmd_val(orig) & PMD_HUGE_EXEC) != 0); | ||
87 | 144 | ||
88 | if (unlikely(nr != 0 && mm != tb->mm)) { | 145 | addr &= HPAGE_MASK; |
89 | flush_tlb_pending(); | 146 | if (pmd_val(orig) & PMD_ISHUGE) |
90 | nr = 0; | 147 | tlb_batch_add_one(mm, addr, exec); |
148 | else | ||
149 | tlb_batch_pmd_scan(mm, addr, orig, exec); | ||
91 | } | 150 | } |
151 | } | ||
92 | 152 | ||
93 | if (nr == 0) | 153 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) |
94 | tb->mm = mm; | 154 | { |
155 | struct list_head *lh = (struct list_head *) pgtable; | ||
95 | 156 | ||
96 | tb->vaddrs[nr] = vaddr; | 157 | assert_spin_locked(&mm->page_table_lock); |
97 | tb->tlb_nr = ++nr; | ||
98 | if (nr >= TLB_BATCH_NR) | ||
99 | flush_tlb_pending(); | ||
100 | 158 | ||
101 | put_cpu_var(tlb_batch); | 159 | /* FIFO */ |
160 | if (!mm->pmd_huge_pte) | ||
161 | INIT_LIST_HEAD(lh); | ||
162 | else | ||
163 | list_add(lh, (struct list_head *) mm->pmd_huge_pte); | ||
164 | mm->pmd_huge_pte = pgtable; | ||
165 | } | ||
166 | |||
167 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | ||
168 | { | ||
169 | struct list_head *lh; | ||
170 | pgtable_t pgtable; | ||
171 | |||
172 | assert_spin_locked(&mm->page_table_lock); | ||
173 | |||
174 | /* FIFO */ | ||
175 | pgtable = mm->pmd_huge_pte; | ||
176 | lh = (struct list_head *) pgtable; | ||
177 | if (list_empty(lh)) | ||
178 | mm->pmd_huge_pte = NULL; | ||
179 | else { | ||
180 | mm->pmd_huge_pte = (pgtable_t) lh->next; | ||
181 | list_del(lh); | ||
182 | } | ||
183 | pte_val(pgtable[0]) = 0; | ||
184 | pte_val(pgtable[1]) = 0; | ||
185 | |||
186 | return pgtable; | ||
102 | } | 187 | } |
188 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index c52add79b83d..7f6474347491 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c | |||
@@ -78,7 +78,7 @@ void flush_tsb_user(struct tlb_batch *tb) | |||
78 | base = __pa(base); | 78 | base = __pa(base); |
79 | __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); | 79 | __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); |
80 | 80 | ||
81 | #ifdef CONFIG_HUGETLB_PAGE | 81 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
82 | if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { | 82 | if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { |
83 | base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; | 83 | base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; |
84 | nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; | 84 | nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; |
@@ -90,29 +90,12 @@ void flush_tsb_user(struct tlb_batch *tb) | |||
90 | spin_unlock_irqrestore(&mm->context.lock, flags); | 90 | spin_unlock_irqrestore(&mm->context.lock, flags); |
91 | } | 91 | } |
92 | 92 | ||
93 | #if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) | ||
94 | #define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K | 93 | #define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K |
95 | #define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K | 94 | #define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K |
96 | #elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) | ||
97 | #define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_64K | ||
98 | #define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_64K | ||
99 | #else | ||
100 | #error Broken base page size setting... | ||
101 | #endif | ||
102 | 95 | ||
103 | #ifdef CONFIG_HUGETLB_PAGE | 96 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
104 | #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) | ||
105 | #define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_64K | ||
106 | #define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_64K | ||
107 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) | ||
108 | #define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_512K | ||
109 | #define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_512K | ||
110 | #elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) | ||
111 | #define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB | 97 | #define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB |
112 | #define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB | 98 | #define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB |
113 | #else | ||
114 | #error Broken huge page size setting... | ||
115 | #endif | ||
116 | #endif | 99 | #endif |
117 | 100 | ||
118 | static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) | 101 | static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) |
@@ -207,7 +190,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign | |||
207 | case MM_TSB_BASE: | 190 | case MM_TSB_BASE: |
208 | hp->pgsz_idx = HV_PGSZ_IDX_BASE; | 191 | hp->pgsz_idx = HV_PGSZ_IDX_BASE; |
209 | break; | 192 | break; |
210 | #ifdef CONFIG_HUGETLB_PAGE | 193 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
211 | case MM_TSB_HUGE: | 194 | case MM_TSB_HUGE: |
212 | hp->pgsz_idx = HV_PGSZ_IDX_HUGE; | 195 | hp->pgsz_idx = HV_PGSZ_IDX_HUGE; |
213 | break; | 196 | break; |
@@ -222,7 +205,7 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsign | |||
222 | case MM_TSB_BASE: | 205 | case MM_TSB_BASE: |
223 | hp->pgsz_mask = HV_PGSZ_MASK_BASE; | 206 | hp->pgsz_mask = HV_PGSZ_MASK_BASE; |
224 | break; | 207 | break; |
225 | #ifdef CONFIG_HUGETLB_PAGE | 208 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
226 | case MM_TSB_HUGE: | 209 | case MM_TSB_HUGE: |
227 | hp->pgsz_mask = HV_PGSZ_MASK_HUGE; | 210 | hp->pgsz_mask = HV_PGSZ_MASK_HUGE; |
228 | break; | 211 | break; |
@@ -444,7 +427,7 @@ retry_tsb_alloc: | |||
444 | 427 | ||
445 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | 428 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) |
446 | { | 429 | { |
447 | #ifdef CONFIG_HUGETLB_PAGE | 430 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
448 | unsigned long huge_pte_count; | 431 | unsigned long huge_pte_count; |
449 | #endif | 432 | #endif |
450 | unsigned int i; | 433 | unsigned int i; |
@@ -453,7 +436,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |||
453 | 436 | ||
454 | mm->context.sparc64_ctx_val = 0UL; | 437 | mm->context.sparc64_ctx_val = 0UL; |
455 | 438 | ||
456 | #ifdef CONFIG_HUGETLB_PAGE | 439 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
457 | /* We reset it to zero because the fork() page copying | 440 | /* We reset it to zero because the fork() page copying |
458 | * will re-increment the counters as the parent PTEs are | 441 | * will re-increment the counters as the parent PTEs are |
459 | * copied into the child address space. | 442 | * copied into the child address space. |
@@ -462,6 +445,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |||
462 | mm->context.huge_pte_count = 0; | 445 | mm->context.huge_pte_count = 0; |
463 | #endif | 446 | #endif |
464 | 447 | ||
448 | mm->context.pgtable_page = NULL; | ||
449 | |||
465 | /* copy_mm() copies over the parent's mm_struct before calling | 450 | /* copy_mm() copies over the parent's mm_struct before calling |
466 | * us, so we need to zero out the TSB pointer or else tsb_grow() | 451 | * us, so we need to zero out the TSB pointer or else tsb_grow() |
467 | * will be confused and think there is an older TSB to free up. | 452 | * will be confused and think there is an older TSB to free up. |
@@ -474,7 +459,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |||
474 | */ | 459 | */ |
475 | tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); | 460 | tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); |
476 | 461 | ||
477 | #ifdef CONFIG_HUGETLB_PAGE | 462 | #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) |
478 | if (unlikely(huge_pte_count)) | 463 | if (unlikely(huge_pte_count)) |
479 | tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); | 464 | tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); |
480 | #endif | 465 | #endif |
@@ -500,10 +485,17 @@ static void tsb_destroy_one(struct tsb_config *tp) | |||
500 | void destroy_context(struct mm_struct *mm) | 485 | void destroy_context(struct mm_struct *mm) |
501 | { | 486 | { |
502 | unsigned long flags, i; | 487 | unsigned long flags, i; |
488 | struct page *page; | ||
503 | 489 | ||
504 | for (i = 0; i < MM_NUM_TSBS; i++) | 490 | for (i = 0; i < MM_NUM_TSBS; i++) |
505 | tsb_destroy_one(&mm->context.tsb_block[i]); | 491 | tsb_destroy_one(&mm->context.tsb_block[i]); |
506 | 492 | ||
493 | page = mm->context.pgtable_page; | ||
494 | if (page && put_page_testzero(page)) { | ||
495 | pgtable_page_dtor(page); | ||
496 | free_hot_cold_page(page, 0); | ||
497 | } | ||
498 | |||
507 | spin_lock_irqsave(&ctx_alloc_lock, flags); | 499 | spin_lock_irqsave(&ctx_alloc_lock, flags); |
508 | 500 | ||
509 | if (CTX_VALID(mm->context)) { | 501 | if (CTX_VALID(mm->context)) { |
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index c9a3c1fe7297..dc46490adca0 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig | |||
@@ -7,12 +7,15 @@ config TILE | |||
7 | select HAVE_DMA_API_DEBUG | 7 | select HAVE_DMA_API_DEBUG |
8 | select HAVE_KVM if !TILEGX | 8 | select HAVE_KVM if !TILEGX |
9 | select GENERIC_FIND_FIRST_BIT | 9 | select GENERIC_FIND_FIRST_BIT |
10 | select SYSCTL_EXCEPTION_TRACE | ||
10 | select USE_GENERIC_SMP_HELPERS | 11 | select USE_GENERIC_SMP_HELPERS |
11 | select CC_OPTIMIZE_FOR_SIZE | 12 | select CC_OPTIMIZE_FOR_SIZE |
13 | select HAVE_DEBUG_KMEMLEAK | ||
12 | select HAVE_GENERIC_HARDIRQS | 14 | select HAVE_GENERIC_HARDIRQS |
13 | select GENERIC_IRQ_PROBE | 15 | select GENERIC_IRQ_PROBE |
14 | select GENERIC_PENDING_IRQ if SMP | 16 | select GENERIC_PENDING_IRQ if SMP |
15 | select GENERIC_IRQ_SHOW | 17 | select GENERIC_IRQ_SHOW |
18 | select HAVE_DEBUG_BUGVERBOSE | ||
16 | select HAVE_SYSCALL_WRAPPERS if TILEGX | 19 | select HAVE_SYSCALL_WRAPPERS if TILEGX |
17 | select SYS_HYPERVISOR | 20 | select SYS_HYPERVISOR |
18 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 21 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index b2042380a5aa..0f885af2b621 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h | |||
@@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page) | |||
106 | { | 106 | { |
107 | } | 107 | } |
108 | 108 | ||
109 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
110 | { | ||
111 | } | ||
112 | |||
109 | #ifdef CONFIG_HUGETLB_SUPER_PAGES | 113 | #ifdef CONFIG_HUGETLB_SUPER_PAGES |
110 | static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, | 114 | static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, |
111 | struct page *page, int writable) | 115 | struct page *page, int writable) |
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 758b6038c2b7..3cfa98bf9125 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c | |||
@@ -36,19 +36,14 @@ static void sim_notify_exec(const char *binary_name) | |||
36 | } while (c); | 36 | } while (c); |
37 | } | 37 | } |
38 | 38 | ||
39 | static int notify_exec(void) | 39 | static int notify_exec(struct mm_struct *mm) |
40 | { | 40 | { |
41 | int retval = 0; /* failure */ | 41 | int retval = 0; /* failure */ |
42 | struct vm_area_struct *vma = current->mm->mmap; | 42 | |
43 | while (vma) { | 43 | if (mm->exe_file) { |
44 | if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) | ||
45 | break; | ||
46 | vma = vma->vm_next; | ||
47 | } | ||
48 | if (vma) { | ||
49 | char *buf = (char *) __get_free_page(GFP_KERNEL); | 44 | char *buf = (char *) __get_free_page(GFP_KERNEL); |
50 | if (buf) { | 45 | if (buf) { |
51 | char *path = d_path(&vma->vm_file->f_path, | 46 | char *path = d_path(&mm->exe_file->f_path, |
52 | buf, PAGE_SIZE); | 47 | buf, PAGE_SIZE); |
53 | if (!IS_ERR(path)) { | 48 | if (!IS_ERR(path)) { |
54 | sim_notify_exec(path); | 49 | sim_notify_exec(path); |
@@ -106,16 +101,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, | |||
106 | unsigned long vdso_base; | 101 | unsigned long vdso_base; |
107 | int retval = 0; | 102 | int retval = 0; |
108 | 103 | ||
104 | down_write(&mm->mmap_sem); | ||
105 | |||
109 | /* | 106 | /* |
110 | * Notify the simulator that an exec just occurred. | 107 | * Notify the simulator that an exec just occurred. |
111 | * If we can't find the filename of the mapping, just use | 108 | * If we can't find the filename of the mapping, just use |
112 | * whatever was passed as the linux_binprm filename. | 109 | * whatever was passed as the linux_binprm filename. |
113 | */ | 110 | */ |
114 | if (!notify_exec()) | 111 | if (!notify_exec(mm)) |
115 | sim_notify_exec(bprm->filename); | 112 | sim_notify_exec(bprm->filename); |
116 | 113 | ||
117 | down_write(&mm->mmap_sem); | ||
118 | |||
119 | /* | 114 | /* |
120 | * MAYWRITE to allow gdb to COW and set breakpoints | 115 | * MAYWRITE to allow gdb to COW and set breakpoints |
121 | */ | 116 | */ |
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 84ce7abbf5af..fe811fa5f1b9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c | |||
@@ -454,6 +454,7 @@ good_area: | |||
454 | tsk->min_flt++; | 454 | tsk->min_flt++; |
455 | if (fault & VM_FAULT_RETRY) { | 455 | if (fault & VM_FAULT_RETRY) { |
456 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 456 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
457 | flags |= FAULT_FLAG_TRIED; | ||
457 | 458 | ||
458 | /* | 459 | /* |
459 | * No need to up_read(&mm->mmap_sem) as we would | 460 | * No need to up_read(&mm->mmap_sem) as we would |
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index cb837c223922..648121b037d5 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common | |||
@@ -7,6 +7,7 @@ config UML | |||
7 | bool | 7 | bool |
8 | default y | 8 | default y |
9 | select HAVE_GENERIC_HARDIRQS | 9 | select HAVE_GENERIC_HARDIRQS |
10 | select HAVE_UID16 | ||
10 | select GENERIC_IRQ_SHOW | 11 | select GENERIC_IRQ_SHOW |
11 | select GENERIC_CPU_DEVICES | 12 | select GENERIC_CPU_DEVICES |
12 | select GENERIC_IO | 13 | select GENERIC_IO |
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 0353b98ae35a..0f00e9c82080 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c | |||
@@ -89,6 +89,7 @@ good_area: | |||
89 | current->min_flt++; | 89 | current->min_flt++; |
90 | if (fault & VM_FAULT_RETRY) { | 90 | if (fault & VM_FAULT_RETRY) { |
91 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 91 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
92 | flags |= FAULT_FLAG_TRIED; | ||
92 | 93 | ||
93 | goto retry; | 94 | goto retry; |
94 | } | 95 | } |
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index b6f0458c3143..b008586dad75 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c | |||
@@ -380,7 +380,7 @@ int vectors_user_mapping(void) | |||
380 | return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, | 380 | return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, |
381 | VM_READ | VM_EXEC | | 381 | VM_READ | VM_EXEC | |
382 | VM_MAYREAD | VM_MAYEXEC | | 382 | VM_MAYREAD | VM_MAYEXEC | |
383 | VM_RESERVED, | 383 | VM_DONTEXPAND | VM_DONTDUMP, |
384 | NULL); | 384 | NULL); |
385 | } | 385 | } |
386 | 386 | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b72777ff32a9..1ae94bcae5d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -10,6 +10,7 @@ config X86_32 | |||
10 | def_bool y | 10 | def_bool y |
11 | depends on !64BIT | 11 | depends on !64BIT |
12 | select CLKSRC_I8253 | 12 | select CLKSRC_I8253 |
13 | select HAVE_UID16 | ||
13 | 14 | ||
14 | config X86_64 | 15 | config X86_64 |
15 | def_bool y | 16 | def_bool y |
@@ -46,6 +47,7 @@ config X86 | |||
46 | select HAVE_FUNCTION_GRAPH_FP_TEST | 47 | select HAVE_FUNCTION_GRAPH_FP_TEST |
47 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST | 48 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST |
48 | select HAVE_SYSCALL_TRACEPOINTS | 49 | select HAVE_SYSCALL_TRACEPOINTS |
50 | select SYSCTL_EXCEPTION_TRACE | ||
49 | select HAVE_KVM | 51 | select HAVE_KVM |
50 | select HAVE_ARCH_KGDB | 52 | select HAVE_ARCH_KGDB |
51 | select HAVE_ARCH_TRACEHOOK | 53 | select HAVE_ARCH_TRACEHOOK |
@@ -65,6 +67,7 @@ config X86 | |||
65 | select HAVE_PERF_EVENTS_NMI | 67 | select HAVE_PERF_EVENTS_NMI |
66 | select HAVE_PERF_REGS | 68 | select HAVE_PERF_REGS |
67 | select HAVE_PERF_USER_STACK_DUMP | 69 | select HAVE_PERF_USER_STACK_DUMP |
70 | select HAVE_DEBUG_KMEMLEAK | ||
68 | select ANON_INODES | 71 | select ANON_INODES |
69 | select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 | 72 | select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 |
70 | select HAVE_CMPXCHG_LOCAL if !M386 | 73 | select HAVE_CMPXCHG_LOCAL if !M386 |
@@ -85,6 +88,7 @@ config X86 | |||
85 | select IRQ_FORCED_THREADING | 88 | select IRQ_FORCED_THREADING |
86 | select USE_GENERIC_SMP_HELPERS if SMP | 89 | select USE_GENERIC_SMP_HELPERS if SMP |
87 | select HAVE_BPF_JIT if X86_64 | 90 | select HAVE_BPF_JIT if X86_64 |
91 | select HAVE_ARCH_TRANSPARENT_HUGEPAGE | ||
88 | select CLKEVT_I8253 | 92 | select CLKEVT_I8253 |
89 | select ARCH_HAVE_NMI_SAFE_CMPXCHG | 93 | select ARCH_HAVE_NMI_SAFE_CMPXCHG |
90 | select GENERIC_IOMAP | 94 | select GENERIC_IOMAP |
@@ -2168,6 +2172,7 @@ config IA32_EMULATION | |||
2168 | bool "IA32 Emulation" | 2172 | bool "IA32 Emulation" |
2169 | depends on X86_64 | 2173 | depends on X86_64 |
2170 | select COMPAT_BINFMT_ELF | 2174 | select COMPAT_BINFMT_ELF |
2175 | select HAVE_UID16 | ||
2171 | ---help--- | 2176 | ---help--- |
2172 | Include code to run legacy 32-bit programs under a | 2177 | Include code to run legacy 32-bit programs under a |
2173 | 64-bit kernel. You should likely turn this on, unless you're | 2178 | 64-bit kernel. You should likely turn this on, unless you're |
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 250b8774c158..b6c3b821acf6 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h | |||
@@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) | |||
240 | return c; | 240 | return c; |
241 | } | 241 | } |
242 | 242 | ||
243 | |||
244 | /* | ||
245 | * atomic_dec_if_positive - decrement by 1 if old value positive | ||
246 | * @v: pointer of type atomic_t | ||
247 | * | ||
248 | * The function returns the old value of *v minus 1, even if | ||
249 | * the atomic variable, v, was not decremented. | ||
250 | */ | ||
251 | static inline int atomic_dec_if_positive(atomic_t *v) | ||
252 | { | ||
253 | int c, old, dec; | ||
254 | c = atomic_read(v); | ||
255 | for (;;) { | ||
256 | dec = c - 1; | ||
257 | if (unlikely(dec < 0)) | ||
258 | break; | ||
259 | old = atomic_cmpxchg((v), c, dec); | ||
260 | if (likely(old == c)) | ||
261 | break; | ||
262 | c = old; | ||
263 | } | ||
264 | return dec; | ||
265 | } | ||
266 | |||
267 | /** | 243 | /** |
268 | * atomic_inc_short - increment of a short integer | 244 | * atomic_inc_short - increment of a short integer |
269 | * @v: pointer to type int | 245 | * @v: pointer to type int |
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 439a9acc132d..bdd35dbd0605 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h | |||
@@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page) | |||
90 | { | 90 | { |
91 | } | 91 | } |
92 | 92 | ||
93 | static inline void arch_clear_hugepage_flags(struct page *page) | ||
94 | { | ||
95 | } | ||
96 | |||
93 | #endif /* _ASM_X86_HUGETLB_H */ | 97 | #endif /* _ASM_X86_HUGETLB_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fc9948465293..a1f780d45f76 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) | |||
146 | 146 | ||
147 | static inline int pmd_large(pmd_t pte) | 147 | static inline int pmd_large(pmd_t pte) |
148 | { | 148 | { |
149 | return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == | 149 | return pmd_flags(pte) & _PAGE_PSE; |
150 | (_PAGE_PSE | _PAGE_PRESENT); | ||
151 | } | 150 | } |
152 | 151 | ||
153 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 152 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte) | |||
415 | 414 | ||
416 | static inline int pmd_present(pmd_t pmd) | 415 | static inline int pmd_present(pmd_t pmd) |
417 | { | 416 | { |
418 | return pmd_flags(pmd) & _PAGE_PRESENT; | 417 | /* |
418 | * Checking for _PAGE_PSE is needed too because | ||
419 | * split_huge_page will temporarily clear the present bit (but | ||
420 | * the _PAGE_PSE flag will remain set at all times while the | ||
421 | * _PAGE_PRESENT bit is clear). | ||
422 | */ | ||
423 | return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); | ||
419 | } | 424 | } |
420 | 425 | ||
421 | static inline int pmd_none(pmd_t pmd) | 426 | static inline int pmd_none(pmd_t pmd) |
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0c92113c4cb6..8faa215a503e 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h | |||
@@ -71,6 +71,7 @@ do { \ | |||
71 | * tables contain all the necessary information. | 71 | * tables contain all the necessary information. |
72 | */ | 72 | */ |
73 | #define update_mmu_cache(vma, address, ptep) do { } while (0) | 73 | #define update_mmu_cache(vma, address, ptep) do { } while (0) |
74 | #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) | ||
74 | 75 | ||
75 | #endif /* !__ASSEMBLY__ */ | 76 | #endif /* !__ASSEMBLY__ */ |
76 | 77 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 8251be02301e..47356f9df82e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -143,6 +143,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } | |||
143 | #define pte_unmap(pte) ((void)(pte))/* NOP */ | 143 | #define pte_unmap(pte) ((void)(pte))/* NOP */ |
144 | 144 | ||
145 | #define update_mmu_cache(vma, address, ptep) do { } while (0) | 145 | #define update_mmu_cache(vma, address, ptep) do { } while (0) |
146 | #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) | ||
146 | 147 | ||
147 | /* Encode and de-code a swap entry */ | 148 | /* Encode and de-code a swap entry */ |
148 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE | 149 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a530b230e7d7..8e13ecb41bee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -1220,6 +1220,7 @@ good_area: | |||
1220 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | 1220 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
1221 | * of starvation. */ | 1221 | * of starvation. */ |
1222 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 1222 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
1223 | flags |= FAULT_FLAG_TRIED; | ||
1223 | goto retry; | 1224 | goto retry; |
1224 | } | 1225 | } |
1225 | } | 1226 | } |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b91e48512425..937bff5cdaa7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
71 | struct address_space *mapping = vma->vm_file->f_mapping; | 71 | struct address_space *mapping = vma->vm_file->f_mapping; |
72 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | 72 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + |
73 | vma->vm_pgoff; | 73 | vma->vm_pgoff; |
74 | struct prio_tree_iter iter; | ||
75 | struct vm_area_struct *svma; | 74 | struct vm_area_struct *svma; |
76 | unsigned long saddr; | 75 | unsigned long saddr; |
77 | pte_t *spte = NULL; | 76 | pte_t *spte = NULL; |
@@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
81 | return (pte_t *)pmd_alloc(mm, pud, addr); | 80 | return (pte_t *)pmd_alloc(mm, pud, addr); |
82 | 81 | ||
83 | mutex_lock(&mapping->i_mmap_mutex); | 82 | mutex_lock(&mapping->i_mmap_mutex); |
84 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { | 83 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
85 | if (svma == vma) | 84 | if (svma == vma) |
86 | continue; | 85 | continue; |
87 | 86 | ||
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3d68ef6d2266..0eb572eda406 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -664,20 +664,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) | |||
664 | } | 664 | } |
665 | 665 | ||
666 | /* | 666 | /* |
667 | * track_pfn_vma_copy is called when vma that is covering the pfnmap gets | 667 | * track_pfn_copy is called when vma that is covering the pfnmap gets |
668 | * copied through copy_page_range(). | 668 | * copied through copy_page_range(). |
669 | * | 669 | * |
670 | * If the vma has a linear pfn mapping for the entire range, we get the prot | 670 | * If the vma has a linear pfn mapping for the entire range, we get the prot |
671 | * from pte and reserve the entire vma range with single reserve_pfn_range call. | 671 | * from pte and reserve the entire vma range with single reserve_pfn_range call. |
672 | */ | 672 | */ |
673 | int track_pfn_vma_copy(struct vm_area_struct *vma) | 673 | int track_pfn_copy(struct vm_area_struct *vma) |
674 | { | 674 | { |
675 | resource_size_t paddr; | 675 | resource_size_t paddr; |
676 | unsigned long prot; | 676 | unsigned long prot; |
677 | unsigned long vma_size = vma->vm_end - vma->vm_start; | 677 | unsigned long vma_size = vma->vm_end - vma->vm_start; |
678 | pgprot_t pgprot; | 678 | pgprot_t pgprot; |
679 | 679 | ||
680 | if (is_linear_pfn_mapping(vma)) { | 680 | if (vma->vm_flags & VM_PAT) { |
681 | /* | 681 | /* |
682 | * reserve the whole chunk covered by vma. We need the | 682 | * reserve the whole chunk covered by vma. We need the |
683 | * starting address and protection from pte. | 683 | * starting address and protection from pte. |
@@ -694,31 +694,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) | |||
694 | } | 694 | } |
695 | 695 | ||
696 | /* | 696 | /* |
697 | * track_pfn_vma_new is called when a _new_ pfn mapping is being established | ||
698 | * for physical range indicated by pfn and size. | ||
699 | * | ||
700 | * prot is passed in as a parameter for the new mapping. If the vma has a | 697 | * prot is passed in as a parameter for the new mapping. If the vma has a |
701 | * linear pfn mapping for the entire range reserve the entire vma range with | 698 | * linear pfn mapping for the entire range reserve the entire vma range with |
702 | * single reserve_pfn_range call. | 699 | * single reserve_pfn_range call. |
703 | */ | 700 | */ |
704 | int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, | 701 | int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, |
705 | unsigned long pfn, unsigned long size) | 702 | unsigned long pfn, unsigned long addr, unsigned long size) |
706 | { | 703 | { |
704 | resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; | ||
707 | unsigned long flags; | 705 | unsigned long flags; |
708 | resource_size_t paddr; | ||
709 | unsigned long vma_size = vma->vm_end - vma->vm_start; | ||
710 | 706 | ||
711 | if (is_linear_pfn_mapping(vma)) { | 707 | /* reserve the whole chunk starting from paddr */ |
712 | /* reserve the whole chunk starting from vm_pgoff */ | 708 | if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { |
713 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; | 709 | int ret; |
714 | return reserve_pfn_range(paddr, vma_size, prot, 0); | 710 | |
711 | ret = reserve_pfn_range(paddr, size, prot, 0); | ||
712 | if (!ret) | ||
713 | vma->vm_flags |= VM_PAT; | ||
714 | return ret; | ||
715 | } | 715 | } |
716 | 716 | ||
717 | if (!pat_enabled) | 717 | if (!pat_enabled) |
718 | return 0; | 718 | return 0; |
719 | 719 | ||
720 | /* for vm_insert_pfn and friends, we set prot based on lookup */ | 720 | /* |
721 | flags = lookup_memtype(pfn << PAGE_SHIFT); | 721 | * For anything smaller than the vma size we set prot based on the |
722 | * lookup. | ||
723 | */ | ||
724 | flags = lookup_memtype(paddr); | ||
725 | |||
726 | /* Check memtype for the remaining pages */ | ||
727 | while (size > PAGE_SIZE) { | ||
728 | size -= PAGE_SIZE; | ||
729 | paddr += PAGE_SIZE; | ||
730 | if (flags != lookup_memtype(paddr)) | ||
731 | return -EINVAL; | ||
732 | } | ||
733 | |||
734 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | | ||
735 | flags); | ||
736 | |||
737 | return 0; | ||
738 | } | ||
739 | |||
740 | int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | ||
741 | unsigned long pfn) | ||
742 | { | ||
743 | unsigned long flags; | ||
744 | |||
745 | if (!pat_enabled) | ||
746 | return 0; | ||
747 | |||
748 | /* Set prot based on lookup */ | ||
749 | flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); | ||
722 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | | 750 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | |
723 | flags); | 751 | flags); |
724 | 752 | ||
@@ -726,22 +754,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, | |||
726 | } | 754 | } |
727 | 755 | ||
728 | /* | 756 | /* |
729 | * untrack_pfn_vma is called while unmapping a pfnmap for a region. | 757 | * untrack_pfn is called while unmapping a pfnmap for a region. |
730 | * untrack can be called for a specific region indicated by pfn and size or | 758 | * untrack can be called for a specific region indicated by pfn and size or |
731 | * can be for the entire vma (in which case size can be zero). | 759 | * can be for the entire vma (in which case pfn, size are zero). |
732 | */ | 760 | */ |
733 | void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, | 761 | void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, |
734 | unsigned long size) | 762 | unsigned long size) |
735 | { | 763 | { |
736 | resource_size_t paddr; | 764 | resource_size_t paddr; |
737 | unsigned long vma_size = vma->vm_end - vma->vm_start; | 765 | unsigned long prot; |
738 | 766 | ||
739 | if (is_linear_pfn_mapping(vma)) { | 767 | if (!(vma->vm_flags & VM_PAT)) |
740 | /* free the whole chunk starting from vm_pgoff */ | ||
741 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; | ||
742 | free_pfn_range(paddr, vma_size); | ||
743 | return; | 768 | return; |
769 | |||
770 | /* free the chunk starting from pfn or the whole chunk */ | ||
771 | paddr = (resource_size_t)pfn << PAGE_SHIFT; | ||
772 | if (!paddr && !size) { | ||
773 | if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { | ||
774 | WARN_ON_ONCE(1); | ||
775 | return; | ||
776 | } | ||
777 | |||
778 | size = vma->vm_end - vma->vm_start; | ||
744 | } | 779 | } |
780 | free_pfn_range(paddr, size); | ||
781 | vma->vm_flags &= ~VM_PAT; | ||
745 | } | 782 | } |
746 | 783 | ||
747 | pgprot_t pgprot_writecombine(pgprot_t prot) | 784 | pgprot_t pgprot_writecombine(pgprot_t prot) |
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb21..415f6c4ced36 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/debugfs.h> | 12 | #include <linux/debugfs.h> |
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/rbtree.h> | 15 | #include <linux/rbtree_augmented.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | 18 | ||
@@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node) | |||
54 | return ret; | 54 | return ret; |
55 | } | 55 | } |
56 | 56 | ||
57 | /* Update 'subtree_max_end' for a node, based on node and its children */ | 57 | static u64 compute_subtree_max_end(struct memtype *data) |
58 | static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) | ||
59 | { | 58 | { |
60 | struct memtype *data; | 59 | u64 max_end = data->end, child_max_end; |
61 | u64 max_end, child_max_end; | ||
62 | |||
63 | if (!node) | ||
64 | return; | ||
65 | 60 | ||
66 | data = container_of(node, struct memtype, rb); | 61 | child_max_end = get_subtree_max_end(data->rb.rb_right); |
67 | max_end = data->end; | ||
68 | |||
69 | child_max_end = get_subtree_max_end(node->rb_right); | ||
70 | if (child_max_end > max_end) | 62 | if (child_max_end > max_end) |
71 | max_end = child_max_end; | 63 | max_end = child_max_end; |
72 | 64 | ||
73 | child_max_end = get_subtree_max_end(node->rb_left); | 65 | child_max_end = get_subtree_max_end(data->rb.rb_left); |
74 | if (child_max_end > max_end) | 66 | if (child_max_end > max_end) |
75 | max_end = child_max_end; | 67 | max_end = child_max_end; |
76 | 68 | ||
77 | data->subtree_max_end = max_end; | 69 | return max_end; |
78 | } | 70 | } |
79 | 71 | ||
72 | RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, | ||
73 | u64, subtree_max_end, compute_subtree_max_end) | ||
74 | |||
80 | /* Find the first (lowest start addr) overlapping range from rb tree */ | 75 | /* Find the first (lowest start addr) overlapping range from rb tree */ |
81 | static struct memtype *memtype_rb_lowest_match(struct rb_root *root, | 76 | static struct memtype *memtype_rb_lowest_match(struct rb_root *root, |
82 | u64 start, u64 end) | 77 | u64 start, u64 end) |
@@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) | |||
179 | struct memtype *data = container_of(*node, struct memtype, rb); | 174 | struct memtype *data = container_of(*node, struct memtype, rb); |
180 | 175 | ||
181 | parent = *node; | 176 | parent = *node; |
177 | if (data->subtree_max_end < newdata->end) | ||
178 | data->subtree_max_end = newdata->end; | ||
182 | if (newdata->start <= data->start) | 179 | if (newdata->start <= data->start) |
183 | node = &((*node)->rb_left); | 180 | node = &((*node)->rb_left); |
184 | else if (newdata->start > data->start) | 181 | else if (newdata->start > data->start) |
185 | node = &((*node)->rb_right); | 182 | node = &((*node)->rb_right); |
186 | } | 183 | } |
187 | 184 | ||
185 | newdata->subtree_max_end = newdata->end; | ||
188 | rb_link_node(&newdata->rb, parent, node); | 186 | rb_link_node(&newdata->rb, parent, node); |
189 | rb_insert_color(&newdata->rb, root); | 187 | rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); |
190 | rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); | ||
191 | } | 188 | } |
192 | 189 | ||
193 | int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) | 190 | int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) |
@@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) | |||
209 | 206 | ||
210 | struct memtype *rbt_memtype_erase(u64 start, u64 end) | 207 | struct memtype *rbt_memtype_erase(u64 start, u64 end) |
211 | { | 208 | { |
212 | struct rb_node *deepest; | ||
213 | struct memtype *data; | 209 | struct memtype *data; |
214 | 210 | ||
215 | data = memtype_rb_exact_match(&memtype_rbroot, start, end); | 211 | data = memtype_rb_exact_match(&memtype_rbroot, start, end); |
216 | if (!data) | 212 | if (!data) |
217 | goto out; | 213 | goto out; |
218 | 214 | ||
219 | deepest = rb_augment_erase_begin(&data->rb); | 215 | rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); |
220 | rb_erase(&data->rb, &memtype_rbroot); | ||
221 | rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); | ||
222 | out: | 216 | out: |
223 | return data; | 217 | return data; |
224 | } | 218 | } |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5a16824cc2b3..fd28d86fe3d2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | |||
2451 | 2451 | ||
2452 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); | 2452 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); |
2453 | 2453 | ||
2454 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == | 2454 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); |
2455 | (VM_PFNMAP | VM_RESERVED | VM_IO))); | ||
2456 | 2455 | ||
2457 | rmd.mfn = mfn; | 2456 | rmd.mfn = mfn; |
2458 | rmd.prot = prot; | 2457 | rmd.prot = prot; |
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 5a74c53bc69c..2c2f710ed1dc 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c | |||
@@ -126,6 +126,7 @@ good_area: | |||
126 | current->min_flt++; | 126 | current->min_flt++; |
127 | if (fault & VM_FAULT_RETRY) { | 127 | if (fault & VM_FAULT_RETRY) { |
128 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | 128 | flags &= ~FAULT_FLAG_ALLOW_RETRY; |
129 | flags |= FAULT_FLAG_TRIED; | ||
129 | 130 | ||
130 | /* No need to up_read(&mm->mmap_sem) as we would | 131 | /* No need to up_read(&mm->mmap_sem) as we would |
131 | * have already released it in __lock_page_or_retry | 132 | * have already released it in __lock_page_or_retry |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7dda4f790f00..86c88216a503 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long start_pfn, | |||
248 | static int | 248 | static int |
249 | memory_block_action(unsigned long phys_index, unsigned long action) | 249 | memory_block_action(unsigned long phys_index, unsigned long action) |
250 | { | 250 | { |
251 | unsigned long start_pfn, start_paddr; | 251 | unsigned long start_pfn; |
252 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | 252 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
253 | struct page *first_page; | 253 | struct page *first_page; |
254 | int ret; | 254 | int ret; |
255 | 255 | ||
256 | first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); | 256 | first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); |
257 | start_pfn = page_to_pfn(first_page); | ||
257 | 258 | ||
258 | switch (action) { | 259 | switch (action) { |
259 | case MEM_ONLINE: | 260 | case MEM_ONLINE: |
260 | start_pfn = page_to_pfn(first_page); | ||
261 | |||
262 | if (!pages_correctly_reserved(start_pfn, nr_pages)) | 261 | if (!pages_correctly_reserved(start_pfn, nr_pages)) |
263 | return -EBUSY; | 262 | return -EBUSY; |
264 | 263 | ||
265 | ret = online_pages(start_pfn, nr_pages); | 264 | ret = online_pages(start_pfn, nr_pages); |
266 | break; | 265 | break; |
267 | case MEM_OFFLINE: | 266 | case MEM_OFFLINE: |
268 | start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; | 267 | ret = offline_pages(start_pfn, nr_pages); |
269 | ret = remove_memory(start_paddr, | ||
270 | nr_pages << PAGE_SHIFT); | ||
271 | break; | 268 | break; |
272 | default: | 269 | default: |
273 | WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " | 270 | WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " |
@@ -278,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned long action) | |||
278 | return ret; | 275 | return ret; |
279 | } | 276 | } |
280 | 277 | ||
281 | static int memory_block_change_state(struct memory_block *mem, | 278 | static int __memory_block_change_state(struct memory_block *mem, |
282 | unsigned long to_state, unsigned long from_state_req) | 279 | unsigned long to_state, unsigned long from_state_req) |
283 | { | 280 | { |
284 | int ret = 0; | 281 | int ret = 0; |
285 | 282 | ||
286 | mutex_lock(&mem->state_mutex); | ||
287 | |||
288 | if (mem->state != from_state_req) { | 283 | if (mem->state != from_state_req) { |
289 | ret = -EINVAL; | 284 | ret = -EINVAL; |
290 | goto out; | 285 | goto out; |
@@ -312,10 +307,20 @@ static int memory_block_change_state(struct memory_block *mem, | |||
312 | break; | 307 | break; |
313 | } | 308 | } |
314 | out: | 309 | out: |
315 | mutex_unlock(&mem->state_mutex); | ||
316 | return ret; | 310 | return ret; |
317 | } | 311 | } |
318 | 312 | ||
313 | static int memory_block_change_state(struct memory_block *mem, | ||
314 | unsigned long to_state, unsigned long from_state_req) | ||
315 | { | ||
316 | int ret; | ||
317 | |||
318 | mutex_lock(&mem->state_mutex); | ||
319 | ret = __memory_block_change_state(mem, to_state, from_state_req); | ||
320 | mutex_unlock(&mem->state_mutex); | ||
321 | |||
322 | return ret; | ||
323 | } | ||
319 | static ssize_t | 324 | static ssize_t |
320 | store_mem_state(struct device *dev, | 325 | store_mem_state(struct device *dev, |
321 | struct device_attribute *attr, const char *buf, size_t count) | 326 | struct device_attribute *attr, const char *buf, size_t count) |
@@ -656,6 +661,21 @@ int unregister_memory_section(struct mem_section *section) | |||
656 | } | 661 | } |
657 | 662 | ||
658 | /* | 663 | /* |
664 | * offline one memory block. If the memory block has been offlined, do nothing. | ||
665 | */ | ||
666 | int offline_memory_block(struct memory_block *mem) | ||
667 | { | ||
668 | int ret = 0; | ||
669 | |||
670 | mutex_lock(&mem->state_mutex); | ||
671 | if (mem->state != MEM_OFFLINE) | ||
672 | ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | ||
673 | mutex_unlock(&mem->state_mutex); | ||
674 | |||
675 | return ret; | ||
676 | } | ||
677 | |||
678 | /* | ||
659 | * Initialize the sysfs support for memory devices... | 679 | * Initialize the sysfs support for memory devices... |
660 | */ | 680 | */ |
661 | int __init memory_dev_init(void) | 681 | int __init memory_dev_init(void) |
diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c index 0c7d340b9ab9..f74e892711dd 100644 --- a/drivers/char/mbcs.c +++ b/drivers/char/mbcs.c | |||
@@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma) | |||
507 | 507 | ||
508 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 508 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
509 | 509 | ||
510 | /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ | 510 | /* Remap-pfn-range will mark the range VM_IO */ |
511 | if (remap_pfn_range(vma, | 511 | if (remap_pfn_range(vma, |
512 | vma->vm_start, | 512 | vma->vm_start, |
513 | __pa(soft->gscr_addr) >> PAGE_SHIFT, | 513 | __pa(soft->gscr_addr) >> PAGE_SHIFT, |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index e5eedfa24c91..0537903c985b 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
@@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) | |||
322 | 322 | ||
323 | vma->vm_ops = &mmap_mem_ops; | 323 | vma->vm_ops = &mmap_mem_ops; |
324 | 324 | ||
325 | /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ | 325 | /* Remap-pfn-range will mark the range VM_IO */ |
326 | if (remap_pfn_range(vma, | 326 | if (remap_pfn_range(vma, |
327 | vma->vm_start, | 327 | vma->vm_start, |
328 | vma->vm_pgoff, | 328 | vma->vm_pgoff, |
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 845f97fd1832..e1f60f968fdd 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c | |||
@@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, | |||
286 | atomic_set(&vdata->refcnt, 1); | 286 | atomic_set(&vdata->refcnt, 1); |
287 | vma->vm_private_data = vdata; | 287 | vma->vm_private_data = vdata; |
288 | 288 | ||
289 | vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); | 289 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
290 | if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) | 290 | if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) |
291 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 291 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
292 | vma->vm_ops = &mspec_vm_ops; | 292 | vma->vm_ops = &mspec_vm_ops; |
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 92177d5aedee..24efae464e2c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c | |||
@@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) | |||
706 | goto out_unlock; | 706 | goto out_unlock; |
707 | } | 707 | } |
708 | 708 | ||
709 | vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; | 709 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
710 | vma->vm_ops = obj->dev->driver->gem_vm_ops; | 710 | vma->vm_ops = obj->dev->driver->gem_vm_ops; |
711 | vma->vm_private_data = map->handle; | 711 | vma->vm_private_data = map->handle; |
712 | vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); | 712 | vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); |
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index 23a824e6a22a..db7bd292410b 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c | |||
@@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) | |||
514 | 514 | ||
515 | vma->vm_ops = &drm_vm_dma_ops; | 515 | vma->vm_ops = &drm_vm_dma_ops; |
516 | 516 | ||
517 | vma->vm_flags |= VM_RESERVED; /* Don't swap */ | 517 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
518 | vma->vm_flags |= VM_DONTEXPAND; | ||
519 | 518 | ||
520 | drm_vm_open_locked(dev, vma); | 519 | drm_vm_open_locked(dev, vma); |
521 | return 0; | 520 | return 0; |
@@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) | |||
643 | case _DRM_SHM: | 642 | case _DRM_SHM: |
644 | vma->vm_ops = &drm_vm_shm_ops; | 643 | vma->vm_ops = &drm_vm_shm_ops; |
645 | vma->vm_private_data = (void *)map; | 644 | vma->vm_private_data = (void *)map; |
646 | /* Don't let this area swap. Change when | ||
647 | DRM_KERNEL advisory is supported. */ | ||
648 | vma->vm_flags |= VM_RESERVED; | ||
649 | break; | 645 | break; |
650 | case _DRM_SCATTER_GATHER: | 646 | case _DRM_SCATTER_GATHER: |
651 | vma->vm_ops = &drm_vm_sg_ops; | 647 | vma->vm_ops = &drm_vm_sg_ops; |
652 | vma->vm_private_data = (void *)map; | 648 | vma->vm_private_data = (void *)map; |
653 | vma->vm_flags |= VM_RESERVED; | ||
654 | vma->vm_page_prot = drm_dma_prot(map->type, vma); | 649 | vma->vm_page_prot = drm_dma_prot(map->type, vma); |
655 | break; | 650 | break; |
656 | default: | 651 | default: |
657 | return -EINVAL; /* This should never happen. */ | 652 | return -EINVAL; /* This should never happen. */ |
658 | } | 653 | } |
659 | vma->vm_flags |= VM_RESERVED; /* Don't swap */ | 654 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
660 | vma->vm_flags |= VM_DONTEXPAND; | ||
661 | 655 | ||
662 | drm_vm_open_locked(dev, vma); | 656 | drm_vm_open_locked(dev, vma); |
663 | return 0; | 657 | return 0; |
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index fcdbe46914f7..d2545560664f 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c | |||
@@ -500,7 +500,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp, | |||
500 | 500 | ||
501 | DRM_DEBUG_KMS("%s\n", __FILE__); | 501 | DRM_DEBUG_KMS("%s\n", __FILE__); |
502 | 502 | ||
503 | vma->vm_flags |= (VM_IO | VM_RESERVED); | 503 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
504 | 504 | ||
505 | update_vm_cache_attr(exynos_gem_obj, vma); | 505 | update_vm_cache_attr(exynos_gem_obj, vma); |
506 | 506 | ||
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 884ba73ac6ce..afded54dbb10 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c | |||
@@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
178 | */ | 178 | */ |
179 | vma->vm_ops = &psbfb_vm_ops; | 179 | vma->vm_ops = &psbfb_vm_ops; |
180 | vma->vm_private_data = (void *)psbfb; | 180 | vma->vm_private_data = (void *)psbfb; |
181 | vma->vm_flags |= VM_RESERVED | VM_IO | | 181 | vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; |
182 | VM_MIXEDMAP | VM_DONTEXPAND; | ||
183 | return 0; | 182 | return 0; |
184 | } | 183 | } |
185 | 184 | ||
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813571a4..3ba72dbdc4bd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c | |||
@@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, | |||
285 | */ | 285 | */ |
286 | 286 | ||
287 | vma->vm_private_data = bo; | 287 | vma->vm_private_data = bo; |
288 | vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; | 288 | vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; |
289 | return 0; | 289 | return 0; |
290 | out_unref: | 290 | out_unref: |
291 | ttm_bo_unref(&bo); | 291 | ttm_bo_unref(&bo); |
@@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo) | |||
300 | 300 | ||
301 | vma->vm_ops = &ttm_bo_vm_ops; | 301 | vma->vm_ops = &ttm_bo_vm_ops; |
302 | vma->vm_private_data = ttm_bo_reference(bo); | 302 | vma->vm_private_data = ttm_bo_reference(bo); |
303 | vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; | 303 | vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; |
304 | return 0; | 304 | return 0; |
305 | } | 305 | } |
306 | EXPORT_SYMBOL(ttm_fbdev_mmap); | 306 | EXPORT_SYMBOL(ttm_fbdev_mmap); |
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c index 67df842fbb33..69a2b16f42a6 100644 --- a/drivers/gpu/drm/udl/udl_fb.c +++ b/drivers/gpu/drm/udl/udl_fb.c | |||
@@ -243,7 +243,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
243 | size = 0; | 243 | size = 0; |
244 | } | 244 | } |
245 | 245 | ||
246 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | 246 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
247 | return 0; | 247 | return 0; |
248 | } | 248 | } |
249 | 249 | ||
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 45ee89b65c23..1a1d5d99fcf9 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c | |||
@@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, | |||
117 | physical = galpas->user.fw_handle; | 117 | physical = galpas->user.fw_handle; |
118 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 118 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
119 | ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); | 119 | ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); |
120 | /* VM_IO | VM_RESERVED are set by remap_pfn_range() */ | 120 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
121 | ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, | 121 | ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, |
122 | vma->vm_page_prot); | 122 | vma->vm_page_prot); |
123 | if (unlikely(ret)) { | 123 | if (unlikely(ret)) { |
@@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, | |||
139 | u64 start, ofs; | 139 | u64 start, ofs; |
140 | struct page *page; | 140 | struct page *page; |
141 | 141 | ||
142 | vma->vm_flags |= VM_RESERVED; | 142 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
143 | start = vma->vm_start; | 143 | start = vma->vm_start; |
144 | for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { | 144 | for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { |
145 | u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); | 145 | u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); |
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 736d9edbdbe7..3eb7e454849b 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c | |||
@@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, | |||
1225 | 1225 | ||
1226 | vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; | 1226 | vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; |
1227 | vma->vm_ops = &ipath_file_vm_ops; | 1227 | vma->vm_ops = &ipath_file_vm_ops; |
1228 | vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; | 1228 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
1229 | ret = 1; | 1229 | ret = 1; |
1230 | 1230 | ||
1231 | bail: | 1231 | bail: |
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index faa44cb08071..959a5c4ff812 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c | |||
@@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, | |||
971 | 971 | ||
972 | vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; | 972 | vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; |
973 | vma->vm_ops = &qib_file_vm_ops; | 973 | vma->vm_ops = &qib_file_vm_ops; |
974 | vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; | 974 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
975 | ret = 1; | 975 | ret = 1; |
976 | 976 | ||
977 | bail: | 977 | bail: |
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c index 7bc775219f97..e5a76da86081 100644 --- a/drivers/media/pci/meye/meye.c +++ b/drivers/media/pci/meye/meye.c | |||
@@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) | |||
1647 | 1647 | ||
1648 | vma->vm_ops = &meye_vm_ops; | 1648 | vma->vm_ops = &meye_vm_ops; |
1649 | vma->vm_flags &= ~VM_IO; /* not I/O memory */ | 1649 | vma->vm_flags &= ~VM_IO; /* not I/O memory */ |
1650 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | 1650 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
1651 | vma->vm_private_data = (void *) (offset / gbufsize); | 1651 | vma->vm_private_data = (void *) (offset / gbufsize); |
1652 | meye_vm_open(vma); | 1652 | meye_vm_open(vma); |
1653 | 1653 | ||
diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index 66ac21d466af..134016f0e660 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c | |||
@@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma) | |||
911 | 911 | ||
912 | q->bufs[i]->baddr = vma->vm_start; | 912 | q->bufs[i]->baddr = vma->vm_start; |
913 | 913 | ||
914 | vma->vm_flags |= VM_RESERVED; | 914 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
915 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); | 915 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); |
916 | vma->vm_ops = &omap_vout_vm_ops; | 916 | vma->vm_ops = &omap_vout_vm_ops; |
917 | vma->vm_private_data = (void *) vout; | 917 | vma->vm_private_data = (void *) vout; |
diff --git a/drivers/media/platform/vino.c b/drivers/media/platform/vino.c index 790d96cffeea..70b0bf4b2900 100644 --- a/drivers/media/platform/vino.c +++ b/drivers/media/platform/vino.c | |||
@@ -3950,7 +3950,7 @@ found: | |||
3950 | 3950 | ||
3951 | fb->map_count = 1; | 3951 | fb->map_count = 1; |
3952 | 3952 | ||
3953 | vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; | 3953 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3954 | vma->vm_flags &= ~VM_IO; | 3954 | vma->vm_flags &= ~VM_IO; |
3955 | vma->vm_private_data = fb; | 3955 | vma->vm_private_data = fb; |
3956 | vma->vm_file = file; | 3956 | vma->vm_file = file; |
diff --git a/drivers/media/usb/sn9c102/sn9c102_core.c b/drivers/media/usb/sn9c102/sn9c102_core.c index 19ea780b16ff..5bfc8e2f018f 100644 --- a/drivers/media/usb/sn9c102/sn9c102_core.c +++ b/drivers/media/usb/sn9c102/sn9c102_core.c | |||
@@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma) | |||
2126 | return -EINVAL; | 2126 | return -EINVAL; |
2127 | } | 2127 | } |
2128 | 2128 | ||
2129 | vma->vm_flags |= VM_IO; | 2129 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
2130 | vma->vm_flags |= VM_RESERVED; | ||
2131 | 2130 | ||
2132 | pos = cam->frame[i].bufmem; | 2131 | pos = cam->frame[i].bufmem; |
2133 | while (size > 0) { /* size is page-aligned */ | 2132 | while (size > 0) { /* size is page-aligned */ |
diff --git a/drivers/media/usb/usbvision/usbvision-video.c b/drivers/media/usb/usbvision/usbvision-video.c index f67018ed3795..5c36a57e6590 100644 --- a/drivers/media/usb/usbvision/usbvision-video.c +++ b/drivers/media/usb/usbvision/usbvision-video.c | |||
@@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma) | |||
1108 | } | 1108 | } |
1109 | 1109 | ||
1110 | /* VM_IO is eventually going to replace PageReserved altogether */ | 1110 | /* VM_IO is eventually going to replace PageReserved altogether */ |
1111 | vma->vm_flags |= VM_IO; | 1111 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
1112 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | ||
1113 | 1112 | ||
1114 | pos = usbvision->frame[i].data; | 1113 | pos = usbvision->frame[i].data; |
1115 | while (size > 0) { | 1114 | while (size > 0) { |
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f300deafd268..828e7c10bd70 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c | |||
@@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, | |||
582 | map->count = 1; | 582 | map->count = 1; |
583 | map->q = q; | 583 | map->q = q; |
584 | vma->vm_ops = &videobuf_vm_ops; | 584 | vma->vm_ops = &videobuf_vm_ops; |
585 | vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; | 585 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
586 | vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ | 586 | vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ |
587 | vma->vm_private_data = map; | 587 | vma->vm_private_data = map; |
588 | dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", | 588 | dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", |
diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index df142580e44c..2ff7fcc77b11 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c | |||
@@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, | |||
270 | } | 270 | } |
271 | 271 | ||
272 | vma->vm_ops = &videobuf_vm_ops; | 272 | vma->vm_ops = &videobuf_vm_ops; |
273 | vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; | 273 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
274 | vma->vm_private_data = map; | 274 | vma->vm_private_data = map; |
275 | 275 | ||
276 | dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", | 276 | dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", |
diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c index 504cd4cbe29e..051ea3571b20 100644 --- a/drivers/media/v4l2-core/videobuf2-memops.c +++ b/drivers/media/v4l2-core/videobuf2-memops.c | |||
@@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr, | |||
163 | return ret; | 163 | return ret; |
164 | } | 164 | } |
165 | 165 | ||
166 | vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; | 166 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
167 | vma->vm_private_data = priv; | 167 | vma->vm_private_data = priv; |
168 | vma->vm_ops = vm_ops; | 168 | vma->vm_ops = vm_ops; |
169 | 169 | ||
diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c index 0c43297ed9ac..8835eabb3b87 100644 --- a/drivers/misc/carma/carma-fpga.c +++ b/drivers/misc/carma/carma-fpga.c | |||
@@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma) | |||
1243 | return -EINVAL; | 1243 | return -EINVAL; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | /* IO memory (stop cacheing) */ | ||
1247 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
1248 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 1246 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
1249 | 1247 | ||
1250 | return io_remap_pfn_range(vma, vma->vm_start, addr, vsize, | 1248 | return io_remap_pfn_range(vma, vma->vm_start, addr, vsize, |
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index ecafa4ba238b..492c8cac69ac 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c | |||
@@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
108 | vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) | 108 | vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) |
109 | return -EINVAL; | 109 | return -EINVAL; |
110 | 110 | ||
111 | vma->vm_flags |= | 111 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | |
112 | (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP | | 112 | VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; |
113 | VM_RESERVED); | ||
114 | vma->vm_page_prot = PAGE_SHARED; | 113 | vma->vm_page_prot = PAGE_SHARED; |
115 | vma->vm_ops = &gru_vm_ops; | 114 | vma->vm_ops = &gru_vm_ops; |
116 | 115 | ||
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index a6e74514e662..73ae81a629f2 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c | |||
@@ -1182,7 +1182,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma) | |||
1182 | return -EINVAL; | 1182 | return -EINVAL; |
1183 | if (set_vm_offset(vma, off) < 0) | 1183 | if (set_vm_offset(vma, off) < 0) |
1184 | return -EINVAL; | 1184 | return -EINVAL; |
1185 | vma->vm_flags |= VM_IO | VM_RESERVED; | 1185 | vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; |
1186 | 1186 | ||
1187 | #ifdef pgprot_noncached | 1187 | #ifdef pgprot_noncached |
1188 | if (file->f_flags & O_DSYNC || off >= __pa(high_memory)) | 1188 | if (file->f_flags & O_DSYNC || off >= __pa(high_memory)) |
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 575730744fdb..b9adff543f5f 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c | |||
@@ -1056,8 +1056,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); | |||
1056 | * until the request succeeds or until the allocation size falls below | 1056 | * until the request succeeds or until the allocation size falls below |
1057 | * the system page size. This attempts to make sure it does not adversely | 1057 | * the system page size. This attempts to make sure it does not adversely |
1058 | * impact system performance, so when allocating more than one page, we | 1058 | * impact system performance, so when allocating more than one page, we |
1059 | * ask the memory allocator to avoid re-trying, swapping, writing back | 1059 | * ask the memory allocator to avoid re-trying. |
1060 | * or performing I/O. | ||
1061 | * | 1060 | * |
1062 | * Note, this function also makes sure that the allocated buffer is aligned to | 1061 | * Note, this function also makes sure that the allocated buffer is aligned to |
1063 | * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. | 1062 | * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. |
@@ -1071,8 +1070,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); | |||
1071 | */ | 1070 | */ |
1072 | void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) | 1071 | void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) |
1073 | { | 1072 | { |
1074 | gfp_t flags = __GFP_NOWARN | __GFP_WAIT | | 1073 | gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; |
1075 | __GFP_NORETRY | __GFP_NO_KSWAPD; | ||
1076 | size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); | 1074 | size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); |
1077 | void *kbuf; | 1075 | void *kbuf; |
1078 | 1076 | ||
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index f34b5b29fb95..d93b2b6b1f7a 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c | |||
@@ -216,7 +216,7 @@ static inline unsigned long fast_get_dcookie(struct path *path) | |||
216 | } | 216 | } |
217 | 217 | ||
218 | 218 | ||
219 | /* Look up the dcookie for the task's first VM_EXECUTABLE mapping, | 219 | /* Look up the dcookie for the task's mm->exe_file, |
220 | * which corresponds loosely to "application name". This is | 220 | * which corresponds loosely to "application name". This is |
221 | * not strictly necessary but allows oprofile to associate | 221 | * not strictly necessary but allows oprofile to associate |
222 | * shared-library samples with particular applications | 222 | * shared-library samples with particular applications |
@@ -224,21 +224,10 @@ static inline unsigned long fast_get_dcookie(struct path *path) | |||
224 | static unsigned long get_exec_dcookie(struct mm_struct *mm) | 224 | static unsigned long get_exec_dcookie(struct mm_struct *mm) |
225 | { | 225 | { |
226 | unsigned long cookie = NO_COOKIE; | 226 | unsigned long cookie = NO_COOKIE; |
227 | struct vm_area_struct *vma; | ||
228 | |||
229 | if (!mm) | ||
230 | goto out; | ||
231 | 227 | ||
232 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 228 | if (mm && mm->exe_file) |
233 | if (!vma->vm_file) | 229 | cookie = fast_get_dcookie(&mm->exe_file->f_path); |
234 | continue; | ||
235 | if (!(vma->vm_flags & VM_EXECUTABLE)) | ||
236 | continue; | ||
237 | cookie = fast_get_dcookie(&vma->vm_file->f_path); | ||
238 | break; | ||
239 | } | ||
240 | 230 | ||
241 | out: | ||
242 | return cookie; | 231 | return cookie; |
243 | } | 232 | } |
244 | 233 | ||
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9c5c5f2b3962..be2c9a6561ff 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c | |||
@@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) | |||
1257 | } | 1257 | } |
1258 | 1258 | ||
1259 | sfp->mmap_called = 1; | 1259 | sfp->mmap_called = 1; |
1260 | vma->vm_flags |= VM_RESERVED; | 1260 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
1261 | vma->vm_private_data = sfp; | 1261 | vma->vm_private_data = sfp; |
1262 | vma->vm_ops = &sg_mmap_vm_ops; | 1262 | vma->vm_ops = &sg_mmap_vm_ops; |
1263 | return 0; | 1263 | return 0; |
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 94a740d2883d..634b9ae713e0 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c | |||
@@ -332,7 +332,6 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
332 | if (vma->vm_file) | 332 | if (vma->vm_file) |
333 | fput(vma->vm_file); | 333 | fput(vma->vm_file); |
334 | vma->vm_file = asma->file; | 334 | vma->vm_file = asma->file; |
335 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
336 | 335 | ||
337 | out: | 336 | out: |
338 | mutex_unlock(&ashmem_mutex); | 337 | mutex_unlock(&ashmem_mutex); |
diff --git a/drivers/staging/omapdrm/omap_gem_dmabuf.c b/drivers/staging/omapdrm/omap_gem_dmabuf.c index 42728e0cc194..c6f3ef6f57b9 100644 --- a/drivers/staging/omapdrm/omap_gem_dmabuf.c +++ b/drivers/staging/omapdrm/omap_gem_dmabuf.c | |||
@@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer, | |||
160 | goto out_unlock; | 160 | goto out_unlock; |
161 | } | 161 | } |
162 | 162 | ||
163 | vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; | 163 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
164 | vma->vm_ops = obj->dev->driver->gem_vm_ops; | 164 | vma->vm_ops = obj->dev->driver->gem_vm_ops; |
165 | vma->vm_private_data = obj; | 165 | vma->vm_private_data = obj; |
166 | vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); | 166 | vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); |
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c index bddea1d3b2c3..701a11ac676d 100644 --- a/drivers/staging/tidspbridge/rmgr/drv_interface.c +++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c | |||
@@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma) | |||
261 | { | 261 | { |
262 | u32 status; | 262 | u32 status; |
263 | 263 | ||
264 | vma->vm_flags |= VM_RESERVED | VM_IO; | 264 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
265 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 265 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
266 | 266 | ||
267 | dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx " | 267 | dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx " |
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a783d533a1a6..5110f367f1f1 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c | |||
@@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma) | |||
653 | if (mi < 0) | 653 | if (mi < 0) |
654 | return -EINVAL; | 654 | return -EINVAL; |
655 | 655 | ||
656 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
657 | |||
658 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 656 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
659 | 657 | ||
660 | return remap_pfn_range(vma, | 658 | return remap_pfn_range(vma, |
@@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma) | |||
666 | 664 | ||
667 | static int uio_mmap_logical(struct vm_area_struct *vma) | 665 | static int uio_mmap_logical(struct vm_area_struct *vma) |
668 | { | 666 | { |
669 | vma->vm_flags |= VM_RESERVED; | 667 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
670 | vma->vm_ops = &uio_vm_ops; | 668 | vma->vm_ops = &uio_vm_ops; |
671 | uio_vma_open(vma); | 669 | uio_vma_open(vma); |
672 | return 0; | 670 | return 0; |
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 91cd85076a44..9a62e89d6dc0 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c | |||
@@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) | |||
1247 | { | 1247 | { |
1248 | /* don't do anything here: "fault" will set up page table entries */ | 1248 | /* don't do anything here: "fault" will set up page table entries */ |
1249 | vma->vm_ops = &mon_bin_vm_ops; | 1249 | vma->vm_ops = &mon_bin_vm_ops; |
1250 | vma->vm_flags |= VM_RESERVED; | 1250 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
1251 | vma->vm_private_data = filp->private_data; | 1251 | vma->vm_private_data = filp->private_data; |
1252 | mon_bin_vma_open(vma); | 1252 | mon_bin_vma_open(vma); |
1253 | return 0; | 1253 | return 0; |
diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c index a425d65d5ba2..fa44fbed397d 100644 --- a/drivers/video/68328fb.c +++ b/drivers/video/68328fb.c | |||
@@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
400 | #ifndef MMU | 400 | #ifndef MMU |
401 | /* this is uClinux (no MMU) specific code */ | 401 | /* this is uClinux (no MMU) specific code */ |
402 | 402 | ||
403 | vma->vm_flags |= VM_RESERVED; | 403 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
404 | vma->vm_start = videomemory; | 404 | vma->vm_start = videomemory; |
405 | 405 | ||
406 | return 0; | 406 | return 0; |
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 3f2e8c13f1ca..868932f904ef 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c | |||
@@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
1942 | off = vma->vm_pgoff << PAGE_SHIFT; | 1942 | off = vma->vm_pgoff << PAGE_SHIFT; |
1943 | size = vma->vm_end - vma->vm_start; | 1943 | size = vma->vm_end - vma->vm_start; |
1944 | 1944 | ||
1945 | /* To stop the swapper from even considering these pages. */ | 1945 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
1946 | vma->vm_flags |= (VM_IO | VM_RESERVED); | ||
1947 | 1946 | ||
1948 | if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) || | 1947 | if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) || |
1949 | ((off == info->fix.smem_len) && (size == PAGE_SIZE))) | 1948 | ((off == info->fix.smem_len) && (size == PAGE_SIZE))) |
diff --git a/drivers/video/fb-puv3.c b/drivers/video/fb-puv3.c index 60a787fa32cf..7d106f1f4906 100644 --- a/drivers/video/fb-puv3.c +++ b/drivers/video/fb-puv3.c | |||
@@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info, | |||
653 | vma->vm_page_prot)) | 653 | vma->vm_page_prot)) |
654 | return -EAGAIN; | 654 | return -EAGAIN; |
655 | 655 | ||
656 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | 656 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
657 | return 0; | 657 | return 0; |
658 | |||
659 | } | 658 | } |
660 | 659 | ||
661 | static struct fb_ops unifb_ops = { | 660 | static struct fb_ops unifb_ops = { |
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 64cda560c488..88cad6b8b479 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c | |||
@@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = { | |||
166 | static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) | 166 | static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) |
167 | { | 167 | { |
168 | vma->vm_ops = &fb_deferred_io_vm_ops; | 168 | vma->vm_ops = &fb_deferred_io_vm_ops; |
169 | vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND ); | 169 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
170 | if (!(info->flags & FBINFO_VIRTFB)) | 170 | if (!(info->flags & FBINFO_VIRTFB)) |
171 | vma->vm_flags |= VM_IO; | 171 | vma->vm_flags |= VM_IO; |
172 | vma->vm_private_data = info; | 172 | vma->vm_private_data = info; |
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 0dff12a1daef..3ff0105a496a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c | |||
@@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) | |||
1410 | return -EINVAL; | 1410 | return -EINVAL; |
1411 | off += start; | 1411 | off += start; |
1412 | vma->vm_pgoff = off >> PAGE_SHIFT; | 1412 | vma->vm_pgoff = off >> PAGE_SHIFT; |
1413 | /* This is an IO map - tell maydump to skip this VMA */ | 1413 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/ |
1414 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
1415 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 1414 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
1416 | fb_pgprotect(file, vma, off); | 1415 | fb_pgprotect(file, vma, off); |
1417 | if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, | 1416 | if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, |
diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c index 7e7b7a9ba274..05e2a8a99d8f 100644 --- a/drivers/video/gbefb.c +++ b/drivers/video/gbefb.c | |||
@@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info, | |||
1024 | pgprot_val(vma->vm_page_prot) = | 1024 | pgprot_val(vma->vm_page_prot) = |
1025 | pgprot_fb(pgprot_val(vma->vm_page_prot)); | 1025 | pgprot_fb(pgprot_val(vma->vm_page_prot)); |
1026 | 1026 | ||
1027 | vma->vm_flags |= VM_IO | VM_RESERVED; | 1027 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
1028 | 1028 | ||
1029 | /* look for the starting tile */ | 1029 | /* look for the starting tile */ |
1030 | tile = &gbe_tiles.cpu[offset >> TILE_SHIFT]; | 1030 | tile = &gbe_tiles.cpu[offset >> TILE_SHIFT]; |
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c index 3c39aa8de928..15373f4aee19 100644 --- a/drivers/video/omap2/omapfb/omapfb-main.c +++ b/drivers/video/omap2/omapfb/omapfb-main.c | |||
@@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma) | |||
1128 | DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off); | 1128 | DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off); |
1129 | 1129 | ||
1130 | vma->vm_pgoff = off >> PAGE_SHIFT; | 1130 | vma->vm_pgoff = off >> PAGE_SHIFT; |
1131 | vma->vm_flags |= VM_IO | VM_RESERVED; | 1131 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
1132 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); | 1132 | vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); |
1133 | vma->vm_ops = &mmap_user_ops; | 1133 | vma->vm_ops = &mmap_user_ops; |
1134 | vma->vm_private_data = rg; | 1134 | vma->vm_private_data = rg; |
diff --git a/drivers/video/sbuslib.c b/drivers/video/sbuslib.c index 3c1de981a18c..296afae442f4 100644 --- a/drivers/video/sbuslib.c +++ b/drivers/video/sbuslib.c | |||
@@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map, | |||
57 | 57 | ||
58 | off = vma->vm_pgoff << PAGE_SHIFT; | 58 | off = vma->vm_pgoff << PAGE_SHIFT; |
59 | 59 | ||
60 | /* To stop the swapper from even considering these pages */ | 60 | /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ |
61 | vma->vm_flags |= (VM_IO | VM_RESERVED); | 61 | |
62 | |||
63 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 62 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
64 | 63 | ||
65 | /* Each page, see which map applies */ | 64 | /* Each page, see which map applies */ |
diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c index 5533a32c6ca1..97bd6620c364 100644 --- a/drivers/video/smscufx.c +++ b/drivers/video/smscufx.c | |||
@@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
803 | size = 0; | 803 | size = 0; |
804 | } | 804 | } |
805 | 805 | ||
806 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | ||
807 | return 0; | 806 | return 0; |
808 | } | 807 | } |
809 | 808 | ||
diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index 8af64148294b..f45eba3d6150 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c | |||
@@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
345 | size = 0; | 345 | size = 0; |
346 | } | 346 | } |
347 | 347 | ||
348 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | ||
349 | return 0; | 348 | return 0; |
350 | } | 349 | } |
351 | 350 | ||
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c index 970e43d13f52..89aef343e295 100644 --- a/drivers/video/vermilion/vermilion.c +++ b/drivers/video/vermilion/vermilion.c | |||
@@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma) | |||
1018 | offset += vinfo->vram_start; | 1018 | offset += vinfo->vram_start; |
1019 | pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; | 1019 | pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; |
1020 | pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT; | 1020 | pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT; |
1021 | vma->vm_flags |= VM_RESERVED | VM_IO; | ||
1022 | if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, | 1021 | if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, |
1023 | size, vma->vm_page_prot)) | 1022 | size, vma->vm_page_prot)) |
1024 | return -EAGAIN; | 1023 | return -EAGAIN; |
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c index 501a922aa9dc..c7f692525b88 100644 --- a/drivers/video/vfb.c +++ b/drivers/video/vfb.c | |||
@@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info, | |||
439 | size = 0; | 439 | size = 0; |
440 | } | 440 | } |
441 | 441 | ||
442 | vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ | ||
443 | return 0; | 442 | return 0; |
444 | 443 | ||
445 | } | 444 | } |
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 934985d14c24..4097987b330e 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c | |||
@@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) | |||
535 | 535 | ||
536 | vma->vm_private_data = vm_priv; | 536 | vma->vm_private_data = vm_priv; |
537 | 537 | ||
538 | vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; | 538 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
539 | 539 | ||
540 | vma->vm_ops = &gntalloc_vmops; | 540 | vma->vm_ops = &gntalloc_vmops; |
541 | 541 | ||
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 5df9fd847b2e..610bfc6be177 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c | |||
@@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) | |||
720 | 720 | ||
721 | vma->vm_ops = &gntdev_vmops; | 721 | vma->vm_ops = &gntdev_vmops; |
722 | 722 | ||
723 | vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; | 723 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
724 | 724 | ||
725 | if (use_ptemod) | 725 | if (use_ptemod) |
726 | vma->vm_flags |= VM_DONTCOPY; | 726 | vma->vm_flags |= VM_DONTCOPY; |
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ef6389580b8c..8adb9cc267f9 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c | |||
@@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) | |||
455 | { | 455 | { |
456 | /* DONTCOPY is essential for Xen because copy_page_range doesn't know | 456 | /* DONTCOPY is essential for Xen because copy_page_range doesn't know |
457 | * how to recreate these mappings */ | 457 | * how to recreate these mappings */ |
458 | vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; | 458 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | |
459 | VM_DONTEXPAND | VM_DONTDUMP; | ||
459 | vma->vm_ops = &privcmd_vm_ops; | 460 | vma->vm_ops = &privcmd_vm_ops; |
460 | vma->vm_private_data = NULL; | 461 | vma->vm_private_data = NULL; |
461 | 462 | ||
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index dd6f7ee1e312..c2483e97beee 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c | |||
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, | |||
738 | static const struct vm_operations_struct v9fs_file_vm_ops = { | 738 | static const struct vm_operations_struct v9fs_file_vm_ops = { |
739 | .fault = filemap_fault, | 739 | .fault = filemap_fault, |
740 | .page_mkwrite = v9fs_vm_page_mkwrite, | 740 | .page_mkwrite = v9fs_vm_page_mkwrite, |
741 | .remap_pages = generic_file_remap_pages, | ||
741 | }; | 742 | }; |
742 | 743 | ||
743 | 744 | ||
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 28a64e769527..e800dec958c3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -1123,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, | |||
1123 | if (always_dump_vma(vma)) | 1123 | if (always_dump_vma(vma)) |
1124 | goto whole; | 1124 | goto whole; |
1125 | 1125 | ||
1126 | if (vma->vm_flags & VM_NODUMP) | 1126 | if (vma->vm_flags & VM_DONTDUMP) |
1127 | return 0; | 1127 | return 0; |
1128 | 1128 | ||
1129 | /* Hugetlb memory check */ | 1129 | /* Hugetlb memory check */ |
@@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, | |||
1135 | } | 1135 | } |
1136 | 1136 | ||
1137 | /* Do not dump I/O mapped devices or special mappings */ | 1137 | /* Do not dump I/O mapped devices or special mappings */ |
1138 | if (vma->vm_flags & (VM_IO | VM_RESERVED)) | 1138 | if (vma->vm_flags & VM_IO) |
1139 | return 0; | 1139 | return 0; |
1140 | 1140 | ||
1141 | /* By default, dump shared memory if mapped from an anonymous file. */ | 1141 | /* By default, dump shared memory if mapped from an anonymous file. */ |
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d812b32282..262db114ff01 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c | |||
@@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) | |||
1205 | int dump_ok; | 1205 | int dump_ok; |
1206 | 1206 | ||
1207 | /* Do not dump I/O mapped devices or special mappings */ | 1207 | /* Do not dump I/O mapped devices or special mappings */ |
1208 | if (vma->vm_flags & (VM_IO | VM_RESERVED)) { | 1208 | if (vma->vm_flags & VM_IO) { |
1209 | kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); | 1209 | kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); |
1210 | return 0; | 1210 | return 0; |
1211 | } | 1211 | } |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4d..f6b40e86121b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1599,6 +1599,7 @@ out: | |||
1599 | static const struct vm_operations_struct btrfs_file_vm_ops = { | 1599 | static const struct vm_operations_struct btrfs_file_vm_ops = { |
1600 | .fault = filemap_fault, | 1600 | .fault = filemap_fault, |
1601 | .page_mkwrite = btrfs_page_mkwrite, | 1601 | .page_mkwrite = btrfs_page_mkwrite, |
1602 | .remap_pages = generic_file_remap_pages, | ||
1602 | }; | 1603 | }; |
1603 | 1604 | ||
1604 | static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) | 1605 | static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) |
@@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) | |||
1610 | 1611 | ||
1611 | file_accessed(filp); | 1612 | file_accessed(filp); |
1612 | vma->vm_ops = &btrfs_file_vm_ops; | 1613 | vma->vm_ops = &btrfs_file_vm_ops; |
1613 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1614 | 1614 | ||
1615 | return 0; | 1615 | return 0; |
1616 | } | 1616 | } |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e4583faa..6690269f5dde 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -1224,6 +1224,7 @@ out: | |||
1224 | static struct vm_operations_struct ceph_vmops = { | 1224 | static struct vm_operations_struct ceph_vmops = { |
1225 | .fault = filemap_fault, | 1225 | .fault = filemap_fault, |
1226 | .page_mkwrite = ceph_page_mkwrite, | 1226 | .page_mkwrite = ceph_page_mkwrite, |
1227 | .remap_pages = generic_file_remap_pages, | ||
1227 | }; | 1228 | }; |
1228 | 1229 | ||
1229 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) | 1230 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) |
@@ -1234,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) | |||
1234 | return -ENOEXEC; | 1235 | return -ENOEXEC; |
1235 | file_accessed(file); | 1236 | file_accessed(file); |
1236 | vma->vm_ops = &ceph_vmops; | 1237 | vma->vm_ops = &ceph_vmops; |
1237 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1238 | return 0; | 1238 | return 0; |
1239 | } | 1239 | } |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7d7bbdc4c8e7..edb25b4bbb95 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
3003 | static struct vm_operations_struct cifs_file_vm_ops = { | 3003 | static struct vm_operations_struct cifs_file_vm_ops = { |
3004 | .fault = filemap_fault, | 3004 | .fault = filemap_fault, |
3005 | .page_mkwrite = cifs_page_mkwrite, | 3005 | .page_mkwrite = cifs_page_mkwrite, |
3006 | .remap_pages = generic_file_remap_pages, | ||
3006 | }; | 3007 | }; |
3007 | 3008 | ||
3008 | int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) | 3009 | int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) |
@@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) | |||
603 | * process cleanup to remove whatever mess we made. | 603 | * process cleanup to remove whatever mess we made. |
604 | */ | 604 | */ |
605 | if (length != move_page_tables(vma, old_start, | 605 | if (length != move_page_tables(vma, old_start, |
606 | vma, new_start, length)) | 606 | vma, new_start, length, false)) |
607 | return -ENOMEM; | 607 | return -ENOMEM; |
608 | 608 | ||
609 | lru_add_drain(); | 609 | lru_add_drain(); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca6f07afe601..bf3966bccd34 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, | |||
207 | static const struct vm_operations_struct ext4_file_vm_ops = { | 207 | static const struct vm_operations_struct ext4_file_vm_ops = { |
208 | .fault = filemap_fault, | 208 | .fault = filemap_fault, |
209 | .page_mkwrite = ext4_page_mkwrite, | 209 | .page_mkwrite = ext4_page_mkwrite, |
210 | .remap_pages = generic_file_remap_pages, | ||
210 | }; | 211 | }; |
211 | 212 | ||
212 | static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) | 213 | static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) |
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
217 | return -ENOEXEC; | 218 | return -ENOEXEC; |
218 | file_accessed(file); | 219 | file_accessed(file); |
219 | vma->vm_ops = &ext4_file_vm_ops; | 220 | vma->vm_ops = &ext4_file_vm_ops; |
220 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e1d7b9e4a33..401b6c6248ae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -439,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, | |||
439 | * setting I_SYNC flag and calling inode_sync_complete() to clear it. | 439 | * setting I_SYNC flag and calling inode_sync_complete() to clear it. |
440 | */ | 440 | */ |
441 | static int | 441 | static int |
442 | __writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | 442 | __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) |
443 | struct writeback_control *wbc) | ||
444 | { | 443 | { |
445 | struct address_space *mapping = inode->i_mapping; | 444 | struct address_space *mapping = inode->i_mapping; |
446 | long nr_to_write = wbc->nr_to_write; | 445 | long nr_to_write = wbc->nr_to_write; |
@@ -527,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | |||
527 | inode->i_state |= I_SYNC; | 526 | inode->i_state |= I_SYNC; |
528 | spin_unlock(&inode->i_lock); | 527 | spin_unlock(&inode->i_lock); |
529 | 528 | ||
530 | ret = __writeback_single_inode(inode, wb, wbc); | 529 | ret = __writeback_single_inode(inode, wbc); |
531 | 530 | ||
532 | spin_lock(&wb->list_lock); | 531 | spin_lock(&wb->list_lock); |
533 | spin_lock(&inode->i_lock); | 532 | spin_lock(&inode->i_lock); |
@@ -670,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb, | |||
670 | * We use I_SYNC to pin the inode in memory. While it is set | 669 | * We use I_SYNC to pin the inode in memory. While it is set |
671 | * evict_inode() will wait so the inode cannot be freed. | 670 | * evict_inode() will wait so the inode cannot be freed. |
672 | */ | 671 | */ |
673 | __writeback_single_inode(inode, wb, &wbc); | 672 | __writeback_single_inode(inode, &wbc); |
674 | 673 | ||
675 | work->nr_pages -= write_chunk - wbc.nr_to_write; | 674 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
676 | wrote += write_chunk - wbc.nr_to_write; | 675 | wrote += write_chunk - wbc.nr_to_write; |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aba15f1b7ad2..78d2837bc940 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { | |||
1379 | .close = fuse_vma_close, | 1379 | .close = fuse_vma_close, |
1380 | .fault = filemap_fault, | 1380 | .fault = filemap_fault, |
1381 | .page_mkwrite = fuse_page_mkwrite, | 1381 | .page_mkwrite = fuse_page_mkwrite, |
1382 | .remap_pages = generic_file_remap_pages, | ||
1382 | }; | 1383 | }; |
1383 | 1384 | ||
1384 | static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) | 1385 | static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) |
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 30e21997a1a1..0def0504afc1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c | |||
@@ -492,6 +492,7 @@ out: | |||
492 | static const struct vm_operations_struct gfs2_vm_ops = { | 492 | static const struct vm_operations_struct gfs2_vm_ops = { |
493 | .fault = filemap_fault, | 493 | .fault = filemap_fault, |
494 | .page_mkwrite = gfs2_page_mkwrite, | 494 | .page_mkwrite = gfs2_page_mkwrite, |
495 | .remap_pages = generic_file_remap_pages, | ||
495 | }; | 496 | }; |
496 | 497 | ||
497 | /** | 498 | /** |
@@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) | |||
526 | return error; | 527 | return error; |
527 | } | 528 | } |
528 | vma->vm_ops = &gfs2_vm_ops; | 529 | vma->vm_ops = &gfs2_vm_ops; |
529 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
530 | 530 | ||
531 | return 0; | 531 | return 0; |
532 | } | 532 | } |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9460120a5170..c5bc355d8243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
110 | * way when do_mmap_pgoff unwinds (may be important on powerpc | 110 | * way when do_mmap_pgoff unwinds (may be important on powerpc |
111 | * and ia64). | 111 | * and ia64). |
112 | */ | 112 | */ |
113 | vma->vm_flags |= VM_HUGETLB | VM_RESERVED; | 113 | vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; |
114 | vma->vm_ops = &hugetlb_vm_ops; | 114 | vma->vm_ops = &hugetlb_vm_ops; |
115 | 115 | ||
116 | if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) | 116 | if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) | |||
397 | } | 397 | } |
398 | 398 | ||
399 | static inline void | 399 | static inline void |
400 | hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) | 400 | hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) |
401 | { | 401 | { |
402 | struct vm_area_struct *vma; | 402 | struct vm_area_struct *vma; |
403 | struct prio_tree_iter iter; | ||
404 | 403 | ||
405 | vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { | 404 | vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { |
406 | unsigned long v_offset; | 405 | unsigned long v_offset; |
407 | 406 | ||
408 | /* | 407 | /* |
409 | * Can the expression below overflow on 32-bit arches? | 408 | * Can the expression below overflow on 32-bit arches? |
410 | * No, because the prio_tree returns us only those vmas | 409 | * No, because the interval tree returns us only those vmas |
411 | * which overlap the truncated area starting at pgoff, | 410 | * which overlap the truncated area starting at pgoff, |
412 | * and no vma on a 32-bit arch can span beyond the 4GB. | 411 | * and no vma on a 32-bit arch can span beyond the 4GB. |
413 | */ | 412 | */ |
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
432 | 431 | ||
433 | i_size_write(inode, offset); | 432 | i_size_write(inode, offset); |
434 | mutex_lock(&mapping->i_mmap_mutex); | 433 | mutex_lock(&mapping->i_mmap_mutex); |
435 | if (!prio_tree_empty(&mapping->i_mmap)) | 434 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) |
436 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); | 435 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); |
437 | mutex_unlock(&mapping->i_mmap_mutex); | 436 | mutex_unlock(&mapping->i_mmap_mutex); |
438 | truncate_hugepages(inode, offset); | 437 | truncate_hugepages(inode, offset); |
diff --git a/fs/inode.c b/fs/inode.c index ac8d904b3f16..b03c71957246 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) | |||
348 | mutex_init(&mapping->i_mmap_mutex); | 348 | mutex_init(&mapping->i_mmap_mutex); |
349 | INIT_LIST_HEAD(&mapping->private_list); | 349 | INIT_LIST_HEAD(&mapping->private_list); |
350 | spin_lock_init(&mapping->private_lock); | 350 | spin_lock_init(&mapping->private_lock); |
351 | INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); | 351 | mapping->i_mmap = RB_ROOT; |
352 | INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); | 352 | INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); |
353 | } | 353 | } |
354 | EXPORT_SYMBOL(address_space_init_once); | 354 | EXPORT_SYMBOL(address_space_init_once); |
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 1ea349fff68b..ae81b01e6fd7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c | |||
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, | |||
394 | } | 394 | } |
395 | 395 | ||
396 | /* Trivial function to remove the last node in the tree. Which by definition | 396 | /* Trivial function to remove the last node in the tree. Which by definition |
397 | has no right-hand -- so can be removed just by making its only child (if | 397 | has no right-hand child — so can be removed just by making its left-hand |
398 | any) take its place under its parent. */ | 398 | child (if any) take its place under its parent. Since this is only done |
399 | when we're consuming the whole tree, there's no need to use rb_erase() | ||
400 | and let it worry about adjusting colours and balancing the tree. That | ||
401 | would just be a waste of time. */ | ||
399 | static void eat_last(struct rb_root *root, struct rb_node *node) | 402 | static void eat_last(struct rb_root *root, struct rb_node *node) |
400 | { | 403 | { |
401 | struct rb_node *parent = rb_parent(node); | 404 | struct rb_node *parent = rb_parent(node); |
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node) | |||
412 | link = &parent->rb_right; | 415 | link = &parent->rb_right; |
413 | 416 | ||
414 | *link = node->rb_left; | 417 | *link = node->rb_left; |
415 | /* Colour doesn't matter now. Only the parent pointer. */ | ||
416 | if (node->rb_left) | 418 | if (node->rb_left) |
417 | node->rb_left->rb_parent_color = node->rb_parent_color; | 419 | node->rb_left->__rb_parent_color = node->__rb_parent_color; |
418 | } | 420 | } |
419 | 421 | ||
420 | /* We put this in reverse order, so we can just use eat_last */ | 422 | /* We put the version tree in reverse order, so we can use the same eat_last() |
423 | function that we use to consume the tmpnode tree (tn_root). */ | ||
421 | static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) | 424 | static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) |
422 | { | 425 | { |
423 | struct rb_node **link = &ver_root->rb_node; | 426 | struct rb_node **link = &ver_root->rb_node; |
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6a7fcab7ecb3..f692be97676d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -578,6 +578,7 @@ out: | |||
578 | static const struct vm_operations_struct nfs_file_vm_ops = { | 578 | static const struct vm_operations_struct nfs_file_vm_ops = { |
579 | .fault = filemap_fault, | 579 | .fault = filemap_fault, |
580 | .page_mkwrite = nfs_vm_page_mkwrite, | 580 | .page_mkwrite = nfs_vm_page_mkwrite, |
581 | .remap_pages = generic_file_remap_pages, | ||
581 | }; | 582 | }; |
582 | 583 | ||
583 | static int nfs_need_sync_write(struct file *filp, struct inode *inode) | 584 | static int nfs_need_sync_write(struct file *filp, struct inode *inode) |
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 5b387a4c293e..16f35f7423c5 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c | |||
@@ -135,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
135 | static const struct vm_operations_struct nilfs_file_vm_ops = { | 135 | static const struct vm_operations_struct nilfs_file_vm_ops = { |
136 | .fault = filemap_fault, | 136 | .fault = filemap_fault, |
137 | .page_mkwrite = nilfs_page_mkwrite, | 137 | .page_mkwrite = nilfs_page_mkwrite, |
138 | .remap_pages = generic_file_remap_pages, | ||
138 | }; | 139 | }; |
139 | 140 | ||
140 | static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) | 141 | static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
141 | { | 142 | { |
142 | file_accessed(file); | 143 | file_accessed(file); |
143 | vma->vm_ops = &nilfs_file_vm_ops; | 144 | vma->vm_ops = &nilfs_file_vm_ops; |
144 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
145 | return 0; | 145 | return 0; |
146 | } | 146 | } |
147 | 147 | ||
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index d150372fd81d..47a87dda54ce 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -173,6 +173,7 @@ out: | |||
173 | static const struct vm_operations_struct ocfs2_file_vm_ops = { | 173 | static const struct vm_operations_struct ocfs2_file_vm_ops = { |
174 | .fault = ocfs2_fault, | 174 | .fault = ocfs2_fault, |
175 | .page_mkwrite = ocfs2_page_mkwrite, | 175 | .page_mkwrite = ocfs2_page_mkwrite, |
176 | .remap_pages = generic_file_remap_pages, | ||
176 | }; | 177 | }; |
177 | 178 | ||
178 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | 179 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) |
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | |||
188 | ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); | 189 | ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); |
189 | out: | 190 | out: |
190 | vma->vm_ops = &ocfs2_file_vm_ops; | 191 | vma->vm_ops = &ocfs2_file_vm_ops; |
191 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
192 | return 0; | 192 | return 0; |
193 | } | 193 | } |
194 | 194 | ||
diff --git a/fs/proc/base.c b/fs/proc/base.c index d295af993677..ef5c84be66f9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = { | |||
873 | .release = mem_release, | 873 | .release = mem_release, |
874 | }; | 874 | }; |
875 | 875 | ||
876 | static ssize_t oom_adjust_read(struct file *file, char __user *buf, | ||
877 | size_t count, loff_t *ppos) | ||
878 | { | ||
879 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | ||
880 | char buffer[PROC_NUMBUF]; | ||
881 | size_t len; | ||
882 | int oom_adjust = OOM_DISABLE; | ||
883 | unsigned long flags; | ||
884 | |||
885 | if (!task) | ||
886 | return -ESRCH; | ||
887 | |||
888 | if (lock_task_sighand(task, &flags)) { | ||
889 | oom_adjust = task->signal->oom_adj; | ||
890 | unlock_task_sighand(task, &flags); | ||
891 | } | ||
892 | |||
893 | put_task_struct(task); | ||
894 | |||
895 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); | ||
896 | |||
897 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | ||
898 | } | ||
899 | |||
900 | static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | ||
901 | size_t count, loff_t *ppos) | ||
902 | { | ||
903 | struct task_struct *task; | ||
904 | char buffer[PROC_NUMBUF]; | ||
905 | int oom_adjust; | ||
906 | unsigned long flags; | ||
907 | int err; | ||
908 | |||
909 | memset(buffer, 0, sizeof(buffer)); | ||
910 | if (count > sizeof(buffer) - 1) | ||
911 | count = sizeof(buffer) - 1; | ||
912 | if (copy_from_user(buffer, buf, count)) { | ||
913 | err = -EFAULT; | ||
914 | goto out; | ||
915 | } | ||
916 | |||
917 | err = kstrtoint(strstrip(buffer), 0, &oom_adjust); | ||
918 | if (err) | ||
919 | goto out; | ||
920 | if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && | ||
921 | oom_adjust != OOM_DISABLE) { | ||
922 | err = -EINVAL; | ||
923 | goto out; | ||
924 | } | ||
925 | |||
926 | task = get_proc_task(file->f_path.dentry->d_inode); | ||
927 | if (!task) { | ||
928 | err = -ESRCH; | ||
929 | goto out; | ||
930 | } | ||
931 | |||
932 | task_lock(task); | ||
933 | if (!task->mm) { | ||
934 | err = -EINVAL; | ||
935 | goto err_task_lock; | ||
936 | } | ||
937 | |||
938 | if (!lock_task_sighand(task, &flags)) { | ||
939 | err = -ESRCH; | ||
940 | goto err_task_lock; | ||
941 | } | ||
942 | |||
943 | if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { | ||
944 | err = -EACCES; | ||
945 | goto err_sighand; | ||
946 | } | ||
947 | |||
948 | /* | ||
949 | * Warn that /proc/pid/oom_adj is deprecated, see | ||
950 | * Documentation/feature-removal-schedule.txt. | ||
951 | */ | ||
952 | printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", | ||
953 | current->comm, task_pid_nr(current), task_pid_nr(task), | ||
954 | task_pid_nr(task)); | ||
955 | task->signal->oom_adj = oom_adjust; | ||
956 | /* | ||
957 | * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum | ||
958 | * value is always attainable. | ||
959 | */ | ||
960 | if (task->signal->oom_adj == OOM_ADJUST_MAX) | ||
961 | task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; | ||
962 | else | ||
963 | task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / | ||
964 | -OOM_DISABLE; | ||
965 | trace_oom_score_adj_update(task); | ||
966 | err_sighand: | ||
967 | unlock_task_sighand(task, &flags); | ||
968 | err_task_lock: | ||
969 | task_unlock(task); | ||
970 | put_task_struct(task); | ||
971 | out: | ||
972 | return err < 0 ? err : count; | ||
973 | } | ||
974 | |||
975 | static const struct file_operations proc_oom_adjust_operations = { | ||
976 | .read = oom_adjust_read, | ||
977 | .write = oom_adjust_write, | ||
978 | .llseek = generic_file_llseek, | ||
979 | }; | ||
980 | |||
981 | static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | 876 | static ssize_t oom_score_adj_read(struct file *file, char __user *buf, |
982 | size_t count, loff_t *ppos) | 877 | size_t count, loff_t *ppos) |
983 | { | 878 | { |
@@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, | |||
1051 | if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) | 946 | if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) |
1052 | task->signal->oom_score_adj_min = oom_score_adj; | 947 | task->signal->oom_score_adj_min = oom_score_adj; |
1053 | trace_oom_score_adj_update(task); | 948 | trace_oom_score_adj_update(task); |
1054 | /* | 949 | |
1055 | * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is | ||
1056 | * always attainable. | ||
1057 | */ | ||
1058 | if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1059 | task->signal->oom_adj = OOM_DISABLE; | ||
1060 | else | ||
1061 | task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / | ||
1062 | OOM_SCORE_ADJ_MAX; | ||
1063 | err_sighand: | 950 | err_sighand: |
1064 | unlock_task_sighand(task, &flags); | 951 | unlock_task_sighand(task, &flags); |
1065 | err_task_lock: | 952 | err_task_lock: |
@@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
2710 | REG("cgroup", S_IRUGO, proc_cgroup_operations), | 2597 | REG("cgroup", S_IRUGO, proc_cgroup_operations), |
2711 | #endif | 2598 | #endif |
2712 | INF("oom_score", S_IRUGO, proc_oom_score), | 2599 | INF("oom_score", S_IRUGO, proc_oom_score), |
2713 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), | ||
2714 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), | 2600 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), |
2715 | #ifdef CONFIG_AUDITSYSCALL | 2601 | #ifdef CONFIG_AUDITSYSCALL |
2716 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), | 2602 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
@@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = { | |||
3077 | REG("cgroup", S_IRUGO, proc_cgroup_operations), | 2963 | REG("cgroup", S_IRUGO, proc_cgroup_operations), |
3078 | #endif | 2964 | #endif |
3079 | INF("oom_score", S_IRUGO, proc_oom_score), | 2965 | INF("oom_score", S_IRUGO, proc_oom_score), |
3080 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), | ||
3081 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), | 2966 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), |
3082 | #ifdef CONFIG_AUDITSYSCALL | 2967 | #ifdef CONFIG_AUDITSYSCALL |
3083 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), | 2968 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 7fcd0d60a968..b8730d9ebaee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page) | |||
115 | u |= 1 << KPF_COMPOUND_TAIL; | 115 | u |= 1 << KPF_COMPOUND_TAIL; |
116 | if (PageHuge(page)) | 116 | if (PageHuge(page)) |
117 | u |= 1 << KPF_HUGE; | 117 | u |= 1 << KPF_HUGE; |
118 | else if (PageTransCompound(page)) | 118 | /* |
119 | * PageTransCompound can be true for non-huge compound pages (slab | ||
120 | * pages or pages allocated by drivers with __GFP_COMP) because it | ||
121 | * just checks PG_head/PG_tail, so we need to check PageLRU to make | ||
122 | * sure a given page is a thp, not a non-huge compound page. | ||
123 | */ | ||
124 | else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) | ||
119 | u |= 1 << KPF_THP; | 125 | u |= 1 << KPF_THP; |
120 | 126 | ||
121 | /* | 127 | /* |
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84db7e..a781bdf06694 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c | |||
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) | |||
142 | } | 142 | } |
143 | 143 | ||
144 | rb_link_node(node, parent, p); | 144 | rb_link_node(node, parent, p); |
145 | rb_insert_color(node, &head->parent->root); | ||
145 | return 0; | 146 | return 0; |
146 | } | 147 | } |
147 | 148 | ||
@@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head, | |||
168 | head->node = node; | 169 | head->node = node; |
169 | if (node) { | 170 | if (node) { |
170 | struct ctl_table *entry; | 171 | struct ctl_table *entry; |
171 | for (entry = table; entry->procname; entry++, node++) { | 172 | for (entry = table; entry->procname; entry++, node++) |
172 | rb_init_node(&node->node); | ||
173 | node->header = head; | 173 | node->header = head; |
174 | } | ||
175 | } | 174 | } |
176 | } | 175 | } |
177 | 176 | ||
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f76f16..79827ce03e3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
54 | "VmPTE:\t%8lu kB\n" | 54 | "VmPTE:\t%8lu kB\n" |
55 | "VmSwap:\t%8lu kB\n", | 55 | "VmSwap:\t%8lu kB\n", |
56 | hiwater_vm << (PAGE_SHIFT-10), | 56 | hiwater_vm << (PAGE_SHIFT-10), |
57 | (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | 57 | total_vm << (PAGE_SHIFT-10), |
58 | mm->locked_vm << (PAGE_SHIFT-10), | 58 | mm->locked_vm << (PAGE_SHIFT-10), |
59 | mm->pinned_vm << (PAGE_SHIFT-10), | 59 | mm->pinned_vm << (PAGE_SHIFT-10), |
60 | hiwater_rss << (PAGE_SHIFT-10), | 60 | hiwater_rss << (PAGE_SHIFT-10), |
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ff48c5a85309..5bc77817f382 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c | |||
@@ -1536,6 +1536,7 @@ out_unlock: | |||
1536 | static const struct vm_operations_struct ubifs_file_vm_ops = { | 1536 | static const struct vm_operations_struct ubifs_file_vm_ops = { |
1537 | .fault = filemap_fault, | 1537 | .fault = filemap_fault, |
1538 | .page_mkwrite = ubifs_vm_page_mkwrite, | 1538 | .page_mkwrite = ubifs_vm_page_mkwrite, |
1539 | .remap_pages = generic_file_remap_pages, | ||
1539 | }; | 1540 | }; |
1540 | 1541 | ||
1541 | static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) | 1542 | static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1eaeb8be3aae..aa473fa640a2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -940,7 +940,6 @@ xfs_file_mmap( | |||
940 | struct vm_area_struct *vma) | 940 | struct vm_area_struct *vma) |
941 | { | 941 | { |
942 | vma->vm_ops = &xfs_file_vm_ops; | 942 | vma->vm_ops = &xfs_file_vm_ops; |
943 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
944 | 943 | ||
945 | file_accessed(filp); | 944 | file_accessed(filp); |
946 | return 0; | 945 | return 0; |
@@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = { | |||
1443 | static const struct vm_operations_struct xfs_file_vm_ops = { | 1442 | static const struct vm_operations_struct xfs_file_vm_ops = { |
1444 | .fault = filemap_fault, | 1443 | .fault = filemap_fault, |
1445 | .page_mkwrite = xfs_vm_page_mkwrite, | 1444 | .page_mkwrite = xfs_vm_page_mkwrite, |
1445 | .remap_pages = generic_file_remap_pages, | ||
1446 | }; | 1446 | }; |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ff4947b7a976..b36ce40bd1c6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -87,7 +87,7 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |||
87 | pmd_t *pmdp) | 87 | pmd_t *pmdp) |
88 | { | 88 | { |
89 | pmd_t pmd = *pmdp; | 89 | pmd_t pmd = *pmdp; |
90 | pmd_clear(mm, address, pmdp); | 90 | pmd_clear(pmdp); |
91 | return pmd; | 91 | return pmd; |
92 | } | 92 | } |
93 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 93 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
@@ -162,6 +162,19 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, | |||
162 | unsigned long address, pmd_t *pmdp); | 162 | unsigned long address, pmd_t *pmdp); |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | ||
166 | extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); | ||
167 | #endif | ||
168 | |||
169 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | ||
170 | extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); | ||
171 | #endif | ||
172 | |||
173 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE | ||
174 | extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | ||
175 | pmd_t *pmdp); | ||
176 | #endif | ||
177 | |||
165 | #ifndef __HAVE_ARCH_PTE_SAME | 178 | #ifndef __HAVE_ARCH_PTE_SAME |
166 | static inline int pte_same(pte_t pte_a, pte_t pte_b) | 179 | static inline int pte_same(pte_t pte_a, pte_t pte_b) |
167 | { | 180 | { |
@@ -381,48 +394,59 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, | |||
381 | 394 | ||
382 | #ifndef __HAVE_PFNMAP_TRACKING | 395 | #ifndef __HAVE_PFNMAP_TRACKING |
383 | /* | 396 | /* |
384 | * Interface that can be used by architecture code to keep track of | 397 | * Interfaces that can be used by architecture code to keep track of |
385 | * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) | 398 | * memory type of pfn mappings specified by the remap_pfn_range, |
386 | * | 399 | * vm_insert_pfn. |
387 | * track_pfn_vma_new is called when a _new_ pfn mapping is being established | 400 | */ |
388 | * for physical range indicated by pfn and size. | 401 | |
402 | /* | ||
403 | * track_pfn_remap is called when a _new_ pfn mapping is being established | ||
404 | * by remap_pfn_range() for physical range indicated by pfn and size. | ||
389 | */ | 405 | */ |
390 | static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, | 406 | static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, |
391 | unsigned long pfn, unsigned long size) | 407 | unsigned long pfn, unsigned long addr, |
408 | unsigned long size) | ||
392 | { | 409 | { |
393 | return 0; | 410 | return 0; |
394 | } | 411 | } |
395 | 412 | ||
396 | /* | 413 | /* |
397 | * Interface that can be used by architecture code to keep track of | 414 | * track_pfn_insert is called when a _new_ single pfn is established |
398 | * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) | 415 | * by vm_insert_pfn(). |
399 | * | 416 | */ |
400 | * track_pfn_vma_copy is called when vma that is covering the pfnmap gets | 417 | static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
418 | unsigned long pfn) | ||
419 | { | ||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | /* | ||
424 | * track_pfn_copy is called when vma that is covering the pfnmap gets | ||
401 | * copied through copy_page_range(). | 425 | * copied through copy_page_range(). |
402 | */ | 426 | */ |
403 | static inline int track_pfn_vma_copy(struct vm_area_struct *vma) | 427 | static inline int track_pfn_copy(struct vm_area_struct *vma) |
404 | { | 428 | { |
405 | return 0; | 429 | return 0; |
406 | } | 430 | } |
407 | 431 | ||
408 | /* | 432 | /* |
409 | * Interface that can be used by architecture code to keep track of | ||
410 | * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) | ||
411 | * | ||
412 | * untrack_pfn_vma is called while unmapping a pfnmap for a region. | 433 | * untrack_pfn_vma is called while unmapping a pfnmap for a region. |
413 | * untrack can be called for a specific region indicated by pfn and size or | 434 | * untrack can be called for a specific region indicated by pfn and size or |
414 | * can be for the entire vma (in which case size can be zero). | 435 | * can be for the entire vma (in which case pfn, size are zero). |
415 | */ | 436 | */ |
416 | static inline void untrack_pfn_vma(struct vm_area_struct *vma, | 437 | static inline void untrack_pfn(struct vm_area_struct *vma, |
417 | unsigned long pfn, unsigned long size) | 438 | unsigned long pfn, unsigned long size) |
418 | { | 439 | { |
419 | } | 440 | } |
420 | #else | 441 | #else |
421 | extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, | 442 | extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, |
422 | unsigned long pfn, unsigned long size); | 443 | unsigned long pfn, unsigned long addr, |
423 | extern int track_pfn_vma_copy(struct vm_area_struct *vma); | 444 | unsigned long size); |
424 | extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, | 445 | extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
425 | unsigned long size); | 446 | unsigned long pfn); |
447 | extern int track_pfn_copy(struct vm_area_struct *vma); | ||
448 | extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | ||
449 | unsigned long size); | ||
426 | #endif | 450 | #endif |
427 | 451 | ||
428 | #ifdef CONFIG_MMU | 452 | #ifdef CONFIG_MMU |
diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 70cfcb2d63c4..5b08a8540ecf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h | |||
@@ -86,6 +86,31 @@ static inline int atomic_dec_unless_positive(atomic_t *p) | |||
86 | } | 86 | } |
87 | #endif | 87 | #endif |
88 | 88 | ||
89 | /* | ||
90 | * atomic_dec_if_positive - decrement by 1 if old value positive | ||
91 | * @v: pointer of type atomic_t | ||
92 | * | ||
93 | * The function returns the old value of *v minus 1, even if | ||
94 | * the atomic variable, v, was not decremented. | ||
95 | */ | ||
96 | #ifndef atomic_dec_if_positive | ||
97 | static inline int atomic_dec_if_positive(atomic_t *v) | ||
98 | { | ||
99 | int c, old, dec; | ||
100 | c = atomic_read(v); | ||
101 | for (;;) { | ||
102 | dec = c - 1; | ||
103 | if (unlikely(dec < 0)) | ||
104 | break; | ||
105 | old = atomic_cmpxchg((v), c, dec); | ||
106 | if (likely(old == c)) | ||
107 | break; | ||
108 | c = old; | ||
109 | } | ||
110 | return dec; | ||
111 | } | ||
112 | #endif | ||
113 | |||
89 | #ifndef CONFIG_ARCH_HAS_ATOMIC_OR | 114 | #ifndef CONFIG_ARCH_HAS_ATOMIC_OR |
90 | static inline void atomic_or(int i, atomic_t *v) | 115 | static inline void atomic_or(int i, atomic_t *v) |
91 | { | 116 | { |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ef658147e4e8..6ecb6dc2f303 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -22,8 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | |||
22 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 22 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | bool sync, bool *contended); | 25 | bool sync, bool *contended, struct page **page); |
26 | extern int compact_pgdat(pg_data_t *pgdat, int order); | 26 | extern int compact_pgdat(pg_data_t *pgdat, int order); |
27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | ||
27 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
28 | 29 | ||
29 | /* Do not skip compaction more than 64 times */ | 30 | /* Do not skip compaction more than 64 times */ |
@@ -61,10 +62,20 @@ static inline bool compaction_deferred(struct zone *zone, int order) | |||
61 | return zone->compact_considered < defer_limit; | 62 | return zone->compact_considered < defer_limit; |
62 | } | 63 | } |
63 | 64 | ||
65 | /* Returns true if restarting compaction after many failures */ | ||
66 | static inline bool compaction_restarting(struct zone *zone, int order) | ||
67 | { | ||
68 | if (order < zone->compact_order_failed) | ||
69 | return false; | ||
70 | |||
71 | return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && | ||
72 | zone->compact_considered >= 1UL << zone->compact_defer_shift; | ||
73 | } | ||
74 | |||
64 | #else | 75 | #else |
65 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 76 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, |
66 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 77 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
67 | bool sync, bool *contended) | 78 | bool sync, bool *contended, struct page **page) |
68 | { | 79 | { |
69 | return COMPACT_CONTINUE; | 80 | return COMPACT_CONTINUE; |
70 | } | 81 | } |
@@ -74,6 +85,10 @@ static inline int compact_pgdat(pg_data_t *pgdat, int order) | |||
74 | return COMPACT_CONTINUE; | 85 | return COMPACT_CONTINUE; |
75 | } | 86 | } |
76 | 87 | ||
88 | static inline void reset_isolation_suitable(pg_data_t *pgdat) | ||
89 | { | ||
90 | } | ||
91 | |||
77 | static inline unsigned long compaction_suitable(struct zone *zone, int order) | 92 | static inline unsigned long compaction_suitable(struct zone *zone, int order) |
78 | { | 93 | { |
79 | return COMPACT_SKIPPED; | 94 | return COMPACT_SKIPPED; |
diff --git a/include/linux/fs.h b/include/linux/fs.h index ca6d8c806f47..c617ed024df8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -401,7 +401,7 @@ struct inodes_stat_t { | |||
401 | #include <linux/cache.h> | 401 | #include <linux/cache.h> |
402 | #include <linux/list.h> | 402 | #include <linux/list.h> |
403 | #include <linux/radix-tree.h> | 403 | #include <linux/radix-tree.h> |
404 | #include <linux/prio_tree.h> | 404 | #include <linux/rbtree.h> |
405 | #include <linux/init.h> | 405 | #include <linux/init.h> |
406 | #include <linux/pid.h> | 406 | #include <linux/pid.h> |
407 | #include <linux/bug.h> | 407 | #include <linux/bug.h> |
@@ -669,7 +669,7 @@ struct address_space { | |||
669 | struct radix_tree_root page_tree; /* radix tree of all pages */ | 669 | struct radix_tree_root page_tree; /* radix tree of all pages */ |
670 | spinlock_t tree_lock; /* and lock protecting it */ | 670 | spinlock_t tree_lock; /* and lock protecting it */ |
671 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ | 671 | unsigned int i_mmap_writable;/* count VM_SHARED mappings */ |
672 | struct prio_tree_root i_mmap; /* tree of private and shared mappings */ | 672 | struct rb_root i_mmap; /* tree of private and shared mappings */ |
673 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 673 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
674 | struct mutex i_mmap_mutex; /* protect tree, count, list */ | 674 | struct mutex i_mmap_mutex; /* protect tree, count, list */ |
675 | /* Protected by tree_lock together with the radix tree */ | 675 | /* Protected by tree_lock together with the radix tree */ |
@@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag); | |||
741 | */ | 741 | */ |
742 | static inline int mapping_mapped(struct address_space *mapping) | 742 | static inline int mapping_mapped(struct address_space *mapping) |
743 | { | 743 | { |
744 | return !prio_tree_empty(&mapping->i_mmap) || | 744 | return !RB_EMPTY_ROOT(&mapping->i_mmap) || |
745 | !list_empty(&mapping->i_mmap_nonlinear); | 745 | !list_empty(&mapping->i_mmap_nonlinear); |
746 | } | 746 | } |
747 | 747 | ||
@@ -2552,6 +2552,8 @@ extern int sb_min_blocksize(struct super_block *, int); | |||
2552 | 2552 | ||
2553 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); | 2553 | extern int generic_file_mmap(struct file *, struct vm_area_struct *); |
2554 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); | 2554 | extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); |
2555 | extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, | ||
2556 | unsigned long size, pgoff_t pgoff); | ||
2555 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); | 2557 | extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); |
2556 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); | 2558 | int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); |
2557 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); | 2559 | extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4883f393f50a..02c1c9710be0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -30,12 +30,7 @@ struct vm_area_struct; | |||
30 | #define ___GFP_HARDWALL 0x20000u | 30 | #define ___GFP_HARDWALL 0x20000u |
31 | #define ___GFP_THISNODE 0x40000u | 31 | #define ___GFP_THISNODE 0x40000u |
32 | #define ___GFP_RECLAIMABLE 0x80000u | 32 | #define ___GFP_RECLAIMABLE 0x80000u |
33 | #ifdef CONFIG_KMEMCHECK | ||
34 | #define ___GFP_NOTRACK 0x200000u | 33 | #define ___GFP_NOTRACK 0x200000u |
35 | #else | ||
36 | #define ___GFP_NOTRACK 0 | ||
37 | #endif | ||
38 | #define ___GFP_NO_KSWAPD 0x400000u | ||
39 | #define ___GFP_OTHER_NODE 0x800000u | 34 | #define ___GFP_OTHER_NODE 0x800000u |
40 | #define ___GFP_WRITE 0x1000000u | 35 | #define ___GFP_WRITE 0x1000000u |
41 | 36 | ||
@@ -90,7 +85,6 @@ struct vm_area_struct; | |||
90 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ | 85 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ |
91 | #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ | 86 | #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ |
92 | 87 | ||
93 | #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) | ||
94 | #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ | 88 | #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ |
95 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ | 89 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ |
96 | 90 | ||
@@ -120,8 +114,7 @@ struct vm_area_struct; | |||
120 | __GFP_MOVABLE) | 114 | __GFP_MOVABLE) |
121 | #define GFP_IOFS (__GFP_IO | __GFP_FS) | 115 | #define GFP_IOFS (__GFP_IO | __GFP_FS) |
122 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ | 116 | #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ |
123 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ | 117 | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) |
124 | __GFP_NO_KSWAPD) | ||
125 | 118 | ||
126 | #ifdef CONFIG_NUMA | 119 | #ifdef CONFIG_NUMA |
127 | #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) | 120 | #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4c59b1131187..b31cb7da0346 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -11,8 +11,7 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
11 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 11 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
12 | unsigned long address, pmd_t *pmd, | 12 | unsigned long address, pmd_t *pmd, |
13 | pmd_t orig_pmd); | 13 | pmd_t orig_pmd); |
14 | extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); | 14 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
15 | extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, | ||
16 | unsigned long addr, | 15 | unsigned long addr, |
17 | pmd_t *pmd, | 16 | pmd_t *pmd, |
18 | unsigned int flags); | 17 | unsigned int flags); |
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h new file mode 100644 index 000000000000..724556aa3c95 --- /dev/null +++ b/include/linux/interval_tree.h | |||
@@ -0,0 +1,27 @@ | |||
1 | #ifndef _LINUX_INTERVAL_TREE_H | ||
2 | #define _LINUX_INTERVAL_TREE_H | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | |||
6 | struct interval_tree_node { | ||
7 | struct rb_node rb; | ||
8 | unsigned long start; /* Start of interval */ | ||
9 | unsigned long last; /* Last location _in_ interval */ | ||
10 | unsigned long __subtree_last; | ||
11 | }; | ||
12 | |||
13 | extern void | ||
14 | interval_tree_insert(struct interval_tree_node *node, struct rb_root *root); | ||
15 | |||
16 | extern void | ||
17 | interval_tree_remove(struct interval_tree_node *node, struct rb_root *root); | ||
18 | |||
19 | extern struct interval_tree_node * | ||
20 | interval_tree_iter_first(struct rb_root *root, | ||
21 | unsigned long start, unsigned long last); | ||
22 | |||
23 | extern struct interval_tree_node * | ||
24 | interval_tree_iter_next(struct interval_tree_node *node, | ||
25 | unsigned long start, unsigned long last); | ||
26 | |||
27 | #endif /* _LINUX_INTERVAL_TREE_H */ | ||
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h new file mode 100644 index 000000000000..58370e1862ad --- /dev/null +++ b/include/linux/interval_tree_generic.h | |||
@@ -0,0 +1,191 @@ | |||
1 | /* | ||
2 | Interval Trees | ||
3 | (C) 2012 Michel Lespinasse <walken@google.com> | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | |||
19 | include/linux/interval_tree_generic.h | ||
20 | */ | ||
21 | |||
22 | #include <linux/rbtree_augmented.h> | ||
23 | |||
24 | /* | ||
25 | * Template for implementing interval trees | ||
26 | * | ||
27 | * ITSTRUCT: struct type of the interval tree nodes | ||
28 | * ITRB: name of struct rb_node field within ITSTRUCT | ||
29 | * ITTYPE: type of the interval endpoints | ||
30 | * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree | ||
31 | * ITSTART(n): start endpoint of ITSTRUCT node n | ||
32 | * ITLAST(n): last endpoint of ITSTRUCT node n | ||
33 | * ITSTATIC: 'static' or empty | ||
34 | * ITPREFIX: prefix to use for the inline tree definitions | ||
35 | * | ||
36 | * Note - before using this, please consider if non-generic version | ||
37 | * (interval_tree.h) would work for you... | ||
38 | */ | ||
39 | |||
40 | #define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, \ | ||
41 | ITSTART, ITLAST, ITSTATIC, ITPREFIX) \ | ||
42 | \ | ||
43 | /* Callbacks for augmented rbtree insert and remove */ \ | ||
44 | \ | ||
45 | static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ | ||
46 | { \ | ||
47 | ITTYPE max = ITLAST(node), subtree_last; \ | ||
48 | if (node->ITRB.rb_left) { \ | ||
49 | subtree_last = rb_entry(node->ITRB.rb_left, \ | ||
50 | ITSTRUCT, ITRB)->ITSUBTREE; \ | ||
51 | if (max < subtree_last) \ | ||
52 | max = subtree_last; \ | ||
53 | } \ | ||
54 | if (node->ITRB.rb_right) { \ | ||
55 | subtree_last = rb_entry(node->ITRB.rb_right, \ | ||
56 | ITSTRUCT, ITRB)->ITSUBTREE; \ | ||
57 | if (max < subtree_last) \ | ||
58 | max = subtree_last; \ | ||
59 | } \ | ||
60 | return max; \ | ||
61 | } \ | ||
62 | \ | ||
63 | RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ | ||
64 | ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ | ||
65 | \ | ||
66 | /* Insert / remove interval nodes from the tree */ \ | ||
67 | \ | ||
68 | ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \ | ||
69 | { \ | ||
70 | struct rb_node **link = &root->rb_node, *rb_parent = NULL; \ | ||
71 | ITTYPE start = ITSTART(node), last = ITLAST(node); \ | ||
72 | ITSTRUCT *parent; \ | ||
73 | \ | ||
74 | while (*link) { \ | ||
75 | rb_parent = *link; \ | ||
76 | parent = rb_entry(rb_parent, ITSTRUCT, ITRB); \ | ||
77 | if (parent->ITSUBTREE < last) \ | ||
78 | parent->ITSUBTREE = last; \ | ||
79 | if (start < ITSTART(parent)) \ | ||
80 | link = &parent->ITRB.rb_left; \ | ||
81 | else \ | ||
82 | link = &parent->ITRB.rb_right; \ | ||
83 | } \ | ||
84 | \ | ||
85 | node->ITSUBTREE = last; \ | ||
86 | rb_link_node(&node->ITRB, rb_parent, link); \ | ||
87 | rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ | ||
88 | } \ | ||
89 | \ | ||
90 | ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root) \ | ||
91 | { \ | ||
92 | rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ | ||
93 | } \ | ||
94 | \ | ||
95 | /* \ | ||
96 | * Iterate over intervals intersecting [start;last] \ | ||
97 | * \ | ||
98 | * Note that a node's interval intersects [start;last] iff: \ | ||
99 | * Cond1: ITSTART(node) <= last \ | ||
100 | * and \ | ||
101 | * Cond2: start <= ITLAST(node) \ | ||
102 | */ \ | ||
103 | \ | ||
104 | static ITSTRUCT * \ | ||
105 | ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ | ||
106 | { \ | ||
107 | while (true) { \ | ||
108 | /* \ | ||
109 | * Loop invariant: start <= node->ITSUBTREE \ | ||
110 | * (Cond2 is satisfied by one of the subtree nodes) \ | ||
111 | */ \ | ||
112 | if (node->ITRB.rb_left) { \ | ||
113 | ITSTRUCT *left = rb_entry(node->ITRB.rb_left, \ | ||
114 | ITSTRUCT, ITRB); \ | ||
115 | if (start <= left->ITSUBTREE) { \ | ||
116 | /* \ | ||
117 | * Some nodes in left subtree satisfy Cond2. \ | ||
118 | * Iterate to find the leftmost such node N. \ | ||
119 | * If it also satisfies Cond1, that's the \ | ||
120 | * match we are looking for. Otherwise, there \ | ||
121 | * is no matching interval as nodes to the \ | ||
122 | * right of N can't satisfy Cond1 either. \ | ||
123 | */ \ | ||
124 | node = left; \ | ||
125 | continue; \ | ||
126 | } \ | ||
127 | } \ | ||
128 | if (ITSTART(node) <= last) { /* Cond1 */ \ | ||
129 | if (start <= ITLAST(node)) /* Cond2 */ \ | ||
130 | return node; /* node is leftmost match */ \ | ||
131 | if (node->ITRB.rb_right) { \ | ||
132 | node = rb_entry(node->ITRB.rb_right, \ | ||
133 | ITSTRUCT, ITRB); \ | ||
134 | if (start <= node->ITSUBTREE) \ | ||
135 | continue; \ | ||
136 | } \ | ||
137 | } \ | ||
138 | return NULL; /* No match */ \ | ||
139 | } \ | ||
140 | } \ | ||
141 | \ | ||
142 | ITSTATIC ITSTRUCT * \ | ||
143 | ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last) \ | ||
144 | { \ | ||
145 | ITSTRUCT *node; \ | ||
146 | \ | ||
147 | if (!root->rb_node) \ | ||
148 | return NULL; \ | ||
149 | node = rb_entry(root->rb_node, ITSTRUCT, ITRB); \ | ||
150 | if (node->ITSUBTREE < start) \ | ||
151 | return NULL; \ | ||
152 | return ITPREFIX ## _subtree_search(node, start, last); \ | ||
153 | } \ | ||
154 | \ | ||
155 | ITSTATIC ITSTRUCT * \ | ||
156 | ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ | ||
157 | { \ | ||
158 | struct rb_node *rb = node->ITRB.rb_right, *prev; \ | ||
159 | \ | ||
160 | while (true) { \ | ||
161 | /* \ | ||
162 | * Loop invariants: \ | ||
163 | * Cond1: ITSTART(node) <= last \ | ||
164 | * rb == node->ITRB.rb_right \ | ||
165 | * \ | ||
166 | * First, search right subtree if suitable \ | ||
167 | */ \ | ||
168 | if (rb) { \ | ||
169 | ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); \ | ||
170 | if (start <= right->ITSUBTREE) \ | ||
171 | return ITPREFIX ## _subtree_search(right, \ | ||
172 | start, last); \ | ||
173 | } \ | ||
174 | \ | ||
175 | /* Move up the tree until we come from a node's left child */ \ | ||
176 | do { \ | ||
177 | rb = rb_parent(&node->ITRB); \ | ||
178 | if (!rb) \ | ||
179 | return NULL; \ | ||
180 | prev = &node->ITRB; \ | ||
181 | node = rb_entry(rb, ITSTRUCT, ITRB); \ | ||
182 | rb = node->ITRB.rb_right; \ | ||
183 | } while (prev == rb); \ | ||
184 | \ | ||
185 | /* Check if the node intersects [start;last] */ \ | ||
186 | if (last < ITSTART(node)) /* !Cond1 */ \ | ||
187 | return NULL; \ | ||
188 | else if (start <= ITLAST(node)) /* Cond2 */ \ | ||
189 | return node; \ | ||
190 | } \ | ||
191 | } | ||
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 19dc455b4f3d..569d67d4243e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -70,8 +70,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | |||
70 | * @p_end: ptr to ulong for end pfn of the range, can be %NULL | 70 | * @p_end: ptr to ulong for end pfn of the range, can be %NULL |
71 | * @p_nid: ptr to int for nid of the range, can be %NULL | 71 | * @p_nid: ptr to int for nid of the range, can be %NULL |
72 | * | 72 | * |
73 | * Walks over configured memory ranges. Available after early_node_map is | 73 | * Walks over configured memory ranges. |
74 | * populated. | ||
75 | */ | 74 | */ |
76 | #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ | 75 | #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ |
77 | for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ | 76 | for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8d9489fdab2e..fd0e6d53836e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -84,14 +84,14 @@ extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); | |||
84 | extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); | 84 | extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); |
85 | 85 | ||
86 | static inline | 86 | static inline |
87 | int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) | 87 | bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) |
88 | { | 88 | { |
89 | struct mem_cgroup *memcg; | 89 | struct mem_cgroup *task_memcg; |
90 | int match; | 90 | bool match; |
91 | 91 | ||
92 | rcu_read_lock(); | 92 | rcu_read_lock(); |
93 | memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner)); | 93 | task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
94 | match = __mem_cgroup_same_or_subtree(cgroup, memcg); | 94 | match = __mem_cgroup_same_or_subtree(memcg, task_memcg); |
95 | rcu_read_unlock(); | 95 | rcu_read_unlock(); |
96 | return match; | 96 | return match; |
97 | } | 97 | } |
@@ -258,10 +258,10 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm | |||
258 | return NULL; | 258 | return NULL; |
259 | } | 259 | } |
260 | 260 | ||
261 | static inline int mm_match_cgroup(struct mm_struct *mm, | 261 | static inline bool mm_match_cgroup(struct mm_struct *mm, |
262 | struct mem_cgroup *memcg) | 262 | struct mem_cgroup *memcg) |
263 | { | 263 | { |
264 | return 1; | 264 | return true; |
265 | } | 265 | } |
266 | 266 | ||
267 | static inline int task_in_mem_cgroup(struct task_struct *task, | 267 | static inline int task_in_mem_cgroup(struct task_struct *task, |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 910550f3b70e..95573ec4ee6c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -10,6 +10,7 @@ struct page; | |||
10 | struct zone; | 10 | struct zone; |
11 | struct pglist_data; | 11 | struct pglist_data; |
12 | struct mem_section; | 12 | struct mem_section; |
13 | struct memory_block; | ||
13 | 14 | ||
14 | #ifdef CONFIG_MEMORY_HOTPLUG | 15 | #ifdef CONFIG_MEMORY_HOTPLUG |
15 | 16 | ||
@@ -233,6 +234,8 @@ static inline int is_mem_section_removable(unsigned long pfn, | |||
233 | extern int mem_online_node(int nid); | 234 | extern int mem_online_node(int nid); |
234 | extern int add_memory(int nid, u64 start, u64 size); | 235 | extern int add_memory(int nid, u64 start, u64 size); |
235 | extern int arch_add_memory(int nid, u64 start, u64 size); | 236 | extern int arch_add_memory(int nid, u64 start, u64 size); |
237 | extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); | ||
238 | extern int offline_memory_block(struct memory_block *mem); | ||
236 | extern int remove_memory(u64 start, u64 size); | 239 | extern int remove_memory(u64 start, u64 size); |
237 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 240 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
238 | int nr_pages); | 241 | int nr_pages); |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 95b738c7abff..cec569325608 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -188,7 +188,7 @@ struct sp_node { | |||
188 | 188 | ||
189 | struct shared_policy { | 189 | struct shared_policy { |
190 | struct rb_root root; | 190 | struct rb_root root; |
191 | spinlock_t lock; | 191 | struct mutex mutex; |
192 | }; | 192 | }; |
193 | 193 | ||
194 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); | 194 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); |
@@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, | |||
239 | /* Check if a vma is migratable */ | 239 | /* Check if a vma is migratable */ |
240 | static inline int vma_migratable(struct vm_area_struct *vma) | 240 | static inline int vma_migratable(struct vm_area_struct *vma) |
241 | { | 241 | { |
242 | if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) | 242 | if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) |
243 | return 0; | 243 | return 0; |
244 | /* | 244 | /* |
245 | * Migration allocates pages in the highest zone. If we cannot | 245 | * Migration allocates pages in the highest zone. If we cannot |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 311be906b57d..fa0680402738 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/list.h> | 10 | #include <linux/list.h> |
11 | #include <linux/mmzone.h> | 11 | #include <linux/mmzone.h> |
12 | #include <linux/rbtree.h> | 12 | #include <linux/rbtree.h> |
13 | #include <linux/prio_tree.h> | ||
14 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
15 | #include <linux/debug_locks.h> | 14 | #include <linux/debug_locks.h> |
16 | #include <linux/mm_types.h> | 15 | #include <linux/mm_types.h> |
@@ -21,6 +20,7 @@ | |||
21 | 20 | ||
22 | struct mempolicy; | 21 | struct mempolicy; |
23 | struct anon_vma; | 22 | struct anon_vma; |
23 | struct anon_vma_chain; | ||
24 | struct file_ra_state; | 24 | struct file_ra_state; |
25 | struct user_struct; | 25 | struct user_struct; |
26 | struct writeback_control; | 26 | struct writeback_control; |
@@ -70,6 +70,8 @@ extern unsigned int kobjsize(const void *objp); | |||
70 | /* | 70 | /* |
71 | * vm_flags in vm_area_struct, see mm_types.h. | 71 | * vm_flags in vm_area_struct, see mm_types.h. |
72 | */ | 72 | */ |
73 | #define VM_NONE 0x00000000 | ||
74 | |||
73 | #define VM_READ 0x00000001 /* currently active flags */ | 75 | #define VM_READ 0x00000001 /* currently active flags */ |
74 | #define VM_WRITE 0x00000002 | 76 | #define VM_WRITE 0x00000002 |
75 | #define VM_EXEC 0x00000004 | 77 | #define VM_EXEC 0x00000004 |
@@ -82,16 +84,9 @@ extern unsigned int kobjsize(const void *objp); | |||
82 | #define VM_MAYSHARE 0x00000080 | 84 | #define VM_MAYSHARE 0x00000080 |
83 | 85 | ||
84 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ | 86 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ |
85 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) | ||
86 | #define VM_GROWSUP 0x00000200 | ||
87 | #else | ||
88 | #define VM_GROWSUP 0x00000000 | ||
89 | #define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ | ||
90 | #endif | ||
91 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
92 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
93 | 89 | ||
94 | #define VM_EXECUTABLE 0x00001000 | ||
95 | #define VM_LOCKED 0x00002000 | 90 | #define VM_LOCKED 0x00002000 |
96 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 91 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
97 | 92 | ||
@@ -101,25 +96,34 @@ extern unsigned int kobjsize(const void *objp); | |||
101 | 96 | ||
102 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ | 97 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ |
103 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ | 98 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ |
104 | #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ | ||
105 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ | 99 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ |
106 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ | 100 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ |
107 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 101 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
108 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ | 102 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ |
109 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 103 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ |
110 | #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ | 104 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ |
111 | #else | ||
112 | #define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ | ||
113 | #endif | ||
114 | #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ | ||
115 | #define VM_NODUMP 0x04000000 /* Do not include in the core dump */ | ||
116 | 105 | ||
117 | #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ | ||
118 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ | 106 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ |
119 | #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ | 107 | #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ |
120 | #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ | 108 | #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ |
121 | #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ | 109 | #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ |
122 | 110 | ||
111 | #if defined(CONFIG_X86) | ||
112 | # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ | ||
113 | #elif defined(CONFIG_PPC) | ||
114 | # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ | ||
115 | #elif defined(CONFIG_PARISC) | ||
116 | # define VM_GROWSUP VM_ARCH_1 | ||
117 | #elif defined(CONFIG_IA64) | ||
118 | # define VM_GROWSUP VM_ARCH_1 | ||
119 | #elif !defined(CONFIG_MMU) | ||
120 | # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ | ||
121 | #endif | ||
122 | |||
123 | #ifndef VM_GROWSUP | ||
124 | # define VM_GROWSUP VM_NONE | ||
125 | #endif | ||
126 | |||
123 | /* Bits set in the VMA until the stack is in its final location */ | 127 | /* Bits set in the VMA until the stack is in its final location */ |
124 | #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) | 128 | #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) |
125 | 129 | ||
@@ -143,7 +147,7 @@ extern unsigned int kobjsize(const void *objp); | |||
143 | * Special vmas that are non-mergable, non-mlock()able. | 147 | * Special vmas that are non-mergable, non-mlock()able. |
144 | * Note: mm/huge_memory.c VM_NO_THP depends on this definition. | 148 | * Note: mm/huge_memory.c VM_NO_THP depends on this definition. |
145 | */ | 149 | */ |
146 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | 150 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) |
147 | 151 | ||
148 | /* | 152 | /* |
149 | * mapping from the currently active vm_flags protection bits (the | 153 | * mapping from the currently active vm_flags protection bits (the |
@@ -157,24 +161,7 @@ extern pgprot_t protection_map[16]; | |||
157 | #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ | 161 | #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ |
158 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ | 162 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ |
159 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ | 163 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ |
160 | 164 | #define FAULT_FLAG_TRIED 0x40 /* second try */ | |
161 | /* | ||
162 | * This interface is used by x86 PAT code to identify a pfn mapping that is | ||
163 | * linear over entire vma. This is to optimize PAT code that deals with | ||
164 | * marking the physical region with a particular prot. This is not for generic | ||
165 | * mm use. Note also that this check will not work if the pfn mapping is | ||
166 | * linear for a vma starting at physical address 0. In which case PAT code | ||
167 | * falls back to slow path of reserving physical range page by page. | ||
168 | */ | ||
169 | static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) | ||
170 | { | ||
171 | return !!(vma->vm_flags & VM_PFN_AT_MMAP); | ||
172 | } | ||
173 | |||
174 | static inline int is_pfn_mapping(struct vm_area_struct *vma) | ||
175 | { | ||
176 | return !!(vma->vm_flags & VM_PFNMAP); | ||
177 | } | ||
178 | 165 | ||
179 | /* | 166 | /* |
180 | * vm_fault is filled by the the pagefault handler and passed to the vma's | 167 | * vm_fault is filled by the the pagefault handler and passed to the vma's |
@@ -182,8 +169,7 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) | |||
182 | * of VM_FAULT_xxx flags that give details about how the fault was handled. | 169 | * of VM_FAULT_xxx flags that give details about how the fault was handled. |
183 | * | 170 | * |
184 | * pgoff should be used in favour of virtual_address, if possible. If pgoff | 171 | * pgoff should be used in favour of virtual_address, if possible. If pgoff |
185 | * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear | 172 | * is used, one may implement ->remap_pages to get nonlinear mapping support. |
186 | * mapping support. | ||
187 | */ | 173 | */ |
188 | struct vm_fault { | 174 | struct vm_fault { |
189 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | 175 | unsigned int flags; /* FAULT_FLAG_xxx flags */ |
@@ -241,6 +227,9 @@ struct vm_operations_struct { | |||
241 | int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, | 227 | int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, |
242 | const nodemask_t *to, unsigned long flags); | 228 | const nodemask_t *to, unsigned long flags); |
243 | #endif | 229 | #endif |
230 | /* called by sys_remap_file_pages() to populate non-linear mapping */ | ||
231 | int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, | ||
232 | unsigned long size, pgoff_t pgoff); | ||
244 | }; | 233 | }; |
245 | 234 | ||
246 | struct mmu_gather; | 235 | struct mmu_gather; |
@@ -249,6 +238,18 @@ struct inode; | |||
249 | #define page_private(page) ((page)->private) | 238 | #define page_private(page) ((page)->private) |
250 | #define set_page_private(page, v) ((page)->private = (v)) | 239 | #define set_page_private(page, v) ((page)->private = (v)) |
251 | 240 | ||
241 | /* It's valid only if the page is free path or free_list */ | ||
242 | static inline void set_freepage_migratetype(struct page *page, int migratetype) | ||
243 | { | ||
244 | page->index = migratetype; | ||
245 | } | ||
246 | |||
247 | /* It's valid only if the page is free path or free_list */ | ||
248 | static inline int get_freepage_migratetype(struct page *page) | ||
249 | { | ||
250 | return page->index; | ||
251 | } | ||
252 | |||
252 | /* | 253 | /* |
253 | * FIXME: take this include out, include page-flags.h in | 254 | * FIXME: take this include out, include page-flags.h in |
254 | * files which need it (119 of them) | 255 | * files which need it (119 of them) |
@@ -454,6 +455,7 @@ void put_pages_list(struct list_head *pages); | |||
454 | 455 | ||
455 | void split_page(struct page *page, unsigned int order); | 456 | void split_page(struct page *page, unsigned int order); |
456 | int split_free_page(struct page *page); | 457 | int split_free_page(struct page *page); |
458 | int capture_free_page(struct page *page, int alloc_order, int migratetype); | ||
457 | 459 | ||
458 | /* | 460 | /* |
459 | * Compound pages have a destructor function. Provide a | 461 | * Compound pages have a destructor function. Provide a |
@@ -1071,7 +1073,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); | |||
1071 | 1073 | ||
1072 | extern unsigned long move_page_tables(struct vm_area_struct *vma, | 1074 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
1073 | unsigned long old_addr, struct vm_area_struct *new_vma, | 1075 | unsigned long old_addr, struct vm_area_struct *new_vma, |
1074 | unsigned long new_addr, unsigned long len); | 1076 | unsigned long new_addr, unsigned long len, |
1077 | bool need_rmap_locks); | ||
1075 | extern unsigned long do_mremap(unsigned long addr, | 1078 | extern unsigned long do_mremap(unsigned long addr, |
1076 | unsigned long old_len, unsigned long new_len, | 1079 | unsigned long old_len, unsigned long new_len, |
1077 | unsigned long flags, unsigned long new_addr); | 1080 | unsigned long flags, unsigned long new_addr); |
@@ -1366,24 +1369,45 @@ extern void zone_pcp_reset(struct zone *zone); | |||
1366 | extern atomic_long_t mmap_pages_allocated; | 1369 | extern atomic_long_t mmap_pages_allocated; |
1367 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); | 1370 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); |
1368 | 1371 | ||
1369 | /* prio_tree.c */ | 1372 | /* interval_tree.c */ |
1370 | void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); | 1373 | void vma_interval_tree_insert(struct vm_area_struct *node, |
1371 | void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); | 1374 | struct rb_root *root); |
1372 | void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); | 1375 | void vma_interval_tree_insert_after(struct vm_area_struct *node, |
1373 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | 1376 | struct vm_area_struct *prev, |
1374 | struct prio_tree_iter *iter); | 1377 | struct rb_root *root); |
1375 | 1378 | void vma_interval_tree_remove(struct vm_area_struct *node, | |
1376 | #define vma_prio_tree_foreach(vma, iter, root, begin, end) \ | 1379 | struct rb_root *root); |
1377 | for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ | 1380 | struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, |
1378 | (vma = vma_prio_tree_next(vma, iter)); ) | 1381 | unsigned long start, unsigned long last); |
1382 | struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, | ||
1383 | unsigned long start, unsigned long last); | ||
1384 | |||
1385 | #define vma_interval_tree_foreach(vma, root, start, last) \ | ||
1386 | for (vma = vma_interval_tree_iter_first(root, start, last); \ | ||
1387 | vma; vma = vma_interval_tree_iter_next(vma, start, last)) | ||
1379 | 1388 | ||
1380 | static inline void vma_nonlinear_insert(struct vm_area_struct *vma, | 1389 | static inline void vma_nonlinear_insert(struct vm_area_struct *vma, |
1381 | struct list_head *list) | 1390 | struct list_head *list) |
1382 | { | 1391 | { |
1383 | vma->shared.vm_set.parent = NULL; | 1392 | list_add_tail(&vma->shared.nonlinear, list); |
1384 | list_add_tail(&vma->shared.vm_set.list, list); | ||
1385 | } | 1393 | } |
1386 | 1394 | ||
1395 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, | ||
1396 | struct rb_root *root); | ||
1397 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, | ||
1398 | struct rb_root *root); | ||
1399 | struct anon_vma_chain *anon_vma_interval_tree_iter_first( | ||
1400 | struct rb_root *root, unsigned long start, unsigned long last); | ||
1401 | struct anon_vma_chain *anon_vma_interval_tree_iter_next( | ||
1402 | struct anon_vma_chain *node, unsigned long start, unsigned long last); | ||
1403 | #ifdef CONFIG_DEBUG_VM_RB | ||
1404 | void anon_vma_interval_tree_verify(struct anon_vma_chain *node); | ||
1405 | #endif | ||
1406 | |||
1407 | #define anon_vma_interval_tree_foreach(avc, root, start, last) \ | ||
1408 | for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ | ||
1409 | avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) | ||
1410 | |||
1387 | /* mmap.c */ | 1411 | /* mmap.c */ |
1388 | extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); | 1412 | extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); |
1389 | extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, | 1413 | extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
@@ -1400,15 +1424,13 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, | |||
1400 | struct rb_node **, struct rb_node *); | 1424 | struct rb_node **, struct rb_node *); |
1401 | extern void unlink_file_vma(struct vm_area_struct *); | 1425 | extern void unlink_file_vma(struct vm_area_struct *); |
1402 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, | 1426 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, |
1403 | unsigned long addr, unsigned long len, pgoff_t pgoff); | 1427 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
1428 | bool *need_rmap_locks); | ||
1404 | extern void exit_mmap(struct mm_struct *); | 1429 | extern void exit_mmap(struct mm_struct *); |
1405 | 1430 | ||
1406 | extern int mm_take_all_locks(struct mm_struct *mm); | 1431 | extern int mm_take_all_locks(struct mm_struct *mm); |
1407 | extern void mm_drop_all_locks(struct mm_struct *mm); | 1432 | extern void mm_drop_all_locks(struct mm_struct *mm); |
1408 | 1433 | ||
1409 | /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ | ||
1410 | extern void added_exe_file_vma(struct mm_struct *mm); | ||
1411 | extern void removed_exe_file_vma(struct mm_struct *mm); | ||
1412 | extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); | 1434 | extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); |
1413 | extern struct file *get_mm_exe_file(struct mm_struct *mm); | 1435 | extern struct file *get_mm_exe_file(struct mm_struct *mm); |
1414 | 1436 | ||
@@ -1662,5 +1684,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } | |||
1662 | static inline bool page_is_guard(struct page *page) { return false; } | 1684 | static inline bool page_is_guard(struct page *page) { return false; } |
1663 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | 1685 | #endif /* CONFIG_DEBUG_PAGEALLOC */ |
1664 | 1686 | ||
1687 | extern void reset_zone_present_pages(void); | ||
1688 | extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, | ||
1689 | unsigned long end_pfn); | ||
1690 | |||
1665 | #endif /* __KERNEL__ */ | 1691 | #endif /* __KERNEL__ */ |
1666 | #endif /* _LINUX_MM_H */ | 1692 | #endif /* _LINUX_MM_H */ |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf7867200b95..31f8a3af7d94 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -6,7 +6,6 @@ | |||
6 | #include <linux/threads.h> | 6 | #include <linux/threads.h> |
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
9 | #include <linux/prio_tree.h> | ||
10 | #include <linux/rbtree.h> | 9 | #include <linux/rbtree.h> |
11 | #include <linux/rwsem.h> | 10 | #include <linux/rwsem.h> |
12 | #include <linux/completion.h> | 11 | #include <linux/completion.h> |
@@ -240,18 +239,15 @@ struct vm_area_struct { | |||
240 | 239 | ||
241 | /* | 240 | /* |
242 | * For areas with an address space and backing store, | 241 | * For areas with an address space and backing store, |
243 | * linkage into the address_space->i_mmap prio tree, or | 242 | * linkage into the address_space->i_mmap interval tree, or |
244 | * linkage to the list of like vmas hanging off its node, or | ||
245 | * linkage of vma in the address_space->i_mmap_nonlinear list. | 243 | * linkage of vma in the address_space->i_mmap_nonlinear list. |
246 | */ | 244 | */ |
247 | union { | 245 | union { |
248 | struct { | 246 | struct { |
249 | struct list_head list; | 247 | struct rb_node rb; |
250 | void *parent; /* aligns with prio_tree_node parent */ | 248 | unsigned long rb_subtree_last; |
251 | struct vm_area_struct *head; | 249 | } linear; |
252 | } vm_set; | 250 | struct list_head nonlinear; |
253 | |||
254 | struct raw_prio_tree_node prio_tree_node; | ||
255 | } shared; | 251 | } shared; |
256 | 252 | ||
257 | /* | 253 | /* |
@@ -349,7 +345,6 @@ struct mm_struct { | |||
349 | unsigned long shared_vm; /* Shared pages (files) */ | 345 | unsigned long shared_vm; /* Shared pages (files) */ |
350 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ | 346 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ |
351 | unsigned long stack_vm; /* VM_GROWSUP/DOWN */ | 347 | unsigned long stack_vm; /* VM_GROWSUP/DOWN */ |
352 | unsigned long reserved_vm; /* VM_RESERVED|VM_IO pages */ | ||
353 | unsigned long def_flags; | 348 | unsigned long def_flags; |
354 | unsigned long nr_ptes; /* Page table pages */ | 349 | unsigned long nr_ptes; /* Page table pages */ |
355 | unsigned long start_code, end_code, start_data, end_data; | 350 | unsigned long start_code, end_code, start_data, end_data; |
@@ -394,7 +389,6 @@ struct mm_struct { | |||
394 | 389 | ||
395 | /* store ref to file /proc/<pid>/exe symlink points to */ | 390 | /* store ref to file /proc/<pid>/exe symlink points to */ |
396 | struct file *exe_file; | 391 | struct file *exe_file; |
397 | unsigned long num_exe_file_vmas; | ||
398 | #ifdef CONFIG_MMU_NOTIFIER | 392 | #ifdef CONFIG_MMU_NOTIFIER |
399 | struct mmu_notifier_mm *mmu_notifier_mm; | 393 | struct mmu_notifier_mm *mmu_notifier_mm; |
400 | #endif | 394 | #endif |
diff --git a/include/linux/mman.h b/include/linux/mman.h index 8b74e9b1d0ad..77cec2f45cb7 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags) | |||
86 | { | 86 | { |
87 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | | 87 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | |
88 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | | 88 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | |
89 | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | | ||
90 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); | 89 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); |
91 | } | 90 | } |
92 | #endif /* __KERNEL__ */ | 91 | #endif /* __KERNEL__ */ |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 1d1b1e13f79f..bc823c4c028b 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/list.h> | 4 | #include <linux/list.h> |
5 | #include <linux/spinlock.h> | 5 | #include <linux/spinlock.h> |
6 | #include <linux/mm_types.h> | 6 | #include <linux/mm_types.h> |
7 | #include <linux/srcu.h> | ||
7 | 8 | ||
8 | struct mmu_notifier; | 9 | struct mmu_notifier; |
9 | struct mmu_notifier_ops; | 10 | struct mmu_notifier_ops; |
@@ -245,50 +246,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
245 | __mmu_notifier_mm_destroy(mm); | 246 | __mmu_notifier_mm_destroy(mm); |
246 | } | 247 | } |
247 | 248 | ||
248 | /* | ||
249 | * These two macros will sometime replace ptep_clear_flush. | ||
250 | * ptep_clear_flush is implemented as macro itself, so this also is | ||
251 | * implemented as a macro until ptep_clear_flush will converted to an | ||
252 | * inline function, to diminish the risk of compilation failure. The | ||
253 | * invalidate_page method over time can be moved outside the PT lock | ||
254 | * and these two macros can be later removed. | ||
255 | */ | ||
256 | #define ptep_clear_flush_notify(__vma, __address, __ptep) \ | ||
257 | ({ \ | ||
258 | pte_t __pte; \ | ||
259 | struct vm_area_struct *___vma = __vma; \ | ||
260 | unsigned long ___address = __address; \ | ||
261 | __pte = ptep_clear_flush(___vma, ___address, __ptep); \ | ||
262 | mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ | ||
263 | __pte; \ | ||
264 | }) | ||
265 | |||
266 | #define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ | ||
267 | ({ \ | ||
268 | pmd_t __pmd; \ | ||
269 | struct vm_area_struct *___vma = __vma; \ | ||
270 | unsigned long ___address = __address; \ | ||
271 | VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ | ||
272 | mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ | ||
273 | (__address)+HPAGE_PMD_SIZE);\ | ||
274 | __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ | ||
275 | mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ | ||
276 | (__address)+HPAGE_PMD_SIZE); \ | ||
277 | __pmd; \ | ||
278 | }) | ||
279 | |||
280 | #define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ | ||
281 | ({ \ | ||
282 | struct vm_area_struct *___vma = __vma; \ | ||
283 | unsigned long ___address = __address; \ | ||
284 | VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ | ||
285 | mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ | ||
286 | (__address)+HPAGE_PMD_SIZE);\ | ||
287 | pmdp_splitting_flush(___vma, ___address, __pmdp); \ | ||
288 | mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ | ||
289 | (__address)+HPAGE_PMD_SIZE); \ | ||
290 | }) | ||
291 | |||
292 | #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ | 249 | #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ |
293 | ({ \ | 250 | ({ \ |
294 | int __young; \ | 251 | int __young; \ |
@@ -311,14 +268,24 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
311 | __young; \ | 268 | __young; \ |
312 | }) | 269 | }) |
313 | 270 | ||
271 | /* | ||
272 | * set_pte_at_notify() sets the pte _after_ running the notifier. | ||
273 | * This is safe to start by updating the secondary MMUs, because the primary MMU | ||
274 | * pte invalidate must have already happened with a ptep_clear_flush() before | ||
275 | * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is | ||
276 | * required when we change both the protection of the mapping from read-only to | ||
277 | * read-write and the pfn (like during copy on write page faults). Otherwise the | ||
278 | * old page would remain mapped readonly in the secondary MMUs after the new | ||
279 | * page is already writable by some CPU through the primary MMU. | ||
280 | */ | ||
314 | #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ | 281 | #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ |
315 | ({ \ | 282 | ({ \ |
316 | struct mm_struct *___mm = __mm; \ | 283 | struct mm_struct *___mm = __mm; \ |
317 | unsigned long ___address = __address; \ | 284 | unsigned long ___address = __address; \ |
318 | pte_t ___pte = __pte; \ | 285 | pte_t ___pte = __pte; \ |
319 | \ | 286 | \ |
320 | set_pte_at(___mm, ___address, __ptep, ___pte); \ | ||
321 | mmu_notifier_change_pte(___mm, ___address, ___pte); \ | 287 | mmu_notifier_change_pte(___mm, ___address, ___pte); \ |
288 | set_pte_at(___mm, ___address, __ptep, ___pte); \ | ||
322 | }) | 289 | }) |
323 | 290 | ||
324 | #else /* CONFIG_MMU_NOTIFIER */ | 291 | #else /* CONFIG_MMU_NOTIFIER */ |
@@ -369,9 +336,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
369 | 336 | ||
370 | #define ptep_clear_flush_young_notify ptep_clear_flush_young | 337 | #define ptep_clear_flush_young_notify ptep_clear_flush_young |
371 | #define pmdp_clear_flush_young_notify pmdp_clear_flush_young | 338 | #define pmdp_clear_flush_young_notify pmdp_clear_flush_young |
372 | #define ptep_clear_flush_notify ptep_clear_flush | ||
373 | #define pmdp_clear_flush_notify pmdp_clear_flush | ||
374 | #define pmdp_splitting_flush_notify pmdp_splitting_flush | ||
375 | #define set_pte_at_notify set_pte_at | 339 | #define set_pte_at_notify set_pte_at |
376 | 340 | ||
377 | #endif /* CONFIG_MMU_NOTIFIER */ | 341 | #endif /* CONFIG_MMU_NOTIFIER */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2daa54f55db7..50aaca81f63d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -142,6 +142,7 @@ enum zone_stat_item { | |||
142 | NUMA_OTHER, /* allocation from other node */ | 142 | NUMA_OTHER, /* allocation from other node */ |
143 | #endif | 143 | #endif |
144 | NR_ANON_TRANSPARENT_HUGEPAGES, | 144 | NR_ANON_TRANSPARENT_HUGEPAGES, |
145 | NR_FREE_CMA_PAGES, | ||
145 | NR_VM_ZONE_STAT_ITEMS }; | 146 | NR_VM_ZONE_STAT_ITEMS }; |
146 | 147 | ||
147 | /* | 148 | /* |
@@ -217,6 +218,8 @@ struct lruvec { | |||
217 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) | 218 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) |
218 | /* Isolate for asynchronous migration */ | 219 | /* Isolate for asynchronous migration */ |
219 | #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) | 220 | #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) |
221 | /* Isolate unevictable pages */ | ||
222 | #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) | ||
220 | 223 | ||
221 | /* LRU Isolation modes. */ | 224 | /* LRU Isolation modes. */ |
222 | typedef unsigned __bitwise__ isolate_mode_t; | 225 | typedef unsigned __bitwise__ isolate_mode_t; |
@@ -369,8 +372,12 @@ struct zone { | |||
369 | spinlock_t lock; | 372 | spinlock_t lock; |
370 | int all_unreclaimable; /* All pages pinned */ | 373 | int all_unreclaimable; /* All pages pinned */ |
371 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 374 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
372 | /* pfn where the last incremental compaction isolated free pages */ | 375 | /* Set to true when the PG_migrate_skip bits should be cleared */ |
376 | bool compact_blockskip_flush; | ||
377 | |||
378 | /* pfns where compaction scanners should start */ | ||
373 | unsigned long compact_cached_free_pfn; | 379 | unsigned long compact_cached_free_pfn; |
380 | unsigned long compact_cached_migrate_pfn; | ||
374 | #endif | 381 | #endif |
375 | #ifdef CONFIG_MEMORY_HOTPLUG | 382 | #ifdef CONFIG_MEMORY_HOTPLUG |
376 | /* see spanned/present_pages for more description */ | 383 | /* see spanned/present_pages for more description */ |
@@ -704,6 +711,7 @@ typedef struct pglist_data { | |||
704 | unsigned long node_spanned_pages; /* total size of physical page | 711 | unsigned long node_spanned_pages; /* total size of physical page |
705 | range, including holes */ | 712 | range, including holes */ |
706 | int node_id; | 713 | int node_id; |
714 | nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */ | ||
707 | wait_queue_head_t kswapd_wait; | 715 | wait_queue_head_t kswapd_wait; |
708 | wait_queue_head_t pfmemalloc_wait; | 716 | wait_queue_head_t pfmemalloc_wait; |
709 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ | 717 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 49a3031fda50..d36a8221f58b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -2,17 +2,6 @@ | |||
2 | #define __INCLUDE_LINUX_OOM_H | 2 | #define __INCLUDE_LINUX_OOM_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * /proc/<pid>/oom_adj is deprecated, see | ||
6 | * Documentation/feature-removal-schedule.txt. | ||
7 | * | ||
8 | * /proc/<pid>/oom_adj set to -17 protects from the oom-killer | ||
9 | */ | ||
10 | #define OOM_DISABLE (-17) | ||
11 | /* inclusive */ | ||
12 | #define OOM_ADJUST_MIN (-16) | ||
13 | #define OOM_ADJUST_MAX 15 | ||
14 | |||
15 | /* | ||
16 | * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for | 5 | * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for |
17 | * pid. | 6 | * pid. |
18 | */ | 7 | */ |
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 105077aa7685..76a9539cfd3f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h | |||
@@ -6,6 +6,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count); | |||
6 | void set_pageblock_migratetype(struct page *page, int migratetype); | 6 | void set_pageblock_migratetype(struct page *page, int migratetype); |
7 | int move_freepages_block(struct zone *zone, struct page *page, | 7 | int move_freepages_block(struct zone *zone, struct page *page, |
8 | int migratetype); | 8 | int migratetype); |
9 | int move_freepages(struct zone *zone, | ||
10 | struct page *start_page, struct page *end_page, | ||
11 | int migratetype); | ||
12 | |||
9 | /* | 13 | /* |
10 | * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. | 14 | * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. |
11 | * If specified range includes migrate types other than MOVABLE or CMA, | 15 | * If specified range includes migrate types other than MOVABLE or CMA, |
@@ -37,6 +41,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); | |||
37 | */ | 41 | */ |
38 | int set_migratetype_isolate(struct page *page); | 42 | int set_migratetype_isolate(struct page *page); |
39 | void unset_migratetype_isolate(struct page *page, unsigned migratetype); | 43 | void unset_migratetype_isolate(struct page *page, unsigned migratetype); |
40 | 44 | struct page *alloc_migrate_target(struct page *page, unsigned long private, | |
45 | int **resultp); | ||
41 | 46 | ||
42 | #endif | 47 | #endif |
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 19ef95d293ae..eed27f4f4c3e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h | |||
@@ -30,6 +30,9 @@ enum pageblock_bits { | |||
30 | PB_migrate, | 30 | PB_migrate, |
31 | PB_migrate_end = PB_migrate + 3 - 1, | 31 | PB_migrate_end = PB_migrate + 3 - 1, |
32 | /* 3 bits required for migrate types */ | 32 | /* 3 bits required for migrate types */ |
33 | #ifdef CONFIG_COMPACTION | ||
34 | PB_migrate_skip,/* If set the block is skipped by compaction */ | ||
35 | #endif /* CONFIG_COMPACTION */ | ||
33 | NR_PAGEBLOCK_BITS | 36 | NR_PAGEBLOCK_BITS |
34 | }; | 37 | }; |
35 | 38 | ||
@@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page, | |||
65 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 68 | void set_pageblock_flags_group(struct page *page, unsigned long flags, |
66 | int start_bitidx, int end_bitidx); | 69 | int start_bitidx, int end_bitidx); |
67 | 70 | ||
71 | #ifdef CONFIG_COMPACTION | ||
72 | #define get_pageblock_skip(page) \ | ||
73 | get_pageblock_flags_group(page, PB_migrate_skip, \ | ||
74 | PB_migrate_skip + 1) | ||
75 | #define clear_pageblock_skip(page) \ | ||
76 | set_pageblock_flags_group(page, 0, PB_migrate_skip, \ | ||
77 | PB_migrate_skip + 1) | ||
78 | #define set_pageblock_skip(page) \ | ||
79 | set_pageblock_flags_group(page, 1, PB_migrate_skip, \ | ||
80 | PB_migrate_skip + 1) | ||
81 | #endif /* CONFIG_COMPACTION */ | ||
82 | |||
68 | #define get_pageblock_flags(page) \ | 83 | #define get_pageblock_flags(page) \ |
69 | get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1) | 84 | get_pageblock_flags_group(page, 0, PB_migrate_end) |
70 | #define set_pageblock_flags(page, flags) \ | 85 | #define set_pageblock_flags(page, flags) \ |
71 | set_pageblock_flags_group(page, flags, \ | 86 | set_pageblock_flags_group(page, flags, \ |
72 | 0, NR_PAGEBLOCK_BITS-1) | 87 | 0, PB_migrate_end) |
73 | 88 | ||
74 | #endif /* PAGEBLOCK_FLAGS_H */ | 89 | #endif /* PAGEBLOCK_FLAGS_H */ |
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h deleted file mode 100644 index db04abb557e0..000000000000 --- a/include/linux/prio_tree.h +++ /dev/null | |||
@@ -1,120 +0,0 @@ | |||
1 | #ifndef _LINUX_PRIO_TREE_H | ||
2 | #define _LINUX_PRIO_TREE_H | ||
3 | |||
4 | /* | ||
5 | * K&R 2nd ed. A8.3 somewhat obliquely hints that initial sequences of struct | ||
6 | * fields with identical types should end up at the same location. We'll use | ||
7 | * this until we can scrap struct raw_prio_tree_node. | ||
8 | * | ||
9 | * Note: all this could be done more elegantly by using unnamed union/struct | ||
10 | * fields. However, gcc 2.95.3 and apparently also gcc 3.0.4 don't support this | ||
11 | * language extension. | ||
12 | */ | ||
13 | |||
14 | struct raw_prio_tree_node { | ||
15 | struct prio_tree_node *left; | ||
16 | struct prio_tree_node *right; | ||
17 | struct prio_tree_node *parent; | ||
18 | }; | ||
19 | |||
20 | struct prio_tree_node { | ||
21 | struct prio_tree_node *left; | ||
22 | struct prio_tree_node *right; | ||
23 | struct prio_tree_node *parent; | ||
24 | unsigned long start; | ||
25 | unsigned long last; /* last location _in_ interval */ | ||
26 | }; | ||
27 | |||
28 | struct prio_tree_root { | ||
29 | struct prio_tree_node *prio_tree_node; | ||
30 | unsigned short index_bits; | ||
31 | unsigned short raw; | ||
32 | /* | ||
33 | * 0: nodes are of type struct prio_tree_node | ||
34 | * 1: nodes are of type raw_prio_tree_node | ||
35 | */ | ||
36 | }; | ||
37 | |||
38 | struct prio_tree_iter { | ||
39 | struct prio_tree_node *cur; | ||
40 | unsigned long mask; | ||
41 | unsigned long value; | ||
42 | int size_level; | ||
43 | |||
44 | struct prio_tree_root *root; | ||
45 | pgoff_t r_index; | ||
46 | pgoff_t h_index; | ||
47 | }; | ||
48 | |||
49 | static inline void prio_tree_iter_init(struct prio_tree_iter *iter, | ||
50 | struct prio_tree_root *root, pgoff_t r_index, pgoff_t h_index) | ||
51 | { | ||
52 | iter->root = root; | ||
53 | iter->r_index = r_index; | ||
54 | iter->h_index = h_index; | ||
55 | iter->cur = NULL; | ||
56 | } | ||
57 | |||
58 | #define __INIT_PRIO_TREE_ROOT(ptr, _raw) \ | ||
59 | do { \ | ||
60 | (ptr)->prio_tree_node = NULL; \ | ||
61 | (ptr)->index_bits = 1; \ | ||
62 | (ptr)->raw = (_raw); \ | ||
63 | } while (0) | ||
64 | |||
65 | #define INIT_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 0) | ||
66 | #define INIT_RAW_PRIO_TREE_ROOT(ptr) __INIT_PRIO_TREE_ROOT(ptr, 1) | ||
67 | |||
68 | #define INIT_PRIO_TREE_NODE(ptr) \ | ||
69 | do { \ | ||
70 | (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \ | ||
71 | } while (0) | ||
72 | |||
73 | #define INIT_PRIO_TREE_ITER(ptr) \ | ||
74 | do { \ | ||
75 | (ptr)->cur = NULL; \ | ||
76 | (ptr)->mask = 0UL; \ | ||
77 | (ptr)->value = 0UL; \ | ||
78 | (ptr)->size_level = 0; \ | ||
79 | } while (0) | ||
80 | |||
81 | #define prio_tree_entry(ptr, type, member) \ | ||
82 | ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) | ||
83 | |||
84 | static inline int prio_tree_empty(const struct prio_tree_root *root) | ||
85 | { | ||
86 | return root->prio_tree_node == NULL; | ||
87 | } | ||
88 | |||
89 | static inline int prio_tree_root(const struct prio_tree_node *node) | ||
90 | { | ||
91 | return node->parent == node; | ||
92 | } | ||
93 | |||
94 | static inline int prio_tree_left_empty(const struct prio_tree_node *node) | ||
95 | { | ||
96 | return node->left == node; | ||
97 | } | ||
98 | |||
99 | static inline int prio_tree_right_empty(const struct prio_tree_node *node) | ||
100 | { | ||
101 | return node->right == node; | ||
102 | } | ||
103 | |||
104 | |||
105 | struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root, | ||
106 | struct prio_tree_node *old, struct prio_tree_node *node); | ||
107 | struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, | ||
108 | struct prio_tree_node *node); | ||
109 | void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node); | ||
110 | struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter); | ||
111 | |||
112 | #define raw_prio_tree_replace(root, old, node) \ | ||
113 | prio_tree_replace(root, (struct prio_tree_node *) (old), \ | ||
114 | (struct prio_tree_node *) (node)) | ||
115 | #define raw_prio_tree_insert(root, node) \ | ||
116 | prio_tree_insert(root, (struct prio_tree_node *) (node)) | ||
117 | #define raw_prio_tree_remove(root, node) \ | ||
118 | prio_tree_remove(root, (struct prio_tree_node *) (node)) | ||
119 | |||
120 | #endif /* _LINUX_PRIO_TREE_H */ | ||
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 033b507b33b1..0022c1bb1e26 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h | |||
@@ -23,72 +23,7 @@ | |||
23 | I know it's not the cleaner way, but in C (not in C++) to get | 23 | I know it's not the cleaner way, but in C (not in C++) to get |
24 | performances and genericity... | 24 | performances and genericity... |
25 | 25 | ||
26 | Some example of insert and search follows here. The search is a plain | 26 | See Documentation/rbtree.txt for documentation and samples. |
27 | normal search over an ordered tree. The insert instead must be implemented | ||
28 | in two steps: First, the code must insert the element in order as a red leaf | ||
29 | in the tree, and then the support library function rb_insert_color() must | ||
30 | be called. Such function will do the not trivial work to rebalance the | ||
31 | rbtree, if necessary. | ||
32 | |||
33 | ----------------------------------------------------------------------- | ||
34 | static inline struct page * rb_search_page_cache(struct inode * inode, | ||
35 | unsigned long offset) | ||
36 | { | ||
37 | struct rb_node * n = inode->i_rb_page_cache.rb_node; | ||
38 | struct page * page; | ||
39 | |||
40 | while (n) | ||
41 | { | ||
42 | page = rb_entry(n, struct page, rb_page_cache); | ||
43 | |||
44 | if (offset < page->offset) | ||
45 | n = n->rb_left; | ||
46 | else if (offset > page->offset) | ||
47 | n = n->rb_right; | ||
48 | else | ||
49 | return page; | ||
50 | } | ||
51 | return NULL; | ||
52 | } | ||
53 | |||
54 | static inline struct page * __rb_insert_page_cache(struct inode * inode, | ||
55 | unsigned long offset, | ||
56 | struct rb_node * node) | ||
57 | { | ||
58 | struct rb_node ** p = &inode->i_rb_page_cache.rb_node; | ||
59 | struct rb_node * parent = NULL; | ||
60 | struct page * page; | ||
61 | |||
62 | while (*p) | ||
63 | { | ||
64 | parent = *p; | ||
65 | page = rb_entry(parent, struct page, rb_page_cache); | ||
66 | |||
67 | if (offset < page->offset) | ||
68 | p = &(*p)->rb_left; | ||
69 | else if (offset > page->offset) | ||
70 | p = &(*p)->rb_right; | ||
71 | else | ||
72 | return page; | ||
73 | } | ||
74 | |||
75 | rb_link_node(node, parent, p); | ||
76 | |||
77 | return NULL; | ||
78 | } | ||
79 | |||
80 | static inline struct page * rb_insert_page_cache(struct inode * inode, | ||
81 | unsigned long offset, | ||
82 | struct rb_node * node) | ||
83 | { | ||
84 | struct page * ret; | ||
85 | if ((ret = __rb_insert_page_cache(inode, offset, node))) | ||
86 | goto out; | ||
87 | rb_insert_color(node, &inode->i_rb_page_cache); | ||
88 | out: | ||
89 | return ret; | ||
90 | } | ||
91 | ----------------------------------------------------------------------- | ||
92 | */ | 27 | */ |
93 | 28 | ||
94 | #ifndef _LINUX_RBTREE_H | 29 | #ifndef _LINUX_RBTREE_H |
@@ -97,63 +32,35 @@ static inline struct page * rb_insert_page_cache(struct inode * inode, | |||
97 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
98 | #include <linux/stddef.h> | 33 | #include <linux/stddef.h> |
99 | 34 | ||
100 | struct rb_node | 35 | struct rb_node { |
101 | { | 36 | unsigned long __rb_parent_color; |
102 | unsigned long rb_parent_color; | ||
103 | #define RB_RED 0 | ||
104 | #define RB_BLACK 1 | ||
105 | struct rb_node *rb_right; | 37 | struct rb_node *rb_right; |
106 | struct rb_node *rb_left; | 38 | struct rb_node *rb_left; |
107 | } __attribute__((aligned(sizeof(long)))); | 39 | } __attribute__((aligned(sizeof(long)))); |
108 | /* The alignment might seem pointless, but allegedly CRIS needs it */ | 40 | /* The alignment might seem pointless, but allegedly CRIS needs it */ |
109 | 41 | ||
110 | struct rb_root | 42 | struct rb_root { |
111 | { | ||
112 | struct rb_node *rb_node; | 43 | struct rb_node *rb_node; |
113 | }; | 44 | }; |
114 | 45 | ||
115 | 46 | ||
116 | #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) | 47 | #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) |
117 | #define rb_color(r) ((r)->rb_parent_color & 1) | ||
118 | #define rb_is_red(r) (!rb_color(r)) | ||
119 | #define rb_is_black(r) rb_color(r) | ||
120 | #define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) | ||
121 | #define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) | ||
122 | |||
123 | static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) | ||
124 | { | ||
125 | rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; | ||
126 | } | ||
127 | static inline void rb_set_color(struct rb_node *rb, int color) | ||
128 | { | ||
129 | rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; | ||
130 | } | ||
131 | 48 | ||
132 | #define RB_ROOT (struct rb_root) { NULL, } | 49 | #define RB_ROOT (struct rb_root) { NULL, } |
133 | #define rb_entry(ptr, type, member) container_of(ptr, type, member) | 50 | #define rb_entry(ptr, type, member) container_of(ptr, type, member) |
134 | 51 | ||
135 | #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) | 52 | #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) |
136 | #define RB_EMPTY_NODE(node) (rb_parent(node) == node) | 53 | |
137 | #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) | 54 | /* 'empty' nodes are nodes that are known not to be inserted in an rbree */ |
55 | #define RB_EMPTY_NODE(node) \ | ||
56 | ((node)->__rb_parent_color == (unsigned long)(node)) | ||
57 | #define RB_CLEAR_NODE(node) \ | ||
58 | ((node)->__rb_parent_color = (unsigned long)(node)) | ||
138 | 59 | ||
139 | static inline void rb_init_node(struct rb_node *rb) | ||
140 | { | ||
141 | rb->rb_parent_color = 0; | ||
142 | rb->rb_right = NULL; | ||
143 | rb->rb_left = NULL; | ||
144 | RB_CLEAR_NODE(rb); | ||
145 | } | ||
146 | 60 | ||
147 | extern void rb_insert_color(struct rb_node *, struct rb_root *); | 61 | extern void rb_insert_color(struct rb_node *, struct rb_root *); |
148 | extern void rb_erase(struct rb_node *, struct rb_root *); | 62 | extern void rb_erase(struct rb_node *, struct rb_root *); |
149 | 63 | ||
150 | typedef void (*rb_augment_f)(struct rb_node *node, void *data); | ||
151 | |||
152 | extern void rb_augment_insert(struct rb_node *node, | ||
153 | rb_augment_f func, void *data); | ||
154 | extern struct rb_node *rb_augment_erase_begin(struct rb_node *node); | ||
155 | extern void rb_augment_erase_end(struct rb_node *node, | ||
156 | rb_augment_f func, void *data); | ||
157 | 64 | ||
158 | /* Find logical next and previous nodes in a tree */ | 65 | /* Find logical next and previous nodes in a tree */ |
159 | extern struct rb_node *rb_next(const struct rb_node *); | 66 | extern struct rb_node *rb_next(const struct rb_node *); |
@@ -168,7 +75,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, | |||
168 | static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, | 75 | static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, |
169 | struct rb_node ** rb_link) | 76 | struct rb_node ** rb_link) |
170 | { | 77 | { |
171 | node->rb_parent_color = (unsigned long )parent; | 78 | node->__rb_parent_color = (unsigned long)parent; |
172 | node->rb_left = node->rb_right = NULL; | 79 | node->rb_left = node->rb_right = NULL; |
173 | 80 | ||
174 | *rb_link = node; | 81 | *rb_link = node; |
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h new file mode 100644 index 000000000000..214caa33433b --- /dev/null +++ b/include/linux/rbtree_augmented.h | |||
@@ -0,0 +1,223 @@ | |||
1 | /* | ||
2 | Red Black Trees | ||
3 | (C) 1999 Andrea Arcangeli <andrea@suse.de> | ||
4 | (C) 2002 David Woodhouse <dwmw2@infradead.org> | ||
5 | (C) 2012 Michel Lespinasse <walken@google.com> | ||
6 | |||
7 | This program is free software; you can redistribute it and/or modify | ||
8 | it under the terms of the GNU General Public License as published by | ||
9 | the Free Software Foundation; either version 2 of the License, or | ||
10 | (at your option) any later version. | ||
11 | |||
12 | This program is distributed in the hope that it will be useful, | ||
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | GNU General Public License for more details. | ||
16 | |||
17 | You should have received a copy of the GNU General Public License | ||
18 | along with this program; if not, write to the Free Software | ||
19 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
20 | |||
21 | linux/include/linux/rbtree_augmented.h | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_RBTREE_AUGMENTED_H | ||
25 | #define _LINUX_RBTREE_AUGMENTED_H | ||
26 | |||
27 | #include <linux/rbtree.h> | ||
28 | |||
29 | /* | ||
30 | * Please note - only struct rb_augment_callbacks and the prototypes for | ||
31 | * rb_insert_augmented() and rb_erase_augmented() are intended to be public. | ||
32 | * The rest are implementation details you are not expected to depend on. | ||
33 | * | ||
34 | * See Documentation/rbtree.txt for documentation and samples. | ||
35 | */ | ||
36 | |||
37 | struct rb_augment_callbacks { | ||
38 | void (*propagate)(struct rb_node *node, struct rb_node *stop); | ||
39 | void (*copy)(struct rb_node *old, struct rb_node *new); | ||
40 | void (*rotate)(struct rb_node *old, struct rb_node *new); | ||
41 | }; | ||
42 | |||
43 | extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, | ||
44 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); | ||
45 | static inline void | ||
46 | rb_insert_augmented(struct rb_node *node, struct rb_root *root, | ||
47 | const struct rb_augment_callbacks *augment) | ||
48 | { | ||
49 | __rb_insert_augmented(node, root, augment->rotate); | ||
50 | } | ||
51 | |||
52 | #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ | ||
53 | rbtype, rbaugmented, rbcompute) \ | ||
54 | static inline void \ | ||
55 | rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ | ||
56 | { \ | ||
57 | while (rb != stop) { \ | ||
58 | rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ | ||
59 | rbtype augmented = rbcompute(node); \ | ||
60 | if (node->rbaugmented == augmented) \ | ||
61 | break; \ | ||
62 | node->rbaugmented = augmented; \ | ||
63 | rb = rb_parent(&node->rbfield); \ | ||
64 | } \ | ||
65 | } \ | ||
66 | static inline void \ | ||
67 | rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ | ||
68 | { \ | ||
69 | rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ | ||
70 | rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ | ||
71 | new->rbaugmented = old->rbaugmented; \ | ||
72 | } \ | ||
73 | static void \ | ||
74 | rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ | ||
75 | { \ | ||
76 | rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ | ||
77 | rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ | ||
78 | new->rbaugmented = old->rbaugmented; \ | ||
79 | old->rbaugmented = rbcompute(old); \ | ||
80 | } \ | ||
81 | rbstatic const struct rb_augment_callbacks rbname = { \ | ||
82 | rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ | ||
83 | }; | ||
84 | |||
85 | |||
86 | #define RB_RED 0 | ||
87 | #define RB_BLACK 1 | ||
88 | |||
89 | #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) | ||
90 | |||
91 | #define __rb_color(pc) ((pc) & 1) | ||
92 | #define __rb_is_black(pc) __rb_color(pc) | ||
93 | #define __rb_is_red(pc) (!__rb_color(pc)) | ||
94 | #define rb_color(rb) __rb_color((rb)->__rb_parent_color) | ||
95 | #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) | ||
96 | #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) | ||
97 | |||
98 | static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) | ||
99 | { | ||
100 | rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; | ||
101 | } | ||
102 | |||
103 | static inline void rb_set_parent_color(struct rb_node *rb, | ||
104 | struct rb_node *p, int color) | ||
105 | { | ||
106 | rb->__rb_parent_color = (unsigned long)p | color; | ||
107 | } | ||
108 | |||
109 | static inline void | ||
110 | __rb_change_child(struct rb_node *old, struct rb_node *new, | ||
111 | struct rb_node *parent, struct rb_root *root) | ||
112 | { | ||
113 | if (parent) { | ||
114 | if (parent->rb_left == old) | ||
115 | parent->rb_left = new; | ||
116 | else | ||
117 | parent->rb_right = new; | ||
118 | } else | ||
119 | root->rb_node = new; | ||
120 | } | ||
121 | |||
122 | extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, | ||
123 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); | ||
124 | |||
125 | static __always_inline void | ||
126 | rb_erase_augmented(struct rb_node *node, struct rb_root *root, | ||
127 | const struct rb_augment_callbacks *augment) | ||
128 | { | ||
129 | struct rb_node *child = node->rb_right, *tmp = node->rb_left; | ||
130 | struct rb_node *parent, *rebalance; | ||
131 | unsigned long pc; | ||
132 | |||
133 | if (!tmp) { | ||
134 | /* | ||
135 | * Case 1: node to erase has no more than 1 child (easy!) | ||
136 | * | ||
137 | * Note that if there is one child it must be red due to 5) | ||
138 | * and node must be black due to 4). We adjust colors locally | ||
139 | * so as to bypass __rb_erase_color() later on. | ||
140 | */ | ||
141 | pc = node->__rb_parent_color; | ||
142 | parent = __rb_parent(pc); | ||
143 | __rb_change_child(node, child, parent, root); | ||
144 | if (child) { | ||
145 | child->__rb_parent_color = pc; | ||
146 | rebalance = NULL; | ||
147 | } else | ||
148 | rebalance = __rb_is_black(pc) ? parent : NULL; | ||
149 | tmp = parent; | ||
150 | } else if (!child) { | ||
151 | /* Still case 1, but this time the child is node->rb_left */ | ||
152 | tmp->__rb_parent_color = pc = node->__rb_parent_color; | ||
153 | parent = __rb_parent(pc); | ||
154 | __rb_change_child(node, tmp, parent, root); | ||
155 | rebalance = NULL; | ||
156 | tmp = parent; | ||
157 | } else { | ||
158 | struct rb_node *successor = child, *child2; | ||
159 | tmp = child->rb_left; | ||
160 | if (!tmp) { | ||
161 | /* | ||
162 | * Case 2: node's successor is its right child | ||
163 | * | ||
164 | * (n) (s) | ||
165 | * / \ / \ | ||
166 | * (x) (s) -> (x) (c) | ||
167 | * \ | ||
168 | * (c) | ||
169 | */ | ||
170 | parent = successor; | ||
171 | child2 = successor->rb_right; | ||
172 | augment->copy(node, successor); | ||
173 | } else { | ||
174 | /* | ||
175 | * Case 3: node's successor is leftmost under | ||
176 | * node's right child subtree | ||
177 | * | ||
178 | * (n) (s) | ||
179 | * / \ / \ | ||
180 | * (x) (y) -> (x) (y) | ||
181 | * / / | ||
182 | * (p) (p) | ||
183 | * / / | ||
184 | * (s) (c) | ||
185 | * \ | ||
186 | * (c) | ||
187 | */ | ||
188 | do { | ||
189 | parent = successor; | ||
190 | successor = tmp; | ||
191 | tmp = tmp->rb_left; | ||
192 | } while (tmp); | ||
193 | parent->rb_left = child2 = successor->rb_right; | ||
194 | successor->rb_right = child; | ||
195 | rb_set_parent(child, successor); | ||
196 | augment->copy(node, successor); | ||
197 | augment->propagate(parent, successor); | ||
198 | } | ||
199 | |||
200 | successor->rb_left = tmp = node->rb_left; | ||
201 | rb_set_parent(tmp, successor); | ||
202 | |||
203 | pc = node->__rb_parent_color; | ||
204 | tmp = __rb_parent(pc); | ||
205 | __rb_change_child(node, successor, tmp, root); | ||
206 | if (child2) { | ||
207 | successor->__rb_parent_color = pc; | ||
208 | rb_set_parent_color(child2, parent, RB_BLACK); | ||
209 | rebalance = NULL; | ||
210 | } else { | ||
211 | unsigned long pc2 = successor->__rb_parent_color; | ||
212 | successor->__rb_parent_color = pc; | ||
213 | rebalance = __rb_is_black(pc2) ? parent : NULL; | ||
214 | } | ||
215 | tmp = successor; | ||
216 | } | ||
217 | |||
218 | augment->propagate(tmp, NULL); | ||
219 | if (rebalance) | ||
220 | __rb_erase_color(rebalance, root, augment->rotate); | ||
221 | } | ||
222 | |||
223 | #endif /* _LINUX_RBTREE_AUGMENTED_H */ | ||
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3fce545df394..bfe1f4780644 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -37,14 +37,14 @@ struct anon_vma { | |||
37 | atomic_t refcount; | 37 | atomic_t refcount; |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * NOTE: the LSB of the head.next is set by | 40 | * NOTE: the LSB of the rb_root.rb_node is set by |
41 | * mm_take_all_locks() _after_ taking the above lock. So the | 41 | * mm_take_all_locks() _after_ taking the above lock. So the |
42 | * head must only be read/written after taking the above lock | 42 | * rb_root must only be read/written after taking the above lock |
43 | * to be sure to see a valid next pointer. The LSB bit itself | 43 | * to be sure to see a valid next pointer. The LSB bit itself |
44 | * is serialized by a system wide lock only visible to | 44 | * is serialized by a system wide lock only visible to |
45 | * mm_take_all_locks() (mm_all_locks_mutex). | 45 | * mm_take_all_locks() (mm_all_locks_mutex). |
46 | */ | 46 | */ |
47 | struct list_head head; /* Chain of private "related" vmas */ | 47 | struct rb_root rb_root; /* Interval tree of private "related" vmas */ |
48 | }; | 48 | }; |
49 | 49 | ||
50 | /* | 50 | /* |
@@ -57,14 +57,29 @@ struct anon_vma { | |||
57 | * with a VMA, or the VMAs associated with an anon_vma. | 57 | * with a VMA, or the VMAs associated with an anon_vma. |
58 | * The "same_vma" list contains the anon_vma_chains linking | 58 | * The "same_vma" list contains the anon_vma_chains linking |
59 | * all the anon_vmas associated with this VMA. | 59 | * all the anon_vmas associated with this VMA. |
60 | * The "same_anon_vma" list contains the anon_vma_chains | 60 | * The "rb" field indexes on an interval tree the anon_vma_chains |
61 | * which link all the VMAs associated with this anon_vma. | 61 | * which link all the VMAs associated with this anon_vma. |
62 | */ | 62 | */ |
63 | struct anon_vma_chain { | 63 | struct anon_vma_chain { |
64 | struct vm_area_struct *vma; | 64 | struct vm_area_struct *vma; |
65 | struct anon_vma *anon_vma; | 65 | struct anon_vma *anon_vma; |
66 | struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ | 66 | struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ |
67 | struct list_head same_anon_vma; /* locked by anon_vma->mutex */ | 67 | struct rb_node rb; /* locked by anon_vma->mutex */ |
68 | unsigned long rb_subtree_last; | ||
69 | #ifdef CONFIG_DEBUG_VM_RB | ||
70 | unsigned long cached_vma_start, cached_vma_last; | ||
71 | #endif | ||
72 | }; | ||
73 | |||
74 | enum ttu_flags { | ||
75 | TTU_UNMAP = 0, /* unmap mode */ | ||
76 | TTU_MIGRATION = 1, /* migration mode */ | ||
77 | TTU_MUNLOCK = 2, /* munlock mode */ | ||
78 | TTU_ACTION_MASK = 0xff, | ||
79 | |||
80 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ | ||
81 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ | ||
82 | TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ | ||
68 | }; | 83 | }; |
69 | 84 | ||
70 | #ifdef CONFIG_MMU | 85 | #ifdef CONFIG_MMU |
@@ -120,7 +135,6 @@ void anon_vma_init(void); /* create anon_vma_cachep */ | |||
120 | int anon_vma_prepare(struct vm_area_struct *); | 135 | int anon_vma_prepare(struct vm_area_struct *); |
121 | void unlink_anon_vmas(struct vm_area_struct *); | 136 | void unlink_anon_vmas(struct vm_area_struct *); |
122 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); | 137 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); |
123 | void anon_vma_moveto_tail(struct vm_area_struct *); | ||
124 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); | 138 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); |
125 | 139 | ||
126 | static inline void anon_vma_merge(struct vm_area_struct *vma, | 140 | static inline void anon_vma_merge(struct vm_area_struct *vma, |
@@ -161,16 +175,6 @@ int page_referenced(struct page *, int is_locked, | |||
161 | int page_referenced_one(struct page *, struct vm_area_struct *, | 175 | int page_referenced_one(struct page *, struct vm_area_struct *, |
162 | unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); | 176 | unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); |
163 | 177 | ||
164 | enum ttu_flags { | ||
165 | TTU_UNMAP = 0, /* unmap mode */ | ||
166 | TTU_MIGRATION = 1, /* migration mode */ | ||
167 | TTU_MUNLOCK = 2, /* munlock mode */ | ||
168 | TTU_ACTION_MASK = 0xff, | ||
169 | |||
170 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ | ||
171 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ | ||
172 | TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ | ||
173 | }; | ||
174 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) | 178 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) |
175 | 179 | ||
176 | int try_to_unmap(struct page *, enum ttu_flags flags); | 180 | int try_to_unmap(struct page *, enum ttu_flags flags); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c5612f0374b..c2070e92a9d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -671,7 +671,6 @@ struct signal_struct { | |||
671 | struct rw_semaphore group_rwsem; | 671 | struct rw_semaphore group_rwsem; |
672 | #endif | 672 | #endif |
673 | 673 | ||
674 | int oom_adj; /* OOM kill score adjustment (bit shift) */ | ||
675 | int oom_score_adj; /* OOM kill score adjustment */ | 674 | int oom_score_adj; /* OOM kill score adjustment */ |
676 | int oom_score_adj_min; /* OOM kill score adjustment minimum value. | 675 | int oom_score_adj_min; /* OOM kill score adjustment minimum value. |
677 | * Only settable by CAP_SYS_RESOURCE. */ | 676 | * Only settable by CAP_SYS_RESOURCE. */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 388e70601413..68df9c17fbbb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -281,7 +281,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) | |||
281 | } | 281 | } |
282 | #endif | 282 | #endif |
283 | 283 | ||
284 | extern int page_evictable(struct page *page, struct vm_area_struct *vma); | 284 | extern int page_evictable(struct page *page); |
285 | extern void check_move_unevictable_pages(struct page **, int nr_pages); | 285 | extern void check_move_unevictable_pages(struct page **, int nr_pages); |
286 | 286 | ||
287 | extern unsigned long scan_unevictable_pages; | 287 | extern unsigned long scan_unevictable_pages; |
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 5088727478fd..a520fd70a59f 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h | |||
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) | |||
39 | 39 | ||
40 | static inline void timerqueue_init(struct timerqueue_node *node) | 40 | static inline void timerqueue_init(struct timerqueue_node *node) |
41 | { | 41 | { |
42 | rb_init_node(&node->node); | 42 | RB_CLEAR_NODE(&node->node); |
43 | } | 43 | } |
44 | 44 | ||
45 | static inline void timerqueue_init_head(struct timerqueue_head *head) | 45 | static inline void timerqueue_init_head(struct timerqueue_head *head) |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 57f7b1091511..3d3114594370 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -52,7 +52,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
52 | UNEVICTABLE_PGMUNLOCKED, | 52 | UNEVICTABLE_PGMUNLOCKED, |
53 | UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ | 53 | UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ |
54 | UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ | 54 | UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ |
55 | UNEVICTABLE_MLOCKFREED, | ||
56 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 55 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
57 | THP_FAULT_ALLOC, | 56 | THP_FAULT_ALLOC, |
58 | THP_FAULT_FALLBACK, | 57 | THP_FAULT_FALLBACK, |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ad2cfd53dadc..92a86b2cce33 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -198,6 +198,8 @@ extern void __dec_zone_state(struct zone *, enum zone_stat_item); | |||
198 | void refresh_cpu_vm_stats(int); | 198 | void refresh_cpu_vm_stats(int); |
199 | void refresh_zone_stat_thresholds(void); | 199 | void refresh_zone_stat_thresholds(void); |
200 | 200 | ||
201 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); | ||
202 | |||
201 | int calculate_pressure_threshold(struct zone *zone); | 203 | int calculate_pressure_threshold(struct zone *zone); |
202 | int calculate_normal_threshold(struct zone *zone); | 204 | int calculate_normal_threshold(struct zone *zone); |
203 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, | 205 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, |
@@ -251,8 +253,18 @@ static inline void __dec_zone_page_state(struct page *page, | |||
251 | static inline void refresh_cpu_vm_stats(int cpu) { } | 253 | static inline void refresh_cpu_vm_stats(int cpu) { } |
252 | static inline void refresh_zone_stat_thresholds(void) { } | 254 | static inline void refresh_zone_stat_thresholds(void) { } |
253 | 255 | ||
256 | static inline void drain_zonestat(struct zone *zone, | ||
257 | struct per_cpu_pageset *pset) { } | ||
254 | #endif /* CONFIG_SMP */ | 258 | #endif /* CONFIG_SMP */ |
255 | 259 | ||
260 | static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, | ||
261 | int migratetype) | ||
262 | { | ||
263 | __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); | ||
264 | if (is_migrate_cma(migratetype)) | ||
265 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); | ||
266 | } | ||
267 | |||
256 | extern const char * const vmstat_text[]; | 268 | extern const char * const vmstat_text[]; |
257 | 269 | ||
258 | #endif /* _LINUX_VMSTAT_H */ | 270 | #endif /* _LINUX_VMSTAT_H */ |
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..9391706e9254 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h | |||
@@ -36,7 +36,6 @@ | |||
36 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ | 36 | {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ |
37 | {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ | 37 | {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ |
38 | {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ | 38 | {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ |
39 | {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ | ||
40 | {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ | 39 | {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ |
41 | ) : "GFP_NOWAIT" | 40 | ) : "GFP_NOWAIT" |
42 | 41 | ||
diff --git a/init/Kconfig b/init/Kconfig index ed6334dd5e71..4c93533da42c 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -1125,10 +1125,12 @@ menuconfig EXPERT | |||
1125 | environments which can tolerate a "non-standard" kernel. | 1125 | environments which can tolerate a "non-standard" kernel. |
1126 | Only use this if you really know what you are doing. | 1126 | Only use this if you really know what you are doing. |
1127 | 1127 | ||
1128 | config HAVE_UID16 | ||
1129 | bool | ||
1130 | |||
1128 | config UID16 | 1131 | config UID16 |
1129 | bool "Enable 16-bit UID system calls" if EXPERT | 1132 | bool "Enable 16-bit UID system calls" if EXPERT |
1130 | depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION) \ | 1133 | depends on HAVE_UID16 |
1131 | || AARCH32_EMULATION | ||
1132 | default y | 1134 | default y |
1133 | help | 1135 | help |
1134 | This enables the legacy 16-bit UID syscall wrappers. | 1136 | This enables the legacy 16-bit UID syscall wrappers. |
@@ -1150,6 +1152,11 @@ config SYSCTL_SYSCALL | |||
1150 | 1152 | ||
1151 | If unsure say N here. | 1153 | If unsure say N here. |
1152 | 1154 | ||
1155 | config SYSCTL_EXCEPTION_TRACE | ||
1156 | bool | ||
1157 | help | ||
1158 | Enable support for /proc/sys/debug/exception-trace. | ||
1159 | |||
1153 | config KALLSYMS | 1160 | config KALLSYMS |
1154 | bool "Load all symbols for debugging/ksymoops" if EXPERT | 1161 | bool "Load all symbols for debugging/ksymoops" if EXPERT |
1155 | default y | 1162 | default y |
diff --git a/init/main.c b/init/main.c index db34c0ec4711..313360fe1118 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -86,7 +86,6 @@ extern void init_IRQ(void); | |||
86 | extern void fork_init(unsigned long); | 86 | extern void fork_init(unsigned long); |
87 | extern void mca_init(void); | 87 | extern void mca_init(void); |
88 | extern void sbus_init(void); | 88 | extern void sbus_init(void); |
89 | extern void prio_tree_init(void); | ||
90 | extern void radix_tree_init(void); | 89 | extern void radix_tree_init(void); |
91 | #ifndef CONFIG_DEBUG_RODATA | 90 | #ifndef CONFIG_DEBUG_RODATA |
92 | static inline void mark_rodata_ro(void) { } | 91 | static inline void mark_rodata_ro(void) { } |
@@ -547,7 +546,6 @@ asmlinkage void __init start_kernel(void) | |||
547 | /* init some links before init_ISA_irqs() */ | 546 | /* init some links before init_ISA_irqs() */ |
548 | early_irq_init(); | 547 | early_irq_init(); |
549 | init_IRQ(); | 548 | init_IRQ(); |
550 | prio_tree_init(); | ||
551 | init_timers(); | 549 | init_timers(); |
552 | hrtimers_init(); | 550 | hrtimers_init(); |
553 | softirq_init(); | 551 | softirq_init(); |
diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 6d255e535d03..6b97e2466fad 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c | |||
@@ -142,7 +142,6 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) | |||
142 | leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); | 142 | leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); |
143 | if (!leaf) | 143 | if (!leaf) |
144 | return -ENOMEM; | 144 | return -ENOMEM; |
145 | rb_init_node(&leaf->rb_node); | ||
146 | INIT_LIST_HEAD(&leaf->msg_list); | 145 | INIT_LIST_HEAD(&leaf->msg_list); |
147 | info->qsize += sizeof(*leaf); | 146 | info->qsize += sizeof(*leaf); |
148 | } | 147 | } |
@@ -1013,7 +1012,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, | |||
1013 | 1012 | ||
1014 | if (!info->node_cache && new_leaf) { | 1013 | if (!info->node_cache && new_leaf) { |
1015 | /* Save our speculative allocation into the cache */ | 1014 | /* Save our speculative allocation into the cache */ |
1016 | rb_init_node(&new_leaf->rb_node); | ||
1017 | INIT_LIST_HEAD(&new_leaf->msg_list); | 1015 | INIT_LIST_HEAD(&new_leaf->msg_list); |
1018 | info->node_cache = new_leaf; | 1016 | info->node_cache = new_leaf; |
1019 | info->qsize += sizeof(*new_leaf); | 1017 | info->qsize += sizeof(*new_leaf); |
@@ -1121,7 +1119,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, | |||
1121 | 1119 | ||
1122 | if (!info->node_cache && new_leaf) { | 1120 | if (!info->node_cache && new_leaf) { |
1123 | /* Save our speculative allocation into the cache */ | 1121 | /* Save our speculative allocation into the cache */ |
1124 | rb_init_node(&new_leaf->rb_node); | ||
1125 | INIT_LIST_HEAD(&new_leaf->msg_list); | 1122 | INIT_LIST_HEAD(&new_leaf->msg_list); |
1126 | info->node_cache = new_leaf; | 1123 | info->node_cache = new_leaf; |
1127 | info->qsize += sizeof(*new_leaf); | 1124 | info->qsize += sizeof(*new_leaf); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 29e090cc0e46..f4a7756f999c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1151,7 +1151,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1151 | const struct cred *cred; | 1151 | const struct cred *cred; |
1152 | char name[sizeof(tsk->comm)]; | 1152 | char name[sizeof(tsk->comm)]; |
1153 | struct mm_struct *mm = tsk->mm; | 1153 | struct mm_struct *mm = tsk->mm; |
1154 | struct vm_area_struct *vma; | ||
1155 | char *tty; | 1154 | char *tty; |
1156 | 1155 | ||
1157 | if (!ab) | 1156 | if (!ab) |
@@ -1191,16 +1190,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
1191 | 1190 | ||
1192 | if (mm) { | 1191 | if (mm) { |
1193 | down_read(&mm->mmap_sem); | 1192 | down_read(&mm->mmap_sem); |
1194 | vma = mm->mmap; | 1193 | if (mm->exe_file) |
1195 | while (vma) { | 1194 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); |
1196 | if ((vma->vm_flags & VM_EXECUTABLE) && | ||
1197 | vma->vm_file) { | ||
1198 | audit_log_d_path(ab, " exe=", | ||
1199 | &vma->vm_file->f_path); | ||
1200 | break; | ||
1201 | } | ||
1202 | vma = vma->vm_next; | ||
1203 | } | ||
1204 | up_read(&mm->mmap_sem); | 1195 | up_read(&mm->mmap_sem); |
1205 | } | 1196 | } |
1206 | audit_log_task_context(ab); | 1197 | audit_log_task_context(ab); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f560598807c1..42bd331ee0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -80,6 +80,10 @@ void put_online_cpus(void) | |||
80 | if (cpu_hotplug.active_writer == current) | 80 | if (cpu_hotplug.active_writer == current) |
81 | return; | 81 | return; |
82 | mutex_lock(&cpu_hotplug.lock); | 82 | mutex_lock(&cpu_hotplug.lock); |
83 | |||
84 | if (WARN_ON(!cpu_hotplug.refcount)) | ||
85 | cpu_hotplug.refcount++; /* try to fix things up */ | ||
86 | |||
83 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) | 87 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) |
84 | wake_up_process(cpu_hotplug.active_writer); | 88 | wake_up_process(cpu_hotplug.active_writer); |
85 | mutex_unlock(&cpu_hotplug.lock); | 89 | mutex_unlock(&cpu_hotplug.lock); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f16f3c58f11a..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -3671,7 +3671,7 @@ unlock: | |||
3671 | atomic_inc(&event->mmap_count); | 3671 | atomic_inc(&event->mmap_count); |
3672 | mutex_unlock(&event->mmap_mutex); | 3672 | mutex_unlock(&event->mmap_mutex); |
3673 | 3673 | ||
3674 | vma->vm_flags |= VM_RESERVED; | 3674 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3675 | vma->vm_ops = &perf_mmap_vmops; | 3675 | vma->vm_ops = &perf_mmap_vmops; |
3676 | 3676 | ||
3677 | return ret; | 3677 | return ret; |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..98256bc71ee1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
141 | spinlock_t *ptl; | 141 | spinlock_t *ptl; |
142 | pte_t *ptep; | 142 | pte_t *ptep; |
143 | int err; | 143 | int err; |
144 | /* For mmu_notifiers */ | ||
145 | const unsigned long mmun_start = addr; | ||
146 | const unsigned long mmun_end = addr + PAGE_SIZE; | ||
144 | 147 | ||
145 | /* For try_to_free_swap() and munlock_vma_page() below */ | 148 | /* For try_to_free_swap() and munlock_vma_page() below */ |
146 | lock_page(page); | 149 | lock_page(page); |
147 | 150 | ||
151 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
148 | err = -EAGAIN; | 152 | err = -EAGAIN; |
149 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 153 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
150 | if (!ptep) | 154 | if (!ptep) |
@@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
173 | 177 | ||
174 | err = 0; | 178 | err = 0; |
175 | unlock: | 179 | unlock: |
180 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
176 | unlock_page(page); | 181 | unlock_page(page); |
177 | return err; | 182 | return err; |
178 | } | 183 | } |
@@ -735,7 +740,6 @@ static struct map_info * | |||
735 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | 740 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) |
736 | { | 741 | { |
737 | unsigned long pgoff = offset >> PAGE_SHIFT; | 742 | unsigned long pgoff = offset >> PAGE_SHIFT; |
738 | struct prio_tree_iter iter; | ||
739 | struct vm_area_struct *vma; | 743 | struct vm_area_struct *vma; |
740 | struct map_info *curr = NULL; | 744 | struct map_info *curr = NULL; |
741 | struct map_info *prev = NULL; | 745 | struct map_info *prev = NULL; |
@@ -744,7 +748,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
744 | 748 | ||
745 | again: | 749 | again: |
746 | mutex_lock(&mapping->i_mmap_mutex); | 750 | mutex_lock(&mapping->i_mmap_mutex); |
747 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 751 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
748 | if (!valid_vma(vma, is_register)) | 752 | if (!valid_vma(vma, is_register)) |
749 | continue; | 753 | continue; |
750 | 754 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index a2b1efc20928..1cd7d581b3b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
423 | mapping->i_mmap_writable++; | 423 | mapping->i_mmap_writable++; |
424 | flush_dcache_mmap_lock(mapping); | 424 | flush_dcache_mmap_lock(mapping); |
425 | /* insert tmp into the share list, just after mpnt */ | 425 | /* insert tmp into the share list, just after mpnt */ |
426 | vma_prio_tree_add(tmp, mpnt); | 426 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) |
427 | vma_nonlinear_insert(tmp, | ||
428 | &mapping->i_mmap_nonlinear); | ||
429 | else | ||
430 | vma_interval_tree_insert_after(tmp, mpnt, | ||
431 | &mapping->i_mmap); | ||
427 | flush_dcache_mmap_unlock(mapping); | 432 | flush_dcache_mmap_unlock(mapping); |
428 | mutex_unlock(&mapping->i_mmap_mutex); | 433 | mutex_unlock(&mapping->i_mmap_mutex); |
429 | } | 434 | } |
@@ -622,26 +627,6 @@ void mmput(struct mm_struct *mm) | |||
622 | } | 627 | } |
623 | EXPORT_SYMBOL_GPL(mmput); | 628 | EXPORT_SYMBOL_GPL(mmput); |
624 | 629 | ||
625 | /* | ||
626 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
627 | * during exec and are not mapped with the mmap system call. | ||
628 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
629 | */ | ||
630 | void added_exe_file_vma(struct mm_struct *mm) | ||
631 | { | ||
632 | mm->num_exe_file_vmas++; | ||
633 | } | ||
634 | |||
635 | void removed_exe_file_vma(struct mm_struct *mm) | ||
636 | { | ||
637 | mm->num_exe_file_vmas--; | ||
638 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { | ||
639 | fput(mm->exe_file); | ||
640 | mm->exe_file = NULL; | ||
641 | } | ||
642 | |||
643 | } | ||
644 | |||
645 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | 630 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) |
646 | { | 631 | { |
647 | if (new_exe_file) | 632 | if (new_exe_file) |
@@ -649,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | |||
649 | if (mm->exe_file) | 634 | if (mm->exe_file) |
650 | fput(mm->exe_file); | 635 | fput(mm->exe_file); |
651 | mm->exe_file = new_exe_file; | 636 | mm->exe_file = new_exe_file; |
652 | mm->num_exe_file_vmas = 0; | ||
653 | } | 637 | } |
654 | 638 | ||
655 | struct file *get_mm_exe_file(struct mm_struct *mm) | 639 | struct file *get_mm_exe_file(struct mm_struct *mm) |
656 | { | 640 | { |
657 | struct file *exe_file; | 641 | struct file *exe_file; |
658 | 642 | ||
659 | /* We need mmap_sem to protect against races with removal of | 643 | /* We need mmap_sem to protect against races with removal of exe_file */ |
660 | * VM_EXECUTABLE vmas */ | ||
661 | down_read(&mm->mmap_sem); | 644 | down_read(&mm->mmap_sem); |
662 | exe_file = mm->exe_file; | 645 | exe_file = mm->exe_file; |
663 | if (exe_file) | 646 | if (exe_file) |
@@ -1078,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1078 | init_rwsem(&sig->group_rwsem); | 1061 | init_rwsem(&sig->group_rwsem); |
1079 | #endif | 1062 | #endif |
1080 | 1063 | ||
1081 | sig->oom_adj = current->signal->oom_adj; | ||
1082 | sig->oom_score_adj = current->signal->oom_score_adj; | 1064 | sig->oom_score_adj = current->signal->oom_score_adj; |
1083 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1065 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
1084 | 1066 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c2a2f8084bad..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1549,8 +1549,7 @@ static struct ctl_table fs_table[] = { | |||
1549 | }; | 1549 | }; |
1550 | 1550 | ||
1551 | static struct ctl_table debug_table[] = { | 1551 | static struct ctl_table debug_table[] = { |
1552 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1552 | #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE |
1553 | defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64) | ||
1554 | { | 1553 | { |
1555 | .procname = "exception-trace", | 1554 | .procname = "exception-trace", |
1556 | .data = &show_unhandled_signals, | 1555 | .data = &show_unhandled_signals, |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7fba3a98967f..28e9d6c98941 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -450,12 +450,12 @@ config SLUB_STATS | |||
450 | out which slabs are relevant to a particular load. | 450 | out which slabs are relevant to a particular load. |
451 | Try running: slabinfo -DA | 451 | Try running: slabinfo -DA |
452 | 452 | ||
453 | config HAVE_DEBUG_KMEMLEAK | ||
454 | bool | ||
455 | |||
453 | config DEBUG_KMEMLEAK | 456 | config DEBUG_KMEMLEAK |
454 | bool "Kernel memory leak detector" | 457 | bool "Kernel memory leak detector" |
455 | depends on DEBUG_KERNEL && EXPERIMENTAL && \ | 458 | depends on DEBUG_KERNEL && EXPERIMENTAL && HAVE_DEBUG_KMEMLEAK |
456 | (X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || \ | ||
457 | MICROBLAZE || TILE || ARM64) | ||
458 | |||
459 | select DEBUG_FS | 459 | select DEBUG_FS |
460 | select STACKTRACE if STACKTRACE_SUPPORT | 460 | select STACKTRACE if STACKTRACE_SUPPORT |
461 | select KALLSYMS | 461 | select KALLSYMS |
@@ -751,12 +751,12 @@ config DEBUG_HIGHMEM | |||
751 | This options enables addition error checking for high memory systems. | 751 | This options enables addition error checking for high memory systems. |
752 | Disable for production systems. | 752 | Disable for production systems. |
753 | 753 | ||
754 | config HAVE_DEBUG_BUGVERBOSE | ||
755 | bool | ||
756 | |||
754 | config DEBUG_BUGVERBOSE | 757 | config DEBUG_BUGVERBOSE |
755 | bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT | 758 | bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT |
756 | depends on BUG | 759 | depends on BUG && (GENERIC_BUG || HAVE_DEBUG_BUGVERBOSE) |
757 | depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \ | ||
758 | FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 || \ | ||
759 | TILE || ARM64 | ||
760 | default y | 760 | default y |
761 | help | 761 | help |
762 | Say Y here to make BUG() panics output the file name and line number | 762 | Say Y here to make BUG() panics output the file name and line number |
@@ -798,6 +798,15 @@ config DEBUG_VM | |||
798 | 798 | ||
799 | If unsure, say N. | 799 | If unsure, say N. |
800 | 800 | ||
801 | config DEBUG_VM_RB | ||
802 | bool "Debug VM red-black trees" | ||
803 | depends on DEBUG_VM | ||
804 | help | ||
805 | Enable this to turn on more extended checks in the virtual-memory | ||
806 | system that may impact performance. | ||
807 | |||
808 | If unsure, say N. | ||
809 | |||
801 | config DEBUG_VIRTUAL | 810 | config DEBUG_VIRTUAL |
802 | bool "Debug VM translations" | 811 | bool "Debug VM translations" |
803 | depends on DEBUG_KERNEL && X86 | 812 | depends on DEBUG_KERNEL && X86 |
@@ -1282,6 +1291,19 @@ config LATENCYTOP | |||
1282 | source mm/Kconfig.debug | 1291 | source mm/Kconfig.debug |
1283 | source kernel/trace/Kconfig | 1292 | source kernel/trace/Kconfig |
1284 | 1293 | ||
1294 | config RBTREE_TEST | ||
1295 | tristate "Red-Black tree test" | ||
1296 | depends on m && DEBUG_KERNEL | ||
1297 | help | ||
1298 | A benchmark measuring the performance of the rbtree library. | ||
1299 | Also includes rbtree invariant checks. | ||
1300 | |||
1301 | config INTERVAL_TREE_TEST | ||
1302 | tristate "Interval tree test" | ||
1303 | depends on m && DEBUG_KERNEL | ||
1304 | help | ||
1305 | A benchmark measuring the performance of the interval tree library | ||
1306 | |||
1285 | config PROVIDE_OHCI1394_DMA_INIT | 1307 | config PROVIDE_OHCI1394_DMA_INIT |
1286 | bool "Remote debugging over FireWire early on boot" | 1308 | bool "Remote debugging over FireWire early on boot" |
1287 | depends on PCI && X86 | 1309 | depends on PCI && X86 |
diff --git a/lib/Makefile b/lib/Makefile index 42d283edc4d3..3128e357e286 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -9,7 +9,7 @@ endif | |||
9 | 9 | ||
10 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ | 10 | lib-y := ctype.o string.o vsprintf.o cmdline.o \ |
11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ | 11 | rbtree.o radix-tree.o dump_stack.o timerqueue.o\ |
12 | idr.o int_sqrt.o extable.o prio_tree.o \ | 12 | idr.o int_sqrt.o extable.o \ |
13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
15 | is_single_threaded.o plist.o decompress.o | 15 | is_single_threaded.o plist.o decompress.o |
@@ -140,6 +140,11 @@ $(foreach file, $(libfdt_files), \ | |||
140 | $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) | 140 | $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) |
141 | lib-$(CONFIG_LIBFDT) += $(libfdt_files) | 141 | lib-$(CONFIG_LIBFDT) += $(libfdt_files) |
142 | 142 | ||
143 | obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o | ||
144 | obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o | ||
145 | |||
146 | interval_tree_test-objs := interval_tree_test_main.o interval_tree.o | ||
147 | |||
143 | hostprogs-y := gen_crc32table | 148 | hostprogs-y := gen_crc32table |
144 | clean-files := crc32table.h | 149 | clean-files := crc32table.h |
145 | 150 | ||
diff --git a/lib/interval_tree.c b/lib/interval_tree.c new file mode 100644 index 000000000000..e6eb406f2d65 --- /dev/null +++ b/lib/interval_tree.c | |||
@@ -0,0 +1,10 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/interval_tree.h> | ||
3 | #include <linux/interval_tree_generic.h> | ||
4 | |||
5 | #define START(node) ((node)->start) | ||
6 | #define LAST(node) ((node)->last) | ||
7 | |||
8 | INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, | ||
9 | unsigned long, __subtree_last, | ||
10 | START, LAST,, interval_tree) | ||
diff --git a/lib/interval_tree_test_main.c b/lib/interval_tree_test_main.c new file mode 100644 index 000000000000..b25903987f7a --- /dev/null +++ b/lib/interval_tree_test_main.c | |||
@@ -0,0 +1,105 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/interval_tree.h> | ||
3 | #include <linux/random.h> | ||
4 | #include <asm/timex.h> | ||
5 | |||
6 | #define NODES 100 | ||
7 | #define PERF_LOOPS 100000 | ||
8 | #define SEARCHES 100 | ||
9 | #define SEARCH_LOOPS 10000 | ||
10 | |||
11 | static struct rb_root root = RB_ROOT; | ||
12 | static struct interval_tree_node nodes[NODES]; | ||
13 | static u32 queries[SEARCHES]; | ||
14 | |||
15 | static struct rnd_state rnd; | ||
16 | |||
17 | static inline unsigned long | ||
18 | search(unsigned long query, struct rb_root *root) | ||
19 | { | ||
20 | struct interval_tree_node *node; | ||
21 | unsigned long results = 0; | ||
22 | |||
23 | for (node = interval_tree_iter_first(root, query, query); node; | ||
24 | node = interval_tree_iter_next(node, query, query)) | ||
25 | results++; | ||
26 | return results; | ||
27 | } | ||
28 | |||
29 | static void init(void) | ||
30 | { | ||
31 | int i; | ||
32 | for (i = 0; i < NODES; i++) { | ||
33 | u32 a = prandom32(&rnd), b = prandom32(&rnd); | ||
34 | if (a <= b) { | ||
35 | nodes[i].start = a; | ||
36 | nodes[i].last = b; | ||
37 | } else { | ||
38 | nodes[i].start = b; | ||
39 | nodes[i].last = a; | ||
40 | } | ||
41 | } | ||
42 | for (i = 0; i < SEARCHES; i++) | ||
43 | queries[i] = prandom32(&rnd); | ||
44 | } | ||
45 | |||
46 | static int interval_tree_test_init(void) | ||
47 | { | ||
48 | int i, j; | ||
49 | unsigned long results; | ||
50 | cycles_t time1, time2, time; | ||
51 | |||
52 | printk(KERN_ALERT "interval tree insert/remove"); | ||
53 | |||
54 | prandom32_seed(&rnd, 3141592653589793238ULL); | ||
55 | init(); | ||
56 | |||
57 | time1 = get_cycles(); | ||
58 | |||
59 | for (i = 0; i < PERF_LOOPS; i++) { | ||
60 | for (j = 0; j < NODES; j++) | ||
61 | interval_tree_insert(nodes + j, &root); | ||
62 | for (j = 0; j < NODES; j++) | ||
63 | interval_tree_remove(nodes + j, &root); | ||
64 | } | ||
65 | |||
66 | time2 = get_cycles(); | ||
67 | time = time2 - time1; | ||
68 | |||
69 | time = div_u64(time, PERF_LOOPS); | ||
70 | printk(" -> %llu cycles\n", (unsigned long long)time); | ||
71 | |||
72 | printk(KERN_ALERT "interval tree search"); | ||
73 | |||
74 | for (j = 0; j < NODES; j++) | ||
75 | interval_tree_insert(nodes + j, &root); | ||
76 | |||
77 | time1 = get_cycles(); | ||
78 | |||
79 | results = 0; | ||
80 | for (i = 0; i < SEARCH_LOOPS; i++) | ||
81 | for (j = 0; j < SEARCHES; j++) | ||
82 | results += search(queries[j], &root); | ||
83 | |||
84 | time2 = get_cycles(); | ||
85 | time = time2 - time1; | ||
86 | |||
87 | time = div_u64(time, SEARCH_LOOPS); | ||
88 | results = div_u64(results, SEARCH_LOOPS); | ||
89 | printk(" -> %llu cycles (%lu results)\n", | ||
90 | (unsigned long long)time, results); | ||
91 | |||
92 | return -EAGAIN; /* Fail will directly unload the module */ | ||
93 | } | ||
94 | |||
95 | static void interval_tree_test_exit(void) | ||
96 | { | ||
97 | printk(KERN_ALERT "test exit\n"); | ||
98 | } | ||
99 | |||
100 | module_init(interval_tree_test_init) | ||
101 | module_exit(interval_tree_test_exit) | ||
102 | |||
103 | MODULE_LICENSE("GPL"); | ||
104 | MODULE_AUTHOR("Michel Lespinasse"); | ||
105 | MODULE_DESCRIPTION("Interval Tree test"); | ||
diff --git a/lib/prio_tree.c b/lib/prio_tree.c deleted file mode 100644 index 8d443af03b4c..000000000000 --- a/lib/prio_tree.c +++ /dev/null | |||
@@ -1,466 +0,0 @@ | |||
1 | /* | ||
2 | * lib/prio_tree.c - priority search tree | ||
3 | * | ||
4 | * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | * | ||
8 | * Based on the radix priority search tree proposed by Edward M. McCreight | ||
9 | * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 | ||
10 | * | ||
11 | * 02Feb2004 Initial version | ||
12 | */ | ||
13 | |||
14 | #include <linux/init.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/prio_tree.h> | ||
17 | |||
18 | /* | ||
19 | * A clever mix of heap and radix trees forms a radix priority search tree (PST) | ||
20 | * which is useful for storing intervals, e.g, we can consider a vma as a closed | ||
21 | * interval of file pages [offset_begin, offset_end], and store all vmas that | ||
22 | * map a file in a PST. Then, using the PST, we can answer a stabbing query, | ||
23 | * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a | ||
24 | * given input interval X (a set of consecutive file pages), in "O(log n + m)" | ||
25 | * time where 'log n' is the height of the PST, and 'm' is the number of stored | ||
26 | * intervals (vmas) that overlap (map) with the input interval X (the set of | ||
27 | * consecutive file pages). | ||
28 | * | ||
29 | * In our implementation, we store closed intervals of the form [radix_index, | ||
30 | * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST | ||
31 | * is designed for storing intervals with unique radix indices, i.e., each | ||
32 | * interval have different radix_index. However, this limitation can be easily | ||
33 | * overcome by using the size, i.e., heap_index - radix_index, as part of the | ||
34 | * index, so we index the tree using [(radix_index,size), heap_index]. | ||
35 | * | ||
36 | * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit | ||
37 | * machine, the maximum height of a PST can be 64. We can use a balanced version | ||
38 | * of the priority search tree to optimize the tree height, but the balanced | ||
39 | * tree proposed by McCreight is too complex and memory-hungry for our purpose. | ||
40 | */ | ||
41 | |||
42 | /* | ||
43 | * The following macros are used for implementing prio_tree for i_mmap | ||
44 | */ | ||
45 | |||
46 | #define RADIX_INDEX(vma) ((vma)->vm_pgoff) | ||
47 | #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) | ||
48 | /* avoid overflow */ | ||
49 | #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) | ||
50 | |||
51 | |||
52 | static void get_index(const struct prio_tree_root *root, | ||
53 | const struct prio_tree_node *node, | ||
54 | unsigned long *radix, unsigned long *heap) | ||
55 | { | ||
56 | if (root->raw) { | ||
57 | struct vm_area_struct *vma = prio_tree_entry( | ||
58 | node, struct vm_area_struct, shared.prio_tree_node); | ||
59 | |||
60 | *radix = RADIX_INDEX(vma); | ||
61 | *heap = HEAP_INDEX(vma); | ||
62 | } | ||
63 | else { | ||
64 | *radix = node->start; | ||
65 | *heap = node->last; | ||
66 | } | ||
67 | } | ||
68 | |||
69 | static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; | ||
70 | |||
71 | void __init prio_tree_init(void) | ||
72 | { | ||
73 | unsigned int i; | ||
74 | |||
75 | for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++) | ||
76 | index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1; | ||
77 | index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Maximum heap_index that can be stored in a PST with index_bits bits | ||
82 | */ | ||
83 | static inline unsigned long prio_tree_maxindex(unsigned int bits) | ||
84 | { | ||
85 | return index_bits_to_maxindex[bits - 1]; | ||
86 | } | ||
87 | |||
88 | static void prio_set_parent(struct prio_tree_node *parent, | ||
89 | struct prio_tree_node *child, bool left) | ||
90 | { | ||
91 | if (left) | ||
92 | parent->left = child; | ||
93 | else | ||
94 | parent->right = child; | ||
95 | |||
96 | child->parent = parent; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Extend a priority search tree so that it can store a node with heap_index | ||
101 | * max_heap_index. In the worst case, this algorithm takes O((log n)^2). | ||
102 | * However, this function is used rarely and the common case performance is | ||
103 | * not bad. | ||
104 | */ | ||
105 | static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root, | ||
106 | struct prio_tree_node *node, unsigned long max_heap_index) | ||
107 | { | ||
108 | struct prio_tree_node *prev; | ||
109 | |||
110 | if (max_heap_index > prio_tree_maxindex(root->index_bits)) | ||
111 | root->index_bits++; | ||
112 | |||
113 | prev = node; | ||
114 | INIT_PRIO_TREE_NODE(node); | ||
115 | |||
116 | while (max_heap_index > prio_tree_maxindex(root->index_bits)) { | ||
117 | struct prio_tree_node *tmp = root->prio_tree_node; | ||
118 | |||
119 | root->index_bits++; | ||
120 | |||
121 | if (prio_tree_empty(root)) | ||
122 | continue; | ||
123 | |||
124 | prio_tree_remove(root, root->prio_tree_node); | ||
125 | INIT_PRIO_TREE_NODE(tmp); | ||
126 | |||
127 | prio_set_parent(prev, tmp, true); | ||
128 | prev = tmp; | ||
129 | } | ||
130 | |||
131 | if (!prio_tree_empty(root)) | ||
132 | prio_set_parent(prev, root->prio_tree_node, true); | ||
133 | |||
134 | root->prio_tree_node = node; | ||
135 | return node; | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * Replace a prio_tree_node with a new node and return the old node | ||
140 | */ | ||
141 | struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root, | ||
142 | struct prio_tree_node *old, struct prio_tree_node *node) | ||
143 | { | ||
144 | INIT_PRIO_TREE_NODE(node); | ||
145 | |||
146 | if (prio_tree_root(old)) { | ||
147 | BUG_ON(root->prio_tree_node != old); | ||
148 | /* | ||
149 | * We can reduce root->index_bits here. However, it is complex | ||
150 | * and does not help much to improve performance (IMO). | ||
151 | */ | ||
152 | root->prio_tree_node = node; | ||
153 | } else | ||
154 | prio_set_parent(old->parent, node, old->parent->left == old); | ||
155 | |||
156 | if (!prio_tree_left_empty(old)) | ||
157 | prio_set_parent(node, old->left, true); | ||
158 | |||
159 | if (!prio_tree_right_empty(old)) | ||
160 | prio_set_parent(node, old->right, false); | ||
161 | |||
162 | return old; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Insert a prio_tree_node @node into a radix priority search tree @root. The | ||
167 | * algorithm typically takes O(log n) time where 'log n' is the number of bits | ||
168 | * required to represent the maximum heap_index. In the worst case, the algo | ||
169 | * can take O((log n)^2) - check prio_tree_expand. | ||
170 | * | ||
171 | * If a prior node with same radix_index and heap_index is already found in | ||
172 | * the tree, then returns the address of the prior node. Otherwise, inserts | ||
173 | * @node into the tree and returns @node. | ||
174 | */ | ||
175 | struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, | ||
176 | struct prio_tree_node *node) | ||
177 | { | ||
178 | struct prio_tree_node *cur, *res = node; | ||
179 | unsigned long radix_index, heap_index; | ||
180 | unsigned long r_index, h_index, index, mask; | ||
181 | int size_flag = 0; | ||
182 | |||
183 | get_index(root, node, &radix_index, &heap_index); | ||
184 | |||
185 | if (prio_tree_empty(root) || | ||
186 | heap_index > prio_tree_maxindex(root->index_bits)) | ||
187 | return prio_tree_expand(root, node, heap_index); | ||
188 | |||
189 | cur = root->prio_tree_node; | ||
190 | mask = 1UL << (root->index_bits - 1); | ||
191 | |||
192 | while (mask) { | ||
193 | get_index(root, cur, &r_index, &h_index); | ||
194 | |||
195 | if (r_index == radix_index && h_index == heap_index) | ||
196 | return cur; | ||
197 | |||
198 | if (h_index < heap_index || | ||
199 | (h_index == heap_index && r_index > radix_index)) { | ||
200 | struct prio_tree_node *tmp = node; | ||
201 | node = prio_tree_replace(root, cur, node); | ||
202 | cur = tmp; | ||
203 | /* swap indices */ | ||
204 | index = r_index; | ||
205 | r_index = radix_index; | ||
206 | radix_index = index; | ||
207 | index = h_index; | ||
208 | h_index = heap_index; | ||
209 | heap_index = index; | ||
210 | } | ||
211 | |||
212 | if (size_flag) | ||
213 | index = heap_index - radix_index; | ||
214 | else | ||
215 | index = radix_index; | ||
216 | |||
217 | if (index & mask) { | ||
218 | if (prio_tree_right_empty(cur)) { | ||
219 | INIT_PRIO_TREE_NODE(node); | ||
220 | prio_set_parent(cur, node, false); | ||
221 | return res; | ||
222 | } else | ||
223 | cur = cur->right; | ||
224 | } else { | ||
225 | if (prio_tree_left_empty(cur)) { | ||
226 | INIT_PRIO_TREE_NODE(node); | ||
227 | prio_set_parent(cur, node, true); | ||
228 | return res; | ||
229 | } else | ||
230 | cur = cur->left; | ||
231 | } | ||
232 | |||
233 | mask >>= 1; | ||
234 | |||
235 | if (!mask) { | ||
236 | mask = 1UL << (BITS_PER_LONG - 1); | ||
237 | size_flag = 1; | ||
238 | } | ||
239 | } | ||
240 | /* Should not reach here */ | ||
241 | BUG(); | ||
242 | return NULL; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Remove a prio_tree_node @node from a radix priority search tree @root. The | ||
247 | * algorithm takes O(log n) time where 'log n' is the number of bits required | ||
248 | * to represent the maximum heap_index. | ||
249 | */ | ||
250 | void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node) | ||
251 | { | ||
252 | struct prio_tree_node *cur; | ||
253 | unsigned long r_index, h_index_right, h_index_left; | ||
254 | |||
255 | cur = node; | ||
256 | |||
257 | while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) { | ||
258 | if (!prio_tree_left_empty(cur)) | ||
259 | get_index(root, cur->left, &r_index, &h_index_left); | ||
260 | else { | ||
261 | cur = cur->right; | ||
262 | continue; | ||
263 | } | ||
264 | |||
265 | if (!prio_tree_right_empty(cur)) | ||
266 | get_index(root, cur->right, &r_index, &h_index_right); | ||
267 | else { | ||
268 | cur = cur->left; | ||
269 | continue; | ||
270 | } | ||
271 | |||
272 | /* both h_index_left and h_index_right cannot be 0 */ | ||
273 | if (h_index_left >= h_index_right) | ||
274 | cur = cur->left; | ||
275 | else | ||
276 | cur = cur->right; | ||
277 | } | ||
278 | |||
279 | if (prio_tree_root(cur)) { | ||
280 | BUG_ON(root->prio_tree_node != cur); | ||
281 | __INIT_PRIO_TREE_ROOT(root, root->raw); | ||
282 | return; | ||
283 | } | ||
284 | |||
285 | if (cur->parent->right == cur) | ||
286 | cur->parent->right = cur->parent; | ||
287 | else | ||
288 | cur->parent->left = cur->parent; | ||
289 | |||
290 | while (cur != node) | ||
291 | cur = prio_tree_replace(root, cur->parent, cur); | ||
292 | } | ||
293 | |||
294 | static void iter_walk_down(struct prio_tree_iter *iter) | ||
295 | { | ||
296 | iter->mask >>= 1; | ||
297 | if (iter->mask) { | ||
298 | if (iter->size_level) | ||
299 | iter->size_level++; | ||
300 | return; | ||
301 | } | ||
302 | |||
303 | if (iter->size_level) { | ||
304 | BUG_ON(!prio_tree_left_empty(iter->cur)); | ||
305 | BUG_ON(!prio_tree_right_empty(iter->cur)); | ||
306 | iter->size_level++; | ||
307 | iter->mask = ULONG_MAX; | ||
308 | } else { | ||
309 | iter->size_level = 1; | ||
310 | iter->mask = 1UL << (BITS_PER_LONG - 1); | ||
311 | } | ||
312 | } | ||
313 | |||
314 | static void iter_walk_up(struct prio_tree_iter *iter) | ||
315 | { | ||
316 | if (iter->mask == ULONG_MAX) | ||
317 | iter->mask = 1UL; | ||
318 | else if (iter->size_level == 1) | ||
319 | iter->mask = 1UL; | ||
320 | else | ||
321 | iter->mask <<= 1; | ||
322 | if (iter->size_level) | ||
323 | iter->size_level--; | ||
324 | if (!iter->size_level && (iter->value & iter->mask)) | ||
325 | iter->value ^= iter->mask; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Following functions help to enumerate all prio_tree_nodes in the tree that | ||
330 | * overlap with the input interval X [radix_index, heap_index]. The enumeration | ||
331 | * takes O(log n + m) time where 'log n' is the height of the tree (which is | ||
332 | * proportional to # of bits required to represent the maximum heap_index) and | ||
333 | * 'm' is the number of prio_tree_nodes that overlap the interval X. | ||
334 | */ | ||
335 | |||
336 | static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter, | ||
337 | unsigned long *r_index, unsigned long *h_index) | ||
338 | { | ||
339 | if (prio_tree_left_empty(iter->cur)) | ||
340 | return NULL; | ||
341 | |||
342 | get_index(iter->root, iter->cur->left, r_index, h_index); | ||
343 | |||
344 | if (iter->r_index <= *h_index) { | ||
345 | iter->cur = iter->cur->left; | ||
346 | iter_walk_down(iter); | ||
347 | return iter->cur; | ||
348 | } | ||
349 | |||
350 | return NULL; | ||
351 | } | ||
352 | |||
353 | static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter, | ||
354 | unsigned long *r_index, unsigned long *h_index) | ||
355 | { | ||
356 | unsigned long value; | ||
357 | |||
358 | if (prio_tree_right_empty(iter->cur)) | ||
359 | return NULL; | ||
360 | |||
361 | if (iter->size_level) | ||
362 | value = iter->value; | ||
363 | else | ||
364 | value = iter->value | iter->mask; | ||
365 | |||
366 | if (iter->h_index < value) | ||
367 | return NULL; | ||
368 | |||
369 | get_index(iter->root, iter->cur->right, r_index, h_index); | ||
370 | |||
371 | if (iter->r_index <= *h_index) { | ||
372 | iter->cur = iter->cur->right; | ||
373 | iter_walk_down(iter); | ||
374 | return iter->cur; | ||
375 | } | ||
376 | |||
377 | return NULL; | ||
378 | } | ||
379 | |||
380 | static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter) | ||
381 | { | ||
382 | iter->cur = iter->cur->parent; | ||
383 | iter_walk_up(iter); | ||
384 | return iter->cur; | ||
385 | } | ||
386 | |||
387 | static inline int overlap(struct prio_tree_iter *iter, | ||
388 | unsigned long r_index, unsigned long h_index) | ||
389 | { | ||
390 | return iter->h_index >= r_index && iter->r_index <= h_index; | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * prio_tree_first: | ||
395 | * | ||
396 | * Get the first prio_tree_node that overlaps with the interval [radix_index, | ||
397 | * heap_index]. Note that always radix_index <= heap_index. We do a pre-order | ||
398 | * traversal of the tree. | ||
399 | */ | ||
400 | static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter) | ||
401 | { | ||
402 | struct prio_tree_root *root; | ||
403 | unsigned long r_index, h_index; | ||
404 | |||
405 | INIT_PRIO_TREE_ITER(iter); | ||
406 | |||
407 | root = iter->root; | ||
408 | if (prio_tree_empty(root)) | ||
409 | return NULL; | ||
410 | |||
411 | get_index(root, root->prio_tree_node, &r_index, &h_index); | ||
412 | |||
413 | if (iter->r_index > h_index) | ||
414 | return NULL; | ||
415 | |||
416 | iter->mask = 1UL << (root->index_bits - 1); | ||
417 | iter->cur = root->prio_tree_node; | ||
418 | |||
419 | while (1) { | ||
420 | if (overlap(iter, r_index, h_index)) | ||
421 | return iter->cur; | ||
422 | |||
423 | if (prio_tree_left(iter, &r_index, &h_index)) | ||
424 | continue; | ||
425 | |||
426 | if (prio_tree_right(iter, &r_index, &h_index)) | ||
427 | continue; | ||
428 | |||
429 | break; | ||
430 | } | ||
431 | return NULL; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * prio_tree_next: | ||
436 | * | ||
437 | * Get the next prio_tree_node that overlaps with the input interval in iter | ||
438 | */ | ||
439 | struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter) | ||
440 | { | ||
441 | unsigned long r_index, h_index; | ||
442 | |||
443 | if (iter->cur == NULL) | ||
444 | return prio_tree_first(iter); | ||
445 | |||
446 | repeat: | ||
447 | while (prio_tree_left(iter, &r_index, &h_index)) | ||
448 | if (overlap(iter, r_index, h_index)) | ||
449 | return iter->cur; | ||
450 | |||
451 | while (!prio_tree_right(iter, &r_index, &h_index)) { | ||
452 | while (!prio_tree_root(iter->cur) && | ||
453 | iter->cur->parent->right == iter->cur) | ||
454 | prio_tree_parent(iter); | ||
455 | |||
456 | if (prio_tree_root(iter->cur)) | ||
457 | return NULL; | ||
458 | |||
459 | prio_tree_parent(iter); | ||
460 | } | ||
461 | |||
462 | if (overlap(iter, r_index, h_index)) | ||
463 | return iter->cur; | ||
464 | |||
465 | goto repeat; | ||
466 | } | ||
diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565dc2c..4f56a11d67fa 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c | |||
@@ -2,7 +2,8 @@ | |||
2 | Red Black Trees | 2 | Red Black Trees |
3 | (C) 1999 Andrea Arcangeli <andrea@suse.de> | 3 | (C) 1999 Andrea Arcangeli <andrea@suse.de> |
4 | (C) 2002 David Woodhouse <dwmw2@infradead.org> | 4 | (C) 2002 David Woodhouse <dwmw2@infradead.org> |
5 | 5 | (C) 2012 Michel Lespinasse <walken@google.com> | |
6 | |||
6 | This program is free software; you can redistribute it and/or modify | 7 | This program is free software; you can redistribute it and/or modify |
7 | it under the terms of the GNU General Public License as published by | 8 | it under the terms of the GNU General Public License as published by |
8 | the Free Software Foundation; either version 2 of the License, or | 9 | the Free Software Foundation; either version 2 of the License, or |
@@ -20,339 +21,382 @@ | |||
20 | linux/lib/rbtree.c | 21 | linux/lib/rbtree.c |
21 | */ | 22 | */ |
22 | 23 | ||
23 | #include <linux/rbtree.h> | 24 | #include <linux/rbtree_augmented.h> |
24 | #include <linux/export.h> | 25 | #include <linux/export.h> |
25 | 26 | ||
26 | static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) | 27 | /* |
27 | { | 28 | * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree |
28 | struct rb_node *right = node->rb_right; | 29 | * |
29 | struct rb_node *parent = rb_parent(node); | 30 | * 1) A node is either red or black |
30 | 31 | * 2) The root is black | |
31 | if ((node->rb_right = right->rb_left)) | 32 | * 3) All leaves (NULL) are black |
32 | rb_set_parent(right->rb_left, node); | 33 | * 4) Both children of every red node are black |
33 | right->rb_left = node; | 34 | * 5) Every simple path from root to leaves contains the same number |
34 | 35 | * of black nodes. | |
35 | rb_set_parent(right, parent); | 36 | * |
37 | * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two | ||
38 | * consecutive red nodes in a path and every red node is therefore followed by | ||
39 | * a black. So if B is the number of black nodes on every simple path (as per | ||
40 | * 5), then the longest possible path due to 4 is 2B. | ||
41 | * | ||
42 | * We shall indicate color with case, where black nodes are uppercase and red | ||
43 | * nodes will be lowercase. Unknown color nodes shall be drawn as red within | ||
44 | * parentheses and have some accompanying text comment. | ||
45 | */ | ||
36 | 46 | ||
37 | if (parent) | 47 | static inline void rb_set_black(struct rb_node *rb) |
38 | { | 48 | { |
39 | if (node == parent->rb_left) | 49 | rb->__rb_parent_color |= RB_BLACK; |
40 | parent->rb_left = right; | ||
41 | else | ||
42 | parent->rb_right = right; | ||
43 | } | ||
44 | else | ||
45 | root->rb_node = right; | ||
46 | rb_set_parent(node, right); | ||
47 | } | 50 | } |
48 | 51 | ||
49 | static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) | 52 | static inline struct rb_node *rb_red_parent(struct rb_node *red) |
50 | { | 53 | { |
51 | struct rb_node *left = node->rb_left; | 54 | return (struct rb_node *)red->__rb_parent_color; |
52 | struct rb_node *parent = rb_parent(node); | 55 | } |
53 | |||
54 | if ((node->rb_left = left->rb_right)) | ||
55 | rb_set_parent(left->rb_right, node); | ||
56 | left->rb_right = node; | ||
57 | |||
58 | rb_set_parent(left, parent); | ||
59 | 56 | ||
60 | if (parent) | 57 | /* |
61 | { | 58 | * Helper function for rotations: |
62 | if (node == parent->rb_right) | 59 | * - old's parent and color get assigned to new |
63 | parent->rb_right = left; | 60 | * - old gets assigned new as a parent and 'color' as a color. |
64 | else | 61 | */ |
65 | parent->rb_left = left; | 62 | static inline void |
66 | } | 63 | __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, |
67 | else | 64 | struct rb_root *root, int color) |
68 | root->rb_node = left; | 65 | { |
69 | rb_set_parent(node, left); | 66 | struct rb_node *parent = rb_parent(old); |
67 | new->__rb_parent_color = old->__rb_parent_color; | ||
68 | rb_set_parent_color(old, new, color); | ||
69 | __rb_change_child(old, new, parent, root); | ||
70 | } | 70 | } |
71 | 71 | ||
72 | void rb_insert_color(struct rb_node *node, struct rb_root *root) | 72 | static __always_inline void |
73 | __rb_insert(struct rb_node *node, struct rb_root *root, | ||
74 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) | ||
73 | { | 75 | { |
74 | struct rb_node *parent, *gparent; | 76 | struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; |
75 | 77 | ||
76 | while ((parent = rb_parent(node)) && rb_is_red(parent)) | 78 | while (true) { |
77 | { | 79 | /* |
78 | gparent = rb_parent(parent); | 80 | * Loop invariant: node is red |
79 | 81 | * | |
80 | if (parent == gparent->rb_left) | 82 | * If there is a black parent, we are done. |
81 | { | 83 | * Otherwise, take some corrective action as we don't |
82 | { | 84 | * want a red root or two consecutive red nodes. |
83 | register struct rb_node *uncle = gparent->rb_right; | 85 | */ |
84 | if (uncle && rb_is_red(uncle)) | 86 | if (!parent) { |
85 | { | 87 | rb_set_parent_color(node, NULL, RB_BLACK); |
86 | rb_set_black(uncle); | 88 | break; |
87 | rb_set_black(parent); | 89 | } else if (rb_is_black(parent)) |
88 | rb_set_red(gparent); | 90 | break; |
89 | node = gparent; | 91 | |
90 | continue; | 92 | gparent = rb_red_parent(parent); |
91 | } | 93 | |
94 | tmp = gparent->rb_right; | ||
95 | if (parent != tmp) { /* parent == gparent->rb_left */ | ||
96 | if (tmp && rb_is_red(tmp)) { | ||
97 | /* | ||
98 | * Case 1 - color flips | ||
99 | * | ||
100 | * G g | ||
101 | * / \ / \ | ||
102 | * p u --> P U | ||
103 | * / / | ||
104 | * n N | ||
105 | * | ||
106 | * However, since g's parent might be red, and | ||
107 | * 4) does not allow this, we need to recurse | ||
108 | * at g. | ||
109 | */ | ||
110 | rb_set_parent_color(tmp, gparent, RB_BLACK); | ||
111 | rb_set_parent_color(parent, gparent, RB_BLACK); | ||
112 | node = gparent; | ||
113 | parent = rb_parent(node); | ||
114 | rb_set_parent_color(node, parent, RB_RED); | ||
115 | continue; | ||
92 | } | 116 | } |
93 | 117 | ||
94 | if (parent->rb_right == node) | 118 | tmp = parent->rb_right; |
95 | { | 119 | if (node == tmp) { |
96 | register struct rb_node *tmp; | 120 | /* |
97 | __rb_rotate_left(parent, root); | 121 | * Case 2 - left rotate at parent |
98 | tmp = parent; | 122 | * |
123 | * G G | ||
124 | * / \ / \ | ||
125 | * p U --> n U | ||
126 | * \ / | ||
127 | * n p | ||
128 | * | ||
129 | * This still leaves us in violation of 4), the | ||
130 | * continuation into Case 3 will fix that. | ||
131 | */ | ||
132 | parent->rb_right = tmp = node->rb_left; | ||
133 | node->rb_left = parent; | ||
134 | if (tmp) | ||
135 | rb_set_parent_color(tmp, parent, | ||
136 | RB_BLACK); | ||
137 | rb_set_parent_color(parent, node, RB_RED); | ||
138 | augment_rotate(parent, node); | ||
99 | parent = node; | 139 | parent = node; |
100 | node = tmp; | 140 | tmp = node->rb_right; |
101 | } | 141 | } |
102 | 142 | ||
103 | rb_set_black(parent); | 143 | /* |
104 | rb_set_red(gparent); | 144 | * Case 3 - right rotate at gparent |
105 | __rb_rotate_right(gparent, root); | 145 | * |
146 | * G P | ||
147 | * / \ / \ | ||
148 | * p U --> n g | ||
149 | * / \ | ||
150 | * n U | ||
151 | */ | ||
152 | gparent->rb_left = tmp; /* == parent->rb_right */ | ||
153 | parent->rb_right = gparent; | ||
154 | if (tmp) | ||
155 | rb_set_parent_color(tmp, gparent, RB_BLACK); | ||
156 | __rb_rotate_set_parents(gparent, parent, root, RB_RED); | ||
157 | augment_rotate(gparent, parent); | ||
158 | break; | ||
106 | } else { | 159 | } else { |
107 | { | 160 | tmp = gparent->rb_left; |
108 | register struct rb_node *uncle = gparent->rb_left; | 161 | if (tmp && rb_is_red(tmp)) { |
109 | if (uncle && rb_is_red(uncle)) | 162 | /* Case 1 - color flips */ |
110 | { | 163 | rb_set_parent_color(tmp, gparent, RB_BLACK); |
111 | rb_set_black(uncle); | 164 | rb_set_parent_color(parent, gparent, RB_BLACK); |
112 | rb_set_black(parent); | 165 | node = gparent; |
113 | rb_set_red(gparent); | 166 | parent = rb_parent(node); |
114 | node = gparent; | 167 | rb_set_parent_color(node, parent, RB_RED); |
115 | continue; | 168 | continue; |
116 | } | ||
117 | } | 169 | } |
118 | 170 | ||
119 | if (parent->rb_left == node) | 171 | tmp = parent->rb_left; |
120 | { | 172 | if (node == tmp) { |
121 | register struct rb_node *tmp; | 173 | /* Case 2 - right rotate at parent */ |
122 | __rb_rotate_right(parent, root); | 174 | parent->rb_left = tmp = node->rb_right; |
123 | tmp = parent; | 175 | node->rb_right = parent; |
176 | if (tmp) | ||
177 | rb_set_parent_color(tmp, parent, | ||
178 | RB_BLACK); | ||
179 | rb_set_parent_color(parent, node, RB_RED); | ||
180 | augment_rotate(parent, node); | ||
124 | parent = node; | 181 | parent = node; |
125 | node = tmp; | 182 | tmp = node->rb_left; |
126 | } | 183 | } |
127 | 184 | ||
128 | rb_set_black(parent); | 185 | /* Case 3 - left rotate at gparent */ |
129 | rb_set_red(gparent); | 186 | gparent->rb_right = tmp; /* == parent->rb_left */ |
130 | __rb_rotate_left(gparent, root); | 187 | parent->rb_left = gparent; |
188 | if (tmp) | ||
189 | rb_set_parent_color(tmp, gparent, RB_BLACK); | ||
190 | __rb_rotate_set_parents(gparent, parent, root, RB_RED); | ||
191 | augment_rotate(gparent, parent); | ||
192 | break; | ||
131 | } | 193 | } |
132 | } | 194 | } |
133 | |||
134 | rb_set_black(root->rb_node); | ||
135 | } | 195 | } |
136 | EXPORT_SYMBOL(rb_insert_color); | ||
137 | 196 | ||
138 | static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, | 197 | __always_inline void |
139 | struct rb_root *root) | 198 | __rb_erase_color(struct rb_node *parent, struct rb_root *root, |
199 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) | ||
140 | { | 200 | { |
141 | struct rb_node *other; | 201 | struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; |
142 | 202 | ||
143 | while ((!node || rb_is_black(node)) && node != root->rb_node) | 203 | while (true) { |
144 | { | 204 | /* |
145 | if (parent->rb_left == node) | 205 | * Loop invariants: |
146 | { | 206 | * - node is black (or NULL on first iteration) |
147 | other = parent->rb_right; | 207 | * - node is not the root (parent is not NULL) |
148 | if (rb_is_red(other)) | 208 | * - All leaf paths going through parent and node have a |
149 | { | 209 | * black node count that is 1 lower than other leaf paths. |
150 | rb_set_black(other); | 210 | */ |
151 | rb_set_red(parent); | 211 | sibling = parent->rb_right; |
152 | __rb_rotate_left(parent, root); | 212 | if (node != sibling) { /* node == parent->rb_left */ |
153 | other = parent->rb_right; | 213 | if (rb_is_red(sibling)) { |
214 | /* | ||
215 | * Case 1 - left rotate at parent | ||
216 | * | ||
217 | * P S | ||
218 | * / \ / \ | ||
219 | * N s --> p Sr | ||
220 | * / \ / \ | ||
221 | * Sl Sr N Sl | ||
222 | */ | ||
223 | parent->rb_right = tmp1 = sibling->rb_left; | ||
224 | sibling->rb_left = parent; | ||
225 | rb_set_parent_color(tmp1, parent, RB_BLACK); | ||
226 | __rb_rotate_set_parents(parent, sibling, root, | ||
227 | RB_RED); | ||
228 | augment_rotate(parent, sibling); | ||
229 | sibling = tmp1; | ||
154 | } | 230 | } |
155 | if ((!other->rb_left || rb_is_black(other->rb_left)) && | 231 | tmp1 = sibling->rb_right; |
156 | (!other->rb_right || rb_is_black(other->rb_right))) | 232 | if (!tmp1 || rb_is_black(tmp1)) { |
157 | { | 233 | tmp2 = sibling->rb_left; |
158 | rb_set_red(other); | 234 | if (!tmp2 || rb_is_black(tmp2)) { |
159 | node = parent; | 235 | /* |
160 | parent = rb_parent(node); | 236 | * Case 2 - sibling color flip |
161 | } | 237 | * (p could be either color here) |
162 | else | 238 | * |
163 | { | 239 | * (p) (p) |
164 | if (!other->rb_right || rb_is_black(other->rb_right)) | 240 | * / \ / \ |
165 | { | 241 | * N S --> N s |
166 | rb_set_black(other->rb_left); | 242 | * / \ / \ |
167 | rb_set_red(other); | 243 | * Sl Sr Sl Sr |
168 | __rb_rotate_right(other, root); | 244 | * |
169 | other = parent->rb_right; | 245 | * This leaves us violating 5) which |
246 | * can be fixed by flipping p to black | ||
247 | * if it was red, or by recursing at p. | ||
248 | * p is red when coming from Case 1. | ||
249 | */ | ||
250 | rb_set_parent_color(sibling, parent, | ||
251 | RB_RED); | ||
252 | if (rb_is_red(parent)) | ||
253 | rb_set_black(parent); | ||
254 | else { | ||
255 | node = parent; | ||
256 | parent = rb_parent(node); | ||
257 | if (parent) | ||
258 | continue; | ||
259 | } | ||
260 | break; | ||
170 | } | 261 | } |
171 | rb_set_color(other, rb_color(parent)); | 262 | /* |
172 | rb_set_black(parent); | 263 | * Case 3 - right rotate at sibling |
173 | rb_set_black(other->rb_right); | 264 | * (p could be either color here) |
174 | __rb_rotate_left(parent, root); | 265 | * |
175 | node = root->rb_node; | 266 | * (p) (p) |
176 | break; | 267 | * / \ / \ |
177 | } | 268 | * N S --> N Sl |
178 | } | 269 | * / \ \ |
179 | else | 270 | * sl Sr s |
180 | { | 271 | * \ |
181 | other = parent->rb_left; | 272 | * Sr |
182 | if (rb_is_red(other)) | 273 | */ |
183 | { | 274 | sibling->rb_left = tmp1 = tmp2->rb_right; |
184 | rb_set_black(other); | 275 | tmp2->rb_right = sibling; |
185 | rb_set_red(parent); | 276 | parent->rb_right = tmp2; |
186 | __rb_rotate_right(parent, root); | 277 | if (tmp1) |
187 | other = parent->rb_left; | 278 | rb_set_parent_color(tmp1, sibling, |
279 | RB_BLACK); | ||
280 | augment_rotate(sibling, tmp2); | ||
281 | tmp1 = sibling; | ||
282 | sibling = tmp2; | ||
188 | } | 283 | } |
189 | if ((!other->rb_left || rb_is_black(other->rb_left)) && | 284 | /* |
190 | (!other->rb_right || rb_is_black(other->rb_right))) | 285 | * Case 4 - left rotate at parent + color flips |
191 | { | 286 | * (p and sl could be either color here. |
192 | rb_set_red(other); | 287 | * After rotation, p becomes black, s acquires |
193 | node = parent; | 288 | * p's color, and sl keeps its color) |
194 | parent = rb_parent(node); | 289 | * |
290 | * (p) (s) | ||
291 | * / \ / \ | ||
292 | * N S --> P Sr | ||
293 | * / \ / \ | ||
294 | * (sl) sr N (sl) | ||
295 | */ | ||
296 | parent->rb_right = tmp2 = sibling->rb_left; | ||
297 | sibling->rb_left = parent; | ||
298 | rb_set_parent_color(tmp1, sibling, RB_BLACK); | ||
299 | if (tmp2) | ||
300 | rb_set_parent(tmp2, parent); | ||
301 | __rb_rotate_set_parents(parent, sibling, root, | ||
302 | RB_BLACK); | ||
303 | augment_rotate(parent, sibling); | ||
304 | break; | ||
305 | } else { | ||
306 | sibling = parent->rb_left; | ||
307 | if (rb_is_red(sibling)) { | ||
308 | /* Case 1 - right rotate at parent */ | ||
309 | parent->rb_left = tmp1 = sibling->rb_right; | ||
310 | sibling->rb_right = parent; | ||
311 | rb_set_parent_color(tmp1, parent, RB_BLACK); | ||
312 | __rb_rotate_set_parents(parent, sibling, root, | ||
313 | RB_RED); | ||
314 | augment_rotate(parent, sibling); | ||
315 | sibling = tmp1; | ||
195 | } | 316 | } |
196 | else | 317 | tmp1 = sibling->rb_left; |
197 | { | 318 | if (!tmp1 || rb_is_black(tmp1)) { |
198 | if (!other->rb_left || rb_is_black(other->rb_left)) | 319 | tmp2 = sibling->rb_right; |
199 | { | 320 | if (!tmp2 || rb_is_black(tmp2)) { |
200 | rb_set_black(other->rb_right); | 321 | /* Case 2 - sibling color flip */ |
201 | rb_set_red(other); | 322 | rb_set_parent_color(sibling, parent, |
202 | __rb_rotate_left(other, root); | 323 | RB_RED); |
203 | other = parent->rb_left; | 324 | if (rb_is_red(parent)) |
325 | rb_set_black(parent); | ||
326 | else { | ||
327 | node = parent; | ||
328 | parent = rb_parent(node); | ||
329 | if (parent) | ||
330 | continue; | ||
331 | } | ||
332 | break; | ||
204 | } | 333 | } |
205 | rb_set_color(other, rb_color(parent)); | 334 | /* Case 3 - right rotate at sibling */ |
206 | rb_set_black(parent); | 335 | sibling->rb_right = tmp1 = tmp2->rb_left; |
207 | rb_set_black(other->rb_left); | 336 | tmp2->rb_left = sibling; |
208 | __rb_rotate_right(parent, root); | 337 | parent->rb_left = tmp2; |
209 | node = root->rb_node; | 338 | if (tmp1) |
210 | break; | 339 | rb_set_parent_color(tmp1, sibling, |
340 | RB_BLACK); | ||
341 | augment_rotate(sibling, tmp2); | ||
342 | tmp1 = sibling; | ||
343 | sibling = tmp2; | ||
211 | } | 344 | } |
345 | /* Case 4 - left rotate at parent + color flips */ | ||
346 | parent->rb_left = tmp2 = sibling->rb_right; | ||
347 | sibling->rb_right = parent; | ||
348 | rb_set_parent_color(tmp1, sibling, RB_BLACK); | ||
349 | if (tmp2) | ||
350 | rb_set_parent(tmp2, parent); | ||
351 | __rb_rotate_set_parents(parent, sibling, root, | ||
352 | RB_BLACK); | ||
353 | augment_rotate(parent, sibling); | ||
354 | break; | ||
212 | } | 355 | } |
213 | } | 356 | } |
214 | if (node) | ||
215 | rb_set_black(node); | ||
216 | } | 357 | } |
358 | EXPORT_SYMBOL(__rb_erase_color); | ||
217 | 359 | ||
218 | void rb_erase(struct rb_node *node, struct rb_root *root) | 360 | /* |
219 | { | 361 | * Non-augmented rbtree manipulation functions. |
220 | struct rb_node *child, *parent; | 362 | * |
221 | int color; | 363 | * We use dummy augmented callbacks here, and have the compiler optimize them |
222 | 364 | * out of the rb_insert_color() and rb_erase() function definitions. | |
223 | if (!node->rb_left) | 365 | */ |
224 | child = node->rb_right; | ||
225 | else if (!node->rb_right) | ||
226 | child = node->rb_left; | ||
227 | else | ||
228 | { | ||
229 | struct rb_node *old = node, *left; | ||
230 | |||
231 | node = node->rb_right; | ||
232 | while ((left = node->rb_left) != NULL) | ||
233 | node = left; | ||
234 | |||
235 | if (rb_parent(old)) { | ||
236 | if (rb_parent(old)->rb_left == old) | ||
237 | rb_parent(old)->rb_left = node; | ||
238 | else | ||
239 | rb_parent(old)->rb_right = node; | ||
240 | } else | ||
241 | root->rb_node = node; | ||
242 | |||
243 | child = node->rb_right; | ||
244 | parent = rb_parent(node); | ||
245 | color = rb_color(node); | ||
246 | |||
247 | if (parent == old) { | ||
248 | parent = node; | ||
249 | } else { | ||
250 | if (child) | ||
251 | rb_set_parent(child, parent); | ||
252 | parent->rb_left = child; | ||
253 | |||
254 | node->rb_right = old->rb_right; | ||
255 | rb_set_parent(old->rb_right, node); | ||
256 | } | ||
257 | |||
258 | node->rb_parent_color = old->rb_parent_color; | ||
259 | node->rb_left = old->rb_left; | ||
260 | rb_set_parent(old->rb_left, node); | ||
261 | 366 | ||
262 | goto color; | 367 | static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} |
263 | } | 368 | static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} |
369 | static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} | ||
264 | 370 | ||
265 | parent = rb_parent(node); | 371 | static const struct rb_augment_callbacks dummy_callbacks = { |
266 | color = rb_color(node); | 372 | dummy_propagate, dummy_copy, dummy_rotate |
267 | 373 | }; | |
268 | if (child) | ||
269 | rb_set_parent(child, parent); | ||
270 | if (parent) | ||
271 | { | ||
272 | if (parent->rb_left == node) | ||
273 | parent->rb_left = child; | ||
274 | else | ||
275 | parent->rb_right = child; | ||
276 | } | ||
277 | else | ||
278 | root->rb_node = child; | ||
279 | 374 | ||
280 | color: | 375 | void rb_insert_color(struct rb_node *node, struct rb_root *root) |
281 | if (color == RB_BLACK) | ||
282 | __rb_erase_color(child, parent, root); | ||
283 | } | ||
284 | EXPORT_SYMBOL(rb_erase); | ||
285 | |||
286 | static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) | ||
287 | { | 376 | { |
288 | struct rb_node *parent; | 377 | __rb_insert(node, root, dummy_rotate); |
289 | |||
290 | up: | ||
291 | func(node, data); | ||
292 | parent = rb_parent(node); | ||
293 | if (!parent) | ||
294 | return; | ||
295 | |||
296 | if (node == parent->rb_left && parent->rb_right) | ||
297 | func(parent->rb_right, data); | ||
298 | else if (parent->rb_left) | ||
299 | func(parent->rb_left, data); | ||
300 | |||
301 | node = parent; | ||
302 | goto up; | ||
303 | } | 378 | } |
379 | EXPORT_SYMBOL(rb_insert_color); | ||
304 | 380 | ||
305 | /* | 381 | void rb_erase(struct rb_node *node, struct rb_root *root) |
306 | * after inserting @node into the tree, update the tree to account for | ||
307 | * both the new entry and any damage done by rebalance | ||
308 | */ | ||
309 | void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) | ||
310 | { | 382 | { |
311 | if (node->rb_left) | 383 | rb_erase_augmented(node, root, &dummy_callbacks); |
312 | node = node->rb_left; | ||
313 | else if (node->rb_right) | ||
314 | node = node->rb_right; | ||
315 | |||
316 | rb_augment_path(node, func, data); | ||
317 | } | 384 | } |
318 | EXPORT_SYMBOL(rb_augment_insert); | 385 | EXPORT_SYMBOL(rb_erase); |
319 | 386 | ||
320 | /* | 387 | /* |
321 | * before removing the node, find the deepest node on the rebalance path | 388 | * Augmented rbtree manipulation functions. |
322 | * that will still be there after @node gets removed | 389 | * |
390 | * This instantiates the same __always_inline functions as in the non-augmented | ||
391 | * case, but this time with user-defined callbacks. | ||
323 | */ | 392 | */ |
324 | struct rb_node *rb_augment_erase_begin(struct rb_node *node) | ||
325 | { | ||
326 | struct rb_node *deepest; | ||
327 | |||
328 | if (!node->rb_right && !node->rb_left) | ||
329 | deepest = rb_parent(node); | ||
330 | else if (!node->rb_right) | ||
331 | deepest = node->rb_left; | ||
332 | else if (!node->rb_left) | ||
333 | deepest = node->rb_right; | ||
334 | else { | ||
335 | deepest = rb_next(node); | ||
336 | if (deepest->rb_right) | ||
337 | deepest = deepest->rb_right; | ||
338 | else if (rb_parent(deepest) != node) | ||
339 | deepest = rb_parent(deepest); | ||
340 | } | ||
341 | |||
342 | return deepest; | ||
343 | } | ||
344 | EXPORT_SYMBOL(rb_augment_erase_begin); | ||
345 | 393 | ||
346 | /* | 394 | void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, |
347 | * after removal, update the tree to account for the removed entry | 395 | void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) |
348 | * and any rebalance damage. | ||
349 | */ | ||
350 | void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) | ||
351 | { | 396 | { |
352 | if (node) | 397 | __rb_insert(node, root, augment_rotate); |
353 | rb_augment_path(node, func, data); | ||
354 | } | 398 | } |
355 | EXPORT_SYMBOL(rb_augment_erase_end); | 399 | EXPORT_SYMBOL(__rb_insert_augmented); |
356 | 400 | ||
357 | /* | 401 | /* |
358 | * This function returns the first node (in sort order) of the tree. | 402 | * This function returns the first node (in sort order) of the tree. |
@@ -387,11 +431,13 @@ struct rb_node *rb_next(const struct rb_node *node) | |||
387 | { | 431 | { |
388 | struct rb_node *parent; | 432 | struct rb_node *parent; |
389 | 433 | ||
390 | if (rb_parent(node) == node) | 434 | if (RB_EMPTY_NODE(node)) |
391 | return NULL; | 435 | return NULL; |
392 | 436 | ||
393 | /* If we have a right-hand child, go down and then left as far | 437 | /* |
394 | as we can. */ | 438 | * If we have a right-hand child, go down and then left as far |
439 | * as we can. | ||
440 | */ | ||
395 | if (node->rb_right) { | 441 | if (node->rb_right) { |
396 | node = node->rb_right; | 442 | node = node->rb_right; |
397 | while (node->rb_left) | 443 | while (node->rb_left) |
@@ -399,12 +445,13 @@ struct rb_node *rb_next(const struct rb_node *node) | |||
399 | return (struct rb_node *)node; | 445 | return (struct rb_node *)node; |
400 | } | 446 | } |
401 | 447 | ||
402 | /* No right-hand children. Everything down and left is | 448 | /* |
403 | smaller than us, so any 'next' node must be in the general | 449 | * No right-hand children. Everything down and left is smaller than us, |
404 | direction of our parent. Go up the tree; any time the | 450 | * so any 'next' node must be in the general direction of our parent. |
405 | ancestor is a right-hand child of its parent, keep going | 451 | * Go up the tree; any time the ancestor is a right-hand child of its |
406 | up. First time it's a left-hand child of its parent, said | 452 | * parent, keep going up. First time it's a left-hand child of its |
407 | parent is our 'next' node. */ | 453 | * parent, said parent is our 'next' node. |
454 | */ | ||
408 | while ((parent = rb_parent(node)) && node == parent->rb_right) | 455 | while ((parent = rb_parent(node)) && node == parent->rb_right) |
409 | node = parent; | 456 | node = parent; |
410 | 457 | ||
@@ -416,11 +463,13 @@ struct rb_node *rb_prev(const struct rb_node *node) | |||
416 | { | 463 | { |
417 | struct rb_node *parent; | 464 | struct rb_node *parent; |
418 | 465 | ||
419 | if (rb_parent(node) == node) | 466 | if (RB_EMPTY_NODE(node)) |
420 | return NULL; | 467 | return NULL; |
421 | 468 | ||
422 | /* If we have a left-hand child, go down and then right as far | 469 | /* |
423 | as we can. */ | 470 | * If we have a left-hand child, go down and then right as far |
471 | * as we can. | ||
472 | */ | ||
424 | if (node->rb_left) { | 473 | if (node->rb_left) { |
425 | node = node->rb_left; | 474 | node = node->rb_left; |
426 | while (node->rb_right) | 475 | while (node->rb_right) |
@@ -428,8 +477,10 @@ struct rb_node *rb_prev(const struct rb_node *node) | |||
428 | return (struct rb_node *)node; | 477 | return (struct rb_node *)node; |
429 | } | 478 | } |
430 | 479 | ||
431 | /* No left-hand children. Go up till we find an ancestor which | 480 | /* |
432 | is a right-hand child of its parent */ | 481 | * No left-hand children. Go up till we find an ancestor which |
482 | * is a right-hand child of its parent. | ||
483 | */ | ||
433 | while ((parent = rb_parent(node)) && node == parent->rb_left) | 484 | while ((parent = rb_parent(node)) && node == parent->rb_left) |
434 | node = parent; | 485 | node = parent; |
435 | 486 | ||
@@ -443,14 +494,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, | |||
443 | struct rb_node *parent = rb_parent(victim); | 494 | struct rb_node *parent = rb_parent(victim); |
444 | 495 | ||
445 | /* Set the surrounding nodes to point to the replacement */ | 496 | /* Set the surrounding nodes to point to the replacement */ |
446 | if (parent) { | 497 | __rb_change_child(victim, new, parent, root); |
447 | if (victim == parent->rb_left) | ||
448 | parent->rb_left = new; | ||
449 | else | ||
450 | parent->rb_right = new; | ||
451 | } else { | ||
452 | root->rb_node = new; | ||
453 | } | ||
454 | if (victim->rb_left) | 498 | if (victim->rb_left) |
455 | rb_set_parent(victim->rb_left, new); | 499 | rb_set_parent(victim->rb_left, new); |
456 | if (victim->rb_right) | 500 | if (victim->rb_right) |
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c new file mode 100644 index 000000000000..268b23951fec --- /dev/null +++ b/lib/rbtree_test.c | |||
@@ -0,0 +1,234 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/rbtree_augmented.h> | ||
3 | #include <linux/random.h> | ||
4 | #include <asm/timex.h> | ||
5 | |||
6 | #define NODES 100 | ||
7 | #define PERF_LOOPS 100000 | ||
8 | #define CHECK_LOOPS 100 | ||
9 | |||
10 | struct test_node { | ||
11 | struct rb_node rb; | ||
12 | u32 key; | ||
13 | |||
14 | /* following fields used for testing augmented rbtree functionality */ | ||
15 | u32 val; | ||
16 | u32 augmented; | ||
17 | }; | ||
18 | |||
19 | static struct rb_root root = RB_ROOT; | ||
20 | static struct test_node nodes[NODES]; | ||
21 | |||
22 | static struct rnd_state rnd; | ||
23 | |||
24 | static void insert(struct test_node *node, struct rb_root *root) | ||
25 | { | ||
26 | struct rb_node **new = &root->rb_node, *parent = NULL; | ||
27 | u32 key = node->key; | ||
28 | |||
29 | while (*new) { | ||
30 | parent = *new; | ||
31 | if (key < rb_entry(parent, struct test_node, rb)->key) | ||
32 | new = &parent->rb_left; | ||
33 | else | ||
34 | new = &parent->rb_right; | ||
35 | } | ||
36 | |||
37 | rb_link_node(&node->rb, parent, new); | ||
38 | rb_insert_color(&node->rb, root); | ||
39 | } | ||
40 | |||
41 | static inline void erase(struct test_node *node, struct rb_root *root) | ||
42 | { | ||
43 | rb_erase(&node->rb, root); | ||
44 | } | ||
45 | |||
46 | static inline u32 augment_recompute(struct test_node *node) | ||
47 | { | ||
48 | u32 max = node->val, child_augmented; | ||
49 | if (node->rb.rb_left) { | ||
50 | child_augmented = rb_entry(node->rb.rb_left, struct test_node, | ||
51 | rb)->augmented; | ||
52 | if (max < child_augmented) | ||
53 | max = child_augmented; | ||
54 | } | ||
55 | if (node->rb.rb_right) { | ||
56 | child_augmented = rb_entry(node->rb.rb_right, struct test_node, | ||
57 | rb)->augmented; | ||
58 | if (max < child_augmented) | ||
59 | max = child_augmented; | ||
60 | } | ||
61 | return max; | ||
62 | } | ||
63 | |||
64 | RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, | ||
65 | u32, augmented, augment_recompute) | ||
66 | |||
67 | static void insert_augmented(struct test_node *node, struct rb_root *root) | ||
68 | { | ||
69 | struct rb_node **new = &root->rb_node, *rb_parent = NULL; | ||
70 | u32 key = node->key; | ||
71 | u32 val = node->val; | ||
72 | struct test_node *parent; | ||
73 | |||
74 | while (*new) { | ||
75 | rb_parent = *new; | ||
76 | parent = rb_entry(rb_parent, struct test_node, rb); | ||
77 | if (parent->augmented < val) | ||
78 | parent->augmented = val; | ||
79 | if (key < parent->key) | ||
80 | new = &parent->rb.rb_left; | ||
81 | else | ||
82 | new = &parent->rb.rb_right; | ||
83 | } | ||
84 | |||
85 | node->augmented = val; | ||
86 | rb_link_node(&node->rb, rb_parent, new); | ||
87 | rb_insert_augmented(&node->rb, root, &augment_callbacks); | ||
88 | } | ||
89 | |||
90 | static void erase_augmented(struct test_node *node, struct rb_root *root) | ||
91 | { | ||
92 | rb_erase_augmented(&node->rb, root, &augment_callbacks); | ||
93 | } | ||
94 | |||
95 | static void init(void) | ||
96 | { | ||
97 | int i; | ||
98 | for (i = 0; i < NODES; i++) { | ||
99 | nodes[i].key = prandom32(&rnd); | ||
100 | nodes[i].val = prandom32(&rnd); | ||
101 | } | ||
102 | } | ||
103 | |||
104 | static bool is_red(struct rb_node *rb) | ||
105 | { | ||
106 | return !(rb->__rb_parent_color & 1); | ||
107 | } | ||
108 | |||
109 | static int black_path_count(struct rb_node *rb) | ||
110 | { | ||
111 | int count; | ||
112 | for (count = 0; rb; rb = rb_parent(rb)) | ||
113 | count += !is_red(rb); | ||
114 | return count; | ||
115 | } | ||
116 | |||
117 | static void check(int nr_nodes) | ||
118 | { | ||
119 | struct rb_node *rb; | ||
120 | int count = 0; | ||
121 | int blacks; | ||
122 | u32 prev_key = 0; | ||
123 | |||
124 | for (rb = rb_first(&root); rb; rb = rb_next(rb)) { | ||
125 | struct test_node *node = rb_entry(rb, struct test_node, rb); | ||
126 | WARN_ON_ONCE(node->key < prev_key); | ||
127 | WARN_ON_ONCE(is_red(rb) && | ||
128 | (!rb_parent(rb) || is_red(rb_parent(rb)))); | ||
129 | if (!count) | ||
130 | blacks = black_path_count(rb); | ||
131 | else | ||
132 | WARN_ON_ONCE((!rb->rb_left || !rb->rb_right) && | ||
133 | blacks != black_path_count(rb)); | ||
134 | prev_key = node->key; | ||
135 | count++; | ||
136 | } | ||
137 | WARN_ON_ONCE(count != nr_nodes); | ||
138 | } | ||
139 | |||
140 | static void check_augmented(int nr_nodes) | ||
141 | { | ||
142 | struct rb_node *rb; | ||
143 | |||
144 | check(nr_nodes); | ||
145 | for (rb = rb_first(&root); rb; rb = rb_next(rb)) { | ||
146 | struct test_node *node = rb_entry(rb, struct test_node, rb); | ||
147 | WARN_ON_ONCE(node->augmented != augment_recompute(node)); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | static int rbtree_test_init(void) | ||
152 | { | ||
153 | int i, j; | ||
154 | cycles_t time1, time2, time; | ||
155 | |||
156 | printk(KERN_ALERT "rbtree testing"); | ||
157 | |||
158 | prandom32_seed(&rnd, 3141592653589793238ULL); | ||
159 | init(); | ||
160 | |||
161 | time1 = get_cycles(); | ||
162 | |||
163 | for (i = 0; i < PERF_LOOPS; i++) { | ||
164 | for (j = 0; j < NODES; j++) | ||
165 | insert(nodes + j, &root); | ||
166 | for (j = 0; j < NODES; j++) | ||
167 | erase(nodes + j, &root); | ||
168 | } | ||
169 | |||
170 | time2 = get_cycles(); | ||
171 | time = time2 - time1; | ||
172 | |||
173 | time = div_u64(time, PERF_LOOPS); | ||
174 | printk(" -> %llu cycles\n", (unsigned long long)time); | ||
175 | |||
176 | for (i = 0; i < CHECK_LOOPS; i++) { | ||
177 | init(); | ||
178 | for (j = 0; j < NODES; j++) { | ||
179 | check(j); | ||
180 | insert(nodes + j, &root); | ||
181 | } | ||
182 | for (j = 0; j < NODES; j++) { | ||
183 | check(NODES - j); | ||
184 | erase(nodes + j, &root); | ||
185 | } | ||
186 | check(0); | ||
187 | } | ||
188 | |||
189 | printk(KERN_ALERT "augmented rbtree testing"); | ||
190 | |||
191 | init(); | ||
192 | |||
193 | time1 = get_cycles(); | ||
194 | |||
195 | for (i = 0; i < PERF_LOOPS; i++) { | ||
196 | for (j = 0; j < NODES; j++) | ||
197 | insert_augmented(nodes + j, &root); | ||
198 | for (j = 0; j < NODES; j++) | ||
199 | erase_augmented(nodes + j, &root); | ||
200 | } | ||
201 | |||
202 | time2 = get_cycles(); | ||
203 | time = time2 - time1; | ||
204 | |||
205 | time = div_u64(time, PERF_LOOPS); | ||
206 | printk(" -> %llu cycles\n", (unsigned long long)time); | ||
207 | |||
208 | for (i = 0; i < CHECK_LOOPS; i++) { | ||
209 | init(); | ||
210 | for (j = 0; j < NODES; j++) { | ||
211 | check_augmented(j); | ||
212 | insert_augmented(nodes + j, &root); | ||
213 | } | ||
214 | for (j = 0; j < NODES; j++) { | ||
215 | check_augmented(NODES - j); | ||
216 | erase_augmented(nodes + j, &root); | ||
217 | } | ||
218 | check_augmented(0); | ||
219 | } | ||
220 | |||
221 | return -EAGAIN; /* Fail will directly unload the module */ | ||
222 | } | ||
223 | |||
224 | static void rbtree_test_exit(void) | ||
225 | { | ||
226 | printk(KERN_ALERT "test exit\n"); | ||
227 | } | ||
228 | |||
229 | module_init(rbtree_test_init) | ||
230 | module_exit(rbtree_test_exit) | ||
231 | |||
232 | MODULE_LICENSE("GPL"); | ||
233 | MODULE_AUTHOR("Michel Lespinasse"); | ||
234 | MODULE_DESCRIPTION("Red Black Tree test"); | ||
diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019c6627..a3f8dddaaab3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS | |||
191 | # support for memory compaction | 191 | # support for memory compaction |
192 | config COMPACTION | 192 | config COMPACTION |
193 | bool "Allow for memory compaction" | 193 | bool "Allow for memory compaction" |
194 | def_bool y | ||
194 | select MIGRATION | 195 | select MIGRATION |
195 | depends on MMU | 196 | depends on MMU |
196 | help | 197 | help |
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
318 | 319 | ||
319 | config TRANSPARENT_HUGEPAGE | 320 | config TRANSPARENT_HUGEPAGE |
320 | bool "Transparent Hugepage Support" | 321 | bool "Transparent Hugepage Support" |
321 | depends on X86 && MMU | 322 | depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE |
322 | select COMPACTION | 323 | select COMPACTION |
323 | help | 324 | help |
324 | Transparent Hugepages allows the kernel to use huge pages and | 325 | Transparent Hugepages allows the kernel to use huge pages and |
diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -14,9 +14,9 @@ endif | |||
14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o $(mmu-y) | 19 | compaction.o interval_tree.o $(mmu-y) |
20 | 20 | ||
21 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
22 | 22 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..434be4ae7a04 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
198 | int order = ilog2(BITS_PER_LONG); | 198 | int order = ilog2(BITS_PER_LONG); |
199 | 199 | ||
200 | __free_pages_bootmem(pfn_to_page(start), order); | 200 | __free_pages_bootmem(pfn_to_page(start), order); |
201 | fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), | ||
202 | start, start + BITS_PER_LONG); | ||
201 | count += BITS_PER_LONG; | 203 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | 204 | start += BITS_PER_LONG; |
203 | } else { | 205 | } else { |
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
208 | if (vec & 1) { | 210 | if (vec & 1) { |
209 | page = pfn_to_page(start + off); | 211 | page = pfn_to_page(start + off); |
210 | __free_pages_bootmem(page, 0); | 212 | __free_pages_bootmem(page, 0); |
213 | fixup_zone_present_pages( | ||
214 | page_to_nid(page), | ||
215 | start + off, start + off + 1); | ||
211 | count++; | 216 | count++; |
212 | } | 217 | } |
213 | vec >>= 1; | 218 | vec >>= 1; |
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
221 | pages = bdata->node_low_pfn - bdata->node_min_pfn; | 226 | pages = bdata->node_low_pfn - bdata->node_min_pfn; |
222 | pages = bootmem_bootmap_pages(pages); | 227 | pages = bootmem_bootmap_pages(pages); |
223 | count += pages; | 228 | count += pages; |
224 | while (pages--) | 229 | while (pages--) { |
230 | fixup_zone_present_pages(page_to_nid(page), | ||
231 | page_to_pfn(page), page_to_pfn(page) + 1); | ||
225 | __free_pages_bootmem(page++, 0); | 232 | __free_pages_bootmem(page++, 0); |
233 | } | ||
226 | 234 | ||
227 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); | 235 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); |
228 | 236 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a52e68d..2c4ce17651d8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype) | |||
50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
51 | } | 51 | } |
52 | 52 | ||
53 | #ifdef CONFIG_COMPACTION | ||
54 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | ||
55 | static inline bool isolation_suitable(struct compact_control *cc, | ||
56 | struct page *page) | ||
57 | { | ||
58 | if (cc->ignore_skip_hint) | ||
59 | return true; | ||
60 | |||
61 | return !get_pageblock_skip(page); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * This function is called to clear all cached information on pageblocks that | ||
66 | * should be skipped for page isolation when the migrate and free page scanner | ||
67 | * meet. | ||
68 | */ | ||
69 | static void __reset_isolation_suitable(struct zone *zone) | ||
70 | { | ||
71 | unsigned long start_pfn = zone->zone_start_pfn; | ||
72 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
73 | unsigned long pfn; | ||
74 | |||
75 | zone->compact_cached_migrate_pfn = start_pfn; | ||
76 | zone->compact_cached_free_pfn = end_pfn; | ||
77 | zone->compact_blockskip_flush = false; | ||
78 | |||
79 | /* Walk the zone and mark every pageblock as suitable for isolation */ | ||
80 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
81 | struct page *page; | ||
82 | |||
83 | cond_resched(); | ||
84 | |||
85 | if (!pfn_valid(pfn)) | ||
86 | continue; | ||
87 | |||
88 | page = pfn_to_page(pfn); | ||
89 | if (zone != page_zone(page)) | ||
90 | continue; | ||
91 | |||
92 | clear_pageblock_skip(page); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void reset_isolation_suitable(pg_data_t *pgdat) | ||
97 | { | ||
98 | int zoneid; | ||
99 | |||
100 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | ||
101 | struct zone *zone = &pgdat->node_zones[zoneid]; | ||
102 | if (!populated_zone(zone)) | ||
103 | continue; | ||
104 | |||
105 | /* Only flush if a full compaction finished recently */ | ||
106 | if (zone->compact_blockskip_flush) | ||
107 | __reset_isolation_suitable(zone); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * If no pages were isolated then mark this pageblock to be skipped in the | ||
113 | * future. The information is later cleared by __reset_isolation_suitable(). | ||
114 | */ | ||
115 | static void update_pageblock_skip(struct compact_control *cc, | ||
116 | struct page *page, unsigned long nr_isolated, | ||
117 | bool migrate_scanner) | ||
118 | { | ||
119 | struct zone *zone = cc->zone; | ||
120 | if (!page) | ||
121 | return; | ||
122 | |||
123 | if (!nr_isolated) { | ||
124 | unsigned long pfn = page_to_pfn(page); | ||
125 | set_pageblock_skip(page); | ||
126 | |||
127 | /* Update where compaction should restart */ | ||
128 | if (migrate_scanner) { | ||
129 | if (!cc->finished_update_migrate && | ||
130 | pfn > zone->compact_cached_migrate_pfn) | ||
131 | zone->compact_cached_migrate_pfn = pfn; | ||
132 | } else { | ||
133 | if (!cc->finished_update_free && | ||
134 | pfn < zone->compact_cached_free_pfn) | ||
135 | zone->compact_cached_free_pfn = pfn; | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | #else | ||
140 | static inline bool isolation_suitable(struct compact_control *cc, | ||
141 | struct page *page) | ||
142 | { | ||
143 | return true; | ||
144 | } | ||
145 | |||
146 | static void update_pageblock_skip(struct compact_control *cc, | ||
147 | struct page *page, unsigned long nr_isolated, | ||
148 | bool migrate_scanner) | ||
149 | { | ||
150 | } | ||
151 | #endif /* CONFIG_COMPACTION */ | ||
152 | |||
153 | static inline bool should_release_lock(spinlock_t *lock) | ||
154 | { | ||
155 | return need_resched() || spin_is_contended(lock); | ||
156 | } | ||
157 | |||
53 | /* | 158 | /* |
54 | * Compaction requires the taking of some coarse locks that are potentially | 159 | * Compaction requires the taking of some coarse locks that are potentially |
55 | * very heavily contended. Check if the process needs to be scheduled or | 160 | * very heavily contended. Check if the process needs to be scheduled or |
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype) | |||
62 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | 167 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, |
63 | bool locked, struct compact_control *cc) | 168 | bool locked, struct compact_control *cc) |
64 | { | 169 | { |
65 | if (need_resched() || spin_is_contended(lock)) { | 170 | if (should_release_lock(lock)) { |
66 | if (locked) { | 171 | if (locked) { |
67 | spin_unlock_irqrestore(lock, *flags); | 172 | spin_unlock_irqrestore(lock, *flags); |
68 | locked = false; | 173 | locked = false; |
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
70 | 175 | ||
71 | /* async aborts if taking too long or contended */ | 176 | /* async aborts if taking too long or contended */ |
72 | if (!cc->sync) { | 177 | if (!cc->sync) { |
73 | if (cc->contended) | 178 | cc->contended = true; |
74 | *cc->contended = true; | ||
75 | return false; | 179 | return false; |
76 | } | 180 | } |
77 | 181 | ||
78 | cond_resched(); | 182 | cond_resched(); |
79 | if (fatal_signal_pending(current)) | ||
80 | return false; | ||
81 | } | 183 | } |
82 | 184 | ||
83 | if (!locked) | 185 | if (!locked) |
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, | |||
91 | return compact_checklock_irqsave(lock, flags, false, cc); | 193 | return compact_checklock_irqsave(lock, flags, false, cc); |
92 | } | 194 | } |
93 | 195 | ||
196 | /* Returns true if the page is within a block suitable for migration to */ | ||
197 | static bool suitable_migration_target(struct page *page) | ||
198 | { | ||
199 | int migratetype = get_pageblock_migratetype(page); | ||
200 | |||
201 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
202 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
203 | return false; | ||
204 | |||
205 | /* If the page is a large free page, then allow migration */ | ||
206 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
207 | return true; | ||
208 | |||
209 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
210 | if (migrate_async_suitable(migratetype)) | ||
211 | return true; | ||
212 | |||
213 | /* Otherwise skip the block */ | ||
214 | return false; | ||
215 | } | ||
216 | |||
217 | static void compact_capture_page(struct compact_control *cc) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | int mtype, mtype_low, mtype_high; | ||
221 | |||
222 | if (!cc->page || *cc->page) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
227 | * regardless of the migratetype of the freelist is is captured from. | ||
228 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
229 | * allocation is typically at least a pageblock size and overall | ||
230 | * fragmentation is not impaired. Other allocation types must | ||
231 | * capture pages from their own migratelist because otherwise they | ||
232 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
233 | * difficult to move pages and making fragmentation worse overall. | ||
234 | */ | ||
235 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
236 | mtype_low = 0; | ||
237 | mtype_high = MIGRATE_PCPTYPES; | ||
238 | } else { | ||
239 | mtype_low = cc->migratetype; | ||
240 | mtype_high = cc->migratetype + 1; | ||
241 | } | ||
242 | |||
243 | /* Speculatively examine the free lists without zone lock */ | ||
244 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
245 | int order; | ||
246 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
247 | struct page *page; | ||
248 | struct free_area *area; | ||
249 | area = &(cc->zone->free_area[order]); | ||
250 | if (list_empty(&area->free_list[mtype])) | ||
251 | continue; | ||
252 | |||
253 | /* Take the lock and attempt capture of the page */ | ||
254 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
255 | return; | ||
256 | if (!list_empty(&area->free_list[mtype])) { | ||
257 | page = list_entry(area->free_list[mtype].next, | ||
258 | struct page, lru); | ||
259 | if (capture_free_page(page, cc->order, mtype)) { | ||
260 | spin_unlock_irqrestore(&cc->zone->lock, | ||
261 | flags); | ||
262 | *cc->page = page; | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | |||
94 | /* | 271 | /* |
95 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 272 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
96 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 273 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
97 | * pages inside of the pageblock (even though it may still end up isolating | 274 | * pages inside of the pageblock (even though it may still end up isolating |
98 | * some pages). | 275 | * some pages). |
99 | */ | 276 | */ |
100 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | 277 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
278 | unsigned long blockpfn, | ||
101 | unsigned long end_pfn, | 279 | unsigned long end_pfn, |
102 | struct list_head *freelist, | 280 | struct list_head *freelist, |
103 | bool strict) | 281 | bool strict) |
104 | { | 282 | { |
105 | int nr_scanned = 0, total_isolated = 0; | 283 | int nr_scanned = 0, total_isolated = 0; |
106 | struct page *cursor; | 284 | struct page *cursor, *valid_page = NULL; |
285 | unsigned long nr_strict_required = end_pfn - blockpfn; | ||
286 | unsigned long flags; | ||
287 | bool locked = false; | ||
107 | 288 | ||
108 | cursor = pfn_to_page(blockpfn); | 289 | cursor = pfn_to_page(blockpfn); |
109 | 290 | ||
110 | /* Isolate free pages. This assumes the block is valid */ | 291 | /* Isolate free pages. */ |
111 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 292 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { |
112 | int isolated, i; | 293 | int isolated, i; |
113 | struct page *page = cursor; | 294 | struct page *page = cursor; |
114 | 295 | ||
115 | if (!pfn_valid_within(blockpfn)) { | ||
116 | if (strict) | ||
117 | return 0; | ||
118 | continue; | ||
119 | } | ||
120 | nr_scanned++; | 296 | nr_scanned++; |
297 | if (!pfn_valid_within(blockpfn)) | ||
298 | continue; | ||
299 | if (!valid_page) | ||
300 | valid_page = page; | ||
301 | if (!PageBuddy(page)) | ||
302 | continue; | ||
121 | 303 | ||
122 | if (!PageBuddy(page)) { | 304 | /* |
123 | if (strict) | 305 | * The zone lock must be held to isolate freepages. |
124 | return 0; | 306 | * Unfortunately this is a very coarse lock and can be |
307 | * heavily contended if there are parallel allocations | ||
308 | * or parallel compactions. For async compaction do not | ||
309 | * spin on the lock and we acquire the lock as late as | ||
310 | * possible. | ||
311 | */ | ||
312 | locked = compact_checklock_irqsave(&cc->zone->lock, &flags, | ||
313 | locked, cc); | ||
314 | if (!locked) | ||
315 | break; | ||
316 | |||
317 | /* Recheck this is a suitable migration target under lock */ | ||
318 | if (!strict && !suitable_migration_target(page)) | ||
319 | break; | ||
320 | |||
321 | /* Recheck this is a buddy page under lock */ | ||
322 | if (!PageBuddy(page)) | ||
125 | continue; | 323 | continue; |
126 | } | ||
127 | 324 | ||
128 | /* Found a free page, break it into order-0 pages */ | 325 | /* Found a free page, break it into order-0 pages */ |
129 | isolated = split_free_page(page); | 326 | isolated = split_free_page(page); |
130 | if (!isolated && strict) | 327 | if (!isolated && strict) |
131 | return 0; | 328 | break; |
132 | total_isolated += isolated; | 329 | total_isolated += isolated; |
133 | for (i = 0; i < isolated; i++) { | 330 | for (i = 0; i < isolated; i++) { |
134 | list_add(&page->lru, freelist); | 331 | list_add(&page->lru, freelist); |
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
143 | } | 340 | } |
144 | 341 | ||
145 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 342 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
343 | |||
344 | /* | ||
345 | * If strict isolation is requested by CMA then check that all the | ||
346 | * pages requested were isolated. If there were any failures, 0 is | ||
347 | * returned and CMA will fail. | ||
348 | */ | ||
349 | if (strict && nr_strict_required != total_isolated) | ||
350 | total_isolated = 0; | ||
351 | |||
352 | if (locked) | ||
353 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
354 | |||
355 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
356 | if (blockpfn == end_pfn) | ||
357 | update_pageblock_skip(cc, valid_page, total_isolated, false); | ||
358 | |||
146 | return total_isolated; | 359 | return total_isolated; |
147 | } | 360 | } |
148 | 361 | ||
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
160 | * a free page). | 373 | * a free page). |
161 | */ | 374 | */ |
162 | unsigned long | 375 | unsigned long |
163 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | 376 | isolate_freepages_range(struct compact_control *cc, |
377 | unsigned long start_pfn, unsigned long end_pfn) | ||
164 | { | 378 | { |
165 | unsigned long isolated, pfn, block_end_pfn, flags; | 379 | unsigned long isolated, pfn, block_end_pfn; |
166 | struct zone *zone = NULL; | ||
167 | LIST_HEAD(freelist); | 380 | LIST_HEAD(freelist); |
168 | 381 | ||
169 | if (pfn_valid(start_pfn)) | ||
170 | zone = page_zone(pfn_to_page(start_pfn)); | ||
171 | |||
172 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 382 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
173 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) | 383 | if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) |
174 | break; | 384 | break; |
175 | 385 | ||
176 | /* | 386 | /* |
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | |||
180 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 390 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
181 | block_end_pfn = min(block_end_pfn, end_pfn); | 391 | block_end_pfn = min(block_end_pfn, end_pfn); |
182 | 392 | ||
183 | spin_lock_irqsave(&zone->lock, flags); | 393 | isolated = isolate_freepages_block(cc, pfn, block_end_pfn, |
184 | isolated = isolate_freepages_block(pfn, block_end_pfn, | ||
185 | &freelist, true); | 394 | &freelist, true); |
186 | spin_unlock_irqrestore(&zone->lock, flags); | ||
187 | 395 | ||
188 | /* | 396 | /* |
189 | * In strict mode, isolate_freepages_block() returns 0 if | 397 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) | |||
253 | * @cc: Compaction control structure. | 461 | * @cc: Compaction control structure. |
254 | * @low_pfn: The first PFN of the range. | 462 | * @low_pfn: The first PFN of the range. |
255 | * @end_pfn: The one-past-the-last PFN of the range. | 463 | * @end_pfn: The one-past-the-last PFN of the range. |
464 | * @unevictable: true if it allows to isolate unevictable pages | ||
256 | * | 465 | * |
257 | * Isolate all pages that can be migrated from the range specified by | 466 | * Isolate all pages that can be migrated from the range specified by |
258 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 467 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) | |||
268 | */ | 477 | */ |
269 | unsigned long | 478 | unsigned long |
270 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 479 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
271 | unsigned long low_pfn, unsigned long end_pfn) | 480 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable) |
272 | { | 481 | { |
273 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 482 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
274 | unsigned long nr_scanned = 0, nr_isolated = 0; | 483 | unsigned long nr_scanned = 0, nr_isolated = 0; |
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
276 | isolate_mode_t mode = 0; | 485 | isolate_mode_t mode = 0; |
277 | struct lruvec *lruvec; | 486 | struct lruvec *lruvec; |
278 | unsigned long flags; | 487 | unsigned long flags; |
279 | bool locked; | 488 | bool locked = false; |
489 | struct page *page = NULL, *valid_page = NULL; | ||
280 | 490 | ||
281 | /* | 491 | /* |
282 | * Ensure that there are not too many pages isolated from the LRU | 492 | * Ensure that there are not too many pages isolated from the LRU |
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
296 | 506 | ||
297 | /* Time to isolate some pages for migration */ | 507 | /* Time to isolate some pages for migration */ |
298 | cond_resched(); | 508 | cond_resched(); |
299 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
300 | locked = true; | ||
301 | for (; low_pfn < end_pfn; low_pfn++) { | 509 | for (; low_pfn < end_pfn; low_pfn++) { |
302 | struct page *page; | ||
303 | |||
304 | /* give a chance to irqs before checking need_resched() */ | 510 | /* give a chance to irqs before checking need_resched() */ |
305 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 511 | if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
306 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 512 | if (should_release_lock(&zone->lru_lock)) { |
307 | locked = false; | 513 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
514 | locked = false; | ||
515 | } | ||
308 | } | 516 | } |
309 | 517 | ||
310 | /* Check if it is ok to still hold the lock */ | ||
311 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
312 | locked, cc); | ||
313 | if (!locked) | ||
314 | break; | ||
315 | |||
316 | /* | 518 | /* |
317 | * migrate_pfn does not necessarily start aligned to a | 519 | * migrate_pfn does not necessarily start aligned to a |
318 | * pageblock. Ensure that pfn_valid is called when moving | 520 | * pageblock. Ensure that pfn_valid is called when moving |
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
340 | if (page_zone(page) != zone) | 542 | if (page_zone(page) != zone) |
341 | continue; | 543 | continue; |
342 | 544 | ||
545 | if (!valid_page) | ||
546 | valid_page = page; | ||
547 | |||
548 | /* If isolation recently failed, do not retry */ | ||
549 | pageblock_nr = low_pfn >> pageblock_order; | ||
550 | if (!isolation_suitable(cc, page)) | ||
551 | goto next_pageblock; | ||
552 | |||
343 | /* Skip if free */ | 553 | /* Skip if free */ |
344 | if (PageBuddy(page)) | 554 | if (PageBuddy(page)) |
345 | continue; | 555 | continue; |
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
349 | * migration is optimistic to see if the minimum amount of work | 559 | * migration is optimistic to see if the minimum amount of work |
350 | * satisfies the allocation | 560 | * satisfies the allocation |
351 | */ | 561 | */ |
352 | pageblock_nr = low_pfn >> pageblock_order; | ||
353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 562 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
354 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 563 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
355 | low_pfn += pageblock_nr_pages; | 564 | cc->finished_update_migrate = true; |
356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 565 | goto next_pageblock; |
357 | last_pageblock_nr = pageblock_nr; | ||
358 | continue; | ||
359 | } | 566 | } |
360 | 567 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | ||
361 | if (!PageLRU(page)) | 569 | if (!PageLRU(page)) |
362 | continue; | 570 | continue; |
363 | 571 | ||
364 | /* | 572 | /* |
365 | * PageLRU is set, and lru_lock excludes isolation, | 573 | * PageLRU is set. lru_lock normally excludes isolation |
366 | * splitting and collapsing (collapsing has already | 574 | * splitting and collapsing (collapsing has already happened |
367 | * happened if PageLRU is set). | 575 | * if PageLRU is set) but the lock is not necessarily taken |
576 | * here and it is wasteful to take it just to check transhuge. | ||
577 | * Check TransHuge without lock and skip the whole pageblock if | ||
578 | * it's either a transhuge or hugetlbfs page, as calling | ||
579 | * compound_order() without preventing THP from splitting the | ||
580 | * page underneath us may return surprising results. | ||
368 | */ | 581 | */ |
369 | if (PageTransHuge(page)) { | 582 | if (PageTransHuge(page)) { |
583 | if (!locked) | ||
584 | goto next_pageblock; | ||
585 | low_pfn += (1 << compound_order(page)) - 1; | ||
586 | continue; | ||
587 | } | ||
588 | |||
589 | /* Check if it is ok to still hold the lock */ | ||
590 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
591 | locked, cc); | ||
592 | if (!locked || fatal_signal_pending(current)) | ||
593 | break; | ||
594 | |||
595 | /* Recheck PageLRU and PageTransHuge under lock */ | ||
596 | if (!PageLRU(page)) | ||
597 | continue; | ||
598 | if (PageTransHuge(page)) { | ||
370 | low_pfn += (1 << compound_order(page)) - 1; | 599 | low_pfn += (1 << compound_order(page)) - 1; |
371 | continue; | 600 | continue; |
372 | } | 601 | } |
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
374 | if (!cc->sync) | 603 | if (!cc->sync) |
375 | mode |= ISOLATE_ASYNC_MIGRATE; | 604 | mode |= ISOLATE_ASYNC_MIGRATE; |
376 | 605 | ||
606 | if (unevictable) | ||
607 | mode |= ISOLATE_UNEVICTABLE; | ||
608 | |||
377 | lruvec = mem_cgroup_page_lruvec(page, zone); | 609 | lruvec = mem_cgroup_page_lruvec(page, zone); |
378 | 610 | ||
379 | /* Try isolate the page */ | 611 | /* Try isolate the page */ |
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
383 | VM_BUG_ON(PageTransCompound(page)); | 615 | VM_BUG_ON(PageTransCompound(page)); |
384 | 616 | ||
385 | /* Successfully isolated */ | 617 | /* Successfully isolated */ |
618 | cc->finished_update_migrate = true; | ||
386 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 619 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
387 | list_add(&page->lru, migratelist); | 620 | list_add(&page->lru, migratelist); |
388 | cc->nr_migratepages++; | 621 | cc->nr_migratepages++; |
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
393 | ++low_pfn; | 626 | ++low_pfn; |
394 | break; | 627 | break; |
395 | } | 628 | } |
629 | |||
630 | continue; | ||
631 | |||
632 | next_pageblock: | ||
633 | low_pfn += pageblock_nr_pages; | ||
634 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
635 | last_pageblock_nr = pageblock_nr; | ||
396 | } | 636 | } |
397 | 637 | ||
398 | acct_isolated(zone, locked, cc); | 638 | acct_isolated(zone, locked, cc); |
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
400 | if (locked) | 640 | if (locked) |
401 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 641 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
402 | 642 | ||
643 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
644 | if (low_pfn == end_pfn) | ||
645 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | ||
646 | |||
403 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 647 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
404 | 648 | ||
405 | return low_pfn; | 649 | return low_pfn; |
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
407 | 651 | ||
408 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 652 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
409 | #ifdef CONFIG_COMPACTION | 653 | #ifdef CONFIG_COMPACTION |
410 | |||
411 | /* Returns true if the page is within a block suitable for migration to */ | ||
412 | static bool suitable_migration_target(struct page *page) | ||
413 | { | ||
414 | |||
415 | int migratetype = get_pageblock_migratetype(page); | ||
416 | |||
417 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
418 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
419 | return false; | ||
420 | |||
421 | /* If the page is a large free page, then allow migration */ | ||
422 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
423 | return true; | ||
424 | |||
425 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
426 | if (migrate_async_suitable(migratetype)) | ||
427 | return true; | ||
428 | |||
429 | /* Otherwise skip the block */ | ||
430 | return false; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
435 | * point for full compaction of a zone. Compaction searches for free pages from | ||
436 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
437 | * page block. | ||
438 | */ | ||
439 | static unsigned long start_free_pfn(struct zone *zone) | ||
440 | { | ||
441 | unsigned long free_pfn; | ||
442 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
443 | free_pfn &= ~(pageblock_nr_pages-1); | ||
444 | return free_pfn; | ||
445 | } | ||
446 | |||
447 | /* | 654 | /* |
448 | * Based on information in the current compact_control, find blocks | 655 | * Based on information in the current compact_control, find blocks |
449 | * suitable for isolating free pages from and then isolate them. | 656 | * suitable for isolating free pages from and then isolate them. |
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone, | |||
453 | { | 660 | { |
454 | struct page *page; | 661 | struct page *page; |
455 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 662 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; |
456 | unsigned long flags; | ||
457 | int nr_freepages = cc->nr_freepages; | 663 | int nr_freepages = cc->nr_freepages; |
458 | struct list_head *freelist = &cc->freepages; | 664 | struct list_head *freelist = &cc->freepages; |
459 | 665 | ||
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone, | |||
501 | if (!suitable_migration_target(page)) | 707 | if (!suitable_migration_target(page)) |
502 | continue; | 708 | continue; |
503 | 709 | ||
504 | /* | 710 | /* If isolation recently failed, do not retry */ |
505 | * Found a block suitable for isolating free pages from. Now | 711 | if (!isolation_suitable(cc, page)) |
506 | * we disabled interrupts, double check things are ok and | 712 | continue; |
507 | * isolate the pages. This is to minimise the time IRQs | ||
508 | * are disabled | ||
509 | */ | ||
510 | isolated = 0; | ||
511 | 713 | ||
512 | /* | 714 | /* Found a block suitable for isolating free pages from */ |
513 | * The zone lock must be held to isolate freepages. This | 715 | isolated = 0; |
514 | * unfortunately this is a very coarse lock and can be | 716 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); |
515 | * heavily contended if there are parallel allocations | 717 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
516 | * or parallel compactions. For async compaction do not | 718 | freelist, false); |
517 | * spin on the lock | 719 | nr_freepages += isolated; |
518 | */ | ||
519 | if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) | ||
520 | break; | ||
521 | if (suitable_migration_target(page)) { | ||
522 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | ||
523 | isolated = isolate_freepages_block(pfn, end_pfn, | ||
524 | freelist, false); | ||
525 | nr_freepages += isolated; | ||
526 | } | ||
527 | spin_unlock_irqrestore(&zone->lock, flags); | ||
528 | 720 | ||
529 | /* | 721 | /* |
530 | * Record the highest PFN we isolated pages from. When next | 722 | * Record the highest PFN we isolated pages from. When next |
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone, | |||
532 | * page migration may have returned some pages to the allocator | 724 | * page migration may have returned some pages to the allocator |
533 | */ | 725 | */ |
534 | if (isolated) { | 726 | if (isolated) { |
727 | cc->finished_update_free = true; | ||
535 | high_pfn = max(high_pfn, pfn); | 728 | high_pfn = max(high_pfn, pfn); |
536 | |||
537 | /* | ||
538 | * If the free scanner has wrapped, update | ||
539 | * compact_cached_free_pfn to point to the highest | ||
540 | * pageblock with free pages. This reduces excessive | ||
541 | * scanning of full pageblocks near the end of the | ||
542 | * zone | ||
543 | */ | ||
544 | if (cc->order > 0 && cc->wrapped) | ||
545 | zone->compact_cached_free_pfn = high_pfn; | ||
546 | } | 729 | } |
547 | } | 730 | } |
548 | 731 | ||
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone, | |||
551 | 734 | ||
552 | cc->free_pfn = high_pfn; | 735 | cc->free_pfn = high_pfn; |
553 | cc->nr_freepages = nr_freepages; | 736 | cc->nr_freepages = nr_freepages; |
554 | |||
555 | /* If compact_cached_free_pfn is reset then set it now */ | ||
556 | if (cc->order > 0 && !cc->wrapped && | ||
557 | zone->compact_cached_free_pfn == start_free_pfn(zone)) | ||
558 | zone->compact_cached_free_pfn = high_pfn; | ||
559 | } | 737 | } |
560 | 738 | ||
561 | /* | 739 | /* |
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
633 | } | 811 | } |
634 | 812 | ||
635 | /* Perform the isolation */ | 813 | /* Perform the isolation */ |
636 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | 814 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); |
637 | if (!low_pfn) | 815 | if (!low_pfn || cc->contended) |
638 | return ISOLATE_ABORT; | 816 | return ISOLATE_ABORT; |
639 | 817 | ||
640 | cc->migrate_pfn = low_pfn; | 818 | cc->migrate_pfn = low_pfn; |
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
645 | static int compact_finished(struct zone *zone, | 823 | static int compact_finished(struct zone *zone, |
646 | struct compact_control *cc) | 824 | struct compact_control *cc) |
647 | { | 825 | { |
648 | unsigned int order; | ||
649 | unsigned long watermark; | 826 | unsigned long watermark; |
650 | 827 | ||
651 | if (fatal_signal_pending(current)) | 828 | if (fatal_signal_pending(current)) |
652 | return COMPACT_PARTIAL; | 829 | return COMPACT_PARTIAL; |
653 | 830 | ||
654 | /* | 831 | /* Compaction run completes if the migrate and free scanner meet */ |
655 | * A full (order == -1) compaction run starts at the beginning and | ||
656 | * end of a zone; it completes when the migrate and free scanner meet. | ||
657 | * A partial (order > 0) compaction can start with the free scanner | ||
658 | * at a random point in the zone, and may have to restart. | ||
659 | */ | ||
660 | if (cc->free_pfn <= cc->migrate_pfn) { | 832 | if (cc->free_pfn <= cc->migrate_pfn) { |
661 | if (cc->order > 0 && !cc->wrapped) { | 833 | /* |
662 | /* We started partway through; restart at the end. */ | 834 | * Mark that the PG_migrate_skip information should be cleared |
663 | unsigned long free_pfn = start_free_pfn(zone); | 835 | * by kswapd when it goes to sleep. kswapd does not set the |
664 | zone->compact_cached_free_pfn = free_pfn; | 836 | * flag itself as the decision to be clear should be directly |
665 | cc->free_pfn = free_pfn; | 837 | * based on an allocation request. |
666 | cc->wrapped = 1; | 838 | */ |
667 | return COMPACT_CONTINUE; | 839 | if (!current_is_kswapd()) |
668 | } | 840 | zone->compact_blockskip_flush = true; |
669 | return COMPACT_COMPLETE; | ||
670 | } | ||
671 | 841 | ||
672 | /* We wrapped around and ended up where we started. */ | ||
673 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
674 | return COMPACT_COMPLETE; | 842 | return COMPACT_COMPLETE; |
843 | } | ||
675 | 844 | ||
676 | /* | 845 | /* |
677 | * order == -1 is expected when compacting via | 846 | * order == -1 is expected when compacting via |
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone, | |||
688 | return COMPACT_CONTINUE; | 857 | return COMPACT_CONTINUE; |
689 | 858 | ||
690 | /* Direct compactor: Is a suitable page free? */ | 859 | /* Direct compactor: Is a suitable page free? */ |
691 | for (order = cc->order; order < MAX_ORDER; order++) { | 860 | if (cc->page) { |
692 | /* Job done if page is free of the right migratetype */ | 861 | /* Was a suitable page captured? */ |
693 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) | 862 | if (*cc->page) |
694 | return COMPACT_PARTIAL; | ||
695 | |||
696 | /* Job done if allocation would set block type */ | ||
697 | if (order >= pageblock_order && zone->free_area[order].nr_free) | ||
698 | return COMPACT_PARTIAL; | 863 | return COMPACT_PARTIAL; |
864 | } else { | ||
865 | unsigned int order; | ||
866 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
867 | struct free_area *area = &zone->free_area[cc->order]; | ||
868 | /* Job done if page is free of the right migratetype */ | ||
869 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
870 | return COMPACT_PARTIAL; | ||
871 | |||
872 | /* Job done if allocation would set block type */ | ||
873 | if (cc->order >= pageblock_order && area->nr_free) | ||
874 | return COMPACT_PARTIAL; | ||
875 | } | ||
699 | } | 876 | } |
700 | 877 | ||
701 | return COMPACT_CONTINUE; | 878 | return COMPACT_CONTINUE; |
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
754 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 931 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
755 | { | 932 | { |
756 | int ret; | 933 | int ret; |
934 | unsigned long start_pfn = zone->zone_start_pfn; | ||
935 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
757 | 936 | ||
758 | ret = compaction_suitable(zone, cc->order); | 937 | ret = compaction_suitable(zone, cc->order); |
759 | switch (ret) { | 938 | switch (ret) { |
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
766 | ; | 945 | ; |
767 | } | 946 | } |
768 | 947 | ||
769 | /* Setup to move all movable pages to the end of the zone */ | 948 | /* |
770 | cc->migrate_pfn = zone->zone_start_pfn; | 949 | * Setup to move all movable pages to the end of the zone. Used cached |
771 | 950 | * information on where the scanners should start but check that it | |
772 | if (cc->order > 0) { | 951 | * is initialised by ensuring the values are within zone boundaries. |
773 | /* Incremental compaction. Start where the last one stopped. */ | 952 | */ |
774 | cc->free_pfn = zone->compact_cached_free_pfn; | 953 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; |
775 | cc->start_free_pfn = cc->free_pfn; | 954 | cc->free_pfn = zone->compact_cached_free_pfn; |
776 | } else { | 955 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
777 | /* Order == -1 starts at the end of the zone. */ | 956 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
778 | cc->free_pfn = start_free_pfn(zone); | 957 | zone->compact_cached_free_pfn = cc->free_pfn; |
958 | } | ||
959 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | ||
960 | cc->migrate_pfn = start_pfn; | ||
961 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | ||
779 | } | 962 | } |
780 | 963 | ||
964 | /* | ||
965 | * Clear pageblock skip if there were failures recently and compaction | ||
966 | * is about to be retried after being deferred. kswapd does not do | ||
967 | * this reset as it'll reset the cached information when going to sleep. | ||
968 | */ | ||
969 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
970 | __reset_isolation_suitable(zone); | ||
971 | |||
781 | migrate_prep_local(); | 972 | migrate_prep_local(); |
782 | 973 | ||
783 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 974 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
787 | switch (isolate_migratepages(zone, cc)) { | 978 | switch (isolate_migratepages(zone, cc)) { |
788 | case ISOLATE_ABORT: | 979 | case ISOLATE_ABORT: |
789 | ret = COMPACT_PARTIAL; | 980 | ret = COMPACT_PARTIAL; |
981 | putback_lru_pages(&cc->migratepages); | ||
982 | cc->nr_migratepages = 0; | ||
790 | goto out; | 983 | goto out; |
791 | case ISOLATE_NONE: | 984 | case ISOLATE_NONE: |
792 | continue; | 985 | continue; |
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
817 | goto out; | 1010 | goto out; |
818 | } | 1011 | } |
819 | } | 1012 | } |
1013 | |||
1014 | /* Capture a page now if it is a suitable size */ | ||
1015 | compact_capture_page(cc); | ||
820 | } | 1016 | } |
821 | 1017 | ||
822 | out: | 1018 | out: |
@@ -829,8 +1025,10 @@ out: | |||
829 | 1025 | ||
830 | static unsigned long compact_zone_order(struct zone *zone, | 1026 | static unsigned long compact_zone_order(struct zone *zone, |
831 | int order, gfp_t gfp_mask, | 1027 | int order, gfp_t gfp_mask, |
832 | bool sync, bool *contended) | 1028 | bool sync, bool *contended, |
1029 | struct page **page) | ||
833 | { | 1030 | { |
1031 | unsigned long ret; | ||
834 | struct compact_control cc = { | 1032 | struct compact_control cc = { |
835 | .nr_freepages = 0, | 1033 | .nr_freepages = 0, |
836 | .nr_migratepages = 0, | 1034 | .nr_migratepages = 0, |
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
838 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1036 | .migratetype = allocflags_to_migratetype(gfp_mask), |
839 | .zone = zone, | 1037 | .zone = zone, |
840 | .sync = sync, | 1038 | .sync = sync, |
841 | .contended = contended, | 1039 | .page = page, |
842 | }; | 1040 | }; |
843 | INIT_LIST_HEAD(&cc.freepages); | 1041 | INIT_LIST_HEAD(&cc.freepages); |
844 | INIT_LIST_HEAD(&cc.migratepages); | 1042 | INIT_LIST_HEAD(&cc.migratepages); |
845 | 1043 | ||
846 | return compact_zone(zone, &cc); | 1044 | ret = compact_zone(zone, &cc); |
1045 | |||
1046 | VM_BUG_ON(!list_empty(&cc.freepages)); | ||
1047 | VM_BUG_ON(!list_empty(&cc.migratepages)); | ||
1048 | |||
1049 | *contended = cc.contended; | ||
1050 | return ret; | ||
847 | } | 1051 | } |
848 | 1052 | ||
849 | int sysctl_extfrag_threshold = 500; | 1053 | int sysctl_extfrag_threshold = 500; |
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500; | |||
855 | * @gfp_mask: The GFP mask of the current allocation | 1059 | * @gfp_mask: The GFP mask of the current allocation |
856 | * @nodemask: The allowed nodes to allocate from | 1060 | * @nodemask: The allowed nodes to allocate from |
857 | * @sync: Whether migration is synchronous or not | 1061 | * @sync: Whether migration is synchronous or not |
1062 | * @contended: Return value that is true if compaction was aborted due to lock contention | ||
1063 | * @page: Optionally capture a free page of the requested order during compaction | ||
858 | * | 1064 | * |
859 | * This is the main entry point for direct page compaction. | 1065 | * This is the main entry point for direct page compaction. |
860 | */ | 1066 | */ |
861 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1067 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
862 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1068 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
863 | bool sync, bool *contended) | 1069 | bool sync, bool *contended, struct page **page) |
864 | { | 1070 | { |
865 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1071 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
866 | int may_enter_fs = gfp_mask & __GFP_FS; | 1072 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
868 | struct zoneref *z; | 1074 | struct zoneref *z; |
869 | struct zone *zone; | 1075 | struct zone *zone; |
870 | int rc = COMPACT_SKIPPED; | 1076 | int rc = COMPACT_SKIPPED; |
1077 | int alloc_flags = 0; | ||
871 | 1078 | ||
872 | /* | 1079 | /* Check if the GFP flags allow compaction */ |
873 | * Check whether it is worth even starting compaction. The order check is | ||
874 | * made because an assumption is made that the page allocator can satisfy | ||
875 | * the "cheaper" orders without taking special steps | ||
876 | */ | ||
877 | if (!order || !may_enter_fs || !may_perform_io) | 1080 | if (!order || !may_enter_fs || !may_perform_io) |
878 | return rc; | 1081 | return rc; |
879 | 1082 | ||
880 | count_vm_event(COMPACTSTALL); | 1083 | count_vm_event(COMPACTSTALL); |
881 | 1084 | ||
1085 | #ifdef CONFIG_CMA | ||
1086 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
1087 | alloc_flags |= ALLOC_CMA; | ||
1088 | #endif | ||
882 | /* Compact each zone in the list */ | 1089 | /* Compact each zone in the list */ |
883 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1090 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
884 | nodemask) { | 1091 | nodemask) { |
885 | int status; | 1092 | int status; |
886 | 1093 | ||
887 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1094 | status = compact_zone_order(zone, order, gfp_mask, sync, |
888 | contended); | 1095 | contended, page); |
889 | rc = max(status, rc); | 1096 | rc = max(status, rc); |
890 | 1097 | ||
891 | /* If a normal allocation would succeed, stop compacting */ | 1098 | /* If a normal allocation would succeed, stop compacting */ |
892 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | 1099 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, |
1100 | alloc_flags)) | ||
893 | break; | 1101 | break; |
894 | } | 1102 | } |
895 | 1103 | ||
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
940 | struct compact_control cc = { | 1148 | struct compact_control cc = { |
941 | .order = order, | 1149 | .order = order, |
942 | .sync = false, | 1150 | .sync = false, |
1151 | .page = NULL, | ||
943 | }; | 1152 | }; |
944 | 1153 | ||
945 | return __compact_pgdat(pgdat, &cc); | 1154 | return __compact_pgdat(pgdat, &cc); |
@@ -950,6 +1159,7 @@ static int compact_node(int nid) | |||
950 | struct compact_control cc = { | 1159 | struct compact_control cc = { |
951 | .order = -1, | 1160 | .order = -1, |
952 | .sync = true, | 1161 | .sync = true, |
1162 | .page = NULL, | ||
953 | }; | 1163 | }; |
954 | 1164 | ||
955 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1165 | return __compact_pgdat(NODE_DATA(nid), &cc); |
diff --git a/mm/filemap.c b/mm/filemap.c index 384344575c37..83efee76a5c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1607 | * Do we have something in the page cache already? | 1607 | * Do we have something in the page cache already? |
1608 | */ | 1608 | */ |
1609 | page = find_get_page(mapping, offset); | 1609 | page = find_get_page(mapping, offset); |
1610 | if (likely(page)) { | 1610 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
1611 | /* | 1611 | /* |
1612 | * We found the page, so try async readahead before | 1612 | * We found the page, so try async readahead before |
1613 | * waiting for the lock. | 1613 | * waiting for the lock. |
1614 | */ | 1614 | */ |
1615 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1615 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1616 | } else { | 1616 | } else if (!page) { |
1617 | /* No page in the page cache at all */ | 1617 | /* No page in the page cache at all */ |
1618 | do_sync_mmap_readahead(vma, ra, file, offset); | 1618 | do_sync_mmap_readahead(vma, ra, file, offset); |
1619 | count_vm_event(PGMAJFAULT); | 1619 | count_vm_event(PGMAJFAULT); |
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); | |||
1737 | const struct vm_operations_struct generic_file_vm_ops = { | 1737 | const struct vm_operations_struct generic_file_vm_ops = { |
1738 | .fault = filemap_fault, | 1738 | .fault = filemap_fault, |
1739 | .page_mkwrite = filemap_page_mkwrite, | 1739 | .page_mkwrite = filemap_page_mkwrite, |
1740 | .remap_pages = generic_file_remap_pages, | ||
1740 | }; | 1741 | }; |
1741 | 1742 | ||
1742 | /* This is used for a general mmap of a disk file */ | 1743 | /* This is used for a general mmap of a disk file */ |
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
1749 | return -ENOEXEC; | 1750 | return -ENOEXEC; |
1750 | file_accessed(file); | 1751 | file_accessed(file); |
1751 | vma->vm_ops = &generic_file_vm_ops; | 1752 | vma->vm_ops = &generic_file_vm_ops; |
1752 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1753 | return 0; | 1753 | return 0; |
1754 | } | 1754 | } |
1755 | 1755 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b1270c..a912da6ddfd4 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, | |||
167 | { | 167 | { |
168 | struct vm_area_struct *vma; | 168 | struct vm_area_struct *vma; |
169 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
170 | struct prio_tree_iter iter; | ||
171 | unsigned long address; | 170 | unsigned long address; |
172 | pte_t *pte; | 171 | pte_t *pte; |
173 | pte_t pteval; | 172 | pte_t pteval; |
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, | |||
184 | 183 | ||
185 | retry: | 184 | retry: |
186 | mutex_lock(&mapping->i_mmap_mutex); | 185 | mutex_lock(&mapping->i_mmap_mutex); |
187 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
188 | mm = vma->vm_mm; | 187 | mm = vma->vm_mm; |
189 | address = vma->vm_start + | 188 | address = vma->vm_start + |
190 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
@@ -193,11 +192,13 @@ retry: | |||
193 | if (pte) { | 192 | if (pte) { |
194 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
195 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
196 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush(vma, address, pte); |
197 | page_remove_rmap(page); | 196 | page_remove_rmap(page); |
198 | dec_mm_counter(mm, MM_FILEPAGES); | 197 | dec_mm_counter(mm, MM_FILEPAGES); |
199 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
200 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
200 | /* must invalidate_page _before_ freeing the page */ | ||
201 | mmu_notifier_invalidate_page(mm, address); | ||
201 | page_cache_release(page); | 202 | page_cache_release(page); |
202 | } | 203 | } |
203 | } | 204 | } |
@@ -305,6 +306,7 @@ out: | |||
305 | static const struct vm_operations_struct xip_file_vm_ops = { | 306 | static const struct vm_operations_struct xip_file_vm_ops = { |
306 | .fault = xip_file_fault, | 307 | .fault = xip_file_fault, |
307 | .page_mkwrite = filemap_page_mkwrite, | 308 | .page_mkwrite = filemap_page_mkwrite, |
309 | .remap_pages = generic_file_remap_pages, | ||
308 | }; | 310 | }; |
309 | 311 | ||
310 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 312 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
313 | 315 | ||
314 | file_accessed(file); | 316 | file_accessed(file); |
315 | vma->vm_ops = &xip_file_vm_ops; | 317 | vma->vm_ops = &xip_file_vm_ops; |
316 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; | 318 | vma->vm_flags |= VM_MIXEDMAP; |
317 | return 0; | 319 | return 0; |
318 | } | 320 | } |
319 | EXPORT_SYMBOL_GPL(xip_file_mmap); | 321 | EXPORT_SYMBOL_GPL(xip_file_mmap); |
diff --git a/mm/fremap.c b/mm/fremap.c index 048659c0c03d..3899a86851ce 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * | 5 | * |
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 |
7 | */ | 7 | */ |
8 | #include <linux/export.h> | ||
8 | #include <linux/backing-dev.h> | 9 | #include <linux/backing-dev.h> |
9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
@@ -80,9 +81,10 @@ out: | |||
80 | return err; | 81 | return err; |
81 | } | 82 | } |
82 | 83 | ||
83 | static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | 84 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, |
84 | unsigned long addr, unsigned long size, pgoff_t pgoff) | 85 | unsigned long size, pgoff_t pgoff) |
85 | { | 86 | { |
87 | struct mm_struct *mm = vma->vm_mm; | ||
86 | int err; | 88 | int err; |
87 | 89 | ||
88 | do { | 90 | do { |
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
95 | pgoff++; | 97 | pgoff++; |
96 | } while (size); | 98 | } while (size); |
97 | 99 | ||
98 | return 0; | 100 | return 0; |
99 | |||
100 | } | 101 | } |
102 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
101 | 103 | ||
102 | /** | 104 | /** |
103 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | 105 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma |
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
167 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | 169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) |
168 | goto out; | 170 | goto out; |
169 | 171 | ||
170 | if (!(vma->vm_flags & VM_CAN_NONLINEAR)) | 172 | if (!vma->vm_ops->remap_pages) |
171 | goto out; | 173 | goto out; |
172 | 174 | ||
173 | if (start < vma->vm_start || start + size > vma->vm_end) | 175 | if (start < vma->vm_start || start + size > vma->vm_end) |
@@ -212,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
212 | mutex_lock(&mapping->i_mmap_mutex); | 214 | mutex_lock(&mapping->i_mmap_mutex); |
213 | flush_dcache_mmap_lock(mapping); | 215 | flush_dcache_mmap_lock(mapping); |
214 | vma->vm_flags |= VM_NONLINEAR; | 216 | vma->vm_flags |= VM_NONLINEAR; |
215 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 217 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
216 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 218 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
217 | flush_dcache_mmap_unlock(mapping); | 219 | flush_dcache_mmap_unlock(mapping); |
218 | mutex_unlock(&mapping->i_mmap_mutex); | 220 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
228 | } | 230 | } |
229 | 231 | ||
230 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 232 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
231 | err = populate_range(mm, vma, start, size, pgoff); | 233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
232 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 234 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
233 | if (!err && !(flags & MAP_NONBLOCK)) { | 235 | if (!err && !(flags & MAP_NONBLOCK)) { |
234 | if (vma->vm_flags & VM_LOCKED) { | 236 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb695097..a863af26c79c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void) | |||
102 | unsigned long recommended_min; | 102 | unsigned long recommended_min; |
103 | extern int min_free_kbytes; | 103 | extern int min_free_kbytes; |
104 | 104 | ||
105 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | 105 | if (!khugepaged_enabled()) |
106 | &transparent_hugepage_flags) && | ||
107 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
108 | &transparent_hugepage_flags)) | ||
109 | return 0; | 106 | return 0; |
110 | 107 | ||
111 | for_each_populated_zone(zone) | 108 | for_each_populated_zone(zone) |
@@ -139,12 +136,6 @@ static int start_khugepaged(void) | |||
139 | { | 136 | { |
140 | int err = 0; | 137 | int err = 0; |
141 | if (khugepaged_enabled()) { | 138 | if (khugepaged_enabled()) { |
142 | int wakeup; | ||
143 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
144 | err = -ENOMEM; | ||
145 | goto out; | ||
146 | } | ||
147 | mutex_lock(&khugepaged_mutex); | ||
148 | if (!khugepaged_thread) | 139 | if (!khugepaged_thread) |
149 | khugepaged_thread = kthread_run(khugepaged, NULL, | 140 | khugepaged_thread = kthread_run(khugepaged, NULL, |
150 | "khugepaged"); | 141 | "khugepaged"); |
@@ -154,16 +145,16 @@ static int start_khugepaged(void) | |||
154 | err = PTR_ERR(khugepaged_thread); | 145 | err = PTR_ERR(khugepaged_thread); |
155 | khugepaged_thread = NULL; | 146 | khugepaged_thread = NULL; |
156 | } | 147 | } |
157 | wakeup = !list_empty(&khugepaged_scan.mm_head); | 148 | |
158 | mutex_unlock(&khugepaged_mutex); | 149 | if (!list_empty(&khugepaged_scan.mm_head)) |
159 | if (wakeup) | ||
160 | wake_up_interruptible(&khugepaged_wait); | 150 | wake_up_interruptible(&khugepaged_wait); |
161 | 151 | ||
162 | set_recommended_min_free_kbytes(); | 152 | set_recommended_min_free_kbytes(); |
163 | } else | 153 | } else if (khugepaged_thread) { |
164 | /* wakeup to exit */ | 154 | kthread_stop(khugepaged_thread); |
165 | wake_up_interruptible(&khugepaged_wait); | 155 | khugepaged_thread = NULL; |
166 | out: | 156 | } |
157 | |||
167 | return err; | 158 | return err; |
168 | } | 159 | } |
169 | 160 | ||
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
224 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 215 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
225 | 216 | ||
226 | if (ret > 0) { | 217 | if (ret > 0) { |
227 | int err = start_khugepaged(); | 218 | int err; |
219 | |||
220 | mutex_lock(&khugepaged_mutex); | ||
221 | err = start_khugepaged(); | ||
222 | mutex_unlock(&khugepaged_mutex); | ||
223 | |||
228 | if (err) | 224 | if (err) |
229 | ret = err; | 225 | ret = err; |
230 | } | 226 | } |
231 | 227 | ||
232 | if (ret > 0 && | ||
233 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
234 | &transparent_hugepage_flags) || | ||
235 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
236 | &transparent_hugepage_flags))) | ||
237 | set_recommended_min_free_kbytes(); | ||
238 | |||
239 | return ret; | 228 | return ret; |
240 | } | 229 | } |
241 | static struct kobj_attribute enabled_attr = | 230 | static struct kobj_attribute enabled_attr = |
@@ -570,8 +559,6 @@ static int __init hugepage_init(void) | |||
570 | 559 | ||
571 | start_khugepaged(); | 560 | start_khugepaged(); |
572 | 561 | ||
573 | set_recommended_min_free_kbytes(); | ||
574 | |||
575 | return 0; | 562 | return 0; |
576 | out: | 563 | out: |
577 | hugepage_exit_sysfs(hugepage_kobj); | 564 | hugepage_exit_sysfs(hugepage_kobj); |
@@ -611,19 +598,6 @@ out: | |||
611 | } | 598 | } |
612 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 599 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
613 | 600 | ||
614 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
615 | struct mm_struct *mm) | ||
616 | { | ||
617 | assert_spin_locked(&mm->page_table_lock); | ||
618 | |||
619 | /* FIFO */ | ||
620 | if (!mm->pmd_huge_pte) | ||
621 | INIT_LIST_HEAD(&pgtable->lru); | ||
622 | else | ||
623 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
624 | mm->pmd_huge_pte = pgtable; | ||
625 | } | ||
626 | |||
627 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 601 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
628 | { | 602 | { |
629 | if (likely(vma->vm_flags & VM_WRITE)) | 603 | if (likely(vma->vm_flags & VM_WRITE)) |
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
665 | */ | 639 | */ |
666 | page_add_new_anon_rmap(page, vma, haddr); | 640 | page_add_new_anon_rmap(page, vma, haddr); |
667 | set_pmd_at(mm, haddr, pmd, entry); | 641 | set_pmd_at(mm, haddr, pmd, entry); |
668 | prepare_pmd_huge_pte(pgtable, mm); | 642 | pgtable_trans_huge_deposit(mm, pgtable); |
669 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 643 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
670 | mm->nr_ptes++; | 644 | mm->nr_ptes++; |
671 | spin_unlock(&mm->page_table_lock); | 645 | spin_unlock(&mm->page_table_lock); |
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
791 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 765 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
792 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 766 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
793 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 767 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
794 | prepare_pmd_huge_pte(pgtable, dst_mm); | 768 | pgtable_trans_huge_deposit(dst_mm, pgtable); |
795 | dst_mm->nr_ptes++; | 769 | dst_mm->nr_ptes++; |
796 | 770 | ||
797 | ret = 0; | 771 | ret = 0; |
@@ -802,25 +776,6 @@ out: | |||
802 | return ret; | 776 | return ret; |
803 | } | 777 | } |
804 | 778 | ||
805 | /* no "address" argument so destroys page coloring of some arch */ | ||
806 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
807 | { | ||
808 | pgtable_t pgtable; | ||
809 | |||
810 | assert_spin_locked(&mm->page_table_lock); | ||
811 | |||
812 | /* FIFO */ | ||
813 | pgtable = mm->pmd_huge_pte; | ||
814 | if (list_empty(&pgtable->lru)) | ||
815 | mm->pmd_huge_pte = NULL; | ||
816 | else { | ||
817 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
818 | struct page, lru); | ||
819 | list_del(&pgtable->lru); | ||
820 | } | ||
821 | return pgtable; | ||
822 | } | ||
823 | |||
824 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 779 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
825 | struct vm_area_struct *vma, | 780 | struct vm_area_struct *vma, |
826 | unsigned long address, | 781 | unsigned long address, |
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
832 | pmd_t _pmd; | 787 | pmd_t _pmd; |
833 | int ret = 0, i; | 788 | int ret = 0, i; |
834 | struct page **pages; | 789 | struct page **pages; |
790 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
791 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
835 | 792 | ||
836 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 793 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
837 | GFP_KERNEL); | 794 | GFP_KERNEL); |
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
868 | cond_resched(); | 825 | cond_resched(); |
869 | } | 826 | } |
870 | 827 | ||
828 | mmun_start = haddr; | ||
829 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
830 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
831 | |||
871 | spin_lock(&mm->page_table_lock); | 832 | spin_lock(&mm->page_table_lock); |
872 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 833 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
873 | goto out_free_pages; | 834 | goto out_free_pages; |
874 | VM_BUG_ON(!PageHead(page)); | 835 | VM_BUG_ON(!PageHead(page)); |
875 | 836 | ||
876 | pmdp_clear_flush_notify(vma, haddr, pmd); | 837 | pmdp_clear_flush(vma, haddr, pmd); |
877 | /* leave pmd empty until pte is filled */ | 838 | /* leave pmd empty until pte is filled */ |
878 | 839 | ||
879 | pgtable = get_pmd_huge_pte(mm); | 840 | pgtable = pgtable_trans_huge_withdraw(mm); |
880 | pmd_populate(mm, &_pmd, pgtable); | 841 | pmd_populate(mm, &_pmd, pgtable); |
881 | 842 | ||
882 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 843 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
896 | page_remove_rmap(page); | 857 | page_remove_rmap(page); |
897 | spin_unlock(&mm->page_table_lock); | 858 | spin_unlock(&mm->page_table_lock); |
898 | 859 | ||
860 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
861 | |||
899 | ret |= VM_FAULT_WRITE; | 862 | ret |= VM_FAULT_WRITE; |
900 | put_page(page); | 863 | put_page(page); |
901 | 864 | ||
@@ -904,6 +867,7 @@ out: | |||
904 | 867 | ||
905 | out_free_pages: | 868 | out_free_pages: |
906 | spin_unlock(&mm->page_table_lock); | 869 | spin_unlock(&mm->page_table_lock); |
870 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
907 | mem_cgroup_uncharge_start(); | 871 | mem_cgroup_uncharge_start(); |
908 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 872 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
909 | mem_cgroup_uncharge_page(pages[i]); | 873 | mem_cgroup_uncharge_page(pages[i]); |
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
920 | int ret = 0; | 884 | int ret = 0; |
921 | struct page *page, *new_page; | 885 | struct page *page, *new_page; |
922 | unsigned long haddr; | 886 | unsigned long haddr; |
887 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
888 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
923 | 889 | ||
924 | VM_BUG_ON(!vma->anon_vma); | 890 | VM_BUG_ON(!vma->anon_vma); |
925 | spin_lock(&mm->page_table_lock); | 891 | spin_lock(&mm->page_table_lock); |
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
934 | entry = pmd_mkyoung(orig_pmd); | 900 | entry = pmd_mkyoung(orig_pmd); |
935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 901 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
936 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 902 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) |
937 | update_mmu_cache(vma, address, entry); | 903 | update_mmu_cache_pmd(vma, address, pmd); |
938 | ret |= VM_FAULT_WRITE; | 904 | ret |= VM_FAULT_WRITE; |
939 | goto out_unlock; | 905 | goto out_unlock; |
940 | } | 906 | } |
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
970 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 936 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
971 | __SetPageUptodate(new_page); | 937 | __SetPageUptodate(new_page); |
972 | 938 | ||
939 | mmun_start = haddr; | ||
940 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
941 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
942 | |||
973 | spin_lock(&mm->page_table_lock); | 943 | spin_lock(&mm->page_table_lock); |
974 | put_page(page); | 944 | put_page(page); |
975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 945 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | 946 | spin_unlock(&mm->page_table_lock); |
977 | mem_cgroup_uncharge_page(new_page); | 947 | mem_cgroup_uncharge_page(new_page); |
978 | put_page(new_page); | 948 | put_page(new_page); |
979 | goto out; | 949 | goto out_mn; |
980 | } else { | 950 | } else { |
981 | pmd_t entry; | 951 | pmd_t entry; |
982 | VM_BUG_ON(!PageHead(page)); | 952 | VM_BUG_ON(!PageHead(page)); |
983 | entry = mk_pmd(new_page, vma->vm_page_prot); | 953 | entry = mk_pmd(new_page, vma->vm_page_prot); |
984 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 954 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
985 | entry = pmd_mkhuge(entry); | 955 | entry = pmd_mkhuge(entry); |
986 | pmdp_clear_flush_notify(vma, haddr, pmd); | 956 | pmdp_clear_flush(vma, haddr, pmd); |
987 | page_add_new_anon_rmap(new_page, vma, haddr); | 957 | page_add_new_anon_rmap(new_page, vma, haddr); |
988 | set_pmd_at(mm, haddr, pmd, entry); | 958 | set_pmd_at(mm, haddr, pmd, entry); |
989 | update_mmu_cache(vma, address, entry); | 959 | update_mmu_cache_pmd(vma, address, pmd); |
990 | page_remove_rmap(page); | 960 | page_remove_rmap(page); |
991 | put_page(page); | 961 | put_page(page); |
992 | ret |= VM_FAULT_WRITE; | 962 | ret |= VM_FAULT_WRITE; |
993 | } | 963 | } |
994 | out_unlock: | ||
995 | spin_unlock(&mm->page_table_lock); | 964 | spin_unlock(&mm->page_table_lock); |
965 | out_mn: | ||
966 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
996 | out: | 967 | out: |
997 | return ret; | 968 | return ret; |
969 | out_unlock: | ||
970 | spin_unlock(&mm->page_table_lock); | ||
971 | return ret; | ||
998 | } | 972 | } |
999 | 973 | ||
1000 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 974 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
1001 | unsigned long addr, | 975 | unsigned long addr, |
1002 | pmd_t *pmd, | 976 | pmd_t *pmd, |
1003 | unsigned int flags) | 977 | unsigned int flags) |
1004 | { | 978 | { |
979 | struct mm_struct *mm = vma->vm_mm; | ||
1005 | struct page *page = NULL; | 980 | struct page *page = NULL; |
1006 | 981 | ||
1007 | assert_spin_locked(&mm->page_table_lock); | 982 | assert_spin_locked(&mm->page_table_lock); |
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
1024 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 999 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1025 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1000 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); |
1026 | } | 1001 | } |
1002 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1003 | if (page->mapping && trylock_page(page)) { | ||
1004 | lru_add_drain(); | ||
1005 | if (page->mapping) | ||
1006 | mlock_vma_page(page); | ||
1007 | unlock_page(page); | ||
1008 | } | ||
1009 | } | ||
1027 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1010 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1028 | VM_BUG_ON(!PageCompound(page)); | 1011 | VM_BUG_ON(!PageCompound(page)); |
1029 | if (flags & FOLL_GET) | 1012 | if (flags & FOLL_GET) |
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1041 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1024 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1042 | struct page *page; | 1025 | struct page *page; |
1043 | pgtable_t pgtable; | 1026 | pgtable_t pgtable; |
1044 | pgtable = get_pmd_huge_pte(tlb->mm); | 1027 | pmd_t orig_pmd; |
1045 | page = pmd_page(*pmd); | 1028 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1046 | pmd_clear(pmd); | 1029 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1030 | page = pmd_page(orig_pmd); | ||
1047 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1031 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1048 | page_remove_rmap(page); | 1032 | page_remove_rmap(page); |
1049 | VM_BUG_ON(page_mapcount(page) < 0); | 1033 | VM_BUG_ON(page_mapcount(page) < 0); |
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1207 | struct mm_struct *mm = vma->vm_mm; | 1191 | struct mm_struct *mm = vma->vm_mm; |
1208 | pmd_t *pmd; | 1192 | pmd_t *pmd; |
1209 | int ret = 0; | 1193 | int ret = 0; |
1194 | /* For mmu_notifiers */ | ||
1195 | const unsigned long mmun_start = address; | ||
1196 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | ||
1210 | 1197 | ||
1198 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1211 | spin_lock(&mm->page_table_lock); | 1199 | spin_lock(&mm->page_table_lock); |
1212 | pmd = page_check_address_pmd(page, mm, address, | 1200 | pmd = page_check_address_pmd(page, mm, address, |
1213 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1201 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1219 | * and it won't wait on the anon_vma->root->mutex to | 1207 | * and it won't wait on the anon_vma->root->mutex to |
1220 | * serialize against split_huge_page*. | 1208 | * serialize against split_huge_page*. |
1221 | */ | 1209 | */ |
1222 | pmdp_splitting_flush_notify(vma, address, pmd); | 1210 | pmdp_splitting_flush(vma, address, pmd); |
1223 | ret = 1; | 1211 | ret = 1; |
1224 | } | 1212 | } |
1225 | spin_unlock(&mm->page_table_lock); | 1213 | spin_unlock(&mm->page_table_lock); |
1214 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1226 | 1215 | ||
1227 | return ret; | 1216 | return ret; |
1228 | } | 1217 | } |
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page, | |||
1358 | pmd = page_check_address_pmd(page, mm, address, | 1347 | pmd = page_check_address_pmd(page, mm, address, |
1359 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1348 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1360 | if (pmd) { | 1349 | if (pmd) { |
1361 | pgtable = get_pmd_huge_pte(mm); | 1350 | pgtable = pgtable_trans_huge_withdraw(mm); |
1362 | pmd_populate(mm, &_pmd, pgtable); | 1351 | pmd_populate(mm, &_pmd, pgtable); |
1363 | 1352 | ||
1364 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | 1353 | haddr = address; |
1365 | i++, haddr += PAGE_SIZE) { | 1354 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1366 | pte_t *pte, entry; | 1355 | pte_t *pte, entry; |
1367 | BUG_ON(PageCompound(page+i)); | 1356 | BUG_ON(PageCompound(page+i)); |
1368 | entry = mk_pte(page + i, vma->vm_page_prot); | 1357 | entry = mk_pte(page + i, vma->vm_page_prot); |
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page, | |||
1406 | * SMP TLB and finally we write the non-huge version | 1395 | * SMP TLB and finally we write the non-huge version |
1407 | * of the pmd entry with pmd_populate. | 1396 | * of the pmd entry with pmd_populate. |
1408 | */ | 1397 | */ |
1409 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | 1398 | pmdp_invalidate(vma, address, pmd); |
1410 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
1411 | pmd_populate(mm, pmd, pgtable); | 1399 | pmd_populate(mm, pmd, pgtable); |
1412 | ret = 1; | 1400 | ret = 1; |
1413 | } | 1401 | } |
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page, | |||
1421 | struct anon_vma *anon_vma) | 1409 | struct anon_vma *anon_vma) |
1422 | { | 1410 | { |
1423 | int mapcount, mapcount2; | 1411 | int mapcount, mapcount2; |
1412 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1424 | struct anon_vma_chain *avc; | 1413 | struct anon_vma_chain *avc; |
1425 | 1414 | ||
1426 | BUG_ON(!PageHead(page)); | 1415 | BUG_ON(!PageHead(page)); |
1427 | BUG_ON(PageTail(page)); | 1416 | BUG_ON(PageTail(page)); |
1428 | 1417 | ||
1429 | mapcount = 0; | 1418 | mapcount = 0; |
1430 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1419 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1431 | struct vm_area_struct *vma = avc->vma; | 1420 | struct vm_area_struct *vma = avc->vma; |
1432 | unsigned long addr = vma_address(page, vma); | 1421 | unsigned long addr = vma_address(page, vma); |
1433 | BUG_ON(is_vma_temporary_stack(vma)); | 1422 | BUG_ON(is_vma_temporary_stack(vma)); |
1434 | if (addr == -EFAULT) | ||
1435 | continue; | ||
1436 | mapcount += __split_huge_page_splitting(page, vma, addr); | 1423 | mapcount += __split_huge_page_splitting(page, vma, addr); |
1437 | } | 1424 | } |
1438 | /* | 1425 | /* |
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page, | |||
1453 | __split_huge_page_refcount(page); | 1440 | __split_huge_page_refcount(page); |
1454 | 1441 | ||
1455 | mapcount2 = 0; | 1442 | mapcount2 = 0; |
1456 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1443 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1457 | struct vm_area_struct *vma = avc->vma; | 1444 | struct vm_area_struct *vma = avc->vma; |
1458 | unsigned long addr = vma_address(page, vma); | 1445 | unsigned long addr = vma_address(page, vma); |
1459 | BUG_ON(is_vma_temporary_stack(vma)); | 1446 | BUG_ON(is_vma_temporary_stack(vma)); |
1460 | if (addr == -EFAULT) | ||
1461 | continue; | ||
1462 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1447 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1463 | } | 1448 | } |
1464 | if (mapcount != mapcount2) | 1449 | if (mapcount != mapcount2) |
@@ -1491,12 +1476,13 @@ out: | |||
1491 | return ret; | 1476 | return ret; |
1492 | } | 1477 | } |
1493 | 1478 | ||
1494 | #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ | 1479 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) |
1495 | VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | ||
1496 | 1480 | ||
1497 | int hugepage_madvise(struct vm_area_struct *vma, | 1481 | int hugepage_madvise(struct vm_area_struct *vma, |
1498 | unsigned long *vm_flags, int advice) | 1482 | unsigned long *vm_flags, int advice) |
1499 | { | 1483 | { |
1484 | struct mm_struct *mm = vma->vm_mm; | ||
1485 | |||
1500 | switch (advice) { | 1486 | switch (advice) { |
1501 | case MADV_HUGEPAGE: | 1487 | case MADV_HUGEPAGE: |
1502 | /* | 1488 | /* |
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma, | |||
1504 | */ | 1490 | */ |
1505 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1491 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1506 | return -EINVAL; | 1492 | return -EINVAL; |
1493 | if (mm->def_flags & VM_NOHUGEPAGE) | ||
1494 | return -EINVAL; | ||
1507 | *vm_flags &= ~VM_NOHUGEPAGE; | 1495 | *vm_flags &= ~VM_NOHUGEPAGE; |
1508 | *vm_flags |= VM_HUGEPAGE; | 1496 | *vm_flags |= VM_HUGEPAGE; |
1509 | /* | 1497 | /* |
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
1655 | if (vma->vm_ops) | 1643 | if (vma->vm_ops) |
1656 | /* khugepaged not yet working on file or special mappings */ | 1644 | /* khugepaged not yet working on file or special mappings */ |
1657 | return 0; | 1645 | return 0; |
1658 | /* | 1646 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1659 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1660 | * true too, verify it here. | ||
1661 | */ | ||
1662 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1663 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 1647 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
1664 | hend = vma->vm_end & HPAGE_PMD_MASK; | 1648 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1665 | if (hstart < hend) | 1649 | if (hstart < hend) |
@@ -1833,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
1833 | } | 1817 | } |
1834 | } | 1818 | } |
1835 | 1819 | ||
1836 | static void collapse_huge_page(struct mm_struct *mm, | 1820 | static void khugepaged_alloc_sleep(void) |
1837 | unsigned long address, | ||
1838 | struct page **hpage, | ||
1839 | struct vm_area_struct *vma, | ||
1840 | int node) | ||
1841 | { | 1821 | { |
1842 | pgd_t *pgd; | 1822 | wait_event_freezable_timeout(khugepaged_wait, false, |
1843 | pud_t *pud; | 1823 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
1844 | pmd_t *pmd, _pmd; | 1824 | } |
1845 | pte_t *pte; | ||
1846 | pgtable_t pgtable; | ||
1847 | struct page *new_page; | ||
1848 | spinlock_t *ptl; | ||
1849 | int isolated; | ||
1850 | unsigned long hstart, hend; | ||
1851 | 1825 | ||
1852 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1826 | #ifdef CONFIG_NUMA |
1853 | #ifndef CONFIG_NUMA | 1827 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
1854 | up_read(&mm->mmap_sem); | 1828 | { |
1855 | VM_BUG_ON(!*hpage); | 1829 | if (IS_ERR(*hpage)) { |
1856 | new_page = *hpage; | 1830 | if (!*wait) |
1857 | #else | 1831 | return false; |
1832 | |||
1833 | *wait = false; | ||
1834 | *hpage = NULL; | ||
1835 | khugepaged_alloc_sleep(); | ||
1836 | } else if (*hpage) { | ||
1837 | put_page(*hpage); | ||
1838 | *hpage = NULL; | ||
1839 | } | ||
1840 | |||
1841 | return true; | ||
1842 | } | ||
1843 | |||
1844 | static struct page | ||
1845 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1846 | struct vm_area_struct *vma, unsigned long address, | ||
1847 | int node) | ||
1848 | { | ||
1858 | VM_BUG_ON(*hpage); | 1849 | VM_BUG_ON(*hpage); |
1859 | /* | 1850 | /* |
1860 | * Allocate the page while the vma is still valid and under | 1851 | * Allocate the page while the vma is still valid and under |
@@ -1866,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1866 | * mmap_sem in read mode is good idea also to allow greater | 1857 | * mmap_sem in read mode is good idea also to allow greater |
1867 | * scalability. | 1858 | * scalability. |
1868 | */ | 1859 | */ |
1869 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 1860 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1870 | node, __GFP_OTHER_NODE); | 1861 | node, __GFP_OTHER_NODE); |
1871 | 1862 | ||
1872 | /* | 1863 | /* |
@@ -1874,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1874 | * preparation for taking it in write mode. | 1865 | * preparation for taking it in write mode. |
1875 | */ | 1866 | */ |
1876 | up_read(&mm->mmap_sem); | 1867 | up_read(&mm->mmap_sem); |
1877 | if (unlikely(!new_page)) { | 1868 | if (unlikely(!*hpage)) { |
1878 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 1869 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
1879 | *hpage = ERR_PTR(-ENOMEM); | 1870 | *hpage = ERR_PTR(-ENOMEM); |
1880 | return; | 1871 | return NULL; |
1881 | } | 1872 | } |
1882 | #endif | ||
1883 | 1873 | ||
1884 | count_vm_event(THP_COLLAPSE_ALLOC); | 1874 | count_vm_event(THP_COLLAPSE_ALLOC); |
1885 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1875 | return *hpage; |
1886 | #ifdef CONFIG_NUMA | 1876 | } |
1887 | put_page(new_page); | 1877 | #else |
1878 | static struct page *khugepaged_alloc_hugepage(bool *wait) | ||
1879 | { | ||
1880 | struct page *hpage; | ||
1881 | |||
1882 | do { | ||
1883 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
1884 | if (!hpage) { | ||
1885 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
1886 | if (!*wait) | ||
1887 | return NULL; | ||
1888 | |||
1889 | *wait = false; | ||
1890 | khugepaged_alloc_sleep(); | ||
1891 | } else | ||
1892 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
1893 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | ||
1894 | |||
1895 | return hpage; | ||
1896 | } | ||
1897 | |||
1898 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
1899 | { | ||
1900 | if (!*hpage) | ||
1901 | *hpage = khugepaged_alloc_hugepage(wait); | ||
1902 | |||
1903 | if (unlikely(!*hpage)) | ||
1904 | return false; | ||
1905 | |||
1906 | return true; | ||
1907 | } | ||
1908 | |||
1909 | static struct page | ||
1910 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1911 | struct vm_area_struct *vma, unsigned long address, | ||
1912 | int node) | ||
1913 | { | ||
1914 | up_read(&mm->mmap_sem); | ||
1915 | VM_BUG_ON(!*hpage); | ||
1916 | return *hpage; | ||
1917 | } | ||
1888 | #endif | 1918 | #endif |
1919 | |||
1920 | static void collapse_huge_page(struct mm_struct *mm, | ||
1921 | unsigned long address, | ||
1922 | struct page **hpage, | ||
1923 | struct vm_area_struct *vma, | ||
1924 | int node) | ||
1925 | { | ||
1926 | pgd_t *pgd; | ||
1927 | pud_t *pud; | ||
1928 | pmd_t *pmd, _pmd; | ||
1929 | pte_t *pte; | ||
1930 | pgtable_t pgtable; | ||
1931 | struct page *new_page; | ||
1932 | spinlock_t *ptl; | ||
1933 | int isolated; | ||
1934 | unsigned long hstart, hend; | ||
1935 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1936 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1937 | |||
1938 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1939 | |||
1940 | /* release the mmap_sem read lock. */ | ||
1941 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | ||
1942 | if (!new_page) | ||
1943 | return; | ||
1944 | |||
1945 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | ||
1889 | return; | 1946 | return; |
1890 | } | ||
1891 | 1947 | ||
1892 | /* | 1948 | /* |
1893 | * Prevent all access to pagetables with the exception of | 1949 | * Prevent all access to pagetables with the exception of |
@@ -1912,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1912 | goto out; | 1968 | goto out; |
1913 | if (is_vma_temporary_stack(vma)) | 1969 | if (is_vma_temporary_stack(vma)) |
1914 | goto out; | 1970 | goto out; |
1915 | /* | 1971 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1916 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1917 | * true too, verify it here. | ||
1918 | */ | ||
1919 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1920 | 1972 | ||
1921 | pgd = pgd_offset(mm, address); | 1973 | pgd = pgd_offset(mm, address); |
1922 | if (!pgd_present(*pgd)) | 1974 | if (!pgd_present(*pgd)) |
@@ -1936,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1936 | pte = pte_offset_map(pmd, address); | 1988 | pte = pte_offset_map(pmd, address); |
1937 | ptl = pte_lockptr(mm, pmd); | 1989 | ptl = pte_lockptr(mm, pmd); |
1938 | 1990 | ||
1991 | mmun_start = address; | ||
1992 | mmun_end = address + HPAGE_PMD_SIZE; | ||
1993 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1939 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 1994 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
1940 | /* | 1995 | /* |
1941 | * After this gup_fast can't run anymore. This also removes | 1996 | * After this gup_fast can't run anymore. This also removes |
@@ -1943,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1943 | * huge and small TLB entries for the same virtual address | 1998 | * huge and small TLB entries for the same virtual address |
1944 | * to avoid the risk of CPU bugs in that area. | 1999 | * to avoid the risk of CPU bugs in that area. |
1945 | */ | 2000 | */ |
1946 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | 2001 | _pmd = pmdp_clear_flush(vma, address, pmd); |
1947 | spin_unlock(&mm->page_table_lock); | 2002 | spin_unlock(&mm->page_table_lock); |
2003 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1948 | 2004 | ||
1949 | spin_lock(ptl); | 2005 | spin_lock(ptl); |
1950 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2006 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
@@ -1970,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1970 | pte_unmap(pte); | 2026 | pte_unmap(pte); |
1971 | __SetPageUptodate(new_page); | 2027 | __SetPageUptodate(new_page); |
1972 | pgtable = pmd_pgtable(_pmd); | 2028 | pgtable = pmd_pgtable(_pmd); |
1973 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1974 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1975 | 2029 | ||
1976 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2030 | _pmd = mk_pmd(new_page, vma->vm_page_prot); |
1977 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | 2031 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); |
@@ -1988,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1988 | BUG_ON(!pmd_none(*pmd)); | 2042 | BUG_ON(!pmd_none(*pmd)); |
1989 | page_add_new_anon_rmap(new_page, vma, address); | 2043 | page_add_new_anon_rmap(new_page, vma, address); |
1990 | set_pmd_at(mm, address, pmd, _pmd); | 2044 | set_pmd_at(mm, address, pmd, _pmd); |
1991 | update_mmu_cache(vma, address, _pmd); | 2045 | update_mmu_cache_pmd(vma, address, pmd); |
1992 | prepare_pmd_huge_pte(pgtable, mm); | 2046 | pgtable_trans_huge_deposit(mm, pgtable); |
1993 | spin_unlock(&mm->page_table_lock); | 2047 | spin_unlock(&mm->page_table_lock); |
1994 | 2048 | ||
1995 | #ifndef CONFIG_NUMA | ||
1996 | *hpage = NULL; | 2049 | *hpage = NULL; |
1997 | #endif | 2050 | |
1998 | khugepaged_pages_collapsed++; | 2051 | khugepaged_pages_collapsed++; |
1999 | out_up_write: | 2052 | out_up_write: |
2000 | up_write(&mm->mmap_sem); | 2053 | up_write(&mm->mmap_sem); |
@@ -2002,9 +2055,6 @@ out_up_write: | |||
2002 | 2055 | ||
2003 | out: | 2056 | out: |
2004 | mem_cgroup_uncharge_page(new_page); | 2057 | mem_cgroup_uncharge_page(new_page); |
2005 | #ifdef CONFIG_NUMA | ||
2006 | put_page(new_page); | ||
2007 | #endif | ||
2008 | goto out_up_write; | 2058 | goto out_up_write; |
2009 | } | 2059 | } |
2010 | 2060 | ||
@@ -2154,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2154 | goto skip; | 2204 | goto skip; |
2155 | if (is_vma_temporary_stack(vma)) | 2205 | if (is_vma_temporary_stack(vma)) |
2156 | goto skip; | 2206 | goto skip; |
2157 | /* | 2207 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2158 | * If is_pfn_mapping() is true is_learn_pfn_mapping() | ||
2159 | * must be true too, verify it here. | ||
2160 | */ | ||
2161 | VM_BUG_ON(is_linear_pfn_mapping(vma) || | ||
2162 | vma->vm_flags & VM_NO_THP); | ||
2163 | 2208 | ||
2164 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2209 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2165 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2210 | hend = vma->vm_end & HPAGE_PMD_MASK; |
@@ -2234,32 +2279,23 @@ static int khugepaged_has_work(void) | |||
2234 | static int khugepaged_wait_event(void) | 2279 | static int khugepaged_wait_event(void) |
2235 | { | 2280 | { |
2236 | return !list_empty(&khugepaged_scan.mm_head) || | 2281 | return !list_empty(&khugepaged_scan.mm_head) || |
2237 | !khugepaged_enabled(); | 2282 | kthread_should_stop(); |
2238 | } | 2283 | } |
2239 | 2284 | ||
2240 | static void khugepaged_do_scan(struct page **hpage) | 2285 | static void khugepaged_do_scan(void) |
2241 | { | 2286 | { |
2287 | struct page *hpage = NULL; | ||
2242 | unsigned int progress = 0, pass_through_head = 0; | 2288 | unsigned int progress = 0, pass_through_head = 0; |
2243 | unsigned int pages = khugepaged_pages_to_scan; | 2289 | unsigned int pages = khugepaged_pages_to_scan; |
2290 | bool wait = true; | ||
2244 | 2291 | ||
2245 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | 2292 | barrier(); /* write khugepaged_pages_to_scan to local stack */ |
2246 | 2293 | ||
2247 | while (progress < pages) { | 2294 | while (progress < pages) { |
2248 | cond_resched(); | 2295 | if (!khugepaged_prealloc_page(&hpage, &wait)) |
2249 | |||
2250 | #ifndef CONFIG_NUMA | ||
2251 | if (!*hpage) { | ||
2252 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
2253 | if (unlikely(!*hpage)) { | ||
2254 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2255 | break; | ||
2256 | } | ||
2257 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2258 | } | ||
2259 | #else | ||
2260 | if (IS_ERR(*hpage)) | ||
2261 | break; | 2296 | break; |
2262 | #endif | 2297 | |
2298 | cond_resched(); | ||
2263 | 2299 | ||
2264 | if (unlikely(kthread_should_stop() || freezing(current))) | 2300 | if (unlikely(kthread_should_stop() || freezing(current))) |
2265 | break; | 2301 | break; |
@@ -2270,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2270 | if (khugepaged_has_work() && | 2306 | if (khugepaged_has_work() && |
2271 | pass_through_head < 2) | 2307 | pass_through_head < 2) |
2272 | progress += khugepaged_scan_mm_slot(pages - progress, | 2308 | progress += khugepaged_scan_mm_slot(pages - progress, |
2273 | hpage); | 2309 | &hpage); |
2274 | else | 2310 | else |
2275 | progress = pages; | 2311 | progress = pages; |
2276 | spin_unlock(&khugepaged_mm_lock); | 2312 | spin_unlock(&khugepaged_mm_lock); |
2277 | } | 2313 | } |
2278 | } | ||
2279 | 2314 | ||
2280 | static void khugepaged_alloc_sleep(void) | 2315 | if (!IS_ERR_OR_NULL(hpage)) |
2281 | { | 2316 | put_page(hpage); |
2282 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2283 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
2284 | } | 2317 | } |
2285 | 2318 | ||
2286 | #ifndef CONFIG_NUMA | 2319 | static void khugepaged_wait_work(void) |
2287 | static struct page *khugepaged_alloc_hugepage(void) | ||
2288 | { | 2320 | { |
2289 | struct page *hpage; | 2321 | try_to_freeze(); |
2290 | |||
2291 | do { | ||
2292 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
2293 | if (!hpage) { | ||
2294 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2295 | khugepaged_alloc_sleep(); | ||
2296 | } else | ||
2297 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2298 | } while (unlikely(!hpage) && | ||
2299 | likely(khugepaged_enabled())); | ||
2300 | return hpage; | ||
2301 | } | ||
2302 | #endif | ||
2303 | 2322 | ||
2304 | static void khugepaged_loop(void) | 2323 | if (khugepaged_has_work()) { |
2305 | { | 2324 | if (!khugepaged_scan_sleep_millisecs) |
2306 | struct page *hpage; | 2325 | return; |
2307 | 2326 | ||
2308 | #ifdef CONFIG_NUMA | 2327 | wait_event_freezable_timeout(khugepaged_wait, |
2309 | hpage = NULL; | 2328 | kthread_should_stop(), |
2310 | #endif | 2329 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); |
2311 | while (likely(khugepaged_enabled())) { | 2330 | return; |
2312 | #ifndef CONFIG_NUMA | ||
2313 | hpage = khugepaged_alloc_hugepage(); | ||
2314 | if (unlikely(!hpage)) | ||
2315 | break; | ||
2316 | #else | ||
2317 | if (IS_ERR(hpage)) { | ||
2318 | khugepaged_alloc_sleep(); | ||
2319 | hpage = NULL; | ||
2320 | } | ||
2321 | #endif | ||
2322 | |||
2323 | khugepaged_do_scan(&hpage); | ||
2324 | #ifndef CONFIG_NUMA | ||
2325 | if (hpage) | ||
2326 | put_page(hpage); | ||
2327 | #endif | ||
2328 | try_to_freeze(); | ||
2329 | if (unlikely(kthread_should_stop())) | ||
2330 | break; | ||
2331 | if (khugepaged_has_work()) { | ||
2332 | if (!khugepaged_scan_sleep_millisecs) | ||
2333 | continue; | ||
2334 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2335 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); | ||
2336 | } else if (khugepaged_enabled()) | ||
2337 | wait_event_freezable(khugepaged_wait, | ||
2338 | khugepaged_wait_event()); | ||
2339 | } | 2331 | } |
2332 | |||
2333 | if (khugepaged_enabled()) | ||
2334 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | ||
2340 | } | 2335 | } |
2341 | 2336 | ||
2342 | static int khugepaged(void *none) | 2337 | static int khugepaged(void *none) |
@@ -2346,20 +2341,9 @@ static int khugepaged(void *none) | |||
2346 | set_freezable(); | 2341 | set_freezable(); |
2347 | set_user_nice(current, 19); | 2342 | set_user_nice(current, 19); |
2348 | 2343 | ||
2349 | /* serialize with start_khugepaged() */ | 2344 | while (!kthread_should_stop()) { |
2350 | mutex_lock(&khugepaged_mutex); | 2345 | khugepaged_do_scan(); |
2351 | 2346 | khugepaged_wait_work(); | |
2352 | for (;;) { | ||
2353 | mutex_unlock(&khugepaged_mutex); | ||
2354 | VM_BUG_ON(khugepaged_thread != current); | ||
2355 | khugepaged_loop(); | ||
2356 | VM_BUG_ON(khugepaged_thread != current); | ||
2357 | |||
2358 | mutex_lock(&khugepaged_mutex); | ||
2359 | if (!khugepaged_enabled()) | ||
2360 | break; | ||
2361 | if (unlikely(kthread_should_stop())) | ||
2362 | break; | ||
2363 | } | 2347 | } |
2364 | 2348 | ||
2365 | spin_lock(&khugepaged_mm_lock); | 2349 | spin_lock(&khugepaged_mm_lock); |
@@ -2368,10 +2352,6 @@ static int khugepaged(void *none) | |||
2368 | if (mm_slot) | 2352 | if (mm_slot) |
2369 | collect_mm_slot(mm_slot); | 2353 | collect_mm_slot(mm_slot); |
2370 | spin_unlock(&khugepaged_mm_lock); | 2354 | spin_unlock(&khugepaged_mm_lock); |
2371 | |||
2372 | khugepaged_thread = NULL; | ||
2373 | mutex_unlock(&khugepaged_mutex); | ||
2374 | |||
2375 | return 0; | 2355 | return 0; |
2376 | } | 2356 | } |
2377 | 2357 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd44..59a0059b39e2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | 31 | #include <linux/hugetlb_cgroup.h> |
32 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
34 | #include "internal.h" | 33 | #include "internal.h" |
35 | 34 | ||
36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 35 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page) | |||
637 | h->surplus_huge_pages--; | 636 | h->surplus_huge_pages--; |
638 | h->surplus_huge_pages_node[nid]--; | 637 | h->surplus_huge_pages_node[nid]--; |
639 | } else { | 638 | } else { |
639 | arch_clear_hugepage_flags(page); | ||
640 | enqueue_huge_page(h, page); | 640 | enqueue_huge_page(h, page); |
641 | } | 641 | } |
642 | spin_unlock(&hugetlb_lock); | 642 | spin_unlock(&hugetlb_lock); |
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
671 | } | 671 | } |
672 | } | 672 | } |
673 | 673 | ||
674 | /* | ||
675 | * PageHuge() only returns true for hugetlbfs pages, but not for normal or | ||
676 | * transparent huge pages. See the PageTransHuge() documentation for more | ||
677 | * details. | ||
678 | */ | ||
674 | int PageHuge(struct page *page) | 679 | int PageHuge(struct page *page) |
675 | { | 680 | { |
676 | compound_page_dtor *dtor; | 681 | compound_page_dtor *dtor; |
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2355 | struct page *page; | 2360 | struct page *page; |
2356 | struct hstate *h = hstate_vma(vma); | 2361 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2362 | unsigned long sz = huge_page_size(h); |
2363 | const unsigned long mmun_start = start; /* For mmu_notifiers */ | ||
2364 | const unsigned long mmun_end = end; /* For mmu_notifiers */ | ||
2358 | 2365 | ||
2359 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2366 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2360 | BUG_ON(start & ~huge_page_mask(h)); | 2367 | BUG_ON(start & ~huge_page_mask(h)); |
2361 | BUG_ON(end & ~huge_page_mask(h)); | 2368 | BUG_ON(end & ~huge_page_mask(h)); |
2362 | 2369 | ||
2363 | tlb_start_vma(tlb, vma); | 2370 | tlb_start_vma(tlb, vma); |
2364 | mmu_notifier_invalidate_range_start(mm, start, end); | 2371 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2365 | again: | 2372 | again: |
2366 | spin_lock(&mm->page_table_lock); | 2373 | spin_lock(&mm->page_table_lock); |
2367 | for (address = start; address < end; address += sz) { | 2374 | for (address = start; address < end; address += sz) { |
@@ -2425,7 +2432,7 @@ again: | |||
2425 | if (address < end && !ref_page) | 2432 | if (address < end && !ref_page) |
2426 | goto again; | 2433 | goto again; |
2427 | } | 2434 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | 2435 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2429 | tlb_end_vma(tlb, vma); | 2436 | tlb_end_vma(tlb, vma); |
2430 | } | 2437 | } |
2431 | 2438 | ||
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2473 | struct hstate *h = hstate_vma(vma); | 2480 | struct hstate *h = hstate_vma(vma); |
2474 | struct vm_area_struct *iter_vma; | 2481 | struct vm_area_struct *iter_vma; |
2475 | struct address_space *mapping; | 2482 | struct address_space *mapping; |
2476 | struct prio_tree_iter iter; | ||
2477 | pgoff_t pgoff; | 2483 | pgoff_t pgoff; |
2478 | 2484 | ||
2479 | /* | 2485 | /* |
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2481 | * from page cache lookup which is in HPAGE_SIZE units. | 2487 | * from page cache lookup which is in HPAGE_SIZE units. |
2482 | */ | 2488 | */ |
2483 | address = address & huge_page_mask(h); | 2489 | address = address & huge_page_mask(h); |
2484 | pgoff = vma_hugecache_offset(h, vma, address); | 2490 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + |
2491 | vma->vm_pgoff; | ||
2485 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; | 2492 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2486 | 2493 | ||
2487 | /* | 2494 | /* |
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2490 | * __unmap_hugepage_range() is called as the lock is already held | 2497 | * __unmap_hugepage_range() is called as the lock is already held |
2491 | */ | 2498 | */ |
2492 | mutex_lock(&mapping->i_mmap_mutex); | 2499 | mutex_lock(&mapping->i_mmap_mutex); |
2493 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2500 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
2494 | /* Do not unmap the current VMA */ | 2501 | /* Do not unmap the current VMA */ |
2495 | if (iter_vma == vma) | 2502 | if (iter_vma == vma) |
2496 | continue; | 2503 | continue; |
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | struct page *old_page, *new_page; | 2532 | struct page *old_page, *new_page; |
2526 | int avoidcopy; | 2533 | int avoidcopy; |
2527 | int outside_reserve = 0; | 2534 | int outside_reserve = 0; |
2535 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2536 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2528 | 2537 | ||
2529 | old_page = pte_page(pte); | 2538 | old_page = pte_page(pte); |
2530 | 2539 | ||
@@ -2611,6 +2620,9 @@ retry_avoidcopy: | |||
2611 | pages_per_huge_page(h)); | 2620 | pages_per_huge_page(h)); |
2612 | __SetPageUptodate(new_page); | 2621 | __SetPageUptodate(new_page); |
2613 | 2622 | ||
2623 | mmun_start = address & huge_page_mask(h); | ||
2624 | mmun_end = mmun_start + huge_page_size(h); | ||
2625 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2614 | /* | 2626 | /* |
2615 | * Retake the page_table_lock to check for racing updates | 2627 | * Retake the page_table_lock to check for racing updates |
2616 | * before the page tables are altered | 2628 | * before the page tables are altered |
@@ -2619,9 +2631,6 @@ retry_avoidcopy: | |||
2619 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2631 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2620 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2632 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
2621 | /* Break COW */ | 2633 | /* Break COW */ |
2622 | mmu_notifier_invalidate_range_start(mm, | ||
2623 | address & huge_page_mask(h), | ||
2624 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2625 | huge_ptep_clear_flush(vma, address, ptep); | 2634 | huge_ptep_clear_flush(vma, address, ptep); |
2626 | set_huge_pte_at(mm, address, ptep, | 2635 | set_huge_pte_at(mm, address, ptep, |
2627 | make_huge_pte(vma, new_page, 1)); | 2636 | make_huge_pte(vma, new_page, 1)); |
@@ -2629,10 +2638,11 @@ retry_avoidcopy: | |||
2629 | hugepage_add_new_anon_rmap(new_page, vma, address); | 2638 | hugepage_add_new_anon_rmap(new_page, vma, address); |
2630 | /* Make the old page be freed below */ | 2639 | /* Make the old page be freed below */ |
2631 | new_page = old_page; | 2640 | new_page = old_page; |
2632 | mmu_notifier_invalidate_range_end(mm, | ||
2633 | address & huge_page_mask(h), | ||
2634 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2635 | } | 2641 | } |
2642 | spin_unlock(&mm->page_table_lock); | ||
2643 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2644 | /* Caller expects lock to be held */ | ||
2645 | spin_lock(&mm->page_table_lock); | ||
2636 | page_cache_release(new_page); | 2646 | page_cache_release(new_page); |
2637 | page_cache_release(old_page); | 2647 | page_cache_release(old_page); |
2638 | return 0; | 2648 | return 0; |
diff --git a/mm/internal.h b/mm/internal.h index b8c91b342e24..a4fa284f6bc2 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,26 +118,27 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
123 | bool sync; /* Synchronous migration */ | 122 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | 123 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
125 | incremental, once free_pfn | 124 | bool finished_update_free; /* True when the zone cached pfns are |
126 | and migrate_pfn meet, we restart | 125 | * no longer being updated |
127 | from the top of the zone; | 126 | */ |
128 | remember we wrapped around. */ | 127 | bool finished_update_migrate; |
129 | 128 | ||
130 | int order; /* order a direct compactor needs */ | 129 | int order; /* order a direct compactor needs */ |
131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 130 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
132 | struct zone *zone; | 131 | struct zone *zone; |
133 | bool *contended; /* True if a lock was contended */ | 132 | bool contended; /* True if a lock was contended */ |
133 | struct page **page; /* Page captured of requested size */ | ||
134 | }; | 134 | }; |
135 | 135 | ||
136 | unsigned long | 136 | unsigned long |
137 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | 137 | isolate_freepages_range(struct compact_control *cc, |
138 | unsigned long start_pfn, unsigned long end_pfn); | ||
138 | unsigned long | 139 | unsigned long |
139 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 140 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
140 | unsigned long low_pfn, unsigned long end_pfn); | 141 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable); |
141 | 142 | ||
142 | #endif | 143 | #endif |
143 | 144 | ||
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
167 | } | 168 | } |
168 | 169 | ||
169 | /* | 170 | /* |
170 | * Called only in fault path via page_evictable() for a new page | 171 | * Called only in fault path, to determine if a new page is being |
171 | * to determine if it's being mapped into a LOCKED vma. | 172 | * mapped into a LOCKED vma. If it is, mark page as mlocked. |
172 | * If so, mark page as mlocked. | ||
173 | */ | 173 | */ |
174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
175 | struct page *page) | 175 | struct page *page) |
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | |||
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | if (!TestSetPageMlocked(page)) { | 182 | if (!TestSetPageMlocked(page)) { |
183 | inc_zone_page_state(page, NR_MLOCK); | 183 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
184 | hpage_nr_pages(page)); | ||
184 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 185 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
185 | } | 186 | } |
186 | return 1; | 187 | return 1; |
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page); | |||
201 | * If called for a page that is still mapped by mlocked vmas, all we do | 202 | * If called for a page that is still mapped by mlocked vmas, all we do |
202 | * is revert to lazy LRU behaviour -- semantics are not broken. | 203 | * is revert to lazy LRU behaviour -- semantics are not broken. |
203 | */ | 204 | */ |
204 | extern void __clear_page_mlock(struct page *page); | 205 | extern void clear_page_mlock(struct page *page); |
205 | static inline void clear_page_mlock(struct page *page) | ||
206 | { | ||
207 | if (unlikely(TestClearPageMlocked(page))) | ||
208 | __clear_page_mlock(page); | ||
209 | } | ||
210 | 206 | ||
211 | /* | 207 | /* |
212 | * mlock_migrate_page - called only from migrate_page_copy() to | 208 | * mlock_migrate_page - called only from migrate_page_copy() to |
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
340 | #define ZONE_RECLAIM_FULL -1 | 336 | #define ZONE_RECLAIM_FULL -1 |
341 | #define ZONE_RECLAIM_SOME 0 | 337 | #define ZONE_RECLAIM_SOME 0 |
342 | #define ZONE_RECLAIM_SUCCESS 1 | 338 | #define ZONE_RECLAIM_SUCCESS 1 |
343 | #endif | ||
344 | 339 | ||
345 | extern int hwpoison_filter(struct page *p); | 340 | extern int hwpoison_filter(struct page *p); |
346 | 341 | ||
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | |||
356 | unsigned long, unsigned long); | 351 | unsigned long, unsigned long); |
357 | 352 | ||
358 | extern void set_pageblock_order(void); | 353 | extern void set_pageblock_order(void); |
354 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
355 | struct list_head *page_list); | ||
356 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
357 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
358 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
359 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
360 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
361 | |||
362 | /* Mask to get the watermark bits */ | ||
363 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
364 | |||
365 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
366 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
367 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
368 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | ||
369 | |||
370 | #endif /* __MM_INTERNAL_H */ | ||
diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..4a5822a586e6 --- /dev/null +++ b/mm/interval_tree.c | |||
@@ -0,0 +1,112 @@ | |||
1 | /* | ||
2 | * mm/interval_tree.c - interval tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2012, Michel Lespinasse <walken@google.com> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/fs.h> | ||
11 | #include <linux/rmap.h> | ||
12 | #include <linux/interval_tree_generic.h> | ||
13 | |||
14 | static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) | ||
15 | { | ||
16 | return v->vm_pgoff; | ||
17 | } | ||
18 | |||
19 | static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | ||
20 | { | ||
21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | ||
22 | } | ||
23 | |||
24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | ||
25 | unsigned long, shared.linear.rb_subtree_last, | ||
26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | ||
27 | |||
28 | /* Insert node immediately after prev in the interval tree */ | ||
29 | void vma_interval_tree_insert_after(struct vm_area_struct *node, | ||
30 | struct vm_area_struct *prev, | ||
31 | struct rb_root *root) | ||
32 | { | ||
33 | struct rb_node **link; | ||
34 | struct vm_area_struct *parent; | ||
35 | unsigned long last = vma_last_pgoff(node); | ||
36 | |||
37 | VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); | ||
38 | |||
39 | if (!prev->shared.linear.rb.rb_right) { | ||
40 | parent = prev; | ||
41 | link = &prev->shared.linear.rb.rb_right; | ||
42 | } else { | ||
43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | ||
44 | struct vm_area_struct, shared.linear.rb); | ||
45 | if (parent->shared.linear.rb_subtree_last < last) | ||
46 | parent->shared.linear.rb_subtree_last = last; | ||
47 | while (parent->shared.linear.rb.rb_left) { | ||
48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | ||
49 | struct vm_area_struct, shared.linear.rb); | ||
50 | if (parent->shared.linear.rb_subtree_last < last) | ||
51 | parent->shared.linear.rb_subtree_last = last; | ||
52 | } | ||
53 | link = &parent->shared.linear.rb.rb_left; | ||
54 | } | ||
55 | |||
56 | node->shared.linear.rb_subtree_last = last; | ||
57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | ||
58 | rb_insert_augmented(&node->shared.linear.rb, root, | ||
59 | &vma_interval_tree_augment); | ||
60 | } | ||
61 | |||
62 | static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) | ||
63 | { | ||
64 | return vma_start_pgoff(avc->vma); | ||
65 | } | ||
66 | |||
67 | static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) | ||
68 | { | ||
69 | return vma_last_pgoff(avc->vma); | ||
70 | } | ||
71 | |||
72 | INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, | ||
73 | avc_start_pgoff, avc_last_pgoff, | ||
74 | static inline, __anon_vma_interval_tree) | ||
75 | |||
76 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, | ||
77 | struct rb_root *root) | ||
78 | { | ||
79 | #ifdef CONFIG_DEBUG_VM_RB | ||
80 | node->cached_vma_start = avc_start_pgoff(node); | ||
81 | node->cached_vma_last = avc_last_pgoff(node); | ||
82 | #endif | ||
83 | __anon_vma_interval_tree_insert(node, root); | ||
84 | } | ||
85 | |||
86 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, | ||
87 | struct rb_root *root) | ||
88 | { | ||
89 | __anon_vma_interval_tree_remove(node, root); | ||
90 | } | ||
91 | |||
92 | struct anon_vma_chain * | ||
93 | anon_vma_interval_tree_iter_first(struct rb_root *root, | ||
94 | unsigned long first, unsigned long last) | ||
95 | { | ||
96 | return __anon_vma_interval_tree_iter_first(root, first, last); | ||
97 | } | ||
98 | |||
99 | struct anon_vma_chain * | ||
100 | anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, | ||
101 | unsigned long first, unsigned long last) | ||
102 | { | ||
103 | return __anon_vma_interval_tree_iter_next(node, first, last); | ||
104 | } | ||
105 | |||
106 | #ifdef CONFIG_DEBUG_VM_RB | ||
107 | void anon_vma_interval_tree_verify(struct anon_vma_chain *node) | ||
108 | { | ||
109 | WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); | ||
110 | WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); | ||
111 | } | ||
112 | #endif | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0de83b4541e9..a217cc544060 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -29,7 +29,7 @@ | |||
29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | 29 | * - kmemleak_lock (rwlock): protects the object_list modifications and |
30 | * accesses to the object_tree_root. The object_list is the main list | 30 | * accesses to the object_tree_root. The object_list is the main list |
31 | * holding the metadata (struct kmemleak_object) for the allocated memory | 31 | * holding the metadata (struct kmemleak_object) for the allocated memory |
32 | * blocks. The object_tree_root is a priority search tree used to look-up | 32 | * blocks. The object_tree_root is a red black tree used to look-up |
33 | * metadata based on a pointer to the corresponding memory block. The | 33 | * metadata based on a pointer to the corresponding memory block. The |
34 | * kmemleak_object structures are added to the object_list and | 34 | * kmemleak_object structures are added to the object_list and |
35 | * object_tree_root in the create_object() function called from the | 35 | * object_tree_root in the create_object() function called from the |
@@ -71,7 +71,7 @@ | |||
71 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
72 | #include <linux/export.h> | 72 | #include <linux/export.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/rbtree.h> |
75 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
76 | #include <linux/debugfs.h> | 76 | #include <linux/debugfs.h> |
77 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
@@ -132,7 +132,7 @@ struct kmemleak_scan_area { | |||
132 | * Structure holding the metadata for each allocated memory block. | 132 | * Structure holding the metadata for each allocated memory block. |
133 | * Modifications to such objects should be made while holding the | 133 | * Modifications to such objects should be made while holding the |
134 | * object->lock. Insertions or deletions from object_list, gray_list or | 134 | * object->lock. Insertions or deletions from object_list, gray_list or |
135 | * tree_node are already protected by the corresponding locks or mutex (see | 135 | * rb_node are already protected by the corresponding locks or mutex (see |
136 | * the notes on locking above). These objects are reference-counted | 136 | * the notes on locking above). These objects are reference-counted |
137 | * (use_count) and freed using the RCU mechanism. | 137 | * (use_count) and freed using the RCU mechanism. |
138 | */ | 138 | */ |
@@ -141,7 +141,7 @@ struct kmemleak_object { | |||
141 | unsigned long flags; /* object status flags */ | 141 | unsigned long flags; /* object status flags */ |
142 | struct list_head object_list; | 142 | struct list_head object_list; |
143 | struct list_head gray_list; | 143 | struct list_head gray_list; |
144 | struct prio_tree_node tree_node; | 144 | struct rb_node rb_node; |
145 | struct rcu_head rcu; /* object_list lockless traversal */ | 145 | struct rcu_head rcu; /* object_list lockless traversal */ |
146 | /* object usage count; object freed when use_count == 0 */ | 146 | /* object usage count; object freed when use_count == 0 */ |
147 | atomic_t use_count; | 147 | atomic_t use_count; |
@@ -182,9 +182,9 @@ struct kmemleak_object { | |||
182 | static LIST_HEAD(object_list); | 182 | static LIST_HEAD(object_list); |
183 | /* the list of gray-colored objects (see color_gray comment below) */ | 183 | /* the list of gray-colored objects (see color_gray comment below) */ |
184 | static LIST_HEAD(gray_list); | 184 | static LIST_HEAD(gray_list); |
185 | /* prio search tree for object boundaries */ | 185 | /* search tree for object boundaries */ |
186 | static struct prio_tree_root object_tree_root; | 186 | static struct rb_root object_tree_root = RB_ROOT; |
187 | /* rw_lock protecting the access to object_list and prio_tree_root */ | 187 | /* rw_lock protecting the access to object_list and object_tree_root */ |
188 | static DEFINE_RWLOCK(kmemleak_lock); | 188 | static DEFINE_RWLOCK(kmemleak_lock); |
189 | 189 | ||
190 | /* allocation caches for kmemleak internal data */ | 190 | /* allocation caches for kmemleak internal data */ |
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
380 | trace.entries = object->trace; | 380 | trace.entries = object->trace; |
381 | 381 | ||
382 | pr_notice("Object 0x%08lx (size %zu):\n", | 382 | pr_notice("Object 0x%08lx (size %zu):\n", |
383 | object->tree_node.start, object->size); | 383 | object->pointer, object->size); |
384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | 384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", |
385 | object->comm, object->pid, object->jiffies); | 385 | object->comm, object->pid, object->jiffies); |
386 | pr_notice(" min_count = %d\n", object->min_count); | 386 | pr_notice(" min_count = %d\n", object->min_count); |
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) | |||
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * Look-up a memory block metadata (kmemleak_object) in the priority search | 395 | * Look-up a memory block metadata (kmemleak_object) in the object search |
396 | * tree based on a pointer value. If alias is 0, only values pointing to the | 396 | * tree based on a pointer value. If alias is 0, only values pointing to the |
397 | * beginning of the memory block are allowed. The kmemleak_lock must be held | 397 | * beginning of the memory block are allowed. The kmemleak_lock must be held |
398 | * when calling this function. | 398 | * when calling this function. |
399 | */ | 399 | */ |
400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | 400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) |
401 | { | 401 | { |
402 | struct prio_tree_node *node; | 402 | struct rb_node *rb = object_tree_root.rb_node; |
403 | struct prio_tree_iter iter; | 403 | |
404 | struct kmemleak_object *object; | 404 | while (rb) { |
405 | 405 | struct kmemleak_object *object = | |
406 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | 406 | rb_entry(rb, struct kmemleak_object, rb_node); |
407 | node = prio_tree_next(&iter); | 407 | if (ptr < object->pointer) |
408 | if (node) { | 408 | rb = object->rb_node.rb_left; |
409 | object = prio_tree_entry(node, struct kmemleak_object, | 409 | else if (object->pointer + object->size <= ptr) |
410 | tree_node); | 410 | rb = object->rb_node.rb_right; |
411 | if (!alias && object->pointer != ptr) { | 411 | else if (object->pointer == ptr || alias) |
412 | return object; | ||
413 | else { | ||
412 | kmemleak_warn("Found object by alias at 0x%08lx\n", | 414 | kmemleak_warn("Found object by alias at 0x%08lx\n", |
413 | ptr); | 415 | ptr); |
414 | dump_object_info(object); | 416 | dump_object_info(object); |
415 | object = NULL; | 417 | break; |
416 | } | 418 | } |
417 | } else | 419 | } |
418 | object = NULL; | 420 | return NULL; |
419 | |||
420 | return object; | ||
421 | } | 421 | } |
422 | 422 | ||
423 | /* | 423 | /* |
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) | |||
471 | } | 471 | } |
472 | 472 | ||
473 | /* | 473 | /* |
474 | * Look up an object in the prio search tree and increase its use_count. | 474 | * Look up an object in the object search tree and increase its use_count. |
475 | */ | 475 | */ |
476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | 476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) |
477 | { | 477 | { |
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
516 | int min_count, gfp_t gfp) | 516 | int min_count, gfp_t gfp) |
517 | { | 517 | { |
518 | unsigned long flags; | 518 | unsigned long flags; |
519 | struct kmemleak_object *object; | 519 | struct kmemleak_object *object, *parent; |
520 | struct prio_tree_node *node; | 520 | struct rb_node **link, *rb_parent; |
521 | 521 | ||
522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); | 522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); |
523 | if (!object) { | 523 | if (!object) { |
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
560 | /* kernel backtrace */ | 560 | /* kernel backtrace */ |
561 | object->trace_len = __save_stack_trace(object->trace); | 561 | object->trace_len = __save_stack_trace(object->trace); |
562 | 562 | ||
563 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
564 | object->tree_node.start = ptr; | ||
565 | object->tree_node.last = ptr + size - 1; | ||
566 | |||
567 | write_lock_irqsave(&kmemleak_lock, flags); | 563 | write_lock_irqsave(&kmemleak_lock, flags); |
568 | 564 | ||
569 | min_addr = min(min_addr, ptr); | 565 | min_addr = min(min_addr, ptr); |
570 | max_addr = max(max_addr, ptr + size); | 566 | max_addr = max(max_addr, ptr + size); |
571 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | 567 | link = &object_tree_root.rb_node; |
572 | /* | 568 | rb_parent = NULL; |
573 | * The code calling the kernel does not yet have the pointer to the | 569 | while (*link) { |
574 | * memory block to be able to free it. However, we still hold the | 570 | rb_parent = *link; |
575 | * kmemleak_lock here in case parts of the kernel started freeing | 571 | parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); |
576 | * random memory blocks. | 572 | if (ptr + size <= parent->pointer) |
577 | */ | 573 | link = &parent->rb_node.rb_left; |
578 | if (node != &object->tree_node) { | 574 | else if (parent->pointer + parent->size <= ptr) |
579 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | 575 | link = &parent->rb_node.rb_right; |
580 | "(already existing)\n", ptr); | 576 | else { |
581 | object = lookup_object(ptr, 1); | 577 | kmemleak_stop("Cannot insert 0x%lx into the object " |
582 | spin_lock(&object->lock); | 578 | "search tree (overlaps existing)\n", |
583 | dump_object_info(object); | 579 | ptr); |
584 | spin_unlock(&object->lock); | 580 | kmem_cache_free(object_cache, object); |
585 | 581 | object = parent; | |
586 | goto out; | 582 | spin_lock(&object->lock); |
583 | dump_object_info(object); | ||
584 | spin_unlock(&object->lock); | ||
585 | goto out; | ||
586 | } | ||
587 | } | 587 | } |
588 | rb_link_node(&object->rb_node, rb_parent, link); | ||
589 | rb_insert_color(&object->rb_node, &object_tree_root); | ||
590 | |||
588 | list_add_tail_rcu(&object->object_list, &object_list); | 591 | list_add_tail_rcu(&object->object_list, &object_list); |
589 | out: | 592 | out: |
590 | write_unlock_irqrestore(&kmemleak_lock, flags); | 593 | write_unlock_irqrestore(&kmemleak_lock, flags); |
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) | |||
600 | unsigned long flags; | 603 | unsigned long flags; |
601 | 604 | ||
602 | write_lock_irqsave(&kmemleak_lock, flags); | 605 | write_lock_irqsave(&kmemleak_lock, flags); |
603 | prio_tree_remove(&object_tree_root, &object->tree_node); | 606 | rb_erase(&object->rb_node, &object_tree_root); |
604 | list_del_rcu(&object->object_list); | 607 | list_del_rcu(&object->object_list); |
605 | write_unlock_irqrestore(&kmemleak_lock, flags); | 608 | write_unlock_irqrestore(&kmemleak_lock, flags); |
606 | 609 | ||
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void) | |||
1766 | 1769 | ||
1767 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | 1770 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); |
1768 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | 1771 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); |
1769 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
1770 | 1772 | ||
1771 | if (crt_early_log >= ARRAY_SIZE(early_log)) | 1773 | if (crt_early_log >= ARRAY_SIZE(early_log)) |
1772 | pr_warning("Early log buffer exceeded (%d), please increase " | 1774 | pr_warning("Early log buffer exceeded (%d), please increase " |
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
709 | spinlock_t *ptl; | 709 | spinlock_t *ptl; |
710 | int swapped; | 710 | int swapped; |
711 | int err = -EFAULT; | 711 | int err = -EFAULT; |
712 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
713 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
712 | 714 | ||
713 | addr = page_address_in_vma(page, vma); | 715 | addr = page_address_in_vma(page, vma); |
714 | if (addr == -EFAULT) | 716 | if (addr == -EFAULT) |
715 | goto out; | 717 | goto out; |
716 | 718 | ||
717 | BUG_ON(PageTransCompound(page)); | 719 | BUG_ON(PageTransCompound(page)); |
720 | |||
721 | mmun_start = addr; | ||
722 | mmun_end = addr + PAGE_SIZE; | ||
723 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
724 | |||
718 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 725 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
719 | if (!ptep) | 726 | if (!ptep) |
720 | goto out; | 727 | goto out_mn; |
721 | 728 | ||
722 | if (pte_write(*ptep) || pte_dirty(*ptep)) { | 729 | if (pte_write(*ptep) || pte_dirty(*ptep)) { |
723 | pte_t entry; | 730 | pte_t entry; |
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
752 | 759 | ||
753 | out_unlock: | 760 | out_unlock: |
754 | pte_unmap_unlock(ptep, ptl); | 761 | pte_unmap_unlock(ptep, ptl); |
762 | out_mn: | ||
763 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
755 | out: | 764 | out: |
756 | return err; | 765 | return err; |
757 | } | 766 | } |
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
776 | spinlock_t *ptl; | 785 | spinlock_t *ptl; |
777 | unsigned long addr; | 786 | unsigned long addr; |
778 | int err = -EFAULT; | 787 | int err = -EFAULT; |
788 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
789 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
779 | 790 | ||
780 | addr = page_address_in_vma(page, vma); | 791 | addr = page_address_in_vma(page, vma); |
781 | if (addr == -EFAULT) | 792 | if (addr == -EFAULT) |
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
794 | if (!pmd_present(*pmd)) | 805 | if (!pmd_present(*pmd)) |
795 | goto out; | 806 | goto out; |
796 | 807 | ||
808 | mmun_start = addr; | ||
809 | mmun_end = addr + PAGE_SIZE; | ||
810 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
811 | |||
797 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 812 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
798 | if (!pte_same(*ptep, orig_pte)) { | 813 | if (!pte_same(*ptep, orig_pte)) { |
799 | pte_unmap_unlock(ptep, ptl); | 814 | pte_unmap_unlock(ptep, ptl); |
800 | goto out; | 815 | goto out_mn; |
801 | } | 816 | } |
802 | 817 | ||
803 | get_page(kpage); | 818 | get_page(kpage); |
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
814 | 829 | ||
815 | pte_unmap_unlock(ptep, ptl); | 830 | pte_unmap_unlock(ptep, ptl); |
816 | err = 0; | 831 | err = 0; |
832 | out_mn: | ||
833 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
817 | out: | 834 | out: |
818 | return err; | 835 | return err; |
819 | } | 836 | } |
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1469 | */ | 1486 | */ |
1470 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1487 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1471 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1488 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1472 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1489 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) |
1473 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) | ||
1474 | return 0; /* just ignore the advice */ | 1490 | return 0; /* just ignore the advice */ |
1475 | 1491 | ||
1492 | #ifdef VM_SAO | ||
1493 | if (*vm_flags & VM_SAO) | ||
1494 | return 0; | ||
1495 | #endif | ||
1496 | |||
1476 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1497 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
1477 | err = __ksm_enter(mm); | 1498 | err = __ksm_enter(mm); |
1478 | if (err) | 1499 | if (err) |
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page, | |||
1582 | SetPageSwapBacked(new_page); | 1603 | SetPageSwapBacked(new_page); |
1583 | __set_page_locked(new_page); | 1604 | __set_page_locked(new_page); |
1584 | 1605 | ||
1585 | if (page_evictable(new_page, vma)) | 1606 | if (!mlocked_vma_newpage(vma, new_page)) |
1586 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | 1607 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); |
1587 | else | 1608 | else |
1588 | add_page_to_unevictable_list(new_page); | 1609 | add_page_to_unevictable_list(new_page); |
@@ -1614,7 +1635,8 @@ again: | |||
1614 | struct vm_area_struct *vma; | 1635 | struct vm_area_struct *vma; |
1615 | 1636 | ||
1616 | anon_vma_lock(anon_vma); | 1637 | anon_vma_lock(anon_vma); |
1617 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1638 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1639 | 0, ULONG_MAX) { | ||
1618 | vma = vmac->vma; | 1640 | vma = vmac->vma; |
1619 | if (rmap_item->address < vma->vm_start || | 1641 | if (rmap_item->address < vma->vm_start || |
1620 | rmap_item->address >= vma->vm_end) | 1642 | rmap_item->address >= vma->vm_end) |
@@ -1667,7 +1689,8 @@ again: | |||
1667 | struct vm_area_struct *vma; | 1689 | struct vm_area_struct *vma; |
1668 | 1690 | ||
1669 | anon_vma_lock(anon_vma); | 1691 | anon_vma_lock(anon_vma); |
1670 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1692 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1693 | 0, ULONG_MAX) { | ||
1671 | vma = vmac->vma; | 1694 | vma = vmac->vma; |
1672 | if (rmap_item->address < vma->vm_start || | 1695 | if (rmap_item->address < vma->vm_start || |
1673 | rmap_item->address >= vma->vm_end) | 1696 | rmap_item->address >= vma->vm_end) |
@@ -1719,7 +1742,8 @@ again: | |||
1719 | struct vm_area_struct *vma; | 1742 | struct vm_area_struct *vma; |
1720 | 1743 | ||
1721 | anon_vma_lock(anon_vma); | 1744 | anon_vma_lock(anon_vma); |
1722 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1745 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1746 | 0, ULONG_MAX) { | ||
1723 | vma = vmac->vma; | 1747 | vma = vmac->vma; |
1724 | if (rmap_item->address < vma->vm_start || | 1748 | if (rmap_item->address < vma->vm_start || |
1725 | rmap_item->address >= vma->vm_end) | 1749 | rmap_item->address >= vma->vm_end) |
diff --git a/mm/madvise.c b/mm/madvise.c index 14d260fa0d17..03dfa5c7adb3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
69 | new_flags &= ~VM_DONTCOPY; | 69 | new_flags &= ~VM_DONTCOPY; |
70 | break; | 70 | break; |
71 | case MADV_DONTDUMP: | 71 | case MADV_DONTDUMP: |
72 | new_flags |= VM_NODUMP; | 72 | new_flags |= VM_DONTDUMP; |
73 | break; | 73 | break; |
74 | case MADV_DODUMP: | 74 | case MADV_DODUMP: |
75 | new_flags &= ~VM_NODUMP; | 75 | if (new_flags & VM_SPECIAL) { |
76 | error = -EINVAL; | ||
77 | goto out; | ||
78 | } | ||
79 | new_flags &= ~VM_DONTDUMP; | ||
76 | break; | 80 | break; |
77 | case MADV_MERGEABLE: | 81 | case MADV_MERGEABLE: |
78 | case MADV_UNMERGEABLE: | 82 | case MADV_UNMERGEABLE: |
diff --git a/mm/memblock.c b/mm/memblock.c index 82aa349d2f7a..931eef145af5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; | |||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 41 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
42 | 42 | ||
43 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
44 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static __init_memblock const char * |
45 | memblock_type_name(struct memblock_type *type) | ||
45 | { | 46 | { |
46 | if (type == &memblock.memory) | 47 | if (type == &memblock.memory) |
47 | return "memory"; | 48 | return "memory"; |
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
756 | return ret; | 757 | return ret; |
757 | 758 | ||
758 | for (i = start_rgn; i < end_rgn; i++) | 759 | for (i = start_rgn; i < end_rgn; i++) |
759 | type->regions[i].nid = nid; | 760 | memblock_set_region_node(&type->regions[i], nid); |
760 | 761 | ||
761 | memblock_merge_regions(type); | 762 | memblock_merge_regions(type); |
762 | return 0; | 763 | return 0; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a72f2ffdc3d0..7acf43bf04a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/oom.h> | 51 | #include <linux/oom.h> |
52 | #include "internal.h" | 52 | #include "internal.h" |
53 | #include <net/sock.h> | 53 | #include <net/sock.h> |
54 | #include <net/ip.h> | ||
54 | #include <net/tcp_memcontrol.h> | 55 | #include <net/tcp_memcontrol.h> |
55 | 56 | ||
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -326,7 +327,7 @@ struct mem_cgroup { | |||
326 | struct mem_cgroup_stat_cpu nocpu_base; | 327 | struct mem_cgroup_stat_cpu nocpu_base; |
327 | spinlock_t pcp_counter_lock; | 328 | spinlock_t pcp_counter_lock; |
328 | 329 | ||
329 | #ifdef CONFIG_INET | 330 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
330 | struct tcp_memcontrol tcp_mem; | 331 | struct tcp_memcontrol tcp_mem; |
331 | #endif | 332 | #endif |
332 | }; | 333 | }; |
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | |||
411 | return container_of(s, struct mem_cgroup, css); | 412 | return container_of(s, struct mem_cgroup, css); |
412 | } | 413 | } |
413 | 414 | ||
415 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
416 | { | ||
417 | return (memcg == root_mem_cgroup); | ||
418 | } | ||
419 | |||
414 | /* Writing them here to avoid exposing memcg's inner layout */ | 420 | /* Writing them here to avoid exposing memcg's inner layout */ |
415 | #ifdef CONFIG_MEMCG_KMEM | 421 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
416 | #include <net/sock.h> | ||
417 | #include <net/ip.h> | ||
418 | 422 | ||
419 | static bool mem_cgroup_is_root(struct mem_cgroup *memcg); | ||
420 | void sock_update_memcg(struct sock *sk) | 423 | void sock_update_memcg(struct sock *sk) |
421 | { | 424 | { |
422 | if (mem_cgroup_sockets_enabled) { | 425 | if (mem_cgroup_sockets_enabled) { |
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk) | |||
461 | } | 464 | } |
462 | } | 465 | } |
463 | 466 | ||
464 | #ifdef CONFIG_INET | ||
465 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | 467 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) |
466 | { | 468 | { |
467 | if (!memcg || mem_cgroup_is_root(memcg)) | 469 | if (!memcg || mem_cgroup_is_root(memcg)) |
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
470 | return &memcg->tcp_mem.cg_proto; | 472 | return &memcg->tcp_mem.cg_proto; |
471 | } | 473 | } |
472 | EXPORT_SYMBOL(tcp_proto_cgroup); | 474 | EXPORT_SYMBOL(tcp_proto_cgroup); |
473 | #endif /* CONFIG_INET */ | ||
474 | #endif /* CONFIG_MEMCG_KMEM */ | ||
475 | 475 | ||
476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | ||
477 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 476 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
478 | { | 477 | { |
479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 478 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1016 | iter != NULL; \ | 1015 | iter != NULL; \ |
1017 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1018 | 1017 | ||
1019 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
1020 | { | ||
1021 | return (memcg == root_mem_cgroup); | ||
1022 | } | ||
1023 | |||
1024 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1025 | { | 1019 | { |
1026 | struct mem_cgroup *memcg; | 1020 | struct mem_cgroup *memcg; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..6c5899b9034a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
400 | struct vm_area_struct *vma; | 400 | struct vm_area_struct *vma; |
401 | struct task_struct *tsk; | 401 | struct task_struct *tsk; |
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | ||
403 | 404 | ||
404 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma(page); |
405 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
406 | return; | 407 | return; |
407 | 408 | ||
409 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
408 | read_lock(&tasklist_lock); | 410 | read_lock(&tasklist_lock); |
409 | for_each_process (tsk) { | 411 | for_each_process (tsk) { |
410 | struct anon_vma_chain *vmac; | 412 | struct anon_vma_chain *vmac; |
411 | 413 | ||
412 | if (!task_early_kill(tsk)) | 414 | if (!task_early_kill(tsk)) |
413 | continue; | 415 | continue; |
414 | list_for_each_entry(vmac, &av->head, same_anon_vma) { | 416 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
417 | pgoff, pgoff) { | ||
415 | vma = vmac->vma; | 418 | vma = vmac->vma; |
416 | if (!page_mapped_in_vma(page, vma)) | 419 | if (!page_mapped_in_vma(page, vma)) |
417 | continue; | 420 | continue; |
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
431 | { | 434 | { |
432 | struct vm_area_struct *vma; | 435 | struct vm_area_struct *vma; |
433 | struct task_struct *tsk; | 436 | struct task_struct *tsk; |
434 | struct prio_tree_iter iter; | ||
435 | struct address_space *mapping = page->mapping; | 437 | struct address_space *mapping = page->mapping; |
436 | 438 | ||
437 | mutex_lock(&mapping->i_mmap_mutex); | 439 | mutex_lock(&mapping->i_mmap_mutex); |
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
442 | if (!task_early_kill(tsk)) | 444 | if (!task_early_kill(tsk)) |
443 | continue; | 445 | continue; |
444 | 446 | ||
445 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | 447 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
446 | pgoff) { | 448 | pgoff) { |
447 | /* | 449 | /* |
448 | * Send early kill signal to tasks where a vma covers | 450 | * Send early kill signal to tasks where a vma covers |
diff --git a/mm/memory.c b/mm/memory.c index 57361708d1a5..fb135ba4aba9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
712 | add_taint(TAINT_BAD_PAGE); | 712 | add_taint(TAINT_BAD_PAGE); |
713 | } | 713 | } |
714 | 714 | ||
715 | static inline int is_cow_mapping(vm_flags_t flags) | 715 | static inline bool is_cow_mapping(vm_flags_t flags) |
716 | { | 716 | { |
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 718 | } |
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1039 | unsigned long next; | 1039 | unsigned long next; |
1040 | unsigned long addr = vma->vm_start; | 1040 | unsigned long addr = vma->vm_start; |
1041 | unsigned long end = vma->vm_end; | 1041 | unsigned long end = vma->vm_end; |
1042 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1043 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1044 | bool is_cow; | ||
1042 | int ret; | 1045 | int ret; |
1043 | 1046 | ||
1044 | /* | 1047 | /* |
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1047 | * readonly mappings. The tradeoff is that copy_page_range is more | 1050 | * readonly mappings. The tradeoff is that copy_page_range is more |
1048 | * efficient than faulting. | 1051 | * efficient than faulting. |
1049 | */ | 1052 | */ |
1050 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { | 1053 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | |
1054 | VM_PFNMAP | VM_MIXEDMAP))) { | ||
1051 | if (!vma->anon_vma) | 1055 | if (!vma->anon_vma) |
1052 | return 0; | 1056 | return 0; |
1053 | } | 1057 | } |
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1055 | if (is_vm_hugetlb_page(vma)) | 1059 | if (is_vm_hugetlb_page(vma)) |
1056 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1060 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
1057 | 1061 | ||
1058 | if (unlikely(is_pfn_mapping(vma))) { | 1062 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
1059 | /* | 1063 | /* |
1060 | * We do not free on error cases below as remove_vma | 1064 | * We do not free on error cases below as remove_vma |
1061 | * gets called on error from higher level routine | 1065 | * gets called on error from higher level routine |
1062 | */ | 1066 | */ |
1063 | ret = track_pfn_vma_copy(vma); | 1067 | ret = track_pfn_copy(vma); |
1064 | if (ret) | 1068 | if (ret) |
1065 | return ret; | 1069 | return ret; |
1066 | } | 1070 | } |
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1071 | * parent mm. And a permission downgrade will only happen if | 1075 | * parent mm. And a permission downgrade will only happen if |
1072 | * is_cow_mapping() returns true. | 1076 | * is_cow_mapping() returns true. |
1073 | */ | 1077 | */ |
1074 | if (is_cow_mapping(vma->vm_flags)) | 1078 | is_cow = is_cow_mapping(vma->vm_flags); |
1075 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | 1079 | mmun_start = addr; |
1080 | mmun_end = end; | ||
1081 | if (is_cow) | ||
1082 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, | ||
1083 | mmun_end); | ||
1076 | 1084 | ||
1077 | ret = 0; | 1085 | ret = 0; |
1078 | dst_pgd = pgd_offset(dst_mm, addr); | 1086 | dst_pgd = pgd_offset(dst_mm, addr); |
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1088 | } | 1096 | } |
1089 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 1097 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
1090 | 1098 | ||
1091 | if (is_cow_mapping(vma->vm_flags)) | 1099 | if (is_cow) |
1092 | mmu_notifier_invalidate_range_end(src_mm, | 1100 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); |
1093 | vma->vm_start, end); | ||
1094 | return ret; | 1101 | return ret; |
1095 | } | 1102 | } |
1096 | 1103 | ||
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1327 | if (vma->vm_file) | 1334 | if (vma->vm_file) |
1328 | uprobe_munmap(vma, start, end); | 1335 | uprobe_munmap(vma, start, end); |
1329 | 1336 | ||
1330 | if (unlikely(is_pfn_mapping(vma))) | 1337 | if (unlikely(vma->vm_flags & VM_PFNMAP)) |
1331 | untrack_pfn_vma(vma, 0, 0); | 1338 | untrack_pfn(vma, 0, 0); |
1332 | 1339 | ||
1333 | if (start != end) { | 1340 | if (start != end) { |
1334 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1341 | if (unlikely(is_vm_hugetlb_page(vma))) { |
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1521 | spin_unlock(&mm->page_table_lock); | 1528 | spin_unlock(&mm->page_table_lock); |
1522 | wait_split_huge_page(vma->anon_vma, pmd); | 1529 | wait_split_huge_page(vma->anon_vma, pmd); |
1523 | } else { | 1530 | } else { |
1524 | page = follow_trans_huge_pmd(mm, address, | 1531 | page = follow_trans_huge_pmd(vma, address, |
1525 | pmd, flags); | 1532 | pmd, flags); |
1526 | spin_unlock(&mm->page_table_lock); | 1533 | spin_unlock(&mm->page_table_lock); |
1527 | goto out; | 1534 | goto out; |
@@ -1576,12 +1583,12 @@ split_fallthrough: | |||
1576 | if (page->mapping && trylock_page(page)) { | 1583 | if (page->mapping && trylock_page(page)) { |
1577 | lru_add_drain(); /* push cached pages to LRU */ | 1584 | lru_add_drain(); /* push cached pages to LRU */ |
1578 | /* | 1585 | /* |
1579 | * Because we lock page here and migration is | 1586 | * Because we lock page here, and migration is |
1580 | * blocked by the pte's page reference, we need | 1587 | * blocked by the pte's page reference, and we |
1581 | * only check for file-cache page truncation. | 1588 | * know the page is still mapped, we don't even |
1589 | * need to check for file-cache page truncation. | ||
1582 | */ | 1590 | */ |
1583 | if (page->mapping) | 1591 | mlock_vma_page(page); |
1584 | mlock_vma_page(page); | ||
1585 | unlock_page(page); | 1592 | unlock_page(page); |
1586 | } | 1593 | } |
1587 | } | 1594 | } |
@@ -2085,6 +2092,11 @@ out: | |||
2085 | * ask for a shared writable mapping! | 2092 | * ask for a shared writable mapping! |
2086 | * | 2093 | * |
2087 | * The page does not need to be reserved. | 2094 | * The page does not need to be reserved. |
2095 | * | ||
2096 | * Usually this function is called from f_op->mmap() handler | ||
2097 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. | ||
2098 | * Caller must set VM_MIXEDMAP on vma if it wants to call this | ||
2099 | * function from other places, for example from page-fault handler. | ||
2088 | */ | 2100 | */ |
2089 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 2101 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
2090 | struct page *page) | 2102 | struct page *page) |
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
2093 | return -EFAULT; | 2105 | return -EFAULT; |
2094 | if (!page_count(page)) | 2106 | if (!page_count(page)) |
2095 | return -EINVAL; | 2107 | return -EINVAL; |
2096 | vma->vm_flags |= VM_INSERTPAGE; | 2108 | if (!(vma->vm_flags & VM_MIXEDMAP)) { |
2109 | BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); | ||
2110 | BUG_ON(vma->vm_flags & VM_PFNMAP); | ||
2111 | vma->vm_flags |= VM_MIXEDMAP; | ||
2112 | } | ||
2097 | return insert_page(vma, addr, page, vma->vm_page_prot); | 2113 | return insert_page(vma, addr, page, vma->vm_page_prot); |
2098 | } | 2114 | } |
2099 | EXPORT_SYMBOL(vm_insert_page); | 2115 | EXPORT_SYMBOL(vm_insert_page); |
@@ -2132,7 +2148,7 @@ out: | |||
2132 | * @addr: target user address of this page | 2148 | * @addr: target user address of this page |
2133 | * @pfn: source kernel pfn | 2149 | * @pfn: source kernel pfn |
2134 | * | 2150 | * |
2135 | * Similar to vm_inert_page, this allows drivers to insert individual pages | 2151 | * Similar to vm_insert_page, this allows drivers to insert individual pages |
2136 | * they've allocated into a user vma. Same comments apply. | 2152 | * they've allocated into a user vma. Same comments apply. |
2137 | * | 2153 | * |
2138 | * This function should only be called from a vm_ops->fault handler, and | 2154 | * This function should only be called from a vm_ops->fault handler, and |
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
2162 | 2178 | ||
2163 | if (addr < vma->vm_start || addr >= vma->vm_end) | 2179 | if (addr < vma->vm_start || addr >= vma->vm_end) |
2164 | return -EFAULT; | 2180 | return -EFAULT; |
2165 | if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) | 2181 | if (track_pfn_insert(vma, &pgprot, pfn)) |
2166 | return -EINVAL; | 2182 | return -EINVAL; |
2167 | 2183 | ||
2168 | ret = insert_pfn(vma, addr, pfn, pgprot); | 2184 | ret = insert_pfn(vma, addr, pfn, pgprot); |
2169 | 2185 | ||
2170 | if (ret) | ||
2171 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); | ||
2172 | |||
2173 | return ret; | 2186 | return ret; |
2174 | } | 2187 | } |
2175 | EXPORT_SYMBOL(vm_insert_pfn); | 2188 | EXPORT_SYMBOL(vm_insert_pfn); |
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2290 | * rest of the world about it: | 2303 | * rest of the world about it: |
2291 | * VM_IO tells people not to look at these pages | 2304 | * VM_IO tells people not to look at these pages |
2292 | * (accesses can have side effects). | 2305 | * (accesses can have side effects). |
2293 | * VM_RESERVED is specified all over the place, because | ||
2294 | * in 2.4 it kept swapout's vma scan off this vma; but | ||
2295 | * in 2.6 the LRU scan won't even find its pages, so this | ||
2296 | * flag means no more than count its pages in reserved_vm, | ||
2297 | * and omit it from core dump, even when VM_IO turned off. | ||
2298 | * VM_PFNMAP tells the core MM that the base pages are just | 2306 | * VM_PFNMAP tells the core MM that the base pages are just |
2299 | * raw PFN mappings, and do not have a "struct page" associated | 2307 | * raw PFN mappings, and do not have a "struct page" associated |
2300 | * with them. | 2308 | * with them. |
2309 | * VM_DONTEXPAND | ||
2310 | * Disable vma merging and expanding with mremap(). | ||
2311 | * VM_DONTDUMP | ||
2312 | * Omit vma from core dump, even when VM_IO turned off. | ||
2301 | * | 2313 | * |
2302 | * There's a horrible special case to handle copy-on-write | 2314 | * There's a horrible special case to handle copy-on-write |
2303 | * behaviour that some programs depend on. We mark the "original" | 2315 | * behaviour that some programs depend on. We mark the "original" |
2304 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". | 2316 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". |
2317 | * See vm_normal_page() for details. | ||
2305 | */ | 2318 | */ |
2306 | if (addr == vma->vm_start && end == vma->vm_end) { | 2319 | if (is_cow_mapping(vma->vm_flags)) { |
2320 | if (addr != vma->vm_start || end != vma->vm_end) | ||
2321 | return -EINVAL; | ||
2307 | vma->vm_pgoff = pfn; | 2322 | vma->vm_pgoff = pfn; |
2308 | vma->vm_flags |= VM_PFN_AT_MMAP; | 2323 | } |
2309 | } else if (is_cow_mapping(vma->vm_flags)) | ||
2310 | return -EINVAL; | ||
2311 | |||
2312 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
2313 | 2324 | ||
2314 | err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); | 2325 | err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); |
2315 | if (err) { | 2326 | if (err) |
2316 | /* | ||
2317 | * To indicate that track_pfn related cleanup is not | ||
2318 | * needed from higher level routine calling unmap_vmas | ||
2319 | */ | ||
2320 | vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); | ||
2321 | vma->vm_flags &= ~VM_PFN_AT_MMAP; | ||
2322 | return -EINVAL; | 2327 | return -EINVAL; |
2323 | } | 2328 | |
2329 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; | ||
2324 | 2330 | ||
2325 | BUG_ON(addr >= end); | 2331 | BUG_ON(addr >= end); |
2326 | pfn -= addr >> PAGE_SHIFT; | 2332 | pfn -= addr >> PAGE_SHIFT; |
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2335 | } while (pgd++, addr = next, addr != end); | 2341 | } while (pgd++, addr = next, addr != end); |
2336 | 2342 | ||
2337 | if (err) | 2343 | if (err) |
2338 | untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); | 2344 | untrack_pfn(vma, pfn, PAGE_ALIGN(size)); |
2339 | 2345 | ||
2340 | return err; | 2346 | return err; |
2341 | } | 2347 | } |
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2516 | spinlock_t *ptl, pte_t orig_pte) | 2522 | spinlock_t *ptl, pte_t orig_pte) |
2517 | __releases(ptl) | 2523 | __releases(ptl) |
2518 | { | 2524 | { |
2519 | struct page *old_page, *new_page; | 2525 | struct page *old_page, *new_page = NULL; |
2520 | pte_t entry; | 2526 | pte_t entry; |
2521 | int ret = 0; | 2527 | int ret = 0; |
2522 | int page_mkwrite = 0; | 2528 | int page_mkwrite = 0; |
2523 | struct page *dirty_page = NULL; | 2529 | struct page *dirty_page = NULL; |
2530 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2531 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2532 | bool mmun_called = false; /* For mmu_notifiers */ | ||
2524 | 2533 | ||
2525 | old_page = vm_normal_page(vma, address, orig_pte); | 2534 | old_page = vm_normal_page(vma, address, orig_pte); |
2526 | if (!old_page) { | 2535 | if (!old_page) { |
@@ -2698,6 +2707,11 @@ gotten: | |||
2698 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2707 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2699 | goto oom_free_new; | 2708 | goto oom_free_new; |
2700 | 2709 | ||
2710 | mmun_start = address & PAGE_MASK; | ||
2711 | mmun_end = (address & PAGE_MASK) + PAGE_SIZE; | ||
2712 | mmun_called = true; | ||
2713 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2714 | |||
2701 | /* | 2715 | /* |
2702 | * Re-check the pte - we dropped the lock | 2716 | * Re-check the pte - we dropped the lock |
2703 | */ | 2717 | */ |
@@ -2764,6 +2778,8 @@ gotten: | |||
2764 | page_cache_release(new_page); | 2778 | page_cache_release(new_page); |
2765 | unlock: | 2779 | unlock: |
2766 | pte_unmap_unlock(page_table, ptl); | 2780 | pte_unmap_unlock(page_table, ptl); |
2781 | if (mmun_called) | ||
2782 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2767 | if (old_page) { | 2783 | if (old_page) { |
2768 | /* | 2784 | /* |
2769 | * Don't let another task, with possibly unlocked vma, | 2785 | * Don't let another task, with possibly unlocked vma, |
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, | |||
2801 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); | 2817 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); |
2802 | } | 2818 | } |
2803 | 2819 | ||
2804 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2820 | static inline void unmap_mapping_range_tree(struct rb_root *root, |
2805 | struct zap_details *details) | 2821 | struct zap_details *details) |
2806 | { | 2822 | { |
2807 | struct vm_area_struct *vma; | 2823 | struct vm_area_struct *vma; |
2808 | struct prio_tree_iter iter; | ||
2809 | pgoff_t vba, vea, zba, zea; | 2824 | pgoff_t vba, vea, zba, zea; |
2810 | 2825 | ||
2811 | vma_prio_tree_foreach(vma, &iter, root, | 2826 | vma_interval_tree_foreach(vma, root, |
2812 | details->first_index, details->last_index) { | 2827 | details->first_index, details->last_index) { |
2813 | 2828 | ||
2814 | vba = vma->vm_pgoff; | 2829 | vba = vma->vm_pgoff; |
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, | |||
2839 | * across *all* the pages in each nonlinear VMA, not just the pages | 2854 | * across *all* the pages in each nonlinear VMA, not just the pages |
2840 | * whose virtual address lies outside the file truncation point. | 2855 | * whose virtual address lies outside the file truncation point. |
2841 | */ | 2856 | */ |
2842 | list_for_each_entry(vma, head, shared.vm_set.list) { | 2857 | list_for_each_entry(vma, head, shared.nonlinear) { |
2843 | details->nonlinear_vma = vma; | 2858 | details->nonlinear_vma = vma; |
2844 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | 2859 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2845 | } | 2860 | } |
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2883 | 2898 | ||
2884 | 2899 | ||
2885 | mutex_lock(&mapping->i_mmap_mutex); | 2900 | mutex_lock(&mapping->i_mmap_mutex); |
2886 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 2901 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2887 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2902 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2888 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2903 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2889 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2904 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a5b90d0cfd7..56b758ae57d2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | struct zone *zone; | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) | |||
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
118 | __free_pages_bootmem(page, 0); | 119 | __free_pages_bootmem(page, 0); |
120 | |||
121 | zone = page_zone(page); | ||
122 | zone_span_writelock(zone); | ||
123 | zone->present_pages++; | ||
124 | zone_span_writeunlock(zone); | ||
125 | totalram_pages++; | ||
119 | } | 126 | } |
120 | 127 | ||
121 | } | 128 | } |
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
362 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 369 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
363 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 370 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
364 | 371 | ||
372 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
373 | |||
365 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 374 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
366 | for (i = 0; i < sections_to_remove; i++) { | 375 | for (i = 0; i < sections_to_remove; i++) { |
367 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 376 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
368 | release_mem_region(pfn << PAGE_SHIFT, | ||
369 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
370 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 377 | ret = __remove_section(zone, __pfn_to_section(pfn)); |
371 | if (ret) | 378 | if (ret) |
372 | break; | 379 | break; |
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) | |||
756 | return 0; | 763 | return 0; |
757 | } | 764 | } |
758 | 765 | ||
759 | static struct page * | ||
760 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) | ||
761 | { | ||
762 | /* This should be improooooved!! */ | ||
763 | return alloc_page(GFP_HIGHUSER_MOVABLE); | ||
764 | } | ||
765 | |||
766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
767 | static int | 767 | static int |
768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
813 | putback_lru_pages(&source); | 813 | putback_lru_pages(&source); |
814 | goto out; | 814 | goto out; |
815 | } | 815 | } |
816 | /* this function returns # of failed pages */ | 816 | |
817 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 817 | /* |
818 | * alloc_migrate_target should be improooooved!! | ||
819 | * migrate_pages returns # of failed pages. | ||
820 | */ | ||
821 | ret = migrate_pages(&source, alloc_migrate_target, 0, | ||
818 | true, MIGRATE_SYNC); | 822 | true, MIGRATE_SYNC); |
819 | if (ret) | 823 | if (ret) |
820 | putback_lru_pages(&source); | 824 | putback_lru_pages(&source); |
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
870 | return offlined; | 874 | return offlined; |
871 | } | 875 | } |
872 | 876 | ||
873 | static int __ref offline_pages(unsigned long start_pfn, | 877 | static int __ref __offline_pages(unsigned long start_pfn, |
874 | unsigned long end_pfn, unsigned long timeout) | 878 | unsigned long end_pfn, unsigned long timeout) |
875 | { | 879 | { |
876 | unsigned long pfn, nr_pages, expire; | 880 | unsigned long pfn, nr_pages, expire; |
@@ -970,8 +974,13 @@ repeat: | |||
970 | 974 | ||
971 | init_per_zone_wmark_min(); | 975 | init_per_zone_wmark_min(); |
972 | 976 | ||
973 | if (!populated_zone(zone)) | 977 | if (!populated_zone(zone)) { |
974 | zone_pcp_reset(zone); | 978 | zone_pcp_reset(zone); |
979 | mutex_lock(&zonelists_mutex); | ||
980 | build_all_zonelists(NULL, NULL); | ||
981 | mutex_unlock(&zonelists_mutex); | ||
982 | } else | ||
983 | zone_pcp_update(zone); | ||
975 | 984 | ||
976 | if (!node_present_pages(node)) { | 985 | if (!node_present_pages(node)) { |
977 | node_clear_state(node, N_HIGH_MEMORY); | 986 | node_clear_state(node, N_HIGH_MEMORY); |
@@ -998,15 +1007,55 @@ out: | |||
998 | return ret; | 1007 | return ret; |
999 | } | 1008 | } |
1000 | 1009 | ||
1010 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1011 | { | ||
1012 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | ||
1013 | } | ||
1014 | |||
1001 | int remove_memory(u64 start, u64 size) | 1015 | int remove_memory(u64 start, u64 size) |
1002 | { | 1016 | { |
1017 | struct memory_block *mem = NULL; | ||
1018 | struct mem_section *section; | ||
1003 | unsigned long start_pfn, end_pfn; | 1019 | unsigned long start_pfn, end_pfn; |
1020 | unsigned long pfn, section_nr; | ||
1021 | int ret; | ||
1004 | 1022 | ||
1005 | start_pfn = PFN_DOWN(start); | 1023 | start_pfn = PFN_DOWN(start); |
1006 | end_pfn = start_pfn + PFN_DOWN(size); | 1024 | end_pfn = start_pfn + PFN_DOWN(size); |
1007 | return offline_pages(start_pfn, end_pfn, 120 * HZ); | 1025 | |
1026 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1027 | section_nr = pfn_to_section_nr(pfn); | ||
1028 | if (!present_section_nr(section_nr)) | ||
1029 | continue; | ||
1030 | |||
1031 | section = __nr_to_section(section_nr); | ||
1032 | /* same memblock? */ | ||
1033 | if (mem) | ||
1034 | if ((section_nr >= mem->start_section_nr) && | ||
1035 | (section_nr <= mem->end_section_nr)) | ||
1036 | continue; | ||
1037 | |||
1038 | mem = find_memory_block_hinted(section, mem); | ||
1039 | if (!mem) | ||
1040 | continue; | ||
1041 | |||
1042 | ret = offline_memory_block(mem); | ||
1043 | if (ret) { | ||
1044 | kobject_put(&mem->dev.kobj); | ||
1045 | return ret; | ||
1046 | } | ||
1047 | } | ||
1048 | |||
1049 | if (mem) | ||
1050 | kobject_put(&mem->dev.kobj); | ||
1051 | |||
1052 | return 0; | ||
1008 | } | 1053 | } |
1009 | #else | 1054 | #else |
1055 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1056 | { | ||
1057 | return -EINVAL; | ||
1058 | } | ||
1010 | int remove_memory(u64 start, u64 size) | 1059 | int remove_memory(u64 start, u64 size) |
1011 | { | 1060 | { |
1012 | return -EINVAL; | 1061 | return -EINVAL; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ada3be6e252..0b78fb9ea65b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
607 | return first; | 607 | return first; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* | ||
611 | * Apply policy to a single VMA | ||
612 | * This must be called with the mmap_sem held for writing. | ||
613 | */ | ||
614 | static int vma_replace_policy(struct vm_area_struct *vma, | ||
615 | struct mempolicy *pol) | ||
616 | { | ||
617 | int err; | ||
618 | struct mempolicy *old; | ||
619 | struct mempolicy *new; | ||
620 | |||
621 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
622 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
623 | vma->vm_ops, vma->vm_file, | ||
624 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
625 | |||
626 | new = mpol_dup(pol); | ||
627 | if (IS_ERR(new)) | ||
628 | return PTR_ERR(new); | ||
629 | |||
630 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
631 | err = vma->vm_ops->set_policy(vma, new); | ||
632 | if (err) | ||
633 | goto err_out; | ||
634 | } | ||
635 | |||
636 | old = vma->vm_policy; | ||
637 | vma->vm_policy = new; /* protected by mmap_sem */ | ||
638 | mpol_put(old); | ||
639 | |||
640 | return 0; | ||
641 | err_out: | ||
642 | mpol_put(new); | ||
643 | return err; | ||
644 | } | ||
645 | |||
610 | /* Step 2: apply policy to a range and do splits. */ | 646 | /* Step 2: apply policy to a range and do splits. */ |
611 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 647 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
612 | unsigned long end, struct mempolicy *new_pol) | 648 | unsigned long end, struct mempolicy *new_pol) |
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
655 | if (err) | 691 | if (err) |
656 | goto out; | 692 | goto out; |
657 | } | 693 | } |
658 | 694 | err = vma_replace_policy(vma, new_pol); | |
659 | /* | 695 | if (err) |
660 | * Apply policy to a single VMA. The reference counting of | 696 | goto out; |
661 | * policy for vma_policy linkages has already been handled by | ||
662 | * vma_merge and split_vma as necessary. If this is a shared | ||
663 | * policy then ->set_policy will increment the reference count | ||
664 | * for an sp node. | ||
665 | */ | ||
666 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
667 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
668 | vma->vm_ops, vma->vm_file, | ||
669 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
670 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
671 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
672 | if (err) | ||
673 | goto out; | ||
674 | } | ||
675 | } | 697 | } |
676 | 698 | ||
677 | out: | 699 | out: |
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
924 | nodemask_t nmask; | 946 | nodemask_t nmask; |
925 | LIST_HEAD(pagelist); | 947 | LIST_HEAD(pagelist); |
926 | int err = 0; | 948 | int err = 0; |
927 | struct vm_area_struct *vma; | ||
928 | 949 | ||
929 | nodes_clear(nmask); | 950 | nodes_clear(nmask); |
930 | node_set(source, nmask); | 951 | node_set(source, nmask); |
931 | 952 | ||
932 | vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | 953 | /* |
954 | * This does not "check" the range but isolates all pages that | ||
955 | * need migration. Between passing in the full user address | ||
956 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. | ||
957 | */ | ||
958 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); | ||
959 | check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | ||
933 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 960 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
934 | if (IS_ERR(vma)) | ||
935 | return PTR_ERR(vma); | ||
936 | 961 | ||
937 | if (!list_empty(&pagelist)) { | 962 | if (!list_empty(&pagelist)) { |
938 | err = migrate_pages(&pagelist, new_node_page, dest, | 963 | err = migrate_pages(&pagelist, new_node_page, dest, |
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1530 | addr); | 1555 | addr); |
1531 | if (vpol) | 1556 | if (vpol) |
1532 | pol = vpol; | 1557 | pol = vpol; |
1533 | } else if (vma->vm_policy) | 1558 | } else if (vma->vm_policy) { |
1534 | pol = vma->vm_policy; | 1559 | pol = vma->vm_policy; |
1560 | |||
1561 | /* | ||
1562 | * shmem_alloc_page() passes MPOL_F_SHARED policy with | ||
1563 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference | ||
1564 | * count on these policies which will be dropped by | ||
1565 | * mpol_cond_put() later | ||
1566 | */ | ||
1567 | if (mpol_needs_cond_ref(pol)) | ||
1568 | mpol_get(pol); | ||
1569 | } | ||
1535 | } | 1570 | } |
1536 | if (!pol) | 1571 | if (!pol) |
1537 | pol = &default_policy; | 1572 | pol = &default_policy; |
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
2061 | */ | 2096 | */ |
2062 | 2097 | ||
2063 | /* lookup first element intersecting start-end */ | 2098 | /* lookup first element intersecting start-end */ |
2064 | /* Caller holds sp->lock */ | 2099 | /* Caller holds sp->mutex */ |
2065 | static struct sp_node * | 2100 | static struct sp_node * |
2066 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2101 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
2067 | { | 2102 | { |
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
2125 | 2160 | ||
2126 | if (!sp->root.rb_node) | 2161 | if (!sp->root.rb_node) |
2127 | return NULL; | 2162 | return NULL; |
2128 | spin_lock(&sp->lock); | 2163 | mutex_lock(&sp->mutex); |
2129 | sn = sp_lookup(sp, idx, idx+1); | 2164 | sn = sp_lookup(sp, idx, idx+1); |
2130 | if (sn) { | 2165 | if (sn) { |
2131 | mpol_get(sn->policy); | 2166 | mpol_get(sn->policy); |
2132 | pol = sn->policy; | 2167 | pol = sn->policy; |
2133 | } | 2168 | } |
2134 | spin_unlock(&sp->lock); | 2169 | mutex_unlock(&sp->mutex); |
2135 | return pol; | 2170 | return pol; |
2136 | } | 2171 | } |
2137 | 2172 | ||
2173 | static void sp_free(struct sp_node *n) | ||
2174 | { | ||
2175 | mpol_put(n->policy); | ||
2176 | kmem_cache_free(sn_cache, n); | ||
2177 | } | ||
2178 | |||
2138 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2179 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2139 | { | 2180 | { |
2140 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2181 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
2141 | rb_erase(&n->nd, &sp->root); | 2182 | rb_erase(&n->nd, &sp->root); |
2142 | mpol_put(n->policy); | 2183 | sp_free(n); |
2143 | kmem_cache_free(sn_cache, n); | ||
2144 | } | 2184 | } |
2145 | 2185 | ||
2146 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2186 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2147 | struct mempolicy *pol) | 2187 | struct mempolicy *pol) |
2148 | { | 2188 | { |
2149 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2189 | struct sp_node *n; |
2190 | struct mempolicy *newpol; | ||
2150 | 2191 | ||
2192 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
2151 | if (!n) | 2193 | if (!n) |
2152 | return NULL; | 2194 | return NULL; |
2195 | |||
2196 | newpol = mpol_dup(pol); | ||
2197 | if (IS_ERR(newpol)) { | ||
2198 | kmem_cache_free(sn_cache, n); | ||
2199 | return NULL; | ||
2200 | } | ||
2201 | newpol->flags |= MPOL_F_SHARED; | ||
2202 | |||
2153 | n->start = start; | 2203 | n->start = start; |
2154 | n->end = end; | 2204 | n->end = end; |
2155 | mpol_get(pol); | 2205 | n->policy = newpol; |
2156 | pol->flags |= MPOL_F_SHARED; /* for unref */ | 2206 | |
2157 | n->policy = pol; | ||
2158 | return n; | 2207 | return n; |
2159 | } | 2208 | } |
2160 | 2209 | ||
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
2162 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | 2211 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, |
2163 | unsigned long end, struct sp_node *new) | 2212 | unsigned long end, struct sp_node *new) |
2164 | { | 2213 | { |
2165 | struct sp_node *n, *new2 = NULL; | 2214 | struct sp_node *n; |
2215 | int ret = 0; | ||
2166 | 2216 | ||
2167 | restart: | 2217 | mutex_lock(&sp->mutex); |
2168 | spin_lock(&sp->lock); | ||
2169 | n = sp_lookup(sp, start, end); | 2218 | n = sp_lookup(sp, start, end); |
2170 | /* Take care of old policies in the same range. */ | 2219 | /* Take care of old policies in the same range. */ |
2171 | while (n && n->start < end) { | 2220 | while (n && n->start < end) { |
@@ -2178,16 +2227,14 @@ restart: | |||
2178 | } else { | 2227 | } else { |
2179 | /* Old policy spanning whole new range. */ | 2228 | /* Old policy spanning whole new range. */ |
2180 | if (n->end > end) { | 2229 | if (n->end > end) { |
2230 | struct sp_node *new2; | ||
2231 | new2 = sp_alloc(end, n->end, n->policy); | ||
2181 | if (!new2) { | 2232 | if (!new2) { |
2182 | spin_unlock(&sp->lock); | 2233 | ret = -ENOMEM; |
2183 | new2 = sp_alloc(end, n->end, n->policy); | 2234 | goto out; |
2184 | if (!new2) | ||
2185 | return -ENOMEM; | ||
2186 | goto restart; | ||
2187 | } | 2235 | } |
2188 | n->end = start; | 2236 | n->end = start; |
2189 | sp_insert(sp, new2); | 2237 | sp_insert(sp, new2); |
2190 | new2 = NULL; | ||
2191 | break; | 2238 | break; |
2192 | } else | 2239 | } else |
2193 | n->end = start; | 2240 | n->end = start; |
@@ -2198,12 +2245,9 @@ restart: | |||
2198 | } | 2245 | } |
2199 | if (new) | 2246 | if (new) |
2200 | sp_insert(sp, new); | 2247 | sp_insert(sp, new); |
2201 | spin_unlock(&sp->lock); | 2248 | out: |
2202 | if (new2) { | 2249 | mutex_unlock(&sp->mutex); |
2203 | mpol_put(new2->policy); | 2250 | return ret; |
2204 | kmem_cache_free(sn_cache, new2); | ||
2205 | } | ||
2206 | return 0; | ||
2207 | } | 2251 | } |
2208 | 2252 | ||
2209 | /** | 2253 | /** |
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
2221 | int ret; | 2265 | int ret; |
2222 | 2266 | ||
2223 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2267 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2224 | spin_lock_init(&sp->lock); | 2268 | mutex_init(&sp->mutex); |
2225 | 2269 | ||
2226 | if (mpol) { | 2270 | if (mpol) { |
2227 | struct vm_area_struct pvma; | 2271 | struct vm_area_struct pvma; |
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2275 | } | 2319 | } |
2276 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | 2320 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); |
2277 | if (err && new) | 2321 | if (err && new) |
2278 | kmem_cache_free(sn_cache, new); | 2322 | sp_free(new); |
2279 | return err; | 2323 | return err; |
2280 | } | 2324 | } |
2281 | 2325 | ||
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2287 | 2331 | ||
2288 | if (!p->root.rb_node) | 2332 | if (!p->root.rb_node) |
2289 | return; | 2333 | return; |
2290 | spin_lock(&p->lock); | 2334 | mutex_lock(&p->mutex); |
2291 | next = rb_first(&p->root); | 2335 | next = rb_first(&p->root); |
2292 | while (next) { | 2336 | while (next) { |
2293 | n = rb_entry(next, struct sp_node, nd); | 2337 | n = rb_entry(next, struct sp_node, nd); |
2294 | next = rb_next(&n->nd); | 2338 | next = rb_next(&n->nd); |
2295 | rb_erase(&n->nd, &p->root); | 2339 | sp_delete(p, n); |
2296 | mpol_put(n->policy); | ||
2297 | kmem_cache_free(sn_cache, n); | ||
2298 | } | 2340 | } |
2299 | spin_unlock(&p->lock); | 2341 | mutex_unlock(&p->mutex); |
2300 | } | 2342 | } |
2301 | 2343 | ||
2302 | /* assumes fs == KERNEL_DS */ | 2344 | /* assumes fs == KERNEL_DS */ |
diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..f0b9ce572fc7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock); | |||
51 | /* | 51 | /* |
52 | * LRU accounting for clear_page_mlock() | 52 | * LRU accounting for clear_page_mlock() |
53 | */ | 53 | */ |
54 | void __clear_page_mlock(struct page *page) | 54 | void clear_page_mlock(struct page *page) |
55 | { | 55 | { |
56 | VM_BUG_ON(!PageLocked(page)); | 56 | if (!TestClearPageMlocked(page)) |
57 | |||
58 | if (!page->mapping) { /* truncated ? */ | ||
59 | return; | 57 | return; |
60 | } | ||
61 | 58 | ||
62 | dec_zone_page_state(page, NR_MLOCK); | 59 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
60 | -hpage_nr_pages(page)); | ||
63 | count_vm_event(UNEVICTABLE_PGCLEARED); | 61 | count_vm_event(UNEVICTABLE_PGCLEARED); |
64 | if (!isolate_lru_page(page)) { | 62 | if (!isolate_lru_page(page)) { |
65 | putback_lru_page(page); | 63 | putback_lru_page(page); |
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page) | |||
81 | BUG_ON(!PageLocked(page)); | 79 | BUG_ON(!PageLocked(page)); |
82 | 80 | ||
83 | if (!TestSetPageMlocked(page)) { | 81 | if (!TestSetPageMlocked(page)) { |
84 | inc_zone_page_state(page, NR_MLOCK); | 82 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
83 | hpage_nr_pages(page)); | ||
85 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 84 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
86 | if (!isolate_lru_page(page)) | 85 | if (!isolate_lru_page(page)) |
87 | putback_lru_page(page); | 86 | putback_lru_page(page); |
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page) | |||
108 | BUG_ON(!PageLocked(page)); | 107 | BUG_ON(!PageLocked(page)); |
109 | 108 | ||
110 | if (TestClearPageMlocked(page)) { | 109 | if (TestClearPageMlocked(page)) { |
111 | dec_zone_page_state(page, NR_MLOCK); | 110 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
111 | -hpage_nr_pages(page)); | ||
112 | if (!isolate_lru_page(page)) { | 112 | if (!isolate_lru_page(page)) { |
113 | int ret = SWAP_AGAIN; | 113 | int ret = SWAP_AGAIN; |
114 | 114 | ||
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
228 | goto no_mlock; | 228 | goto no_mlock; |
229 | 229 | ||
230 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 230 | if (!((vma->vm_flags & VM_DONTEXPAND) || |
231 | is_vm_hugetlb_page(vma) || | 231 | is_vm_hugetlb_page(vma) || |
232 | vma == get_gate_vma(current->mm))) { | 232 | vma == get_gate_vma(current->mm))) { |
233 | 233 | ||
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); | 290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); |
291 | if (page && !IS_ERR(page)) { | 291 | if (page && !IS_ERR(page)) { |
292 | lock_page(page); | 292 | lock_page(page); |
293 | /* | 293 | munlock_vma_page(page); |
294 | * Like in __mlock_vma_pages_range(), | ||
295 | * because we lock page here and migration is | ||
296 | * blocked by the elevated reference, we need | ||
297 | * only check for file-cache page truncation. | ||
298 | */ | ||
299 | if (page->mapping) | ||
300 | munlock_vma_page(page); | ||
301 | unlock_page(page); | 294 | unlock_page(page); |
302 | put_page(page); | 295 | put_page(page); |
303 | } | 296 | } |
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, | |||
51 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 51 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
52 | unsigned long start, unsigned long end); | 52 | unsigned long start, unsigned long end); |
53 | 53 | ||
54 | /* | ||
55 | * WARNING: the debugging will use recursive algorithms so never enable this | ||
56 | * unless you know what you are doing. | ||
57 | */ | ||
58 | #undef DEBUG_MM_RB | ||
59 | |||
60 | /* description of effects of mapping type and prot in current implementation. | 54 | /* description of effects of mapping type and prot in current implementation. |
61 | * this is due to the limited x86 page protection hardware. The expected | 55 | * this is due to the limited x86 page protection hardware. The expected |
62 | * behavior is in parens: | 56 | * behavior is in parens: |
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
199 | 193 | ||
200 | flush_dcache_mmap_lock(mapping); | 194 | flush_dcache_mmap_lock(mapping); |
201 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 195 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
202 | list_del_init(&vma->shared.vm_set.list); | 196 | list_del_init(&vma->shared.nonlinear); |
203 | else | 197 | else |
204 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 198 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
205 | flush_dcache_mmap_unlock(mapping); | 199 | flush_dcache_mmap_unlock(mapping); |
206 | } | 200 | } |
207 | 201 | ||
208 | /* | 202 | /* |
209 | * Unlink a file-based vm structure from its prio_tree, to hide | 203 | * Unlink a file-based vm structure from its interval tree, to hide |
210 | * vma from rmap and vmtruncate before freeing its page tables. | 204 | * vma from rmap and vmtruncate before freeing its page tables. |
211 | */ | 205 | */ |
212 | void unlink_file_vma(struct vm_area_struct *vma) | 206 | void unlink_file_vma(struct vm_area_struct *vma) |
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
231 | might_sleep(); | 225 | might_sleep(); |
232 | if (vma->vm_ops && vma->vm_ops->close) | 226 | if (vma->vm_ops && vma->vm_ops->close) |
233 | vma->vm_ops->close(vma); | 227 | vma->vm_ops->close(vma); |
234 | if (vma->vm_file) { | 228 | if (vma->vm_file) |
235 | fput(vma->vm_file); | 229 | fput(vma->vm_file); |
236 | if (vma->vm_flags & VM_EXECUTABLE) | ||
237 | removed_exe_file_vma(vma->vm_mm); | ||
238 | } | ||
239 | mpol_put(vma_policy(vma)); | 230 | mpol_put(vma_policy(vma)); |
240 | kmem_cache_free(vm_area_cachep, vma); | 231 | kmem_cache_free(vm_area_cachep, vma); |
241 | return next; | 232 | return next; |
@@ -306,7 +297,7 @@ out: | |||
306 | return retval; | 297 | return retval; |
307 | } | 298 | } |
308 | 299 | ||
309 | #ifdef DEBUG_MM_RB | 300 | #ifdef CONFIG_DEBUG_VM_RB |
310 | static int browse_rb(struct rb_root *root) | 301 | static int browse_rb(struct rb_root *root) |
311 | { | 302 | { |
312 | int i = 0, j; | 303 | int i = 0, j; |
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm) | |||
340 | { | 331 | { |
341 | int bug = 0; | 332 | int bug = 0; |
342 | int i = 0; | 333 | int i = 0; |
343 | struct vm_area_struct *tmp = mm->mmap; | 334 | struct vm_area_struct *vma = mm->mmap; |
344 | while (tmp) { | 335 | while (vma) { |
345 | tmp = tmp->vm_next; | 336 | struct anon_vma_chain *avc; |
337 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
338 | anon_vma_interval_tree_verify(avc); | ||
339 | vma = vma->vm_next; | ||
346 | i++; | 340 | i++; |
347 | } | 341 | } |
348 | if (i != mm->map_count) | 342 | if (i != mm->map_count) |
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm) | |||
356 | #define validate_mm(mm) do { } while (0) | 350 | #define validate_mm(mm) do { } while (0) |
357 | #endif | 351 | #endif |
358 | 352 | ||
359 | static struct vm_area_struct * | 353 | /* |
360 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, | 354 | * vma has some anon_vma assigned, and is already inserted on that |
361 | struct vm_area_struct **pprev, struct rb_node ***rb_link, | 355 | * anon_vma's interval trees. |
362 | struct rb_node ** rb_parent) | 356 | * |
357 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the | ||
358 | * vma must be removed from the anon_vma's interval trees using | ||
359 | * anon_vma_interval_tree_pre_update_vma(). | ||
360 | * | ||
361 | * After the update, the vma will be reinserted using | ||
362 | * anon_vma_interval_tree_post_update_vma(). | ||
363 | * | ||
364 | * The entire update must be protected by exclusive mmap_sem and by | ||
365 | * the root anon_vma's mutex. | ||
366 | */ | ||
367 | static inline void | ||
368 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) | ||
363 | { | 369 | { |
364 | struct vm_area_struct * vma; | 370 | struct anon_vma_chain *avc; |
365 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; | 371 | |
372 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
373 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); | ||
374 | } | ||
375 | |||
376 | static inline void | ||
377 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) | ||
378 | { | ||
379 | struct anon_vma_chain *avc; | ||
380 | |||
381 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
382 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); | ||
383 | } | ||
384 | |||
385 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, | ||
386 | unsigned long end, struct vm_area_struct **pprev, | ||
387 | struct rb_node ***rb_link, struct rb_node **rb_parent) | ||
388 | { | ||
389 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; | ||
366 | 390 | ||
367 | __rb_link = &mm->mm_rb.rb_node; | 391 | __rb_link = &mm->mm_rb.rb_node; |
368 | rb_prev = __rb_parent = NULL; | 392 | rb_prev = __rb_parent = NULL; |
369 | vma = NULL; | ||
370 | 393 | ||
371 | while (*__rb_link) { | 394 | while (*__rb_link) { |
372 | struct vm_area_struct *vma_tmp; | 395 | struct vm_area_struct *vma_tmp; |
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
375 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 398 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
376 | 399 | ||
377 | if (vma_tmp->vm_end > addr) { | 400 | if (vma_tmp->vm_end > addr) { |
378 | vma = vma_tmp; | 401 | /* Fail if an existing vma overlaps the area */ |
379 | if (vma_tmp->vm_start <= addr) | 402 | if (vma_tmp->vm_start < end) |
380 | break; | 403 | return -ENOMEM; |
381 | __rb_link = &__rb_parent->rb_left; | 404 | __rb_link = &__rb_parent->rb_left; |
382 | } else { | 405 | } else { |
383 | rb_prev = __rb_parent; | 406 | rb_prev = __rb_parent; |
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
390 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 413 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
391 | *rb_link = __rb_link; | 414 | *rb_link = __rb_link; |
392 | *rb_parent = __rb_parent; | 415 | *rb_parent = __rb_parent; |
393 | return vma; | 416 | return 0; |
394 | } | 417 | } |
395 | 418 | ||
396 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 419 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
417 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 440 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
418 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 441 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
419 | else | 442 | else |
420 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 443 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
421 | flush_dcache_mmap_unlock(mapping); | 444 | flush_dcache_mmap_unlock(mapping); |
422 | } | 445 | } |
423 | } | 446 | } |
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
455 | 478 | ||
456 | /* | 479 | /* |
457 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the | 480 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
458 | * mm's list and rbtree. It has already been inserted into the prio_tree. | 481 | * mm's list and rbtree. It has already been inserted into the interval tree. |
459 | */ | 482 | */ |
460 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 483 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
461 | { | 484 | { |
462 | struct vm_area_struct *__vma, *prev; | 485 | struct vm_area_struct *prev; |
463 | struct rb_node **rb_link, *rb_parent; | 486 | struct rb_node **rb_link, *rb_parent; |
464 | 487 | ||
465 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 488 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
466 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 489 | &prev, &rb_link, &rb_parent)) |
490 | BUG(); | ||
467 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 491 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
468 | mm->map_count++; | 492 | mm->map_count++; |
469 | } | 493 | } |
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
496 | struct vm_area_struct *next = vma->vm_next; | 520 | struct vm_area_struct *next = vma->vm_next; |
497 | struct vm_area_struct *importer = NULL; | 521 | struct vm_area_struct *importer = NULL; |
498 | struct address_space *mapping = NULL; | 522 | struct address_space *mapping = NULL; |
499 | struct prio_tree_root *root = NULL; | 523 | struct rb_root *root = NULL; |
500 | struct anon_vma *anon_vma = NULL; | 524 | struct anon_vma *anon_vma = NULL; |
501 | struct file *file = vma->vm_file; | 525 | struct file *file = vma->vm_file; |
502 | long adjust_next = 0; | 526 | long adjust_next = 0; |
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
559 | mutex_lock(&mapping->i_mmap_mutex); | 583 | mutex_lock(&mapping->i_mmap_mutex); |
560 | if (insert) { | 584 | if (insert) { |
561 | /* | 585 | /* |
562 | * Put into prio_tree now, so instantiated pages | 586 | * Put into interval tree now, so instantiated pages |
563 | * are visible to arm/parisc __flush_dcache_page | 587 | * are visible to arm/parisc __flush_dcache_page |
564 | * throughout; but we cannot insert into address | 588 | * throughout; but we cannot insert into address |
565 | * space until vma start or end is updated. | 589 | * space until vma start or end is updated. |
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end); | |||
570 | 594 | ||
571 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 595 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
572 | 596 | ||
573 | /* | 597 | anon_vma = vma->anon_vma; |
574 | * When changing only vma->vm_end, we don't really need anon_vma | 598 | if (!anon_vma && adjust_next) |
575 | * lock. This is a fairly rare case by itself, but the anon_vma | 599 | anon_vma = next->anon_vma; |
576 | * lock may be shared between many sibling processes. Skipping | 600 | if (anon_vma) { |
577 | * the lock for brk adjustments makes a difference sometimes. | 601 | VM_BUG_ON(adjust_next && next->anon_vma && |
578 | */ | 602 | anon_vma != next->anon_vma); |
579 | if (vma->anon_vma && (importer || start != vma->vm_start)) { | ||
580 | anon_vma = vma->anon_vma; | ||
581 | anon_vma_lock(anon_vma); | 603 | anon_vma_lock(anon_vma); |
604 | anon_vma_interval_tree_pre_update_vma(vma); | ||
605 | if (adjust_next) | ||
606 | anon_vma_interval_tree_pre_update_vma(next); | ||
582 | } | 607 | } |
583 | 608 | ||
584 | if (root) { | 609 | if (root) { |
585 | flush_dcache_mmap_lock(mapping); | 610 | flush_dcache_mmap_lock(mapping); |
586 | vma_prio_tree_remove(vma, root); | 611 | vma_interval_tree_remove(vma, root); |
587 | if (adjust_next) | 612 | if (adjust_next) |
588 | vma_prio_tree_remove(next, root); | 613 | vma_interval_tree_remove(next, root); |
589 | } | 614 | } |
590 | 615 | ||
591 | vma->vm_start = start; | 616 | vma->vm_start = start; |
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
598 | 623 | ||
599 | if (root) { | 624 | if (root) { |
600 | if (adjust_next) | 625 | if (adjust_next) |
601 | vma_prio_tree_insert(next, root); | 626 | vma_interval_tree_insert(next, root); |
602 | vma_prio_tree_insert(vma, root); | 627 | vma_interval_tree_insert(vma, root); |
603 | flush_dcache_mmap_unlock(mapping); | 628 | flush_dcache_mmap_unlock(mapping); |
604 | } | 629 | } |
605 | 630 | ||
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end); | |||
620 | __insert_vm_struct(mm, insert); | 645 | __insert_vm_struct(mm, insert); |
621 | } | 646 | } |
622 | 647 | ||
623 | if (anon_vma) | 648 | if (anon_vma) { |
649 | anon_vma_interval_tree_post_update_vma(vma); | ||
650 | if (adjust_next) | ||
651 | anon_vma_interval_tree_post_update_vma(next); | ||
624 | anon_vma_unlock(anon_vma); | 652 | anon_vma_unlock(anon_vma); |
653 | } | ||
625 | if (mapping) | 654 | if (mapping) |
626 | mutex_unlock(&mapping->i_mmap_mutex); | 655 | mutex_unlock(&mapping->i_mmap_mutex); |
627 | 656 | ||
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
636 | if (file) { | 665 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | 666 | uprobe_munmap(next, next->vm_start, next->vm_end); |
638 | fput(file); | 667 | fput(file); |
639 | if (next->vm_flags & VM_EXECUTABLE) | ||
640 | removed_exe_file_vma(mm); | ||
641 | } | 668 | } |
642 | if (next->anon_vma) | 669 | if (next->anon_vma) |
643 | anon_vma_merge(vma, next); | 670 | anon_vma_merge(vma, next); |
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
669 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 696 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
670 | struct file *file, unsigned long vm_flags) | 697 | struct file *file, unsigned long vm_flags) |
671 | { | 698 | { |
672 | /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ | 699 | if (vma->vm_flags ^ vm_flags) |
673 | if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) | ||
674 | return 0; | 700 | return 0; |
675 | if (vma->vm_file != file) | 701 | if (vma->vm_file != file) |
676 | return 0; | 702 | return 0; |
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
951 | mm->exec_vm += pages; | 977 | mm->exec_vm += pages; |
952 | } else if (flags & stack_flags) | 978 | } else if (flags & stack_flags) |
953 | mm->stack_vm += pages; | 979 | mm->stack_vm += pages; |
954 | if (flags & (VM_RESERVED|VM_IO)) | ||
955 | mm->reserved_vm += pages; | ||
956 | } | 980 | } |
957 | #endif /* CONFIG_PROC_FS */ | 981 | #endif /* CONFIG_PROC_FS */ |
958 | 982 | ||
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1190 | return 0; | 1214 | return 0; |
1191 | 1215 | ||
1192 | /* Specialty mapping? */ | 1216 | /* Specialty mapping? */ |
1193 | if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) | 1217 | if (vm_flags & VM_PFNMAP) |
1194 | return 0; | 1218 | return 0; |
1195 | 1219 | ||
1196 | /* Can the mapping track the dirty pages? */ | 1220 | /* Can the mapping track the dirty pages? */ |
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1229 | /* Clear old maps */ | 1253 | /* Clear old maps */ |
1230 | error = -ENOMEM; | 1254 | error = -ENOMEM; |
1231 | munmap_back: | 1255 | munmap_back: |
1232 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 1256 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
1233 | if (vma && vma->vm_start < addr + len) { | ||
1234 | if (do_munmap(mm, addr, len)) | 1257 | if (do_munmap(mm, addr, len)) |
1235 | return -ENOMEM; | 1258 | return -ENOMEM; |
1236 | goto munmap_back; | 1259 | goto munmap_back; |
@@ -1305,8 +1328,6 @@ munmap_back: | |||
1305 | error = file->f_op->mmap(file, vma); | 1328 | error = file->f_op->mmap(file, vma); |
1306 | if (error) | 1329 | if (error) |
1307 | goto unmap_and_free_vma; | 1330 | goto unmap_and_free_vma; |
1308 | if (vm_flags & VM_EXECUTABLE) | ||
1309 | added_exe_file_vma(mm); | ||
1310 | 1331 | ||
1311 | /* Can addr have changed?? | 1332 | /* Can addr have changed?? |
1312 | * | 1333 | * |
@@ -1757,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1757 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 1778 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1758 | error = acct_stack_growth(vma, size, grow); | 1779 | error = acct_stack_growth(vma, size, grow); |
1759 | if (!error) { | 1780 | if (!error) { |
1781 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1760 | vma->vm_end = address; | 1782 | vma->vm_end = address; |
1783 | anon_vma_interval_tree_post_update_vma(vma); | ||
1761 | perf_event_mmap(vma); | 1784 | perf_event_mmap(vma); |
1762 | } | 1785 | } |
1763 | } | 1786 | } |
1764 | } | 1787 | } |
1765 | vma_unlock_anon_vma(vma); | 1788 | vma_unlock_anon_vma(vma); |
1766 | khugepaged_enter_vma_merge(vma); | 1789 | khugepaged_enter_vma_merge(vma); |
1790 | validate_mm(vma->vm_mm); | ||
1767 | return error; | 1791 | return error; |
1768 | } | 1792 | } |
1769 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1793 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -1807,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1807 | if (grow <= vma->vm_pgoff) { | 1831 | if (grow <= vma->vm_pgoff) { |
1808 | error = acct_stack_growth(vma, size, grow); | 1832 | error = acct_stack_growth(vma, size, grow); |
1809 | if (!error) { | 1833 | if (!error) { |
1834 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1810 | vma->vm_start = address; | 1835 | vma->vm_start = address; |
1811 | vma->vm_pgoff -= grow; | 1836 | vma->vm_pgoff -= grow; |
1837 | anon_vma_interval_tree_post_update_vma(vma); | ||
1812 | perf_event_mmap(vma); | 1838 | perf_event_mmap(vma); |
1813 | } | 1839 | } |
1814 | } | 1840 | } |
1815 | } | 1841 | } |
1816 | vma_unlock_anon_vma(vma); | 1842 | vma_unlock_anon_vma(vma); |
1817 | khugepaged_enter_vma_merge(vma); | 1843 | khugepaged_enter_vma_merge(vma); |
1844 | validate_mm(vma->vm_mm); | ||
1818 | return error; | 1845 | return error; |
1819 | } | 1846 | } |
1820 | 1847 | ||
@@ -1988,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1988 | if (anon_vma_clone(new, vma)) | 2015 | if (anon_vma_clone(new, vma)) |
1989 | goto out_free_mpol; | 2016 | goto out_free_mpol; |
1990 | 2017 | ||
1991 | if (new->vm_file) { | 2018 | if (new->vm_file) |
1992 | get_file(new->vm_file); | 2019 | get_file(new->vm_file); |
1993 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1994 | added_exe_file_vma(mm); | ||
1995 | } | ||
1996 | 2020 | ||
1997 | if (new->vm_ops && new->vm_ops->open) | 2021 | if (new->vm_ops && new->vm_ops->open) |
1998 | new->vm_ops->open(new); | 2022 | new->vm_ops->open(new); |
@@ -2010,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2010 | /* Clean everything up if vma_adjust failed. */ | 2034 | /* Clean everything up if vma_adjust failed. */ |
2011 | if (new->vm_ops && new->vm_ops->close) | 2035 | if (new->vm_ops && new->vm_ops->close) |
2012 | new->vm_ops->close(new); | 2036 | new->vm_ops->close(new); |
2013 | if (new->vm_file) { | 2037 | if (new->vm_file) |
2014 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2015 | removed_exe_file_vma(mm); | ||
2016 | fput(new->vm_file); | 2038 | fput(new->vm_file); |
2017 | } | ||
2018 | unlink_anon_vmas(new); | 2039 | unlink_anon_vmas(new); |
2019 | out_free_mpol: | 2040 | out_free_mpol: |
2020 | mpol_put(pol); | 2041 | mpol_put(pol); |
@@ -2199,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2199 | * Clear old maps. this also does some error checking for us | 2220 | * Clear old maps. this also does some error checking for us |
2200 | */ | 2221 | */ |
2201 | munmap_back: | 2222 | munmap_back: |
2202 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2223 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
2203 | if (vma && vma->vm_start < addr + len) { | ||
2204 | if (do_munmap(mm, addr, len)) | 2224 | if (do_munmap(mm, addr, len)) |
2205 | return -ENOMEM; | 2225 | return -ENOMEM; |
2206 | goto munmap_back; | 2226 | goto munmap_back; |
@@ -2314,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm) | |||
2314 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2334 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2315 | * then i_mmap_mutex is taken here. | 2335 | * then i_mmap_mutex is taken here. |
2316 | */ | 2336 | */ |
2317 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | 2337 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2318 | { | 2338 | { |
2319 | struct vm_area_struct * __vma, * prev; | 2339 | struct vm_area_struct *prev; |
2320 | struct rb_node ** rb_link, * rb_parent; | 2340 | struct rb_node **rb_link, *rb_parent; |
2321 | 2341 | ||
2322 | /* | 2342 | /* |
2323 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2343 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
@@ -2335,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2335 | BUG_ON(vma->anon_vma); | 2355 | BUG_ON(vma->anon_vma); |
2336 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2356 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2337 | } | 2357 | } |
2338 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); | 2358 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
2339 | if (__vma && __vma->vm_start < vma->vm_end) | 2359 | &prev, &rb_link, &rb_parent)) |
2340 | return -ENOMEM; | 2360 | return -ENOMEM; |
2341 | if ((vma->vm_flags & VM_ACCOUNT) && | 2361 | if ((vma->vm_flags & VM_ACCOUNT) && |
2342 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2362 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
@@ -2351,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2351 | * prior to moving page table entries, to effect an mremap move. | 2371 | * prior to moving page table entries, to effect an mremap move. |
2352 | */ | 2372 | */ |
2353 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2354 | unsigned long addr, unsigned long len, pgoff_t pgoff) | 2374 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2375 | bool *need_rmap_locks) | ||
2355 | { | 2376 | { |
2356 | struct vm_area_struct *vma = *vmap; | 2377 | struct vm_area_struct *vma = *vmap; |
2357 | unsigned long vma_start = vma->vm_start; | 2378 | unsigned long vma_start = vma->vm_start; |
@@ -2370,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2370 | faulted_in_anon_vma = false; | 2391 | faulted_in_anon_vma = false; |
2371 | } | 2392 | } |
2372 | 2393 | ||
2373 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2394 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2395 | return NULL; /* should never get here */ | ||
2374 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2396 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2375 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2397 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2376 | if (new_vma) { | 2398 | if (new_vma) { |
@@ -2392,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2392 | * linear if there are no pages mapped yet. | 2414 | * linear if there are no pages mapped yet. |
2393 | */ | 2415 | */ |
2394 | VM_BUG_ON(faulted_in_anon_vma); | 2416 | VM_BUG_ON(faulted_in_anon_vma); |
2395 | *vmap = new_vma; | 2417 | *vmap = vma = new_vma; |
2396 | } else | 2418 | } |
2397 | anon_vma_moveto_tail(new_vma); | 2419 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
2398 | } else { | 2420 | } else { |
2399 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2421 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2400 | if (new_vma) { | 2422 | if (new_vma) { |
2401 | *new_vma = *vma; | 2423 | *new_vma = *vma; |
2424 | new_vma->vm_start = addr; | ||
2425 | new_vma->vm_end = addr + len; | ||
2426 | new_vma->vm_pgoff = pgoff; | ||
2402 | pol = mpol_dup(vma_policy(vma)); | 2427 | pol = mpol_dup(vma_policy(vma)); |
2403 | if (IS_ERR(pol)) | 2428 | if (IS_ERR(pol)) |
2404 | goto out_free_vma; | 2429 | goto out_free_vma; |
2430 | vma_set_policy(new_vma, pol); | ||
2405 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2431 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2406 | if (anon_vma_clone(new_vma, vma)) | 2432 | if (anon_vma_clone(new_vma, vma)) |
2407 | goto out_free_mempol; | 2433 | goto out_free_mempol; |
2408 | vma_set_policy(new_vma, pol); | 2434 | if (new_vma->vm_file) |
2409 | new_vma->vm_start = addr; | ||
2410 | new_vma->vm_end = addr + len; | ||
2411 | new_vma->vm_pgoff = pgoff; | ||
2412 | if (new_vma->vm_file) { | ||
2413 | get_file(new_vma->vm_file); | 2435 | get_file(new_vma->vm_file); |
2414 | |||
2415 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2416 | added_exe_file_vma(mm); | ||
2417 | } | ||
2418 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2436 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2419 | new_vma->vm_ops->open(new_vma); | 2437 | new_vma->vm_ops->open(new_vma); |
2420 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2438 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2439 | *need_rmap_locks = false; | ||
2421 | } | 2440 | } |
2422 | } | 2441 | } |
2423 | return new_vma; | 2442 | return new_vma; |
@@ -2535,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); | |||
2535 | 2554 | ||
2536 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 2555 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
2537 | { | 2556 | { |
2538 | if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2557 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2539 | /* | 2558 | /* |
2540 | * The LSB of head.next can't change from under us | 2559 | * The LSB of head.next can't change from under us |
2541 | * because we hold the mm_all_locks_mutex. | 2560 | * because we hold the mm_all_locks_mutex. |
@@ -2551,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2551 | * anon_vma->root->mutex. | 2570 | * anon_vma->root->mutex. |
2552 | */ | 2571 | */ |
2553 | if (__test_and_set_bit(0, (unsigned long *) | 2572 | if (__test_and_set_bit(0, (unsigned long *) |
2554 | &anon_vma->root->head.next)) | 2573 | &anon_vma->root->rb_root.rb_node)) |
2555 | BUG(); | 2574 | BUG(); |
2556 | } | 2575 | } |
2557 | } | 2576 | } |
@@ -2592,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2592 | * A single task can't take more than one mm_take_all_locks() in a row | 2611 | * A single task can't take more than one mm_take_all_locks() in a row |
2593 | * or it would deadlock. | 2612 | * or it would deadlock. |
2594 | * | 2613 | * |
2595 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | 2614 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in |
2596 | * mapping->flags avoid to take the same lock twice, if more than one | 2615 | * mapping->flags avoid to take the same lock twice, if more than one |
2597 | * vma in this mm is backed by the same anon_vma or address_space. | 2616 | * vma in this mm is backed by the same anon_vma or address_space. |
2598 | * | 2617 | * |
@@ -2639,13 +2658,13 @@ out_unlock: | |||
2639 | 2658 | ||
2640 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2659 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
2641 | { | 2660 | { |
2642 | if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2661 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2643 | /* | 2662 | /* |
2644 | * The LSB of head.next can't change to 0 from under | 2663 | * The LSB of head.next can't change to 0 from under |
2645 | * us because we hold the mm_all_locks_mutex. | 2664 | * us because we hold the mm_all_locks_mutex. |
2646 | * | 2665 | * |
2647 | * We must however clear the bitflag before unlocking | 2666 | * We must however clear the bitflag before unlocking |
2648 | * the vma so the users using the anon_vma->head will | 2667 | * the vma so the users using the anon_vma->rb_root will |
2649 | * never see our bitflag. | 2668 | * never see our bitflag. |
2650 | * | 2669 | * |
2651 | * No need of atomic instructions here, head.next | 2670 | * No need of atomic instructions here, head.next |
@@ -2653,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2653 | * anon_vma->root->mutex. | 2672 | * anon_vma->root->mutex. |
2654 | */ | 2673 | */ |
2655 | if (!__test_and_clear_bit(0, (unsigned long *) | 2674 | if (!__test_and_clear_bit(0, (unsigned long *) |
2656 | &anon_vma->root->head.next)) | 2675 | &anon_vma->root->rb_root.rb_node)) |
2657 | BUG(); | 2676 | BUG(); |
2658 | anon_vma_unlock(anon_vma); | 2677 | anon_vma_unlock(anon_vma); |
2659 | } | 2678 | } |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b60822d9f..479a1e751a73 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -14,10 +14,14 @@ | |||
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/srcu.h> | ||
17 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
20 | 21 | ||
22 | /* global SRCU for all MMs */ | ||
23 | static struct srcu_struct srcu; | ||
24 | |||
21 | /* | 25 | /* |
22 | * This function can't run concurrently against mmu_notifier_register | 26 | * This function can't run concurrently against mmu_notifier_register |
23 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
@@ -25,8 +29,8 @@ | |||
25 | * in parallel despite there being no task using this mm any more, | 29 | * in parallel despite there being no task using this mm any more, |
26 | * through the vmas outside of the exit_mmap context, such as with | 30 | * through the vmas outside of the exit_mmap context, such as with |
27 | * vmtruncate. This serializes against mmu_notifier_unregister with | 31 | * vmtruncate. This serializes against mmu_notifier_unregister with |
28 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | 32 | * the mmu_notifier_mm->lock in addition to SRCU and it serializes |
29 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | 33 | * against the other mmu notifiers with SRCU. struct mmu_notifier_mm |
30 | * can't go away from under us as exit_mmap holds an mm_count pin | 34 | * can't go away from under us as exit_mmap holds an mm_count pin |
31 | * itself. | 35 | * itself. |
32 | */ | 36 | */ |
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
34 | { | 38 | { |
35 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | 40 | struct hlist_node *n; |
41 | int id; | ||
37 | 42 | ||
38 | /* | 43 | /* |
39 | * RCU here will block mmu_notifier_unregister until | 44 | * SRCU here will block mmu_notifier_unregister until |
40 | * ->release returns. | 45 | * ->release returns. |
41 | */ | 46 | */ |
42 | rcu_read_lock(); | 47 | id = srcu_read_lock(&srcu); |
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | 48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) |
44 | /* | 49 | /* |
45 | * if ->release runs before mmu_notifier_unregister it | 50 | * if ->release runs before mmu_notifier_unregister it |
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
50 | */ | 55 | */ |
51 | if (mn->ops->release) | 56 | if (mn->ops->release) |
52 | mn->ops->release(mn, mm); | 57 | mn->ops->release(mn, mm); |
53 | rcu_read_unlock(); | 58 | srcu_read_unlock(&srcu, id); |
54 | 59 | ||
55 | spin_lock(&mm->mmu_notifier_mm->lock); | 60 | spin_lock(&mm->mmu_notifier_mm->lock); |
56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
68 | spin_unlock(&mm->mmu_notifier_mm->lock); | 73 | spin_unlock(&mm->mmu_notifier_mm->lock); |
69 | 74 | ||
70 | /* | 75 | /* |
71 | * synchronize_rcu here prevents mmu_notifier_release to | 76 | * synchronize_srcu here prevents mmu_notifier_release to |
72 | * return to exit_mmap (which would proceed freeing all pages | 77 | * return to exit_mmap (which would proceed freeing all pages |
73 | * in the mm) until the ->release method returns, if it was | 78 | * in the mm) until the ->release method returns, if it was |
74 | * invoked by mmu_notifier_unregister. | 79 | * invoked by mmu_notifier_unregister. |
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
76 | * The mmu_notifier_mm can't go away from under us because one | 81 | * The mmu_notifier_mm can't go away from under us because one |
77 | * mm_count is hold by exit_mmap. | 82 | * mm_count is hold by exit_mmap. |
78 | */ | 83 | */ |
79 | synchronize_rcu(); | 84 | synchronize_srcu(&srcu); |
80 | } | 85 | } |
81 | 86 | ||
82 | /* | 87 | /* |
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
89 | { | 94 | { |
90 | struct mmu_notifier *mn; | 95 | struct mmu_notifier *mn; |
91 | struct hlist_node *n; | 96 | struct hlist_node *n; |
92 | int young = 0; | 97 | int young = 0, id; |
93 | 98 | ||
94 | rcu_read_lock(); | 99 | id = srcu_read_lock(&srcu); |
95 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 100 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
96 | if (mn->ops->clear_flush_young) | 101 | if (mn->ops->clear_flush_young) |
97 | young |= mn->ops->clear_flush_young(mn, mm, address); | 102 | young |= mn->ops->clear_flush_young(mn, mm, address); |
98 | } | 103 | } |
99 | rcu_read_unlock(); | 104 | srcu_read_unlock(&srcu, id); |
100 | 105 | ||
101 | return young; | 106 | return young; |
102 | } | 107 | } |
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
106 | { | 111 | { |
107 | struct mmu_notifier *mn; | 112 | struct mmu_notifier *mn; |
108 | struct hlist_node *n; | 113 | struct hlist_node *n; |
109 | int young = 0; | 114 | int young = 0, id; |
110 | 115 | ||
111 | rcu_read_lock(); | 116 | id = srcu_read_lock(&srcu); |
112 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 117 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
113 | if (mn->ops->test_young) { | 118 | if (mn->ops->test_young) { |
114 | young = mn->ops->test_young(mn, mm, address); | 119 | young = mn->ops->test_young(mn, mm, address); |
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
116 | break; | 121 | break; |
117 | } | 122 | } |
118 | } | 123 | } |
119 | rcu_read_unlock(); | 124 | srcu_read_unlock(&srcu, id); |
120 | 125 | ||
121 | return young; | 126 | return young; |
122 | } | 127 | } |
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
126 | { | 131 | { |
127 | struct mmu_notifier *mn; | 132 | struct mmu_notifier *mn; |
128 | struct hlist_node *n; | 133 | struct hlist_node *n; |
134 | int id; | ||
129 | 135 | ||
130 | rcu_read_lock(); | 136 | id = srcu_read_lock(&srcu); |
131 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
132 | if (mn->ops->change_pte) | 138 | if (mn->ops->change_pte) |
133 | mn->ops->change_pte(mn, mm, address, pte); | 139 | mn->ops->change_pte(mn, mm, address, pte); |
134 | /* | ||
135 | * Some drivers don't have change_pte, | ||
136 | * so we must call invalidate_page in that case. | ||
137 | */ | ||
138 | else if (mn->ops->invalidate_page) | ||
139 | mn->ops->invalidate_page(mn, mm, address); | ||
140 | } | 140 | } |
141 | rcu_read_unlock(); | 141 | srcu_read_unlock(&srcu, id); |
142 | } | 142 | } |
143 | 143 | ||
144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | 144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, | |||
146 | { | 146 | { |
147 | struct mmu_notifier *mn; | 147 | struct mmu_notifier *mn; |
148 | struct hlist_node *n; | 148 | struct hlist_node *n; |
149 | int id; | ||
149 | 150 | ||
150 | rcu_read_lock(); | 151 | id = srcu_read_lock(&srcu); |
151 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 152 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
152 | if (mn->ops->invalidate_page) | 153 | if (mn->ops->invalidate_page) |
153 | mn->ops->invalidate_page(mn, mm, address); | 154 | mn->ops->invalidate_page(mn, mm, address); |
154 | } | 155 | } |
155 | rcu_read_unlock(); | 156 | srcu_read_unlock(&srcu, id); |
156 | } | 157 | } |
157 | 158 | ||
158 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | 159 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, |
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
160 | { | 161 | { |
161 | struct mmu_notifier *mn; | 162 | struct mmu_notifier *mn; |
162 | struct hlist_node *n; | 163 | struct hlist_node *n; |
164 | int id; | ||
163 | 165 | ||
164 | rcu_read_lock(); | 166 | id = srcu_read_lock(&srcu); |
165 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 167 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
166 | if (mn->ops->invalidate_range_start) | 168 | if (mn->ops->invalidate_range_start) |
167 | mn->ops->invalidate_range_start(mn, mm, start, end); | 169 | mn->ops->invalidate_range_start(mn, mm, start, end); |
168 | } | 170 | } |
169 | rcu_read_unlock(); | 171 | srcu_read_unlock(&srcu, id); |
170 | } | 172 | } |
171 | 173 | ||
172 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
174 | { | 176 | { |
175 | struct mmu_notifier *mn; | 177 | struct mmu_notifier *mn; |
176 | struct hlist_node *n; | 178 | struct hlist_node *n; |
179 | int id; | ||
177 | 180 | ||
178 | rcu_read_lock(); | 181 | id = srcu_read_lock(&srcu); |
179 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 182 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
180 | if (mn->ops->invalidate_range_end) | 183 | if (mn->ops->invalidate_range_end) |
181 | mn->ops->invalidate_range_end(mn, mm, start, end); | 184 | mn->ops->invalidate_range_end(mn, mm, start, end); |
182 | } | 185 | } |
183 | rcu_read_unlock(); | 186 | srcu_read_unlock(&srcu, id); |
184 | } | 187 | } |
185 | 188 | ||
186 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
192 | 195 | ||
193 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 196 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
194 | 197 | ||
195 | ret = -ENOMEM; | 198 | /* |
196 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 199 | * Verify that mmu_notifier_init() already run and the global srcu is |
197 | if (unlikely(!mmu_notifier_mm)) | 200 | * initialized. |
198 | goto out; | 201 | */ |
202 | BUG_ON(!srcu.per_cpu_ref); | ||
199 | 203 | ||
200 | if (take_mmap_sem) | 204 | if (take_mmap_sem) |
201 | down_write(&mm->mmap_sem); | 205 | down_write(&mm->mmap_sem); |
202 | ret = mm_take_all_locks(mm); | 206 | ret = mm_take_all_locks(mm); |
203 | if (unlikely(ret)) | 207 | if (unlikely(ret)) |
204 | goto out_cleanup; | 208 | goto out; |
205 | 209 | ||
206 | if (!mm_has_notifiers(mm)) { | 210 | if (!mm_has_notifiers(mm)) { |
211 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), | ||
212 | GFP_KERNEL); | ||
213 | if (unlikely(!mmu_notifier_mm)) { | ||
214 | ret = -ENOMEM; | ||
215 | goto out_of_mem; | ||
216 | } | ||
207 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | 217 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); |
208 | spin_lock_init(&mmu_notifier_mm->lock); | 218 | spin_lock_init(&mmu_notifier_mm->lock); |
219 | |||
209 | mm->mmu_notifier_mm = mmu_notifier_mm; | 220 | mm->mmu_notifier_mm = mmu_notifier_mm; |
210 | mmu_notifier_mm = NULL; | ||
211 | } | 221 | } |
212 | atomic_inc(&mm->mm_count); | 222 | atomic_inc(&mm->mm_count); |
213 | 223 | ||
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
223 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | 233 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); |
224 | spin_unlock(&mm->mmu_notifier_mm->lock); | 234 | spin_unlock(&mm->mmu_notifier_mm->lock); |
225 | 235 | ||
236 | out_of_mem: | ||
226 | mm_drop_all_locks(mm); | 237 | mm_drop_all_locks(mm); |
227 | out_cleanup: | 238 | out: |
228 | if (take_mmap_sem) | 239 | if (take_mmap_sem) |
229 | up_write(&mm->mmap_sem); | 240 | up_write(&mm->mmap_sem); |
230 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | 241 | |
231 | kfree(mmu_notifier_mm); | ||
232 | out: | ||
233 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 242 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
234 | return ret; | 243 | return ret; |
235 | } | 244 | } |
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
274 | /* | 283 | /* |
275 | * This releases the mm_count pin automatically and frees the mm | 284 | * This releases the mm_count pin automatically and frees the mm |
276 | * structure if it was the last user of it. It serializes against | 285 | * structure if it was the last user of it. It serializes against |
277 | * running mmu notifiers with RCU and against mmu_notifier_unregister | 286 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
278 | * with the unregister lock + RCU. All sptes must be dropped before | 287 | * with the unregister lock + SRCU. All sptes must be dropped before |
279 | * calling mmu_notifier_unregister. ->release or any other notifier | 288 | * calling mmu_notifier_unregister. ->release or any other notifier |
280 | * method may be invoked concurrently with mmu_notifier_unregister, | 289 | * method may be invoked concurrently with mmu_notifier_unregister, |
281 | * and only after mmu_notifier_unregister returned we're guaranteed | 290 | * and only after mmu_notifier_unregister returned we're guaranteed |
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
287 | 296 | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 297 | if (!hlist_unhashed(&mn->hlist)) { |
289 | /* | 298 | /* |
290 | * RCU here will force exit_mmap to wait ->release to finish | 299 | * SRCU here will force exit_mmap to wait ->release to finish |
291 | * before freeing the pages. | 300 | * before freeing the pages. |
292 | */ | 301 | */ |
293 | rcu_read_lock(); | 302 | int id; |
294 | 303 | ||
304 | id = srcu_read_lock(&srcu); | ||
295 | /* | 305 | /* |
296 | * exit_mmap will block in mmu_notifier_release to | 306 | * exit_mmap will block in mmu_notifier_release to |
297 | * guarantee ->release is called before freeing the | 307 | * guarantee ->release is called before freeing the |
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
299 | */ | 309 | */ |
300 | if (mn->ops->release) | 310 | if (mn->ops->release) |
301 | mn->ops->release(mn, mm); | 311 | mn->ops->release(mn, mm); |
302 | rcu_read_unlock(); | 312 | srcu_read_unlock(&srcu, id); |
303 | 313 | ||
304 | spin_lock(&mm->mmu_notifier_mm->lock); | 314 | spin_lock(&mm->mmu_notifier_mm->lock); |
305 | hlist_del_rcu(&mn->hlist); | 315 | hlist_del_rcu(&mn->hlist); |
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
310 | * Wait any running method to finish, of course including | 320 | * Wait any running method to finish, of course including |
311 | * ->release if it was run by mmu_notifier_relase instead of us. | 321 | * ->release if it was run by mmu_notifier_relase instead of us. |
312 | */ | 322 | */ |
313 | synchronize_rcu(); | 323 | synchronize_srcu(&srcu); |
314 | 324 | ||
315 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 325 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
316 | 326 | ||
317 | mmdrop(mm); | 327 | mmdrop(mm); |
318 | } | 328 | } |
319 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 329 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
330 | |||
331 | static int __init mmu_notifier_init(void) | ||
332 | { | ||
333 | return init_srcu_struct(&srcu); | ||
334 | } | ||
335 | |||
336 | module_init(mmu_notifier_init); | ||
diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e48d05..1b61c2d3307a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | 71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
72 | unsigned long old_addr, unsigned long old_end, | 72 | unsigned long old_addr, unsigned long old_end, |
73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, | 73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
74 | unsigned long new_addr) | 74 | unsigned long new_addr, bool need_rmap_locks) |
75 | { | 75 | { |
76 | struct address_space *mapping = NULL; | 76 | struct address_space *mapping = NULL; |
77 | struct anon_vma *anon_vma = NULL; | ||
77 | struct mm_struct *mm = vma->vm_mm; | 78 | struct mm_struct *mm = vma->vm_mm; |
78 | pte_t *old_pte, *new_pte, pte; | 79 | pte_t *old_pte, *new_pte, pte; |
79 | spinlock_t *old_ptl, *new_ptl; | 80 | spinlock_t *old_ptl, *new_ptl; |
80 | 81 | ||
81 | if (vma->vm_file) { | 82 | /* |
82 | /* | 83 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma |
83 | * Subtle point from Rajesh Venkatasubramanian: before | 84 | * locks to ensure that rmap will always observe either the old or the |
84 | * moving file-based ptes, we must lock truncate_pagecache | 85 | * new ptes. This is the easiest way to avoid races with |
85 | * out, since it might clean the dst vma before the src vma, | 86 | * truncate_pagecache(), page migration, etc... |
86 | * and we propagate stale pages into the dst afterward. | 87 | * |
87 | */ | 88 | * When need_rmap_locks is false, we use other ways to avoid |
88 | mapping = vma->vm_file->f_mapping; | 89 | * such races: |
89 | mutex_lock(&mapping->i_mmap_mutex); | 90 | * |
91 | * - During exec() shift_arg_pages(), we use a specially tagged vma | ||
92 | * which rmap call sites look for using is_vma_temporary_stack(). | ||
93 | * | ||
94 | * - During mremap(), new_vma is often known to be placed after vma | ||
95 | * in rmap traversal order. This ensures rmap will always observe | ||
96 | * either the old pte, or the new pte, or both (the page table locks | ||
97 | * serialize access to individual ptes, but only rmap traversal | ||
98 | * order guarantees that we won't miss both the old and new ptes). | ||
99 | */ | ||
100 | if (need_rmap_locks) { | ||
101 | if (vma->vm_file) { | ||
102 | mapping = vma->vm_file->f_mapping; | ||
103 | mutex_lock(&mapping->i_mmap_mutex); | ||
104 | } | ||
105 | if (vma->anon_vma) { | ||
106 | anon_vma = vma->anon_vma; | ||
107 | anon_vma_lock(anon_vma); | ||
108 | } | ||
90 | } | 109 | } |
91 | 110 | ||
92 | /* | 111 | /* |
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
114 | spin_unlock(new_ptl); | 133 | spin_unlock(new_ptl); |
115 | pte_unmap(new_pte - 1); | 134 | pte_unmap(new_pte - 1); |
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 135 | pte_unmap_unlock(old_pte - 1, old_ptl); |
136 | if (anon_vma) | ||
137 | anon_vma_unlock(anon_vma); | ||
117 | if (mapping) | 138 | if (mapping) |
118 | mutex_unlock(&mapping->i_mmap_mutex); | 139 | mutex_unlock(&mapping->i_mmap_mutex); |
119 | } | 140 | } |
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
122 | 143 | ||
123 | unsigned long move_page_tables(struct vm_area_struct *vma, | 144 | unsigned long move_page_tables(struct vm_area_struct *vma, |
124 | unsigned long old_addr, struct vm_area_struct *new_vma, | 145 | unsigned long old_addr, struct vm_area_struct *new_vma, |
125 | unsigned long new_addr, unsigned long len) | 146 | unsigned long new_addr, unsigned long len, |
147 | bool need_rmap_locks) | ||
126 | { | 148 | { |
127 | unsigned long extent, next, old_end; | 149 | unsigned long extent, next, old_end; |
128 | pmd_t *old_pmd, *new_pmd; | 150 | pmd_t *old_pmd, *new_pmd; |
129 | bool need_flush = false; | 151 | bool need_flush = false; |
152 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
153 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
130 | 154 | ||
131 | old_end = old_addr + len; | 155 | old_end = old_addr + len; |
132 | flush_cache_range(vma, old_addr, old_end); | 156 | flush_cache_range(vma, old_addr, old_end); |
133 | 157 | ||
134 | mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); | 158 | mmun_start = old_addr; |
159 | mmun_end = old_end; | ||
160 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | ||
135 | 161 | ||
136 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 162 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
137 | cond_resched(); | 163 | cond_resched(); |
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
169 | if (extent > LATENCY_LIMIT) | 195 | if (extent > LATENCY_LIMIT) |
170 | extent = LATENCY_LIMIT; | 196 | extent = LATENCY_LIMIT; |
171 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 197 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
172 | new_vma, new_pmd, new_addr); | 198 | new_vma, new_pmd, new_addr, need_rmap_locks); |
173 | need_flush = true; | 199 | need_flush = true; |
174 | } | 200 | } |
175 | if (likely(need_flush)) | 201 | if (likely(need_flush)) |
176 | flush_tlb_range(vma, old_end-len, old_addr); | 202 | flush_tlb_range(vma, old_end-len, old_addr); |
177 | 203 | ||
178 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); | 204 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
179 | 205 | ||
180 | return len + old_addr - old_end; /* how much done */ | 206 | return len + old_addr - old_end; /* how much done */ |
181 | } | 207 | } |
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
193 | unsigned long hiwater_vm; | 219 | unsigned long hiwater_vm; |
194 | int split = 0; | 220 | int split = 0; |
195 | int err; | 221 | int err; |
222 | bool need_rmap_locks; | ||
196 | 223 | ||
197 | /* | 224 | /* |
198 | * We'd prefer to avoid failure later on in do_munmap: | 225 | * We'd prefer to avoid failure later on in do_munmap: |
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
214 | return err; | 241 | return err; |
215 | 242 | ||
216 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | 243 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
217 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | 244 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, |
245 | &need_rmap_locks); | ||
218 | if (!new_vma) | 246 | if (!new_vma) |
219 | return -ENOMEM; | 247 | return -ENOMEM; |
220 | 248 | ||
221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | 249 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
250 | need_rmap_locks); | ||
222 | if (moved_len < old_len) { | 251 | if (moved_len < old_len) { |
223 | /* | 252 | /* |
224 | * Before moving the page tables from the new vma to | ||
225 | * the old vma, we need to be sure the old vma is | ||
226 | * queued after new vma in the same_anon_vma list to | ||
227 | * prevent SMP races with rmap_walk (that could lead | ||
228 | * rmap_walk to miss some page table). | ||
229 | */ | ||
230 | anon_vma_moveto_tail(vma); | ||
231 | |||
232 | /* | ||
233 | * On error, move entries back from new area to old, | 253 | * On error, move entries back from new area to old, |
234 | * which will succeed since page tables still there, | 254 | * which will succeed since page tables still there, |
235 | * and then proceed to unmap new area instead of old. | 255 | * and then proceed to unmap new area instead of old. |
236 | */ | 256 | */ |
237 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | 257 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, |
258 | true); | ||
238 | vma = new_vma; | 259 | vma = new_vma; |
239 | old_len = new_len; | 260 | old_len = new_len; |
240 | old_addr = new_addr; | 261 | old_addr = new_addr; |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 405573010f99..714d5d650470 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
116 | return 0; | 116 | return 0; |
117 | 117 | ||
118 | __free_pages_memory(start_pfn, end_pfn); | 118 | __free_pages_memory(start_pfn, end_pfn); |
119 | fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), | ||
120 | start_pfn, end_pfn); | ||
119 | 121 | ||
120 | return end_pfn - start_pfn; | 122 | return end_pfn - start_pfn; |
121 | } | 123 | } |
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
126 | phys_addr_t start, end, size; | 128 | phys_addr_t start, end, size; |
127 | u64 i; | 129 | u64 i; |
128 | 130 | ||
131 | reset_zone_present_pages(); | ||
129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 132 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
130 | count += __free_memory_core(start, end); | 133 | count += __free_memory_core(start, end); |
131 | 134 | ||
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void) | |||
162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 165 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
163 | * because in some case like Node0 doesn't have RAM installed | 166 | * because in some case like Node0 doesn't have RAM installed |
164 | * low ram will be on Node1 | 167 | * low ram will be on Node1 |
165 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
166 | * will be used instead of only Node0 related | ||
167 | */ | 168 | */ |
168 | return free_low_memory_core_early(MAX_NUMNODES); | 169 | return free_low_memory_core_early(MAX_NUMNODES); |
169 | } | 170 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index dee2ff89fd58..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
698 | 698 | ||
699 | mutex_lock(&mapping->i_mmap_mutex); | 699 | mutex_lock(&mapping->i_mmap_mutex); |
700 | flush_dcache_mmap_lock(mapping); | 700 | flush_dcache_mmap_lock(mapping); |
701 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 701 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
702 | flush_dcache_mmap_unlock(mapping); | 702 | flush_dcache_mmap_unlock(mapping); |
703 | mutex_unlock(&mapping->i_mmap_mutex); | 703 | mutex_unlock(&mapping->i_mmap_mutex); |
704 | } | 704 | } |
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
764 | 764 | ||
765 | mutex_lock(&mapping->i_mmap_mutex); | 765 | mutex_lock(&mapping->i_mmap_mutex); |
766 | flush_dcache_mmap_lock(mapping); | 766 | flush_dcache_mmap_lock(mapping); |
767 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 767 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
768 | flush_dcache_mmap_unlock(mapping); | 768 | flush_dcache_mmap_unlock(mapping); |
769 | mutex_unlock(&mapping->i_mmap_mutex); | 769 | mutex_unlock(&mapping->i_mmap_mutex); |
770 | } | 770 | } |
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | |||
789 | kenter("%p", vma); | 789 | kenter("%p", vma); |
790 | if (vma->vm_ops && vma->vm_ops->close) | 790 | if (vma->vm_ops && vma->vm_ops->close) |
791 | vma->vm_ops->close(vma); | 791 | vma->vm_ops->close(vma); |
792 | if (vma->vm_file) { | 792 | if (vma->vm_file) |
793 | fput(vma->vm_file); | 793 | fput(vma->vm_file); |
794 | if (vma->vm_flags & VM_EXECUTABLE) | ||
795 | removed_exe_file_vma(mm); | ||
796 | } | ||
797 | put_nommu_region(vma->vm_region); | 794 | put_nommu_region(vma->vm_region); |
798 | kmem_cache_free(vm_area_cachep, vma); | 795 | kmem_cache_free(vm_area_cachep, vma); |
799 | } | 796 | } |
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1284 | if (file) { | 1281 | if (file) { |
1285 | region->vm_file = get_file(file); | 1282 | region->vm_file = get_file(file); |
1286 | vma->vm_file = get_file(file); | 1283 | vma->vm_file = get_file(file); |
1287 | if (vm_flags & VM_EXECUTABLE) { | ||
1288 | added_exe_file_vma(current->mm); | ||
1289 | vma->vm_mm = current->mm; | ||
1290 | } | ||
1291 | } | 1284 | } |
1292 | 1285 | ||
1293 | down_write(&nommu_region_sem); | 1286 | down_write(&nommu_region_sem); |
@@ -1440,8 +1433,6 @@ error: | |||
1440 | kmem_cache_free(vm_region_jar, region); | 1433 | kmem_cache_free(vm_region_jar, region); |
1441 | if (vma->vm_file) | 1434 | if (vma->vm_file) |
1442 | fput(vma->vm_file); | 1435 | fput(vma->vm_file); |
1443 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1444 | removed_exe_file_vma(vma->vm_mm); | ||
1445 | kmem_cache_free(vm_area_cachep, vma); | 1436 | kmem_cache_free(vm_area_cachep, vma); |
1446 | kleave(" = %d", ret); | 1437 | kleave(" = %d", ret); |
1447 | return ret; | 1438 | return ret; |
@@ -1820,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1820 | if (addr != (pfn << PAGE_SHIFT)) | 1811 | if (addr != (pfn << PAGE_SHIFT)) |
1821 | return -EINVAL; | 1812 | return -EINVAL; |
1822 | 1813 | ||
1823 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1814 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
1824 | return 0; | 1815 | return 0; |
1825 | } | 1816 | } |
1826 | EXPORT_SYMBOL(remap_pfn_range); | 1817 | EXPORT_SYMBOL(remap_pfn_range); |
@@ -1961,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1961 | } | 1952 | } |
1962 | EXPORT_SYMBOL(filemap_fault); | 1953 | EXPORT_SYMBOL(filemap_fault); |
1963 | 1954 | ||
1955 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
1956 | unsigned long size, pgoff_t pgoff) | ||
1957 | { | ||
1958 | BUG(); | ||
1959 | return 0; | ||
1960 | } | ||
1961 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
1962 | |||
1964 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 1963 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1965 | unsigned long addr, void *buf, int len, int write) | 1964 | unsigned long addr, void *buf, int len, int write) |
1966 | { | 1965 | { |
@@ -2045,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2045 | size_t newsize) | 2044 | size_t newsize) |
2046 | { | 2045 | { |
2047 | struct vm_area_struct *vma; | 2046 | struct vm_area_struct *vma; |
2048 | struct prio_tree_iter iter; | ||
2049 | struct vm_region *region; | 2047 | struct vm_region *region; |
2050 | pgoff_t low, high; | 2048 | pgoff_t low, high; |
2051 | size_t r_size, r_top; | 2049 | size_t r_size, r_top; |
@@ -2057,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2057 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2055 | mutex_lock(&inode->i_mapping->i_mmap_mutex); |
2058 | 2056 | ||
2059 | /* search for VMAs that fall within the dead zone */ | 2057 | /* search for VMAs that fall within the dead zone */ |
2060 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2058 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
2061 | low, high) { | ||
2062 | /* found one - only interested if it's shared out of the page | 2059 | /* found one - only interested if it's shared out of the page |
2063 | * cache */ | 2060 | * cache */ |
2064 | if (vma->vm_flags & VM_SHARED) { | 2061 | if (vma->vm_flags & VM_SHARED) { |
@@ -2074,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2074 | * we don't check for any regions that start beyond the EOF as there | 2071 | * we don't check for any regions that start beyond the EOF as there |
2075 | * shouldn't be any | 2072 | * shouldn't be any |
2076 | */ | 2073 | */ |
2077 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2074 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, |
2078 | 0, ULONG_MAX) { | 2075 | 0, ULONG_MAX) { |
2079 | if (!(vma->vm_flags & VM_SHARED)) | 2076 | if (!(vma->vm_flags & VM_SHARED)) |
2080 | continue; | 2077 | continue; |
2081 | 2078 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 198600861638..79e0f3e24831 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 428 | { |
429 | task_lock(current); | 429 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_adj=%d, oom_score_adj=%d\n", | 431 | "oom_score_adj=%d\n", |
432 | current->comm, gfp_mask, order, current->signal->oom_adj, | 432 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 433 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 434 | cpuset_print_task_mems_allowed(current); |
435 | task_unlock(current); | 435 | task_unlock(current); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13ea7538891..bb90971182bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, | |||
558 | if (page_is_guard(buddy)) { | 558 | if (page_is_guard(buddy)) { |
559 | clear_page_guard_flag(buddy); | 559 | clear_page_guard_flag(buddy); |
560 | set_page_private(page, 0); | 560 | set_page_private(page, 0); |
561 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 561 | __mod_zone_freepage_state(zone, 1 << order, |
562 | migratetype); | ||
562 | } else { | 563 | } else { |
563 | list_del(&buddy->lru); | 564 | list_del(&buddy->lru); |
564 | zone->free_area[order].nr_free--; | 565 | zone->free_area[order].nr_free--; |
@@ -597,17 +598,6 @@ out: | |||
597 | zone->free_area[order].nr_free++; | 598 | zone->free_area[order].nr_free++; |
598 | } | 599 | } |
599 | 600 | ||
600 | /* | ||
601 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
602 | * Page should not be on lru, so no need to fix that up. | ||
603 | * free_pages_check() will verify... | ||
604 | */ | ||
605 | static inline void free_page_mlock(struct page *page) | ||
606 | { | ||
607 | __dec_zone_page_state(page, NR_MLOCK); | ||
608 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
609 | } | ||
610 | |||
611 | static inline int free_pages_check(struct page *page) | 601 | static inline int free_pages_check(struct page *page) |
612 | { | 602 | { |
613 | if (unlikely(page_mapcount(page) | | 603 | if (unlikely(page_mapcount(page) | |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
668 | batch_free = to_free; | 658 | batch_free = to_free; |
669 | 659 | ||
670 | do { | 660 | do { |
661 | int mt; /* migratetype of the to-be-freed page */ | ||
662 | |||
671 | page = list_entry(list->prev, struct page, lru); | 663 | page = list_entry(list->prev, struct page, lru); |
672 | /* must delete as __free_one_page list manipulates */ | 664 | /* must delete as __free_one_page list manipulates */ |
673 | list_del(&page->lru); | 665 | list_del(&page->lru); |
666 | mt = get_freepage_migratetype(page); | ||
674 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
675 | __free_one_page(page, zone, 0, page_private(page)); | 668 | __free_one_page(page, zone, 0, mt); |
676 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | 669 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | ||
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
677 | } while (--to_free && --batch_free && !list_empty(list)); | 672 | } while (--to_free && --batch_free && !list_empty(list)); |
678 | } | 673 | } |
679 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
688 | zone->pages_scanned = 0; | 683 | zone->pages_scanned = 0; |
689 | 684 | ||
690 | __free_one_page(page, zone, order, migratetype); | 685 | __free_one_page(page, zone, order, migratetype); |
691 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 686 | if (unlikely(migratetype != MIGRATE_ISOLATE)) |
687 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | ||
692 | spin_unlock(&zone->lock); | 688 | spin_unlock(&zone->lock); |
693 | } | 689 | } |
694 | 690 | ||
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
721 | static void __free_pages_ok(struct page *page, unsigned int order) | 717 | static void __free_pages_ok(struct page *page, unsigned int order) |
722 | { | 718 | { |
723 | unsigned long flags; | 719 | unsigned long flags; |
724 | int wasMlocked = __TestClearPageMlocked(page); | 720 | int migratetype; |
725 | 721 | ||
726 | if (!free_pages_prepare(page, order)) | 722 | if (!free_pages_prepare(page, order)) |
727 | return; | 723 | return; |
728 | 724 | ||
729 | local_irq_save(flags); | 725 | local_irq_save(flags); |
730 | if (unlikely(wasMlocked)) | ||
731 | free_page_mlock(page); | ||
732 | __count_vm_events(PGFREE, 1 << order); | 726 | __count_vm_events(PGFREE, 1 << order); |
733 | free_one_page(page_zone(page), page, order, | 727 | migratetype = get_pageblock_migratetype(page); |
734 | get_pageblock_migratetype(page)); | 728 | set_freepage_migratetype(page, migratetype); |
729 | free_one_page(page_zone(page), page, order, migratetype); | ||
735 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
736 | } | 731 | } |
737 | 732 | ||
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page, | |||
811 | set_page_guard_flag(&page[size]); | 806 | set_page_guard_flag(&page[size]); |
812 | set_page_private(&page[size], high); | 807 | set_page_private(&page[size], high); |
813 | /* Guard pages are not available for any usage */ | 808 | /* Guard pages are not available for any usage */ |
814 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | 809 | __mod_zone_freepage_state(zone, -(1 << high), |
810 | migratetype); | ||
815 | continue; | 811 | continue; |
816 | } | 812 | } |
817 | #endif | 813 | #endif |
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
915 | * Note that start_page and end_pages are not aligned on a pageblock | 911 | * Note that start_page and end_pages are not aligned on a pageblock |
916 | * boundary. If alignment is required, use move_freepages_block() | 912 | * boundary. If alignment is required, use move_freepages_block() |
917 | */ | 913 | */ |
918 | static int move_freepages(struct zone *zone, | 914 | int move_freepages(struct zone *zone, |
919 | struct page *start_page, struct page *end_page, | 915 | struct page *start_page, struct page *end_page, |
920 | int migratetype) | 916 | int migratetype) |
921 | { | 917 | { |
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone, | |||
951 | order = page_order(page); | 947 | order = page_order(page); |
952 | list_move(&page->lru, | 948 | list_move(&page->lru, |
953 | &zone->free_area[order].free_list[migratetype]); | 949 | &zone->free_area[order].free_list[migratetype]); |
950 | set_freepage_migratetype(page, migratetype); | ||
954 | page += 1 << order; | 951 | page += 1 << order; |
955 | pages_moved += 1 << order; | 952 | pages_moved += 1 << order; |
956 | } | 953 | } |
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1132 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) |
1136 | mt = migratetype; | 1133 | mt = migratetype; |
1137 | } | 1134 | } |
1138 | set_page_private(page, mt); | 1135 | set_freepage_migratetype(page, mt); |
1139 | list = &page->lru; | 1136 | list = &page->lru; |
1137 | if (is_migrate_cma(mt)) | ||
1138 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | ||
1139 | -(1 << order)); | ||
1140 | } | 1140 | } |
1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
1142 | spin_unlock(&zone->lock); | 1142 | spin_unlock(&zone->lock); |
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1296 | struct per_cpu_pages *pcp; | 1296 | struct per_cpu_pages *pcp; |
1297 | unsigned long flags; | 1297 | unsigned long flags; |
1298 | int migratetype; | 1298 | int migratetype; |
1299 | int wasMlocked = __TestClearPageMlocked(page); | ||
1300 | 1299 | ||
1301 | if (!free_pages_prepare(page, 0)) | 1300 | if (!free_pages_prepare(page, 0)) |
1302 | return; | 1301 | return; |
1303 | 1302 | ||
1304 | migratetype = get_pageblock_migratetype(page); | 1303 | migratetype = get_pageblock_migratetype(page); |
1305 | set_page_private(page, migratetype); | 1304 | set_freepage_migratetype(page, migratetype); |
1306 | local_irq_save(flags); | 1305 | local_irq_save(flags); |
1307 | if (unlikely(wasMlocked)) | ||
1308 | free_page_mlock(page); | ||
1309 | __count_vm_event(PGFREE); | 1306 | __count_vm_event(PGFREE); |
1310 | 1307 | ||
1311 | /* | 1308 | /* |
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order) | |||
1380 | } | 1377 | } |
1381 | 1378 | ||
1382 | /* | 1379 | /* |
1383 | * Similar to split_page except the page is already free. As this is only | 1380 | * Similar to the split_page family of functions except that the page |
1384 | * being used for migration, the migratetype of the block also changes. | 1381 | * required at the given order and being isolated now to prevent races |
1385 | * As this is called with interrupts disabled, the caller is responsible | 1382 | * with parallel allocators |
1386 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1387 | * are enabled. | ||
1388 | * | ||
1389 | * Note: this is probably too low level an operation for use in drivers. | ||
1390 | * Please consult with lkml before using this in your driver. | ||
1391 | */ | 1383 | */ |
1392 | int split_free_page(struct page *page) | 1384 | int capture_free_page(struct page *page, int alloc_order, int migratetype) |
1393 | { | 1385 | { |
1394 | unsigned int order; | 1386 | unsigned int order; |
1395 | unsigned long watermark; | 1387 | unsigned long watermark; |
1396 | struct zone *zone; | 1388 | struct zone *zone; |
1389 | int mt; | ||
1397 | 1390 | ||
1398 | BUG_ON(!PageBuddy(page)); | 1391 | BUG_ON(!PageBuddy(page)); |
1399 | 1392 | ||
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page) | |||
1409 | list_del(&page->lru); | 1402 | list_del(&page->lru); |
1410 | zone->free_area[order].nr_free--; | 1403 | zone->free_area[order].nr_free--; |
1411 | rmv_page_order(page); | 1404 | rmv_page_order(page); |
1412 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
1413 | 1405 | ||
1414 | /* Split into individual pages */ | 1406 | mt = get_pageblock_migratetype(page); |
1415 | set_page_refcounted(page); | 1407 | if (unlikely(mt != MIGRATE_ISOLATE)) |
1416 | split_page(page, order); | 1408 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
1417 | 1409 | ||
1410 | if (alloc_order != order) | ||
1411 | expand(zone, page, alloc_order, order, | ||
1412 | &zone->free_area[order], migratetype); | ||
1413 | |||
1414 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1418 | if (order >= pageblock_order - 1) { | 1415 | if (order >= pageblock_order - 1) { |
1419 | struct page *endpage = page + (1 << order) - 1; | 1416 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1417 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page) | |||
1425 | } | 1422 | } |
1426 | } | 1423 | } |
1427 | 1424 | ||
1428 | return 1 << order; | 1425 | return 1UL << order; |
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Similar to split_page except the page is already free. As this is only | ||
1430 | * being used for migration, the migratetype of the block also changes. | ||
1431 | * As this is called with interrupts disabled, the caller is responsible | ||
1432 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1433 | * are enabled. | ||
1434 | * | ||
1435 | * Note: this is probably too low level an operation for use in drivers. | ||
1436 | * Please consult with lkml before using this in your driver. | ||
1437 | */ | ||
1438 | int split_free_page(struct page *page) | ||
1439 | { | ||
1440 | unsigned int order; | ||
1441 | int nr_pages; | ||
1442 | |||
1443 | BUG_ON(!PageBuddy(page)); | ||
1444 | order = page_order(page); | ||
1445 | |||
1446 | nr_pages = capture_free_page(page, order, 0); | ||
1447 | if (!nr_pages) | ||
1448 | return 0; | ||
1449 | |||
1450 | /* Split into individual pages */ | ||
1451 | set_page_refcounted(page); | ||
1452 | split_page(page, order); | ||
1453 | return nr_pages; | ||
1429 | } | 1454 | } |
1430 | 1455 | ||
1431 | /* | 1456 | /* |
@@ -1484,7 +1509,8 @@ again: | |||
1484 | spin_unlock(&zone->lock); | 1509 | spin_unlock(&zone->lock); |
1485 | if (!page) | 1510 | if (!page) |
1486 | goto failed; | 1511 | goto failed; |
1487 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | 1512 | __mod_zone_freepage_state(zone, -(1 << order), |
1513 | get_pageblock_migratetype(page)); | ||
1488 | } | 1514 | } |
1489 | 1515 | ||
1490 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1516 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
@@ -1501,19 +1527,6 @@ failed: | |||
1501 | return NULL; | 1527 | return NULL; |
1502 | } | 1528 | } |
1503 | 1529 | ||
1504 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
1505 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
1506 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
1507 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
1508 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1509 | |||
1510 | /* Mask to get the watermark bits */ | ||
1511 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1512 | |||
1513 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
1514 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
1515 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
1516 | |||
1517 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1530 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1518 | 1531 | ||
1519 | static struct { | 1532 | static struct { |
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1608 | min -= min / 2; | 1621 | min -= min / 2; |
1609 | if (alloc_flags & ALLOC_HARDER) | 1622 | if (alloc_flags & ALLOC_HARDER) |
1610 | min -= min / 4; | 1623 | min -= min / 4; |
1611 | 1624 | #ifdef CONFIG_CMA | |
1625 | /* If allocation can't use CMA areas don't use free CMA pages */ | ||
1626 | if (!(alloc_flags & ALLOC_CMA)) | ||
1627 | free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); | ||
1628 | #endif | ||
1612 | if (free_pages <= min + lowmem_reserve) | 1629 | if (free_pages <= min + lowmem_reserve) |
1613 | return false; | 1630 | return false; |
1614 | for (o = 0; o < order; o++) { | 1631 | for (o = 0; o < order; o++) { |
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) | |||
1782 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1799 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1783 | } | 1800 | } |
1784 | 1801 | ||
1802 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1803 | { | ||
1804 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | ||
1805 | } | ||
1806 | |||
1807 | static void __paginginit init_zone_allows_reclaim(int nid) | ||
1808 | { | ||
1809 | int i; | ||
1810 | |||
1811 | for_each_online_node(i) | ||
1812 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) { | ||
1813 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1814 | zone_reclaim_mode = 1; | ||
1815 | } | ||
1816 | } | ||
1817 | |||
1785 | #else /* CONFIG_NUMA */ | 1818 | #else /* CONFIG_NUMA */ |
1786 | 1819 | ||
1787 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1820 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1802 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1835 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1803 | { | 1836 | { |
1804 | } | 1837 | } |
1838 | |||
1839 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1840 | { | ||
1841 | return true; | ||
1842 | } | ||
1843 | |||
1844 | static inline void init_zone_allows_reclaim(int nid) | ||
1845 | { | ||
1846 | } | ||
1805 | #endif /* CONFIG_NUMA */ | 1847 | #endif /* CONFIG_NUMA */ |
1806 | 1848 | ||
1807 | /* | 1849 | /* |
@@ -1886,7 +1928,8 @@ zonelist_scan: | |||
1886 | did_zlc_setup = 1; | 1928 | did_zlc_setup = 1; |
1887 | } | 1929 | } |
1888 | 1930 | ||
1889 | if (zone_reclaim_mode == 0) | 1931 | if (zone_reclaim_mode == 0 || |
1932 | !zone_allows_reclaim(preferred_zone, zone)) | ||
1890 | goto this_zone_full; | 1933 | goto this_zone_full; |
1891 | 1934 | ||
1892 | /* | 1935 | /* |
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2105 | bool *contended_compaction, bool *deferred_compaction, | 2148 | bool *contended_compaction, bool *deferred_compaction, |
2106 | unsigned long *did_some_progress) | 2149 | unsigned long *did_some_progress) |
2107 | { | 2150 | { |
2108 | struct page *page; | 2151 | struct page *page = NULL; |
2109 | 2152 | ||
2110 | if (!order) | 2153 | if (!order) |
2111 | return NULL; | 2154 | return NULL; |
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2118 | current->flags |= PF_MEMALLOC; | 2161 | current->flags |= PF_MEMALLOC; |
2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2162 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2120 | nodemask, sync_migration, | 2163 | nodemask, sync_migration, |
2121 | contended_compaction); | 2164 | contended_compaction, &page); |
2122 | current->flags &= ~PF_MEMALLOC; | 2165 | current->flags &= ~PF_MEMALLOC; |
2123 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2124 | 2166 | ||
2167 | /* If compaction captured a page, prep and use it */ | ||
2168 | if (page) { | ||
2169 | prep_new_page(page, order, gfp_mask); | ||
2170 | goto got_page; | ||
2171 | } | ||
2172 | |||
2173 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2125 | /* Page migration frees to the PCP lists but we want merging */ | 2174 | /* Page migration frees to the PCP lists but we want merging */ |
2126 | drain_pages(get_cpu()); | 2175 | drain_pages(get_cpu()); |
2127 | put_cpu(); | 2176 | put_cpu(); |
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2131 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2180 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2132 | preferred_zone, migratetype); | 2181 | preferred_zone, migratetype); |
2133 | if (page) { | 2182 | if (page) { |
2183 | got_page: | ||
2184 | preferred_zone->compact_blockskip_flush = false; | ||
2134 | preferred_zone->compact_considered = 0; | 2185 | preferred_zone->compact_considered = 0; |
2135 | preferred_zone->compact_defer_shift = 0; | 2186 | preferred_zone->compact_defer_shift = 0; |
2136 | if (order >= preferred_zone->compact_order_failed) | 2187 | if (order >= preferred_zone->compact_order_failed) |
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2315 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2366 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
2316 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2367 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2317 | } | 2368 | } |
2318 | 2369 | #ifdef CONFIG_CMA | |
2370 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2371 | alloc_flags |= ALLOC_CMA; | ||
2372 | #endif | ||
2319 | return alloc_flags; | 2373 | return alloc_flags; |
2320 | } | 2374 | } |
2321 | 2375 | ||
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2362 | goto nopage; | 2416 | goto nopage; |
2363 | 2417 | ||
2364 | restart: | 2418 | restart: |
2365 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2419 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2366 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2420 | zone_idx(preferred_zone)); |
2367 | zone_idx(preferred_zone)); | ||
2368 | 2421 | ||
2369 | /* | 2422 | /* |
2370 | * OK, we're below the kswapd watermark and have kicked background | 2423 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2441,7 +2494,7 @@ rebalance: | |||
2441 | * system then fail the allocation instead of entering direct reclaim. | 2494 | * system then fail the allocation instead of entering direct reclaim. |
2442 | */ | 2495 | */ |
2443 | if ((deferred_compaction || contended_compaction) && | 2496 | if ((deferred_compaction || contended_compaction) && |
2444 | (gfp_mask & __GFP_NO_KSWAPD)) | 2497 | (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) |
2445 | goto nopage; | 2498 | goto nopage; |
2446 | 2499 | ||
2447 | /* Try direct reclaim and then allocating */ | 2500 | /* Try direct reclaim and then allocating */ |
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2541 | struct page *page = NULL; | 2594 | struct page *page = NULL; |
2542 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2595 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2543 | unsigned int cpuset_mems_cookie; | 2596 | unsigned int cpuset_mems_cookie; |
2597 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | ||
2544 | 2598 | ||
2545 | gfp_mask &= gfp_allowed_mask; | 2599 | gfp_mask &= gfp_allowed_mask; |
2546 | 2600 | ||
@@ -2569,9 +2623,13 @@ retry_cpuset: | |||
2569 | if (!preferred_zone) | 2623 | if (!preferred_zone) |
2570 | goto out; | 2624 | goto out; |
2571 | 2625 | ||
2626 | #ifdef CONFIG_CMA | ||
2627 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2628 | alloc_flags |= ALLOC_CMA; | ||
2629 | #endif | ||
2572 | /* First allocation attempt */ | 2630 | /* First allocation attempt */ |
2573 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2574 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | 2632 | zonelist, high_zoneidx, alloc_flags, |
2575 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
2576 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) |
2577 | page = __alloc_pages_slowpath(gfp_mask, order, | 2635 | page = __alloc_pages_slowpath(gfp_mask, order, |
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter) | |||
2852 | " unevictable:%lu" | 2910 | " unevictable:%lu" |
2853 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2911 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2854 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2912 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2855 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2913 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
2914 | " free_cma:%lu\n", | ||
2856 | global_page_state(NR_ACTIVE_ANON), | 2915 | global_page_state(NR_ACTIVE_ANON), |
2857 | global_page_state(NR_INACTIVE_ANON), | 2916 | global_page_state(NR_INACTIVE_ANON), |
2858 | global_page_state(NR_ISOLATED_ANON), | 2917 | global_page_state(NR_ISOLATED_ANON), |
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter) | |||
2869 | global_page_state(NR_FILE_MAPPED), | 2928 | global_page_state(NR_FILE_MAPPED), |
2870 | global_page_state(NR_SHMEM), | 2929 | global_page_state(NR_SHMEM), |
2871 | global_page_state(NR_PAGETABLE), | 2930 | global_page_state(NR_PAGETABLE), |
2872 | global_page_state(NR_BOUNCE)); | 2931 | global_page_state(NR_BOUNCE), |
2932 | global_page_state(NR_FREE_CMA_PAGES)); | ||
2873 | 2933 | ||
2874 | for_each_populated_zone(zone) { | 2934 | for_each_populated_zone(zone) { |
2875 | int i; | 2935 | int i; |
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter) | |||
2901 | " pagetables:%lukB" | 2961 | " pagetables:%lukB" |
2902 | " unstable:%lukB" | 2962 | " unstable:%lukB" |
2903 | " bounce:%lukB" | 2963 | " bounce:%lukB" |
2964 | " free_cma:%lukB" | ||
2904 | " writeback_tmp:%lukB" | 2965 | " writeback_tmp:%lukB" |
2905 | " pages_scanned:%lu" | 2966 | " pages_scanned:%lu" |
2906 | " all_unreclaimable? %s" | 2967 | " all_unreclaimable? %s" |
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter) | |||
2930 | K(zone_page_state(zone, NR_PAGETABLE)), | 2991 | K(zone_page_state(zone, NR_PAGETABLE)), |
2931 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 2992 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
2932 | K(zone_page_state(zone, NR_BOUNCE)), | 2993 | K(zone_page_state(zone, NR_BOUNCE)), |
2994 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | ||
2933 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2995 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2934 | zone->pages_scanned, | 2996 | zone->pages_scanned, |
2935 | (zone->all_unreclaimable ? "yes" : "no") | 2997 | (zone->all_unreclaimable ? "yes" : "no") |
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) | |||
3328 | j = 0; | 3390 | j = 0; |
3329 | 3391 | ||
3330 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 3392 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
3331 | int distance = node_distance(local_node, node); | ||
3332 | |||
3333 | /* | ||
3334 | * If another node is sufficiently far away then it is better | ||
3335 | * to reclaim pages in a zone before going off node. | ||
3336 | */ | ||
3337 | if (distance > RECLAIM_DISTANCE) | ||
3338 | zone_reclaim_mode = 1; | ||
3339 | |||
3340 | /* | 3393 | /* |
3341 | * We don't want to pressure a particular node. | 3394 | * We don't want to pressure a particular node. |
3342 | * So adding penalty to the first node in same | 3395 | * So adding penalty to the first node in same |
3343 | * distance group to make it round-robin. | 3396 | * distance group to make it round-robin. |
3344 | */ | 3397 | */ |
3345 | if (distance != node_distance(local_node, prev_node)) | 3398 | if (node_distance(local_node, node) != |
3399 | node_distance(local_node, prev_node)) | ||
3346 | node_load[node] = load; | 3400 | node_load[node] = load; |
3347 | 3401 | ||
3348 | prev_node = node; | 3402 | prev_node = node; |
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4438 | 4492 | ||
4439 | zone->spanned_pages = size; | 4493 | zone->spanned_pages = size; |
4440 | zone->present_pages = realsize; | 4494 | zone->present_pages = realsize; |
4441 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4442 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4443 | zone->spanned_pages; | ||
4444 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4445 | #endif | ||
4446 | #ifdef CONFIG_NUMA | 4495 | #ifdef CONFIG_NUMA |
4447 | zone->node = nid; | 4496 | zone->node = nid; |
4448 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4497 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4521 | 4570 | ||
4522 | pgdat->node_id = nid; | 4571 | pgdat->node_id = nid; |
4523 | pgdat->node_start_pfn = node_start_pfn; | 4572 | pgdat->node_start_pfn = node_start_pfn; |
4573 | init_zone_allows_reclaim(nid); | ||
4524 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4574 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
4525 | 4575 | ||
4526 | alloc_node_mem_map(pgdat); | 4576 | alloc_node_mem_map(pgdat); |
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4879 | zone_movable_pfn[i] << PAGE_SHIFT); | 4929 | zone_movable_pfn[i] << PAGE_SHIFT); |
4880 | } | 4930 | } |
4881 | 4931 | ||
4882 | /* Print out the early_node_map[] */ | 4932 | /* Print out the early node map */ |
4883 | printk("Early memory node ranges\n"); | 4933 | printk("Early memory node ranges\n"); |
4884 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4934 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4885 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 4935 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn) | |||
5619 | pageblock_nr_pages)); | 5669 | pageblock_nr_pages)); |
5620 | } | 5670 | } |
5621 | 5671 | ||
5622 | static struct page * | ||
5623 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5624 | int **resultp) | ||
5625 | { | ||
5626 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5627 | |||
5628 | if (PageHighMem(page)) | ||
5629 | gfp_mask |= __GFP_HIGHMEM; | ||
5630 | |||
5631 | return alloc_page(gfp_mask); | ||
5632 | } | ||
5633 | |||
5634 | /* [start, end) must belong to a single zone. */ | 5672 | /* [start, end) must belong to a single zone. */ |
5635 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | 5673 | static int __alloc_contig_migrate_range(struct compact_control *cc, |
5674 | unsigned long start, unsigned long end) | ||
5636 | { | 5675 | { |
5637 | /* This function is based on compact_zone() from compaction.c. */ | 5676 | /* This function is based on compact_zone() from compaction.c. */ |
5638 | 5677 | unsigned long nr_reclaimed; | |
5639 | unsigned long pfn = start; | 5678 | unsigned long pfn = start; |
5640 | unsigned int tries = 0; | 5679 | unsigned int tries = 0; |
5641 | int ret = 0; | 5680 | int ret = 0; |
5642 | 5681 | ||
5643 | struct compact_control cc = { | ||
5644 | .nr_migratepages = 0, | ||
5645 | .order = -1, | ||
5646 | .zone = page_zone(pfn_to_page(start)), | ||
5647 | .sync = true, | ||
5648 | }; | ||
5649 | INIT_LIST_HEAD(&cc.migratepages); | ||
5650 | |||
5651 | migrate_prep_local(); | 5682 | migrate_prep_local(); |
5652 | 5683 | ||
5653 | while (pfn < end || !list_empty(&cc.migratepages)) { | 5684 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5654 | if (fatal_signal_pending(current)) { | 5685 | if (fatal_signal_pending(current)) { |
5655 | ret = -EINTR; | 5686 | ret = -EINTR; |
5656 | break; | 5687 | break; |
5657 | } | 5688 | } |
5658 | 5689 | ||
5659 | if (list_empty(&cc.migratepages)) { | 5690 | if (list_empty(&cc->migratepages)) { |
5660 | cc.nr_migratepages = 0; | 5691 | cc->nr_migratepages = 0; |
5661 | pfn = isolate_migratepages_range(cc.zone, &cc, | 5692 | pfn = isolate_migratepages_range(cc->zone, cc, |
5662 | pfn, end); | 5693 | pfn, end, true); |
5663 | if (!pfn) { | 5694 | if (!pfn) { |
5664 | ret = -EINTR; | 5695 | ret = -EINTR; |
5665 | break; | 5696 | break; |
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | |||
5670 | break; | 5701 | break; |
5671 | } | 5702 | } |
5672 | 5703 | ||
5673 | ret = migrate_pages(&cc.migratepages, | 5704 | nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, |
5674 | __alloc_contig_migrate_alloc, | 5705 | &cc->migratepages); |
5706 | cc->nr_migratepages -= nr_reclaimed; | ||
5707 | |||
5708 | ret = migrate_pages(&cc->migratepages, | ||
5709 | alloc_migrate_target, | ||
5675 | 0, false, MIGRATE_SYNC); | 5710 | 0, false, MIGRATE_SYNC); |
5676 | } | 5711 | } |
5677 | 5712 | ||
5678 | putback_lru_pages(&cc.migratepages); | 5713 | putback_lru_pages(&cc->migratepages); |
5679 | return ret > 0 ? 0 : ret; | 5714 | return ret > 0 ? 0 : ret; |
5680 | } | 5715 | } |
5681 | 5716 | ||
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5754 | unsigned long outer_start, outer_end; | 5789 | unsigned long outer_start, outer_end; |
5755 | int ret = 0, order; | 5790 | int ret = 0, order; |
5756 | 5791 | ||
5792 | struct compact_control cc = { | ||
5793 | .nr_migratepages = 0, | ||
5794 | .order = -1, | ||
5795 | .zone = page_zone(pfn_to_page(start)), | ||
5796 | .sync = true, | ||
5797 | .ignore_skip_hint = true, | ||
5798 | }; | ||
5799 | INIT_LIST_HEAD(&cc.migratepages); | ||
5800 | |||
5757 | /* | 5801 | /* |
5758 | * What we do here is we mark all pageblocks in range as | 5802 | * What we do here is we mark all pageblocks in range as |
5759 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | 5803 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
@@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5783 | if (ret) | 5827 | if (ret) |
5784 | goto done; | 5828 | goto done; |
5785 | 5829 | ||
5786 | ret = __alloc_contig_migrate_range(start, end); | 5830 | ret = __alloc_contig_migrate_range(&cc, start, end); |
5787 | if (ret) | 5831 | if (ret) |
5788 | goto done; | 5832 | goto done; |
5789 | 5833 | ||
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5832 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | 5876 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); |
5833 | 5877 | ||
5834 | /* Grab isolated pages from freelists. */ | 5878 | /* Grab isolated pages from freelists. */ |
5835 | outer_end = isolate_freepages_range(outer_start, end); | 5879 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
5836 | if (!outer_end) { | 5880 | if (!outer_end) { |
5837 | ret = -EBUSY; | 5881 | ret = -EBUSY; |
5838 | goto done; | 5882 | goto done; |
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data) | |||
5874 | local_irq_save(flags); | 5918 | local_irq_save(flags); |
5875 | if (pcp->count > 0) | 5919 | if (pcp->count > 0) |
5876 | free_pcppages_bulk(zone, pcp->count, pcp); | 5920 | free_pcppages_bulk(zone, pcp->count, pcp); |
5921 | drain_zonestat(zone, pset); | ||
5877 | setup_pageset(pset, batch); | 5922 | setup_pageset(pset, batch); |
5878 | local_irq_restore(flags); | 5923 | local_irq_restore(flags); |
5879 | } | 5924 | } |
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5890 | void zone_pcp_reset(struct zone *zone) | 5935 | void zone_pcp_reset(struct zone *zone) |
5891 | { | 5936 | { |
5892 | unsigned long flags; | 5937 | unsigned long flags; |
5938 | int cpu; | ||
5939 | struct per_cpu_pageset *pset; | ||
5893 | 5940 | ||
5894 | /* avoid races with drain_pages() */ | 5941 | /* avoid races with drain_pages() */ |
5895 | local_irq_save(flags); | 5942 | local_irq_save(flags); |
5896 | if (zone->pageset != &boot_pageset) { | 5943 | if (zone->pageset != &boot_pageset) { |
5944 | for_each_online_cpu(cpu) { | ||
5945 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5946 | drain_zonestat(zone, pset); | ||
5947 | } | ||
5897 | free_percpu(zone->pageset); | 5948 | free_percpu(zone->pageset); |
5898 | zone->pageset = &boot_pageset; | 5949 | zone->pageset = &boot_pageset; |
5899 | } | 5950 | } |
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page) | |||
6047 | dump_page_flags(page->flags); | 6098 | dump_page_flags(page->flags); |
6048 | mem_cgroup_print_bad_page(page); | 6099 | mem_cgroup_print_bad_page(page); |
6049 | } | 6100 | } |
6101 | |||
6102 | /* reset zone->present_pages */ | ||
6103 | void reset_zone_present_pages(void) | ||
6104 | { | ||
6105 | struct zone *z; | ||
6106 | int i, nid; | ||
6107 | |||
6108 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
6109 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6110 | z = NODE_DATA(nid)->node_zones + i; | ||
6111 | z->present_pages = 0; | ||
6112 | } | ||
6113 | } | ||
6114 | } | ||
6115 | |||
6116 | /* calculate zone's present pages in buddy system */ | ||
6117 | void fixup_zone_present_pages(int nid, unsigned long start_pfn, | ||
6118 | unsigned long end_pfn) | ||
6119 | { | ||
6120 | struct zone *z; | ||
6121 | unsigned long zone_start_pfn, zone_end_pfn; | ||
6122 | int i; | ||
6123 | |||
6124 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6125 | z = NODE_DATA(nid)->node_zones + i; | ||
6126 | zone_start_pfn = z->zone_start_pfn; | ||
6127 | zone_end_pfn = zone_start_pfn + z->spanned_pages; | ||
6128 | |||
6129 | /* if the two regions intersect */ | ||
6130 | if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) | ||
6131 | z->present_pages += min(end_pfn, zone_end_pfn) - | ||
6132 | max(start_pfn, zone_start_pfn); | ||
6133 | } | ||
6134 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f175739..f2f5b4818e94 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page) | |||
76 | 76 | ||
77 | out: | 77 | out: |
78 | if (!ret) { | 78 | if (!ret) { |
79 | unsigned long nr_pages; | ||
80 | int migratetype = get_pageblock_migratetype(page); | ||
81 | |||
79 | set_pageblock_isolate(page); | 82 | set_pageblock_isolate(page); |
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 83 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); |
84 | |||
85 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | ||
81 | } | 86 | } |
82 | 87 | ||
83 | spin_unlock_irqrestore(&zone->lock, flags); | 88 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -89,12 +94,14 @@ out: | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | 94 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
90 | { | 95 | { |
91 | struct zone *zone; | 96 | struct zone *zone; |
92 | unsigned long flags; | 97 | unsigned long flags, nr_pages; |
98 | |||
93 | zone = page_zone(page); | 99 | zone = page_zone(page); |
94 | spin_lock_irqsave(&zone->lock, flags); | 100 | spin_lock_irqsave(&zone->lock, flags); |
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 101 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
96 | goto out; | 102 | goto out; |
97 | move_freepages_block(zone, page, migratetype); | 103 | nr_pages = move_freepages_block(zone, page, migratetype); |
104 | __mod_zone_freepage_state(zone, nr_pages, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | 105 | restore_pageblock_isolate(page, migratetype); |
99 | out: | 106 | out: |
100 | spin_unlock_irqrestore(&zone->lock, flags); | 107 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
193 | continue; | 200 | continue; |
194 | } | 201 | } |
195 | page = pfn_to_page(pfn); | 202 | page = pfn_to_page(pfn); |
196 | if (PageBuddy(page)) | 203 | if (PageBuddy(page)) { |
204 | /* | ||
205 | * If race between isolatation and allocation happens, | ||
206 | * some free pages could be in MIGRATE_MOVABLE list | ||
207 | * although pageblock's migratation type of the page | ||
208 | * is MIGRATE_ISOLATE. Catch it and move the page into | ||
209 | * MIGRATE_ISOLATE list. | ||
210 | */ | ||
211 | if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { | ||
212 | struct page *end_page; | ||
213 | |||
214 | end_page = page + (1 << page_order(page)) - 1; | ||
215 | move_freepages(page_zone(page), page, end_page, | ||
216 | MIGRATE_ISOLATE); | ||
217 | } | ||
197 | pfn += 1 << page_order(page); | 218 | pfn += 1 << page_order(page); |
219 | } | ||
198 | else if (page_count(page) == 0 && | 220 | else if (page_count(page) == 0 && |
199 | page_private(page) == MIGRATE_ISOLATE) | 221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
200 | pfn += 1; | 222 | pfn += 1; |
201 | else | 223 | else |
202 | break; | 224 | break; |
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
233 | spin_unlock_irqrestore(&zone->lock, flags); | 255 | spin_unlock_irqrestore(&zone->lock, flags); |
234 | return ret ? 0 : -EBUSY; | 256 | return ret ? 0 : -EBUSY; |
235 | } | 257 | } |
258 | |||
259 | struct page *alloc_migrate_target(struct page *page, unsigned long private, | ||
260 | int **resultp) | ||
261 | { | ||
262 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
263 | |||
264 | if (PageHighMem(page)) | ||
265 | gfp_mask |= __GFP_HIGHMEM; | ||
266 | |||
267 | return alloc_page(gfp_mask); | ||
268 | } | ||
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0ddaa6fa0..e642627da6b7 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
120 | } | 120 | } |
121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
122 | #endif | 122 | #endif |
123 | |||
124 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | ||
125 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
126 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | ||
127 | { | ||
128 | assert_spin_locked(&mm->page_table_lock); | ||
129 | |||
130 | /* FIFO */ | ||
131 | if (!mm->pmd_huge_pte) | ||
132 | INIT_LIST_HEAD(&pgtable->lru); | ||
133 | else | ||
134 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
135 | mm->pmd_huge_pte = pgtable; | ||
136 | } | ||
137 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
138 | #endif | ||
139 | |||
140 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | ||
141 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
142 | /* no "address" argument so destroys page coloring of some arch */ | ||
143 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | ||
144 | { | ||
145 | pgtable_t pgtable; | ||
146 | |||
147 | assert_spin_locked(&mm->page_table_lock); | ||
148 | |||
149 | /* FIFO */ | ||
150 | pgtable = mm->pmd_huge_pte; | ||
151 | if (list_empty(&pgtable->lru)) | ||
152 | mm->pmd_huge_pte = NULL; | ||
153 | else { | ||
154 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
155 | struct page, lru); | ||
156 | list_del(&pgtable->lru); | ||
157 | } | ||
158 | return pgtable; | ||
159 | } | ||
160 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
161 | #endif | ||
162 | |||
163 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE | ||
164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
165 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | ||
166 | pmd_t *pmdp) | ||
167 | { | ||
168 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); | ||
169 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
170 | } | ||
171 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
172 | #endif | ||
diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null | |||
@@ -1,208 +0,0 @@ | |||
1 | /* | ||
2 | * mm/prio_tree.c - priority search tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | * | ||
8 | * Based on the radix priority search tree proposed by Edward M. McCreight | ||
9 | * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 | ||
10 | * | ||
11 | * 02Feb2004 Initial version | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/prio_tree.h> | ||
16 | #include <linux/prefetch.h> | ||
17 | |||
18 | /* | ||
19 | * See lib/prio_tree.c for details on the general radix priority search tree | ||
20 | * code. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * The following #defines are mirrored from lib/prio_tree.c. They're only used | ||
25 | * for debugging, and should be removed (along with the debugging code using | ||
26 | * them) when switching also VMAs to the regular prio_tree code. | ||
27 | */ | ||
28 | |||
29 | #define RADIX_INDEX(vma) ((vma)->vm_pgoff) | ||
30 | #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) | ||
31 | /* avoid overflow */ | ||
32 | #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) | ||
33 | |||
34 | /* | ||
35 | * Radix priority search tree for address_space->i_mmap | ||
36 | * | ||
37 | * For each vma that map a unique set of file pages i.e., unique [radix_index, | ||
38 | * heap_index] value, we have a corresponding priority search tree node. If | ||
39 | * multiple vmas have identical [radix_index, heap_index] value, then one of | ||
40 | * them is used as a tree node and others are stored in a vm_set list. The tree | ||
41 | * node points to the first vma (head) of the list using vm_set.head. | ||
42 | * | ||
43 | * prio_tree_root | ||
44 | * | | ||
45 | * A vm_set.head | ||
46 | * / \ / | ||
47 | * L R -> H-I-J-K-M-N-O-P-Q-S | ||
48 | * ^ ^ <-- vm_set.list --> | ||
49 | * tree nodes | ||
50 | * | ||
51 | * We need some way to identify whether a vma is a tree node, head of a vm_set | ||
52 | * list, or just a member of a vm_set list. We cannot use vm_flags to store | ||
53 | * such information. The reason is, in the above figure, it is possible that | ||
54 | * vm_flags' of R and H are covered by the different mmap_sems. When R is | ||
55 | * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold | ||
56 | * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. | ||
57 | * That's why some trick involving shared.vm_set.parent is used for identifying | ||
58 | * tree nodes and list head nodes. | ||
59 | * | ||
60 | * vma radix priority search tree node rules: | ||
61 | * | ||
62 | * vma->shared.vm_set.parent != NULL ==> a tree node | ||
63 | * vma->shared.vm_set.head != NULL ==> list of others mapping same range | ||
64 | * vma->shared.vm_set.head == NULL ==> no others map the same range | ||
65 | * | ||
66 | * vma->shared.vm_set.parent == NULL | ||
67 | * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range | ||
68 | * vma->shared.vm_set.head == NULL ==> a list node | ||
69 | */ | ||
70 | |||
71 | /* | ||
72 | * Add a new vma known to map the same set of pages as the old vma: | ||
73 | * useful for fork's dup_mmap as well as vma_prio_tree_insert below. | ||
74 | * Note that it just happens to work correctly on i_mmap_nonlinear too. | ||
75 | */ | ||
76 | void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) | ||
77 | { | ||
78 | /* Leave these BUG_ONs till prio_tree patch stabilizes */ | ||
79 | BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); | ||
80 | BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); | ||
81 | |||
82 | vma->shared.vm_set.head = NULL; | ||
83 | vma->shared.vm_set.parent = NULL; | ||
84 | |||
85 | if (!old->shared.vm_set.parent) | ||
86 | list_add(&vma->shared.vm_set.list, | ||
87 | &old->shared.vm_set.list); | ||
88 | else if (old->shared.vm_set.head) | ||
89 | list_add_tail(&vma->shared.vm_set.list, | ||
90 | &old->shared.vm_set.head->shared.vm_set.list); | ||
91 | else { | ||
92 | INIT_LIST_HEAD(&vma->shared.vm_set.list); | ||
93 | vma->shared.vm_set.head = old; | ||
94 | old->shared.vm_set.head = vma; | ||
95 | } | ||
96 | } | ||
97 | |||
98 | void vma_prio_tree_insert(struct vm_area_struct *vma, | ||
99 | struct prio_tree_root *root) | ||
100 | { | ||
101 | struct prio_tree_node *ptr; | ||
102 | struct vm_area_struct *old; | ||
103 | |||
104 | vma->shared.vm_set.head = NULL; | ||
105 | |||
106 | ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); | ||
107 | if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { | ||
108 | old = prio_tree_entry(ptr, struct vm_area_struct, | ||
109 | shared.prio_tree_node); | ||
110 | vma_prio_tree_add(vma, old); | ||
111 | } | ||
112 | } | ||
113 | |||
114 | void vma_prio_tree_remove(struct vm_area_struct *vma, | ||
115 | struct prio_tree_root *root) | ||
116 | { | ||
117 | struct vm_area_struct *node, *head, *new_head; | ||
118 | |||
119 | if (!vma->shared.vm_set.head) { | ||
120 | if (!vma->shared.vm_set.parent) | ||
121 | list_del_init(&vma->shared.vm_set.list); | ||
122 | else | ||
123 | raw_prio_tree_remove(root, &vma->shared.prio_tree_node); | ||
124 | } else { | ||
125 | /* Leave this BUG_ON till prio_tree patch stabilizes */ | ||
126 | BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); | ||
127 | if (vma->shared.vm_set.parent) { | ||
128 | head = vma->shared.vm_set.head; | ||
129 | if (!list_empty(&head->shared.vm_set.list)) { | ||
130 | new_head = list_entry( | ||
131 | head->shared.vm_set.list.next, | ||
132 | struct vm_area_struct, | ||
133 | shared.vm_set.list); | ||
134 | list_del_init(&head->shared.vm_set.list); | ||
135 | } else | ||
136 | new_head = NULL; | ||
137 | |||
138 | raw_prio_tree_replace(root, &vma->shared.prio_tree_node, | ||
139 | &head->shared.prio_tree_node); | ||
140 | head->shared.vm_set.head = new_head; | ||
141 | if (new_head) | ||
142 | new_head->shared.vm_set.head = head; | ||
143 | |||
144 | } else { | ||
145 | node = vma->shared.vm_set.head; | ||
146 | if (!list_empty(&vma->shared.vm_set.list)) { | ||
147 | new_head = list_entry( | ||
148 | vma->shared.vm_set.list.next, | ||
149 | struct vm_area_struct, | ||
150 | shared.vm_set.list); | ||
151 | list_del_init(&vma->shared.vm_set.list); | ||
152 | node->shared.vm_set.head = new_head; | ||
153 | new_head->shared.vm_set.head = node; | ||
154 | } else | ||
155 | node->shared.vm_set.head = NULL; | ||
156 | } | ||
157 | } | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Helper function to enumerate vmas that map a given file page or a set of | ||
162 | * contiguous file pages. The function returns vmas that at least map a single | ||
163 | * page in the given range of contiguous file pages. | ||
164 | */ | ||
165 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | ||
166 | struct prio_tree_iter *iter) | ||
167 | { | ||
168 | struct prio_tree_node *ptr; | ||
169 | struct vm_area_struct *next; | ||
170 | |||
171 | if (!vma) { | ||
172 | /* | ||
173 | * First call is with NULL vma | ||
174 | */ | ||
175 | ptr = prio_tree_next(iter); | ||
176 | if (ptr) { | ||
177 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
178 | shared.prio_tree_node); | ||
179 | prefetch(next->shared.vm_set.head); | ||
180 | return next; | ||
181 | } else | ||
182 | return NULL; | ||
183 | } | ||
184 | |||
185 | if (vma->shared.vm_set.parent) { | ||
186 | if (vma->shared.vm_set.head) { | ||
187 | next = vma->shared.vm_set.head; | ||
188 | prefetch(next->shared.vm_set.list.next); | ||
189 | return next; | ||
190 | } | ||
191 | } else { | ||
192 | next = list_entry(vma->shared.vm_set.list.next, | ||
193 | struct vm_area_struct, shared.vm_set.list); | ||
194 | if (!next->shared.vm_set.head) { | ||
195 | prefetch(next->shared.vm_set.list.next); | ||
196 | return next; | ||
197 | } | ||
198 | } | ||
199 | |||
200 | ptr = prio_tree_next(iter); | ||
201 | if (ptr) { | ||
202 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
203 | shared.prio_tree_node); | ||
204 | prefetch(next->shared.vm_set.head); | ||
205 | return next; | ||
206 | } else | ||
207 | return NULL; | ||
208 | } | ||
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
127 | avc->vma = vma; | 127 | avc->vma = vma; |
128 | avc->anon_vma = anon_vma; | 128 | avc->anon_vma = anon_vma; |
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 129 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
130 | 130 | anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); | |
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | 131 | } |
137 | 132 | ||
138 | /** | 133 | /** |
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
269 | } | 264 | } |
270 | 265 | ||
271 | /* | 266 | /* |
272 | * Some rmap walk that needs to find all ptes/hugepmds without false | ||
273 | * negatives (like migrate and split_huge_page) running concurrent | ||
274 | * with operations that copy or move pagetables (like mremap() and | ||
275 | * fork()) to be safe. They depend on the anon_vma "same_anon_vma" | ||
276 | * list to be in a certain order: the dst_vma must be placed after the | ||
277 | * src_vma in the list. This is always guaranteed by fork() but | ||
278 | * mremap() needs to call this function to enforce it in case the | ||
279 | * dst_vma isn't newly allocated and chained with the anon_vma_clone() | ||
280 | * function but just an extension of a pre-existing vma through | ||
281 | * vma_merge. | ||
282 | * | ||
283 | * NOTE: the same_anon_vma list can still be changed by other | ||
284 | * processes while mremap runs because mremap doesn't hold the | ||
285 | * anon_vma mutex to prevent modifications to the list while it | ||
286 | * runs. All we need to enforce is that the relative order of this | ||
287 | * process vmas isn't changing (we don't care about other vmas | ||
288 | * order). Each vma corresponds to an anon_vma_chain structure so | ||
289 | * there's no risk that other processes calling anon_vma_moveto_tail() | ||
290 | * and changing the same_anon_vma list under mremap() will screw with | ||
291 | * the relative order of this process vmas in the list, because we | ||
292 | * they can't alter the order of any vma that belongs to this | ||
293 | * process. And there can't be another anon_vma_moveto_tail() running | ||
294 | * concurrently with mremap() coming from this process because we hold | ||
295 | * the mmap_sem for the whole mremap(). fork() ordering dependency | ||
296 | * also shouldn't be affected because fork() only cares that the | ||
297 | * parent vmas are placed in the list before the child vmas and | ||
298 | * anon_vma_moveto_tail() won't reorder vmas from either the fork() | ||
299 | * parent or child. | ||
300 | */ | ||
301 | void anon_vma_moveto_tail(struct vm_area_struct *dst) | ||
302 | { | ||
303 | struct anon_vma_chain *pavc; | ||
304 | struct anon_vma *root = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { | ||
307 | struct anon_vma *anon_vma = pavc->anon_vma; | ||
308 | VM_BUG_ON(pavc->vma != dst); | ||
309 | root = lock_anon_vma_root(root, anon_vma); | ||
310 | list_del(&pavc->same_anon_vma); | ||
311 | list_add_tail(&pavc->same_anon_vma, &anon_vma->head); | ||
312 | } | ||
313 | unlock_anon_vma_root(root); | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | 267 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
318 | * the corresponding VMA in the parent process is attached to. | 268 | * the corresponding VMA in the parent process is attached to. |
319 | * Returns 0 on success, non-zero on failure. | 269 | * Returns 0 on success, non-zero on failure. |
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
381 | struct anon_vma *anon_vma = avc->anon_vma; | 331 | struct anon_vma *anon_vma = avc->anon_vma; |
382 | 332 | ||
383 | root = lock_anon_vma_root(root, anon_vma); | 333 | root = lock_anon_vma_root(root, anon_vma); |
384 | list_del(&avc->same_anon_vma); | 334 | anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); |
385 | 335 | ||
386 | /* | 336 | /* |
387 | * Leave empty anon_vmas on the list - we'll need | 337 | * Leave empty anon_vmas on the list - we'll need |
388 | * to free them outside the lock. | 338 | * to free them outside the lock. |
389 | */ | 339 | */ |
390 | if (list_empty(&anon_vma->head)) | 340 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) |
391 | continue; | 341 | continue; |
392 | 342 | ||
393 | list_del(&avc->same_vma); | 343 | list_del(&avc->same_vma); |
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data) | |||
416 | 366 | ||
417 | mutex_init(&anon_vma->mutex); | 367 | mutex_init(&anon_vma->mutex); |
418 | atomic_set(&anon_vma->refcount, 0); | 368 | atomic_set(&anon_vma->refcount, 0); |
419 | INIT_LIST_HEAD(&anon_vma->head); | 369 | anon_vma->rb_root = RB_ROOT; |
420 | } | 370 | } |
421 | 371 | ||
422 | void __init anon_vma_init(void) | 372 | void __init anon_vma_init(void) |
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
560 | 510 | ||
561 | /* | 511 | /* |
562 | * At what user virtual address is page expected in @vma? | 512 | * At what user virtual address is page expected in @vma? |
563 | * Returns virtual address or -EFAULT if page's index/offset is not | ||
564 | * within the range mapped the @vma. | ||
565 | */ | 513 | */ |
566 | inline unsigned long | 514 | static inline unsigned long |
567 | vma_address(struct page *page, struct vm_area_struct *vma) | 515 | __vma_address(struct page *page, struct vm_area_struct *vma) |
568 | { | 516 | { |
569 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 517 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
570 | unsigned long address; | ||
571 | 518 | ||
572 | if (unlikely(is_vm_hugetlb_page(vma))) | 519 | if (unlikely(is_vm_hugetlb_page(vma))) |
573 | pgoff = page->index << huge_page_order(page_hstate(page)); | 520 | pgoff = page->index << huge_page_order(page_hstate(page)); |
574 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 521 | |
575 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 522 | return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
576 | /* page should be within @vma mapping range */ | 523 | } |
577 | return -EFAULT; | 524 | |
578 | } | 525 | inline unsigned long |
526 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
527 | { | ||
528 | unsigned long address = __vma_address(page, vma); | ||
529 | |||
530 | /* page should be within @vma mapping range */ | ||
531 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
532 | |||
579 | return address; | 533 | return address; |
580 | } | 534 | } |
581 | 535 | ||
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
585 | */ | 539 | */ |
586 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 540 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
587 | { | 541 | { |
542 | unsigned long address; | ||
588 | if (PageAnon(page)) { | 543 | if (PageAnon(page)) { |
589 | struct anon_vma *page__anon_vma = page_anon_vma(page); | 544 | struct anon_vma *page__anon_vma = page_anon_vma(page); |
590 | /* | 545 | /* |
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
600 | return -EFAULT; | 555 | return -EFAULT; |
601 | } else | 556 | } else |
602 | return -EFAULT; | 557 | return -EFAULT; |
603 | return vma_address(page, vma); | 558 | address = __vma_address(page, vma); |
559 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) | ||
560 | return -EFAULT; | ||
561 | return address; | ||
604 | } | 562 | } |
605 | 563 | ||
606 | /* | 564 | /* |
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
674 | pte_t *pte; | 632 | pte_t *pte; |
675 | spinlock_t *ptl; | 633 | spinlock_t *ptl; |
676 | 634 | ||
677 | address = vma_address(page, vma); | 635 | address = __vma_address(page, vma); |
678 | if (address == -EFAULT) /* out of vma range */ | 636 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) |
679 | return 0; | 637 | return 0; |
680 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | 638 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); |
681 | if (!pte) /* the page is not in this mm */ | 639 | if (!pte) /* the page is not in this mm */ |
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page, | |||
769 | { | 727 | { |
770 | unsigned int mapcount; | 728 | unsigned int mapcount; |
771 | struct anon_vma *anon_vma; | 729 | struct anon_vma *anon_vma; |
730 | pgoff_t pgoff; | ||
772 | struct anon_vma_chain *avc; | 731 | struct anon_vma_chain *avc; |
773 | int referenced = 0; | 732 | int referenced = 0; |
774 | 733 | ||
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page, | |||
777 | return referenced; | 736 | return referenced; |
778 | 737 | ||
779 | mapcount = page_mapcount(page); | 738 | mapcount = page_mapcount(page); |
780 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 739 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
740 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
781 | struct vm_area_struct *vma = avc->vma; | 741 | struct vm_area_struct *vma = avc->vma; |
782 | unsigned long address = vma_address(page, vma); | 742 | unsigned long address = vma_address(page, vma); |
783 | if (address == -EFAULT) | ||
784 | continue; | ||
785 | /* | 743 | /* |
786 | * If we are reclaiming on behalf of a cgroup, skip | 744 | * If we are reclaiming on behalf of a cgroup, skip |
787 | * counting on behalf of references from different | 745 | * counting on behalf of references from different |
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page, | |||
820 | struct address_space *mapping = page->mapping; | 778 | struct address_space *mapping = page->mapping; |
821 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 779 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
822 | struct vm_area_struct *vma; | 780 | struct vm_area_struct *vma; |
823 | struct prio_tree_iter iter; | ||
824 | int referenced = 0; | 781 | int referenced = 0; |
825 | 782 | ||
826 | /* | 783 | /* |
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page, | |||
846 | */ | 803 | */ |
847 | mapcount = page_mapcount(page); | 804 | mapcount = page_mapcount(page); |
848 | 805 | ||
849 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 806 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
850 | unsigned long address = vma_address(page, vma); | 807 | unsigned long address = vma_address(page, vma); |
851 | if (address == -EFAULT) | ||
852 | continue; | ||
853 | /* | 808 | /* |
854 | * If we are reclaiming on behalf of a cgroup, skip | 809 | * If we are reclaiming on behalf of a cgroup, skip |
855 | * counting on behalf of references from different | 810 | * counting on behalf of references from different |
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
929 | pte_t entry; | 884 | pte_t entry; |
930 | 885 | ||
931 | flush_cache_page(vma, address, pte_pfn(*pte)); | 886 | flush_cache_page(vma, address, pte_pfn(*pte)); |
932 | entry = ptep_clear_flush_notify(vma, address, pte); | 887 | entry = ptep_clear_flush(vma, address, pte); |
933 | entry = pte_wrprotect(entry); | 888 | entry = pte_wrprotect(entry); |
934 | entry = pte_mkclean(entry); | 889 | entry = pte_mkclean(entry); |
935 | set_pte_at(mm, address, pte, entry); | 890 | set_pte_at(mm, address, pte, entry); |
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
937 | } | 892 | } |
938 | 893 | ||
939 | pte_unmap_unlock(pte, ptl); | 894 | pte_unmap_unlock(pte, ptl); |
895 | |||
896 | if (ret) | ||
897 | mmu_notifier_invalidate_page(mm, address); | ||
940 | out: | 898 | out: |
941 | return ret; | 899 | return ret; |
942 | } | 900 | } |
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
945 | { | 903 | { |
946 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 904 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
947 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
948 | struct prio_tree_iter iter; | ||
949 | int ret = 0; | 906 | int ret = 0; |
950 | 907 | ||
951 | BUG_ON(PageAnon(page)); | 908 | BUG_ON(PageAnon(page)); |
952 | 909 | ||
953 | mutex_lock(&mapping->i_mmap_mutex); | 910 | mutex_lock(&mapping->i_mmap_mutex); |
954 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 911 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
955 | if (vma->vm_flags & VM_SHARED) { | 912 | if (vma->vm_flags & VM_SHARED) { |
956 | unsigned long address = vma_address(page, vma); | 913 | unsigned long address = vma_address(page, vma); |
957 | if (address == -EFAULT) | ||
958 | continue; | ||
959 | ret += page_mkclean_one(page, vma, address); | 914 | ret += page_mkclean_one(page, vma, address); |
960 | } | 915 | } |
961 | } | 916 | } |
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
1128 | else | 1083 | else |
1129 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1084 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1130 | __page_set_anon_rmap(page, vma, address, 1); | 1085 | __page_set_anon_rmap(page, vma, address, 1); |
1131 | if (page_evictable(page, vma)) | 1086 | if (!mlocked_vma_newpage(vma, page)) |
1132 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1087 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
1133 | else | 1088 | else |
1134 | add_page_to_unevictable_list(page); | 1089 | add_page_to_unevictable_list(page); |
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page) | |||
1203 | } else { | 1158 | } else { |
1204 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1159 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1205 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1160 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1161 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1206 | } | 1162 | } |
1163 | if (unlikely(PageMlocked(page))) | ||
1164 | clear_page_mlock(page); | ||
1207 | /* | 1165 | /* |
1208 | * It would be tidy to reset the PageAnon mapping here, | 1166 | * It would be tidy to reset the PageAnon mapping here, |
1209 | * but that might overwrite a racing page_add_anon_rmap | 1167 | * but that might overwrite a racing page_add_anon_rmap |
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page) | |||
1213 | * Leaving it set also helps swapoff to reinstate ptes | 1171 | * Leaving it set also helps swapoff to reinstate ptes |
1214 | * faster for those pages still in swapcache. | 1172 | * faster for those pages still in swapcache. |
1215 | */ | 1173 | */ |
1174 | return; | ||
1216 | out: | 1175 | out: |
1217 | if (!anon) | 1176 | if (!anon) |
1218 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1177 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1256 | 1215 | ||
1257 | /* Nuke the page table entry. */ | 1216 | /* Nuke the page table entry. */ |
1258 | flush_cache_page(vma, address, page_to_pfn(page)); | 1217 | flush_cache_page(vma, address, page_to_pfn(page)); |
1259 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1218 | pteval = ptep_clear_flush(vma, address, pte); |
1260 | 1219 | ||
1261 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1220 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1262 | if (pte_dirty(pteval)) | 1221 | if (pte_dirty(pteval)) |
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1318 | 1277 | ||
1319 | out_unmap: | 1278 | out_unmap: |
1320 | pte_unmap_unlock(pte, ptl); | 1279 | pte_unmap_unlock(pte, ptl); |
1280 | if (ret != SWAP_FAIL) | ||
1281 | mmu_notifier_invalidate_page(mm, address); | ||
1321 | out: | 1282 | out: |
1322 | return ret; | 1283 | return ret; |
1323 | 1284 | ||
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1382 | spinlock_t *ptl; | 1343 | spinlock_t *ptl; |
1383 | struct page *page; | 1344 | struct page *page; |
1384 | unsigned long address; | 1345 | unsigned long address; |
1346 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1347 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1385 | unsigned long end; | 1348 | unsigned long end; |
1386 | int ret = SWAP_AGAIN; | 1349 | int ret = SWAP_AGAIN; |
1387 | int locked_vma = 0; | 1350 | int locked_vma = 0; |
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1405 | if (!pmd_present(*pmd)) | 1368 | if (!pmd_present(*pmd)) |
1406 | return ret; | 1369 | return ret; |
1407 | 1370 | ||
1371 | mmun_start = address; | ||
1372 | mmun_end = end; | ||
1373 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1374 | |||
1408 | /* | 1375 | /* |
1409 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | 1376 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
1410 | * keep the sem while scanning the cluster for mlocking pages. | 1377 | * keep the sem while scanning the cluster for mlocking pages. |
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1438 | 1405 | ||
1439 | /* Nuke the page table entry. */ | 1406 | /* Nuke the page table entry. */ |
1440 | flush_cache_page(vma, address, pte_pfn(*pte)); | 1407 | flush_cache_page(vma, address, pte_pfn(*pte)); |
1441 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1408 | pteval = ptep_clear_flush(vma, address, pte); |
1442 | 1409 | ||
1443 | /* If nonlinear, store the file page offset in the pte. */ | 1410 | /* If nonlinear, store the file page offset in the pte. */ |
1444 | if (page->index != linear_page_index(vma, address)) | 1411 | if (page->index != linear_page_index(vma, address)) |
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1454 | (*mapcount)--; | 1421 | (*mapcount)--; |
1455 | } | 1422 | } |
1456 | pte_unmap_unlock(pte - 1, ptl); | 1423 | pte_unmap_unlock(pte - 1, ptl); |
1424 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1457 | if (locked_vma) | 1425 | if (locked_vma) |
1458 | up_read(&vma->vm_mm->mmap_sem); | 1426 | up_read(&vma->vm_mm->mmap_sem); |
1459 | return ret; | 1427 | return ret; |
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) | |||
1492 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1460 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1493 | { | 1461 | { |
1494 | struct anon_vma *anon_vma; | 1462 | struct anon_vma *anon_vma; |
1463 | pgoff_t pgoff; | ||
1495 | struct anon_vma_chain *avc; | 1464 | struct anon_vma_chain *avc; |
1496 | int ret = SWAP_AGAIN; | 1465 | int ret = SWAP_AGAIN; |
1497 | 1466 | ||
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | if (!anon_vma) | 1468 | if (!anon_vma) |
1500 | return ret; | 1469 | return ret; |
1501 | 1470 | ||
1502 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1471 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1472 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1503 | struct vm_area_struct *vma = avc->vma; | 1473 | struct vm_area_struct *vma = avc->vma; |
1504 | unsigned long address; | 1474 | unsigned long address; |
1505 | 1475 | ||
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1516 | continue; | 1486 | continue; |
1517 | 1487 | ||
1518 | address = vma_address(page, vma); | 1488 | address = vma_address(page, vma); |
1519 | if (address == -EFAULT) | ||
1520 | continue; | ||
1521 | ret = try_to_unmap_one(page, vma, address, flags); | 1489 | ret = try_to_unmap_one(page, vma, address, flags); |
1522 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1490 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1523 | break; | 1491 | break; |
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1547 | struct address_space *mapping = page->mapping; | 1515 | struct address_space *mapping = page->mapping; |
1548 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1516 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1549 | struct vm_area_struct *vma; | 1517 | struct vm_area_struct *vma; |
1550 | struct prio_tree_iter iter; | ||
1551 | int ret = SWAP_AGAIN; | 1518 | int ret = SWAP_AGAIN; |
1552 | unsigned long cursor; | 1519 | unsigned long cursor; |
1553 | unsigned long max_nl_cursor = 0; | 1520 | unsigned long max_nl_cursor = 0; |
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1555 | unsigned int mapcount; | 1522 | unsigned int mapcount; |
1556 | 1523 | ||
1557 | mutex_lock(&mapping->i_mmap_mutex); | 1524 | mutex_lock(&mapping->i_mmap_mutex); |
1558 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1525 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1559 | unsigned long address = vma_address(page, vma); | 1526 | unsigned long address = vma_address(page, vma); |
1560 | if (address == -EFAULT) | ||
1561 | continue; | ||
1562 | ret = try_to_unmap_one(page, vma, address, flags); | 1527 | ret = try_to_unmap_one(page, vma, address, flags); |
1563 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1528 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1564 | goto out; | 1529 | goto out; |
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1576 | goto out; | 1541 | goto out; |
1577 | 1542 | ||
1578 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1543 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1579 | shared.vm_set.list) { | 1544 | shared.nonlinear) { |
1580 | cursor = (unsigned long) vma->vm_private_data; | 1545 | cursor = (unsigned long) vma->vm_private_data; |
1581 | if (cursor > max_nl_cursor) | 1546 | if (cursor > max_nl_cursor) |
1582 | max_nl_cursor = cursor; | 1547 | max_nl_cursor = cursor; |
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1608 | 1573 | ||
1609 | do { | 1574 | do { |
1610 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1575 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1611 | shared.vm_set.list) { | 1576 | shared.nonlinear) { |
1612 | cursor = (unsigned long) vma->vm_private_data; | 1577 | cursor = (unsigned long) vma->vm_private_data; |
1613 | while ( cursor < max_nl_cursor && | 1578 | while ( cursor < max_nl_cursor && |
1614 | cursor < vma->vm_end - vma->vm_start) { | 1579 | cursor < vma->vm_end - vma->vm_start) { |
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1631 | * in locked vmas). Reset cursor on all unreserved nonlinear | 1596 | * in locked vmas). Reset cursor on all unreserved nonlinear |
1632 | * vmas, now forgetting on which ones it had fallen behind. | 1597 | * vmas, now forgetting on which ones it had fallen behind. |
1633 | */ | 1598 | */ |
1634 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1599 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1635 | vma->vm_private_data = NULL; | 1600 | vma->vm_private_data = NULL; |
1636 | out: | 1601 | out: |
1637 | mutex_unlock(&mapping->i_mmap_mutex); | 1602 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1716 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1681 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1717 | { | 1682 | { |
1718 | struct anon_vma *anon_vma; | 1683 | struct anon_vma *anon_vma; |
1684 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1719 | struct anon_vma_chain *avc; | 1685 | struct anon_vma_chain *avc; |
1720 | int ret = SWAP_AGAIN; | 1686 | int ret = SWAP_AGAIN; |
1721 | 1687 | ||
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1729 | if (!anon_vma) | 1695 | if (!anon_vma) |
1730 | return ret; | 1696 | return ret; |
1731 | anon_vma_lock(anon_vma); | 1697 | anon_vma_lock(anon_vma); |
1732 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1698 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1733 | struct vm_area_struct *vma = avc->vma; | 1699 | struct vm_area_struct *vma = avc->vma; |
1734 | unsigned long address = vma_address(page, vma); | 1700 | unsigned long address = vma_address(page, vma); |
1735 | if (address == -EFAULT) | ||
1736 | continue; | ||
1737 | ret = rmap_one(page, vma, address, arg); | 1701 | ret = rmap_one(page, vma, address, arg); |
1738 | if (ret != SWAP_AGAIN) | 1702 | if (ret != SWAP_AGAIN) |
1739 | break; | 1703 | break; |
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1748 | struct address_space *mapping = page->mapping; | 1712 | struct address_space *mapping = page->mapping; |
1749 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1713 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1750 | struct vm_area_struct *vma; | 1714 | struct vm_area_struct *vma; |
1751 | struct prio_tree_iter iter; | ||
1752 | int ret = SWAP_AGAIN; | 1715 | int ret = SWAP_AGAIN; |
1753 | 1716 | ||
1754 | if (!mapping) | 1717 | if (!mapping) |
1755 | return ret; | 1718 | return ret; |
1756 | mutex_lock(&mapping->i_mmap_mutex); | 1719 | mutex_lock(&mapping->i_mmap_mutex); |
1757 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1720 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1758 | unsigned long address = vma_address(page, vma); | 1721 | unsigned long address = vma_address(page, vma); |
1759 | if (address == -EFAULT) | ||
1760 | continue; | ||
1761 | ret = rmap_one(page, vma, address, arg); | 1722 | ret = rmap_one(page, vma, address, arg); |
1762 | if (ret != SWAP_AGAIN) | 1723 | if (ret != SWAP_AGAIN) |
1763 | break; | 1724 | break; |
diff --git a/mm/shmem.c b/mm/shmem.c index d3752110c8c7..cc12072f8787 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
1339 | { | 1339 | { |
1340 | file_accessed(file); | 1340 | file_accessed(file); |
1341 | vma->vm_ops = &shmem_vm_ops; | 1341 | vma->vm_ops = &shmem_vm_ops; |
1342 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1343 | return 0; | 1342 | return 0; |
1344 | } | 1343 | } |
1345 | 1344 | ||
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2643 | .set_policy = shmem_set_policy, | 2642 | .set_policy = shmem_set_policy, |
2644 | .get_policy = shmem_get_policy, | 2643 | .get_policy = shmem_get_policy, |
2645 | #endif | 2644 | #endif |
2645 | .remap_pages = generic_file_remap_pages, | ||
2646 | }; | 2646 | }; |
2647 | 2647 | ||
2648 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2648 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2836 | fput(vma->vm_file); | 2836 | fput(vma->vm_file); |
2837 | vma->vm_file = file; | 2837 | vma->vm_file = file; |
2838 | vma->vm_ops = &shmem_vm_ops; | 2838 | vma->vm_ops = &shmem_vm_ops; |
2839 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
2840 | return 0; | 2839 | return 0; |
2841 | } | 2840 | } |
2842 | 2841 | ||
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) | |||
446 | } | 446 | } |
447 | EXPORT_SYMBOL(mark_page_accessed); | 447 | EXPORT_SYMBOL(mark_page_accessed); |
448 | 448 | ||
449 | /* | ||
450 | * Order of operations is important: flush the pagevec when it's already | ||
451 | * full, not when adding the last page, to make sure that last page is | ||
452 | * not added to the LRU directly when passed to this function. Because | ||
453 | * mark_page_accessed() (called after this when writing) only activates | ||
454 | * pages that are on the LRU, linear writes in subpage chunks would see | ||
455 | * every PAGEVEC_SIZE page activated, which is unexpected. | ||
456 | */ | ||
449 | void __lru_cache_add(struct page *page, enum lru_list lru) | 457 | void __lru_cache_add(struct page *page, enum lru_list lru) |
450 | { | 458 | { |
451 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; | 459 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; |
452 | 460 | ||
453 | page_cache_get(page); | 461 | page_cache_get(page); |
454 | if (!pagevec_add(pvec, page)) | 462 | if (!pagevec_space(pvec)) |
455 | __pagevec_lru_add(pvec, lru); | 463 | __pagevec_lru_add(pvec, lru); |
464 | pagevec_add(pvec, page); | ||
456 | put_cpu_var(lru_add_pvecs); | 465 | put_cpu_var(lru_add_pvecs); |
457 | } | 466 | } |
458 | EXPORT_SYMBOL(__lru_cache_add); | 467 | EXPORT_SYMBOL(__lru_cache_add); |
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
742 | 751 | ||
743 | SetPageLRU(page_tail); | 752 | SetPageLRU(page_tail); |
744 | 753 | ||
745 | if (page_evictable(page_tail, NULL)) { | 754 | if (page_evictable(page_tail)) { |
746 | if (PageActive(page)) { | 755 | if (PageActive(page)) { |
747 | SetPageActive(page_tail); | 756 | SetPageActive(page_tail); |
748 | active = 1; | 757 | active = 1; |
diff --git a/mm/truncate.c b/mm/truncate.c index 75801acdaac7..d51ce92d6e83 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
107 | 107 | ||
108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
109 | 109 | ||
110 | clear_page_mlock(page); | ||
111 | ClearPageMappedToDisk(page); | 110 | ClearPageMappedToDisk(page); |
112 | delete_from_page_cache(page); | 111 | delete_from_page_cache(page); |
113 | return 0; | 112 | return 0; |
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
132 | if (page_has_private(page) && !try_to_release_page(page, 0)) | 131 | if (page_has_private(page) && !try_to_release_page(page, 0)) |
133 | return 0; | 132 | return 0; |
134 | 133 | ||
135 | clear_page_mlock(page); | ||
136 | ret = remove_mapping(mapping, page); | 134 | ret = remove_mapping(mapping, page); |
137 | 135 | ||
138 | return ret; | 136 | return ret; |
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
398 | if (PageDirty(page)) | 396 | if (PageDirty(page)) |
399 | goto failed; | 397 | goto failed; |
400 | 398 | ||
401 | clear_page_mlock(page); | ||
402 | BUG_ON(page_has_private(page)); | 399 | BUG_ON(page_has_private(page)); |
403 | __delete_from_page_cache(page); | 400 | __delete_from_page_cache(page); |
404 | spin_unlock_irq(&mapping->tree_lock); | 401 | spin_unlock_irq(&mapping->tree_lock); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241c..78e08300db21 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
2163 | usize -= PAGE_SIZE; | 2163 | usize -= PAGE_SIZE; |
2164 | } while (usize > 0); | 2164 | } while (usize > 0); |
2165 | 2165 | ||
2166 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 2166 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
2167 | vma->vm_flags |= VM_RESERVED; | ||
2168 | 2167 | ||
2169 | return 0; | 2168 | return 0; |
2170 | } | 2169 | } |
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) | |||
2572 | { | 2571 | { |
2573 | struct vm_struct *v = p; | 2572 | struct vm_struct *v = p; |
2574 | 2573 | ||
2575 | seq_printf(m, "0x%p-0x%p %7ld", | 2574 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2576 | v->addr, v->addr + v->size, v->size); | 2575 | v->addr, v->addr + v->size, v->size); |
2577 | 2576 | ||
2578 | if (v->caller) | 2577 | if (v->caller) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b434b674c0..2624edcfb420 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) | |||
553 | redo: | 553 | redo: |
554 | ClearPageUnevictable(page); | 554 | ClearPageUnevictable(page); |
555 | 555 | ||
556 | if (page_evictable(page, NULL)) { | 556 | if (page_evictable(page)) { |
557 | /* | 557 | /* |
558 | * For evictable pages, we can use the cache. | 558 | * For evictable pages, we can use the cache. |
559 | * In event of a race, worst case is we end up with an | 559 | * In event of a race, worst case is we end up with an |
@@ -587,7 +587,7 @@ redo: | |||
587 | * page is on unevictable list, it never be freed. To avoid that, | 587 | * page is on unevictable list, it never be freed. To avoid that, |
588 | * check after we added it to the list, again. | 588 | * check after we added it to the list, again. |
589 | */ | 589 | */ |
590 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | 590 | if (lru == LRU_UNEVICTABLE && page_evictable(page)) { |
591 | if (!isolate_lru_page(page)) { | 591 | if (!isolate_lru_page(page)) { |
592 | put_page(page); | 592 | put_page(page); |
593 | goto redo; | 593 | goto redo; |
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, | |||
674 | static unsigned long shrink_page_list(struct list_head *page_list, | 674 | static unsigned long shrink_page_list(struct list_head *page_list, |
675 | struct zone *zone, | 675 | struct zone *zone, |
676 | struct scan_control *sc, | 676 | struct scan_control *sc, |
677 | enum ttu_flags ttu_flags, | ||
677 | unsigned long *ret_nr_dirty, | 678 | unsigned long *ret_nr_dirty, |
678 | unsigned long *ret_nr_writeback) | 679 | unsigned long *ret_nr_writeback, |
680 | bool force_reclaim) | ||
679 | { | 681 | { |
680 | LIST_HEAD(ret_pages); | 682 | LIST_HEAD(ret_pages); |
681 | LIST_HEAD(free_pages); | 683 | LIST_HEAD(free_pages); |
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
689 | 691 | ||
690 | mem_cgroup_uncharge_start(); | 692 | mem_cgroup_uncharge_start(); |
691 | while (!list_empty(page_list)) { | 693 | while (!list_empty(page_list)) { |
692 | enum page_references references; | ||
693 | struct address_space *mapping; | 694 | struct address_space *mapping; |
694 | struct page *page; | 695 | struct page *page; |
695 | int may_enter_fs; | 696 | int may_enter_fs; |
697 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | ||
696 | 698 | ||
697 | cond_resched(); | 699 | cond_resched(); |
698 | 700 | ||
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
707 | 709 | ||
708 | sc->nr_scanned++; | 710 | sc->nr_scanned++; |
709 | 711 | ||
710 | if (unlikely(!page_evictable(page, NULL))) | 712 | if (unlikely(!page_evictable(page))) |
711 | goto cull_mlocked; | 713 | goto cull_mlocked; |
712 | 714 | ||
713 | if (!sc->may_unmap && page_mapped(page)) | 715 | if (!sc->may_unmap && page_mapped(page)) |
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
758 | wait_on_page_writeback(page); | 760 | wait_on_page_writeback(page); |
759 | } | 761 | } |
760 | 762 | ||
761 | references = page_check_references(page, sc); | 763 | if (!force_reclaim) |
764 | references = page_check_references(page, sc); | ||
765 | |||
762 | switch (references) { | 766 | switch (references) { |
763 | case PAGEREF_ACTIVATE: | 767 | case PAGEREF_ACTIVATE: |
764 | goto activate_locked; | 768 | goto activate_locked; |
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
788 | * processes. Try to unmap it here. | 792 | * processes. Try to unmap it here. |
789 | */ | 793 | */ |
790 | if (page_mapped(page) && mapping) { | 794 | if (page_mapped(page) && mapping) { |
791 | switch (try_to_unmap(page, TTU_UNMAP)) { | 795 | switch (try_to_unmap(page, ttu_flags)) { |
792 | case SWAP_FAIL: | 796 | case SWAP_FAIL: |
793 | goto activate_locked; | 797 | goto activate_locked; |
794 | case SWAP_AGAIN: | 798 | case SWAP_AGAIN: |
@@ -960,6 +964,33 @@ keep: | |||
960 | return nr_reclaimed; | 964 | return nr_reclaimed; |
961 | } | 965 | } |
962 | 966 | ||
967 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
968 | struct list_head *page_list) | ||
969 | { | ||
970 | struct scan_control sc = { | ||
971 | .gfp_mask = GFP_KERNEL, | ||
972 | .priority = DEF_PRIORITY, | ||
973 | .may_unmap = 1, | ||
974 | }; | ||
975 | unsigned long ret, dummy1, dummy2; | ||
976 | struct page *page, *next; | ||
977 | LIST_HEAD(clean_pages); | ||
978 | |||
979 | list_for_each_entry_safe(page, next, page_list, lru) { | ||
980 | if (page_is_file_cache(page) && !PageDirty(page)) { | ||
981 | ClearPageActive(page); | ||
982 | list_move(&page->lru, &clean_pages); | ||
983 | } | ||
984 | } | ||
985 | |||
986 | ret = shrink_page_list(&clean_pages, zone, &sc, | ||
987 | TTU_UNMAP|TTU_IGNORE_ACCESS, | ||
988 | &dummy1, &dummy2, true); | ||
989 | list_splice(&clean_pages, page_list); | ||
990 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | ||
991 | return ret; | ||
992 | } | ||
993 | |||
963 | /* | 994 | /* |
964 | * Attempt to remove the specified page from its LRU. Only take this page | 995 | * Attempt to remove the specified page from its LRU. Only take this page |
965 | * if it is of the appropriate PageActive status. Pages which are being | 996 | * if it is of the appropriate PageActive status. Pages which are being |
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) | |||
978 | if (!PageLRU(page)) | 1009 | if (!PageLRU(page)) |
979 | return ret; | 1010 | return ret; |
980 | 1011 | ||
981 | /* Do not give back unevictable pages for compaction */ | 1012 | /* Compaction should not handle unevictable pages but CMA can do so */ |
982 | if (PageUnevictable(page)) | 1013 | if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) |
983 | return ret; | 1014 | return ret; |
984 | 1015 | ||
985 | ret = -EBUSY; | 1016 | ret = -EBUSY; |
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1186 | 1217 | ||
1187 | VM_BUG_ON(PageLRU(page)); | 1218 | VM_BUG_ON(PageLRU(page)); |
1188 | list_del(&page->lru); | 1219 | list_del(&page->lru); |
1189 | if (unlikely(!page_evictable(page, NULL))) { | 1220 | if (unlikely(!page_evictable(page))) { |
1190 | spin_unlock_irq(&zone->lru_lock); | 1221 | spin_unlock_irq(&zone->lru_lock); |
1191 | putback_lru_page(page); | 1222 | putback_lru_page(page); |
1192 | spin_lock_irq(&zone->lru_lock); | 1223 | spin_lock_irq(&zone->lru_lock); |
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1278 | if (nr_taken == 0) | 1309 | if (nr_taken == 0) |
1279 | return 0; | 1310 | return 0; |
1280 | 1311 | ||
1281 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, | 1312 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, |
1282 | &nr_dirty, &nr_writeback); | 1313 | &nr_dirty, &nr_writeback, false); |
1283 | 1314 | ||
1284 | spin_lock_irq(&zone->lru_lock); | 1315 | spin_lock_irq(&zone->lru_lock); |
1285 | 1316 | ||
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1439 | page = lru_to_page(&l_hold); | 1470 | page = lru_to_page(&l_hold); |
1440 | list_del(&page->lru); | 1471 | list_del(&page->lru); |
1441 | 1472 | ||
1442 | if (unlikely(!page_evictable(page, NULL))) { | 1473 | if (unlikely(!page_evictable(page))) { |
1443 | putback_lru_page(page); | 1474 | putback_lru_page(page); |
1444 | continue; | 1475 | continue; |
1445 | } | 1476 | } |
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1729 | return false; | 1760 | return false; |
1730 | } | 1761 | } |
1731 | 1762 | ||
1763 | #ifdef CONFIG_COMPACTION | ||
1764 | /* | ||
1765 | * If compaction is deferred for sc->order then scale the number of pages | ||
1766 | * reclaimed based on the number of consecutive allocation failures | ||
1767 | */ | ||
1768 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1769 | struct lruvec *lruvec, struct scan_control *sc) | ||
1770 | { | ||
1771 | struct zone *zone = lruvec_zone(lruvec); | ||
1772 | |||
1773 | if (zone->compact_order_failed <= sc->order) | ||
1774 | pages_for_compaction <<= zone->compact_defer_shift; | ||
1775 | return pages_for_compaction; | ||
1776 | } | ||
1777 | #else | ||
1778 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1779 | struct lruvec *lruvec, struct scan_control *sc) | ||
1780 | { | ||
1781 | return pages_for_compaction; | ||
1782 | } | ||
1783 | #endif | ||
1784 | |||
1732 | /* | 1785 | /* |
1733 | * Reclaim/compaction is used for high-order allocation requests. It reclaims | 1786 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
1734 | * order-0 pages before compacting the zone. should_continue_reclaim() returns | 1787 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1776 | * inactive lists are large enough, continue reclaiming | 1829 | * inactive lists are large enough, continue reclaiming |
1777 | */ | 1830 | */ |
1778 | pages_for_compaction = (2UL << sc->order); | 1831 | pages_for_compaction = (2UL << sc->order); |
1832 | |||
1833 | pages_for_compaction = scale_for_compaction(pages_for_compaction, | ||
1834 | lruvec, sc); | ||
1779 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1835 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1780 | if (nr_swap_pages > 0) | 1836 | if (nr_swap_pages > 0) |
1781 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1837 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); |
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2839 | */ | 2895 | */ |
2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2896 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2841 | 2897 | ||
2898 | /* | ||
2899 | * Compaction records what page blocks it recently failed to | ||
2900 | * isolate pages from and skips them in the future scanning. | ||
2901 | * When kswapd is going to sleep, it is reasonable to assume | ||
2902 | * that pages and compaction may succeed so reset the cache. | ||
2903 | */ | ||
2904 | reset_isolation_suitable(pgdat); | ||
2905 | |||
2842 | if (!kthread_should_stop()) | 2906 | if (!kthread_should_stop()) |
2843 | schedule(); | 2907 | schedule(); |
2844 | 2908 | ||
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid) | |||
3101 | if (IS_ERR(pgdat->kswapd)) { | 3165 | if (IS_ERR(pgdat->kswapd)) { |
3102 | /* failure at boot is fatal */ | 3166 | /* failure at boot is fatal */ |
3103 | BUG_ON(system_state == SYSTEM_BOOTING); | 3167 | BUG_ON(system_state == SYSTEM_BOOTING); |
3104 | printk("Failed to start kswapd on node %d\n",nid); | ||
3105 | pgdat->kswapd = NULL; | 3168 | pgdat->kswapd = NULL; |
3106 | ret = -1; | 3169 | pr_err("Failed to start kswapd on node %d\n", nid); |
3170 | ret = PTR_ERR(pgdat->kswapd); | ||
3107 | } | 3171 | } |
3108 | return ret; | 3172 | return ret; |
3109 | } | 3173 | } |
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3350 | /* | 3414 | /* |
3351 | * page_evictable - test whether a page is evictable | 3415 | * page_evictable - test whether a page is evictable |
3352 | * @page: the page to test | 3416 | * @page: the page to test |
3353 | * @vma: the VMA in which the page is or will be mapped, may be NULL | ||
3354 | * | 3417 | * |
3355 | * Test whether page is evictable--i.e., should be placed on active/inactive | 3418 | * Test whether page is evictable--i.e., should be placed on active/inactive |
3356 | * lists vs unevictable list. The vma argument is !NULL when called from the | 3419 | * lists vs unevictable list. |
3357 | * fault path to determine how to instantate a new page. | ||
3358 | * | 3420 | * |
3359 | * Reasons page might not be evictable: | 3421 | * Reasons page might not be evictable: |
3360 | * (1) page's mapping marked unevictable | 3422 | * (1) page's mapping marked unevictable |
3361 | * (2) page is part of an mlocked VMA | 3423 | * (2) page is part of an mlocked VMA |
3362 | * | 3424 | * |
3363 | */ | 3425 | */ |
3364 | int page_evictable(struct page *page, struct vm_area_struct *vma) | 3426 | int page_evictable(struct page *page) |
3365 | { | 3427 | { |
3366 | 3428 | return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); | |
3367 | if (mapping_unevictable(page_mapping(page))) | ||
3368 | return 0; | ||
3369 | |||
3370 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) | ||
3371 | return 0; | ||
3372 | |||
3373 | return 1; | ||
3374 | } | 3429 | } |
3375 | 3430 | ||
3376 | #ifdef CONFIG_SHMEM | 3431 | #ifdef CONFIG_SHMEM |
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3408 | if (!PageLRU(page) || !PageUnevictable(page)) | 3463 | if (!PageLRU(page) || !PageUnevictable(page)) |
3409 | continue; | 3464 | continue; |
3410 | 3465 | ||
3411 | if (page_evictable(page, NULL)) { | 3466 | if (page_evictable(page)) { |
3412 | enum lru_list lru = page_lru_base_type(page); | 3467 | enum lru_list lru = page_lru_base_type(page); |
3413 | 3468 | ||
3414 | VM_BUG_ON(PageActive(page)); | 3469 | VM_BUG_ON(PageActive(page)); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index b3e3b9d525d0..c7370579111b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) | |||
495 | atomic_long_add(global_diff[i], &vm_stat[i]); | 495 | atomic_long_add(global_diff[i], &vm_stat[i]); |
496 | } | 496 | } |
497 | 497 | ||
498 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) | ||
499 | { | ||
500 | int i; | ||
501 | |||
502 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
503 | if (pset->vm_stat_diff[i]) { | ||
504 | int v = pset->vm_stat_diff[i]; | ||
505 | pset->vm_stat_diff[i] = 0; | ||
506 | atomic_long_add(v, &zone->vm_stat[i]); | ||
507 | atomic_long_add(v, &vm_stat[i]); | ||
508 | } | ||
509 | } | ||
498 | #endif | 510 | #endif |
499 | 511 | ||
500 | #ifdef CONFIG_NUMA | 512 | #ifdef CONFIG_NUMA |
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = { | |||
722 | "numa_other", | 734 | "numa_other", |
723 | #endif | 735 | #endif |
724 | "nr_anon_transparent_hugepages", | 736 | "nr_anon_transparent_hugepages", |
737 | "nr_free_cma", | ||
725 | "nr_dirty_threshold", | 738 | "nr_dirty_threshold", |
726 | "nr_dirty_background_threshold", | 739 | "nr_dirty_background_threshold", |
727 | 740 | ||
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = { | |||
781 | "unevictable_pgs_munlocked", | 794 | "unevictable_pgs_munlocked", |
782 | "unevictable_pgs_cleared", | 795 | "unevictable_pgs_cleared", |
783 | "unevictable_pgs_stranded", | 796 | "unevictable_pgs_stranded", |
784 | "unevictable_pgs_mlockfreed", | ||
785 | 797 | ||
786 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 798 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
787 | "thp_fault_alloc", | 799 | "thp_fault_alloc", |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e53..c1d756cc7448 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
221 | kref_init(&req->r_kref); | 221 | kref_init(&req->r_kref); |
222 | init_completion(&req->r_completion); | 222 | init_completion(&req->r_completion); |
223 | init_completion(&req->r_safe_completion); | 223 | init_completion(&req->r_safe_completion); |
224 | rb_init_node(&req->r_node); | ||
225 | INIT_LIST_HEAD(&req->r_unsafe_item); | 224 | INIT_LIST_HEAD(&req->r_unsafe_item); |
226 | INIT_LIST_HEAD(&req->r_linger_item); | 225 | INIT_LIST_HEAD(&req->r_linger_item); |
227 | INIT_LIST_HEAD(&req->r_linger_osd); | 226 | INIT_LIST_HEAD(&req->r_linger_osd); |
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 55af8c5b57e6..3a6e8731646c 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c | |||
@@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) | |||
485 | return -EACCES; | 485 | return -EACCES; |
486 | } | 486 | } |
487 | 487 | ||
488 | vma->vm_flags |= VM_RESERVED; | 488 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
489 | vma->vm_ops = &sel_mmap_policy_ops; | 489 | vma->vm_ops = &sel_mmap_policy_ops; |
490 | 490 | ||
491 | return 0; | 491 | return 0; |
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index 867558c98334..2952ba576fb9 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c | |||
@@ -949,18 +949,13 @@ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, | |||
949 | const char *tomoyo_get_exe(void) | 949 | const char *tomoyo_get_exe(void) |
950 | { | 950 | { |
951 | struct mm_struct *mm = current->mm; | 951 | struct mm_struct *mm = current->mm; |
952 | struct vm_area_struct *vma; | ||
953 | const char *cp = NULL; | 952 | const char *cp = NULL; |
954 | 953 | ||
955 | if (!mm) | 954 | if (!mm) |
956 | return NULL; | 955 | return NULL; |
957 | down_read(&mm->mmap_sem); | 956 | down_read(&mm->mmap_sem); |
958 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 957 | if (mm->exe_file) |
959 | if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { | 958 | cp = tomoyo_realpath_from_path(&mm->exe_file->f_path); |
960 | cp = tomoyo_realpath_from_path(&vma->vm_file->f_path); | ||
961 | break; | ||
962 | } | ||
963 | } | ||
964 | up_read(&mm->mmap_sem); | 959 | up_read(&mm->mmap_sem); |
965 | return cp; | 960 | return cp; |
966 | } | 961 | } |
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 20554eff5a21..5e12e5bacbba 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c | |||
@@ -3039,7 +3039,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file | |||
3039 | return -EINVAL; | 3039 | return -EINVAL; |
3040 | area->vm_ops = &snd_pcm_vm_ops_status; | 3040 | area->vm_ops = &snd_pcm_vm_ops_status; |
3041 | area->vm_private_data = substream; | 3041 | area->vm_private_data = substream; |
3042 | area->vm_flags |= VM_RESERVED; | 3042 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3043 | return 0; | 3043 | return 0; |
3044 | } | 3044 | } |
3045 | 3045 | ||
@@ -3076,7 +3076,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file | |||
3076 | return -EINVAL; | 3076 | return -EINVAL; |
3077 | area->vm_ops = &snd_pcm_vm_ops_control; | 3077 | area->vm_ops = &snd_pcm_vm_ops_control; |
3078 | area->vm_private_data = substream; | 3078 | area->vm_private_data = substream; |
3079 | area->vm_flags |= VM_RESERVED; | 3079 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3080 | return 0; | 3080 | return 0; |
3081 | } | 3081 | } |
3082 | #else /* ! coherent mmap */ | 3082 | #else /* ! coherent mmap */ |
@@ -3170,7 +3170,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { | |||
3170 | int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, | 3170 | int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, |
3171 | struct vm_area_struct *area) | 3171 | struct vm_area_struct *area) |
3172 | { | 3172 | { |
3173 | area->vm_flags |= VM_RESERVED; | 3173 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
3174 | #ifdef ARCH_HAS_DMA_MMAP_COHERENT | 3174 | #ifdef ARCH_HAS_DMA_MMAP_COHERENT |
3175 | if (!substream->ops->page && | 3175 | if (!substream->ops->page && |
3176 | substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) | 3176 | substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) |
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index c4fd3b1d9592..d0323a693ba2 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c | |||
@@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, | |||
262 | } | 262 | } |
263 | 263 | ||
264 | area->vm_ops = &usb_stream_hwdep_vm_ops; | 264 | area->vm_ops = &usb_stream_hwdep_vm_ops; |
265 | area->vm_flags |= VM_RESERVED; | 265 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
266 | area->vm_private_data = us122l; | 266 | area->vm_private_data = us122l; |
267 | atomic_inc(&us122l->mmap_count); | 267 | atomic_inc(&us122l->mmap_count); |
268 | out: | 268 | out: |
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index 04aafb43a13c..0b34dbc8f302 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c | |||
@@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v | |||
82 | us428->us428ctls_sharedmem->CtlSnapShotLast = -2; | 82 | us428->us428ctls_sharedmem->CtlSnapShotLast = -2; |
83 | } | 83 | } |
84 | area->vm_ops = &us428ctls_vm_ops; | 84 | area->vm_ops = &us428ctls_vm_ops; |
85 | area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; | 85 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
86 | area->vm_private_data = hw->private_data; | 86 | area->vm_private_data = hw->private_data; |
87 | return 0; | 87 | return 0; |
88 | } | 88 | } |
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 8e40b6e67e9e..cc56007791e0 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c | |||
@@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st | |||
723 | return -ENODEV; | 723 | return -ENODEV; |
724 | } | 724 | } |
725 | area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; | 725 | area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; |
726 | area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; | 726 | area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
727 | area->vm_private_data = hw->private_data; | 727 | area->vm_private_data = hw->private_data; |
728 | return 0; | 728 | return 0; |
729 | } | 729 | } |
diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h index 2a030c5af3aa..9bcdc844b330 100644 --- a/tools/perf/util/include/linux/rbtree.h +++ b/tools/perf/util/include/linux/rbtree.h | |||
@@ -1,2 +1,3 @@ | |||
1 | #include <stdbool.h> | 1 | #include <stdbool.h> |
2 | #include <stdbool.h> | ||
2 | #include "../../../../include/linux/rbtree.h" | 3 | #include "../../../../include/linux/rbtree.h" |