diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-21 22:05:45 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-21 22:05:45 -0500 |
commit | df32e43a54d04eda35d2859beaf90e3864d53288 (patch) | |
tree | 7a61cf658b2949bd426285eb9902be7758ced1ba | |
parent | fbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff) | |
parent | 78d5506e82b21a1a1de68c24182db2c2fe521422 (diff) |
Merge branch 'akpm' (incoming from Andrew)
Merge first patch-bomb from Andrew Morton:
- a couple of misc things
- inotify/fsnotify work from Jan
- ocfs2 updates (partial)
- about half of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits)
mm/migrate: remove unused function, fail_migrate_page()
mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages
mm/migrate: correct failure handling if !hugepage_migration_support()
mm/migrate: add comment about permanent failure path
mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure
mm: compaction: reset scanner positions immediately when they meet
mm: compaction: do not mark unmovable pageblocks as skipped in async compaction
mm: compaction: detect when scanners meet in isolate_freepages
mm: compaction: reset cached scanner pfn's before reading them
mm: compaction: encapsulate defer reset logic
mm: compaction: trace compaction begin and end
memcg, oom: lock mem_cgroup_print_oom_info
sched: add tracepoints related to NUMA task migration
mm: numa: do not automatically migrate KSM pages
mm: numa: trace tasks that fail migration due to rate limiting
mm: numa: limit scope of lock for NUMA migrate rate limiting
mm: numa: make NUMA-migrate related functions static
lib/show_mem.c: show num_poisoned_pages when oom
mm/hwpoison: add '#' to hwpoison_inject
mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter
...
139 files changed, 2902 insertions, 2512 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22d89aa37218..8533f5f9bb2d 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -767,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not. | |||
767 | 767 | ||
768 | MemTotal: 16344972 kB | 768 | MemTotal: 16344972 kB |
769 | MemFree: 13634064 kB | 769 | MemFree: 13634064 kB |
770 | MemAvailable: 14836172 kB | ||
770 | Buffers: 3656 kB | 771 | Buffers: 3656 kB |
771 | Cached: 1195708 kB | 772 | Cached: 1195708 kB |
772 | SwapCached: 0 kB | 773 | SwapCached: 0 kB |
@@ -799,6 +800,14 @@ AnonHugePages: 49152 kB | |||
799 | MemTotal: Total usable ram (i.e. physical ram minus a few reserved | 800 | MemTotal: Total usable ram (i.e. physical ram minus a few reserved |
800 | bits and the kernel binary code) | 801 | bits and the kernel binary code) |
801 | MemFree: The sum of LowFree+HighFree | 802 | MemFree: The sum of LowFree+HighFree |
803 | MemAvailable: An estimate of how much memory is available for starting new | ||
804 | applications, without swapping. Calculated from MemFree, | ||
805 | SReclaimable, the size of the file LRU lists, and the low | ||
806 | watermarks in each zone. | ||
807 | The estimate takes into account that the system needs some | ||
808 | page cache to function well, and that not all reclaimable | ||
809 | slab will be reclaimable, due to items being in use. The | ||
810 | impact of those factors will vary from system to system. | ||
802 | Buffers: Relatively temporary storage for raw disk blocks | 811 | Buffers: Relatively temporary storage for raw disk blocks |
803 | shouldn't get tremendously large (20MB or so) | 812 | shouldn't get tremendously large (20MB or so) |
804 | Cached: in-memory cache for files read from the disk (the | 813 | Cached: in-memory cache for files read from the disk (the |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 1fbd4eb7b64a..9f5481bdc5a4 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm: | |||
47 | - numa_zonelist_order | 47 | - numa_zonelist_order |
48 | - oom_dump_tasks | 48 | - oom_dump_tasks |
49 | - oom_kill_allocating_task | 49 | - oom_kill_allocating_task |
50 | - overcommit_kbytes | ||
50 | - overcommit_memory | 51 | - overcommit_memory |
51 | - overcommit_ratio | 52 | - overcommit_ratio |
52 | - page-cluster | 53 | - page-cluster |
@@ -574,6 +575,17 @@ The default value is 0. | |||
574 | 575 | ||
575 | ============================================================== | 576 | ============================================================== |
576 | 577 | ||
578 | overcommit_kbytes: | ||
579 | |||
580 | When overcommit_memory is set to 2, the committed address space is not | ||
581 | permitted to exceed swap plus this amount of physical RAM. See below. | ||
582 | |||
583 | Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one | ||
584 | of them may be specified at a time. Setting one disables the other (which | ||
585 | then appears as 0 when read). | ||
586 | |||
587 | ============================================================== | ||
588 | |||
577 | overcommit_memory: | 589 | overcommit_memory: |
578 | 590 | ||
579 | This value contains a flag that enables memory overcommitment. | 591 | This value contains a flag that enables memory overcommitment. |
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 8eaa2fc4b8fa..cbfaaa674118 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting | |||
@@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes | |||
14 | 14 | ||
15 | 2 - Don't overcommit. The total address space commit | 15 | 2 - Don't overcommit. The total address space commit |
16 | for the system is not permitted to exceed swap + a | 16 | for the system is not permitted to exceed swap + a |
17 | configurable percentage (default is 50) of physical RAM. | 17 | configurable amount (default is 50%) of physical RAM. |
18 | Depending on the percentage you use, in most situations | 18 | Depending on the amount you use, in most situations |
19 | this means a process will not be killed while accessing | 19 | this means a process will not be killed while accessing |
20 | pages but will receive errors on memory allocation as | 20 | pages but will receive errors on memory allocation as |
21 | appropriate. | 21 | appropriate. |
@@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes | |||
26 | 26 | ||
27 | The overcommit policy is set via the sysctl `vm.overcommit_memory'. | 27 | The overcommit policy is set via the sysctl `vm.overcommit_memory'. |
28 | 28 | ||
29 | The overcommit percentage is set via `vm.overcommit_ratio'. | 29 | The overcommit amount can be set via `vm.overcommit_ratio' (percentage) |
30 | or `vm.overcommit_kbytes' (absolute value). | ||
30 | 31 | ||
31 | The current overcommit limit and amount committed are viewable in | 32 | The current overcommit limit and amount committed are viewable in |
32 | /proc/meminfo as CommitLimit and Committed_AS respectively. | 33 | /proc/meminfo as CommitLimit and Committed_AS respectively. |
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h index 58b8c6a0ab1f..99084431d6ae 100644 --- a/arch/arm/include/asm/dma.h +++ b/arch/arm/include/asm/dma.h | |||
@@ -8,8 +8,8 @@ | |||
8 | #define MAX_DMA_ADDRESS 0xffffffffUL | 8 | #define MAX_DMA_ADDRESS 0xffffffffUL |
9 | #else | 9 | #else |
10 | #define MAX_DMA_ADDRESS ({ \ | 10 | #define MAX_DMA_ADDRESS ({ \ |
11 | extern unsigned long arm_dma_zone_size; \ | 11 | extern phys_addr_t arm_dma_zone_size; \ |
12 | arm_dma_zone_size ? \ | 12 | arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \ |
13 | (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) | 13 | (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) |
14 | #endif | 14 | #endif |
15 | 15 | ||
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c index 34d5fd585bbb..f751714d52c1 100644 --- a/arch/arm/kernel/devtree.c +++ b/arch/arm/kernel/devtree.c | |||
@@ -33,7 +33,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) | |||
33 | 33 | ||
34 | void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) | 34 | void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) |
35 | { | 35 | { |
36 | return alloc_bootmem_align(size, align); | 36 | return memblock_virt_alloc(size, align); |
37 | } | 37 | } |
38 | 38 | ||
39 | void __init arm_dt_memblock_reserve(void) | 39 | void __init arm_dt_memblock_reserve(void) |
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 987a7f5bce5f..8ce1cbd08dba 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c | |||
@@ -717,7 +717,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) | |||
717 | kernel_data.end = virt_to_phys(_end - 1); | 717 | kernel_data.end = virt_to_phys(_end - 1); |
718 | 718 | ||
719 | for_each_memblock(memory, region) { | 719 | for_each_memblock(memory, region) { |
720 | res = alloc_bootmem_low(sizeof(*res)); | 720 | res = memblock_virt_alloc(sizeof(*res), 0); |
721 | res->name = "System RAM"; | 721 | res->name = "System RAM"; |
722 | res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); | 722 | res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); |
723 | res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; | 723 | res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; |
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 8a1b5e0bad40..f7a6fd35b1e4 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c | |||
@@ -2791,9 +2791,7 @@ static int __init _alloc_links(struct omap_hwmod_link **ml, | |||
2791 | sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF; | 2791 | sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF; |
2792 | 2792 | ||
2793 | *sl = NULL; | 2793 | *sl = NULL; |
2794 | *ml = alloc_bootmem(sz); | 2794 | *ml = memblock_virt_alloc(sz, 0); |
2795 | |||
2796 | memset(*ml, 0, sz); | ||
2797 | 2795 | ||
2798 | *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link); | 2796 | *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link); |
2799 | 2797 | ||
@@ -2912,9 +2910,7 @@ static int __init _alloc_linkspace(struct omap_hwmod_ocp_if **ois) | |||
2912 | pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n", | 2910 | pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n", |
2913 | __func__, sz, max_ls); | 2911 | __func__, sz, max_ls); |
2914 | 2912 | ||
2915 | linkspace = alloc_bootmem(sz); | 2913 | linkspace = memblock_virt_alloc(sz, 0); |
2916 | |||
2917 | memset(linkspace, 0, sz); | ||
2918 | 2914 | ||
2919 | return 0; | 2915 | return 0; |
2920 | } | 2916 | } |
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 3e8f106ee5fe..11eb8add7820 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c | |||
@@ -92,9 +92,6 @@ void show_mem(unsigned int filter) | |||
92 | printk("Mem-info:\n"); | 92 | printk("Mem-info:\n"); |
93 | show_free_areas(filter); | 93 | show_free_areas(filter); |
94 | 94 | ||
95 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
96 | return; | ||
97 | |||
98 | for_each_bank (i, mi) { | 95 | for_each_bank (i, mi) { |
99 | struct membank *bank = &mi->bank[i]; | 96 | struct membank *bank = &mi->bank[i]; |
100 | unsigned int pfn1, pfn2; | 97 | unsigned int pfn1, pfn2; |
@@ -461,7 +458,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn) | |||
461 | * free the section of the memmap array. | 458 | * free the section of the memmap array. |
462 | */ | 459 | */ |
463 | if (pg < pgend) | 460 | if (pg < pgend) |
464 | free_bootmem(pg, pgend - pg); | 461 | memblock_free_early(pg, pgend - pg); |
465 | } | 462 | } |
466 | 463 | ||
467 | /* | 464 | /* |
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index da5237d636d6..52715a71aede 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c | |||
@@ -31,74 +31,6 @@ | |||
31 | static unsigned long max_gap; | 31 | static unsigned long max_gap; |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | /** | ||
35 | * show_mem - give short summary of memory stats | ||
36 | * | ||
37 | * Shows a simple page count of reserved and used pages in the system. | ||
38 | * For discontig machines, it does this on a per-pgdat basis. | ||
39 | */ | ||
40 | void show_mem(unsigned int filter) | ||
41 | { | ||
42 | int i, total_reserved = 0; | ||
43 | int total_shared = 0, total_cached = 0; | ||
44 | unsigned long total_present = 0; | ||
45 | pg_data_t *pgdat; | ||
46 | |||
47 | printk(KERN_INFO "Mem-info:\n"); | ||
48 | show_free_areas(filter); | ||
49 | printk(KERN_INFO "Node memory in pages:\n"); | ||
50 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
51 | return; | ||
52 | for_each_online_pgdat(pgdat) { | ||
53 | unsigned long present; | ||
54 | unsigned long flags; | ||
55 | int shared = 0, cached = 0, reserved = 0; | ||
56 | int nid = pgdat->node_id; | ||
57 | |||
58 | if (skip_free_areas_node(filter, nid)) | ||
59 | continue; | ||
60 | pgdat_resize_lock(pgdat, &flags); | ||
61 | present = pgdat->node_present_pages; | ||
62 | for(i = 0; i < pgdat->node_spanned_pages; i++) { | ||
63 | struct page *page; | ||
64 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
65 | touch_nmi_watchdog(); | ||
66 | if (pfn_valid(pgdat->node_start_pfn + i)) | ||
67 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
68 | else { | ||
69 | #ifdef CONFIG_VIRTUAL_MEM_MAP | ||
70 | if (max_gap < LARGE_GAP) | ||
71 | continue; | ||
72 | #endif | ||
73 | i = vmemmap_find_next_valid_pfn(nid, i) - 1; | ||
74 | continue; | ||
75 | } | ||
76 | if (PageReserved(page)) | ||
77 | reserved++; | ||
78 | else if (PageSwapCache(page)) | ||
79 | cached++; | ||
80 | else if (page_count(page)) | ||
81 | shared += page_count(page)-1; | ||
82 | } | ||
83 | pgdat_resize_unlock(pgdat, &flags); | ||
84 | total_present += present; | ||
85 | total_reserved += reserved; | ||
86 | total_cached += cached; | ||
87 | total_shared += shared; | ||
88 | printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " | ||
89 | "shrd: %10d, swpd: %10d\n", nid, | ||
90 | present, reserved, shared, cached); | ||
91 | } | ||
92 | printk(KERN_INFO "%ld pages of RAM\n", total_present); | ||
93 | printk(KERN_INFO "%d reserved pages\n", total_reserved); | ||
94 | printk(KERN_INFO "%d pages shared\n", total_shared); | ||
95 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | ||
96 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | ||
97 | quicklist_total_size()); | ||
98 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); | ||
99 | } | ||
100 | |||
101 | |||
102 | /* physical address where the bootmem map is located */ | 34 | /* physical address where the bootmem map is located */ |
103 | unsigned long bootmap_start; | 35 | unsigned long bootmap_start; |
104 | 36 | ||
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 2de08f4d9930..878626805369 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c | |||
@@ -608,69 +608,6 @@ void *per_cpu_init(void) | |||
608 | #endif /* CONFIG_SMP */ | 608 | #endif /* CONFIG_SMP */ |
609 | 609 | ||
610 | /** | 610 | /** |
611 | * show_mem - give short summary of memory stats | ||
612 | * | ||
613 | * Shows a simple page count of reserved and used pages in the system. | ||
614 | * For discontig machines, it does this on a per-pgdat basis. | ||
615 | */ | ||
616 | void show_mem(unsigned int filter) | ||
617 | { | ||
618 | int i, total_reserved = 0; | ||
619 | int total_shared = 0, total_cached = 0; | ||
620 | unsigned long total_present = 0; | ||
621 | pg_data_t *pgdat; | ||
622 | |||
623 | printk(KERN_INFO "Mem-info:\n"); | ||
624 | show_free_areas(filter); | ||
625 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
626 | return; | ||
627 | printk(KERN_INFO "Node memory in pages:\n"); | ||
628 | for_each_online_pgdat(pgdat) { | ||
629 | unsigned long present; | ||
630 | unsigned long flags; | ||
631 | int shared = 0, cached = 0, reserved = 0; | ||
632 | int nid = pgdat->node_id; | ||
633 | |||
634 | if (skip_free_areas_node(filter, nid)) | ||
635 | continue; | ||
636 | pgdat_resize_lock(pgdat, &flags); | ||
637 | present = pgdat->node_present_pages; | ||
638 | for(i = 0; i < pgdat->node_spanned_pages; i++) { | ||
639 | struct page *page; | ||
640 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
641 | touch_nmi_watchdog(); | ||
642 | if (pfn_valid(pgdat->node_start_pfn + i)) | ||
643 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
644 | else { | ||
645 | i = vmemmap_find_next_valid_pfn(nid, i) - 1; | ||
646 | continue; | ||
647 | } | ||
648 | if (PageReserved(page)) | ||
649 | reserved++; | ||
650 | else if (PageSwapCache(page)) | ||
651 | cached++; | ||
652 | else if (page_count(page)) | ||
653 | shared += page_count(page)-1; | ||
654 | } | ||
655 | pgdat_resize_unlock(pgdat, &flags); | ||
656 | total_present += present; | ||
657 | total_reserved += reserved; | ||
658 | total_cached += cached; | ||
659 | total_shared += shared; | ||
660 | printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, " | ||
661 | "shrd: %10d, swpd: %10d\n", nid, | ||
662 | present, reserved, shared, cached); | ||
663 | } | ||
664 | printk(KERN_INFO "%ld pages of RAM\n", total_present); | ||
665 | printk(KERN_INFO "%d reserved pages\n", total_reserved); | ||
666 | printk(KERN_INFO "%d pages shared\n", total_shared); | ||
667 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | ||
668 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | ||
669 | quicklist_total_size()); | ||
670 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); | ||
671 | } | ||
672 | |||
673 | /** | ||
674 | * call_pernode_memory - use SRAT to call callback functions with node info | 611 | * call_pernode_memory - use SRAT to call callback functions with node info |
675 | * @start: physical start of range | 612 | * @start: physical start of range |
676 | * @len: length of range | 613 | * @len: length of range |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 88504abf5704..25c350264a41 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -684,3 +684,51 @@ per_linux32_init(void) | |||
684 | } | 684 | } |
685 | 685 | ||
686 | __initcall(per_linux32_init); | 686 | __initcall(per_linux32_init); |
687 | |||
688 | /** | ||
689 | * show_mem - give short summary of memory stats | ||
690 | * | ||
691 | * Shows a simple page count of reserved and used pages in the system. | ||
692 | * For discontig machines, it does this on a per-pgdat basis. | ||
693 | */ | ||
694 | void show_mem(unsigned int filter) | ||
695 | { | ||
696 | int total_reserved = 0; | ||
697 | unsigned long total_present = 0; | ||
698 | pg_data_t *pgdat; | ||
699 | |||
700 | printk(KERN_INFO "Mem-info:\n"); | ||
701 | show_free_areas(filter); | ||
702 | printk(KERN_INFO "Node memory in pages:\n"); | ||
703 | for_each_online_pgdat(pgdat) { | ||
704 | unsigned long present; | ||
705 | unsigned long flags; | ||
706 | int reserved = 0; | ||
707 | int nid = pgdat->node_id; | ||
708 | int zoneid; | ||
709 | |||
710 | if (skip_free_areas_node(filter, nid)) | ||
711 | continue; | ||
712 | pgdat_resize_lock(pgdat, &flags); | ||
713 | |||
714 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | ||
715 | struct zone *zone = &pgdat->node_zones[zoneid]; | ||
716 | if (!populated_zone(zone)) | ||
717 | continue; | ||
718 | |||
719 | reserved += zone->present_pages - zone->managed_pages; | ||
720 | } | ||
721 | present = pgdat->node_present_pages; | ||
722 | |||
723 | pgdat_resize_unlock(pgdat, &flags); | ||
724 | total_present += present; | ||
725 | total_reserved += reserved; | ||
726 | printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", | ||
727 | nid, present, reserved); | ||
728 | } | ||
729 | printk(KERN_INFO "%ld pages of RAM\n", total_present); | ||
730 | printk(KERN_INFO "%d reserved pages\n", total_reserved); | ||
731 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | ||
732 | quicklist_total_size()); | ||
733 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); | ||
734 | } | ||
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 3cd6288f65c2..11fa51c89617 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c | |||
@@ -204,7 +204,8 @@ static void __init do_init_bootmem(void) | |||
204 | start_pfn = memblock_region_memory_base_pfn(reg); | 204 | start_pfn = memblock_region_memory_base_pfn(reg); |
205 | end_pfn = memblock_region_memory_end_pfn(reg); | 205 | end_pfn = memblock_region_memory_end_pfn(reg); |
206 | memblock_set_node(PFN_PHYS(start_pfn), | 206 | memblock_set_node(PFN_PHYS(start_pfn), |
207 | PFN_PHYS(end_pfn - start_pfn), 0); | 207 | PFN_PHYS(end_pfn - start_pfn), |
208 | &memblock.memory, 0); | ||
208 | } | 209 | } |
209 | 210 | ||
210 | /* All of system RAM sits in node 0 for the non-NUMA case */ | 211 | /* All of system RAM sits in node 0 for the non-NUMA case */ |
diff --git a/arch/metag/mm/numa.c b/arch/metag/mm/numa.c index b172aa45fcf8..67b46c295072 100644 --- a/arch/metag/mm/numa.c +++ b/arch/metag/mm/numa.c | |||
@@ -42,7 +42,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) | |||
42 | memblock_add(start, end - start); | 42 | memblock_add(start, end - start); |
43 | 43 | ||
44 | memblock_set_node(PFN_PHYS(start_pfn), | 44 | memblock_set_node(PFN_PHYS(start_pfn), |
45 | PFN_PHYS(end_pfn - start_pfn), nid); | 45 | PFN_PHYS(end_pfn - start_pfn), |
46 | &memblock.memory, nid); | ||
46 | 47 | ||
47 | /* Node-local pgdat */ | 48 | /* Node-local pgdat */ |
48 | pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data), | 49 | pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data), |
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 74c7bcc1e82d..89077d346714 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c | |||
@@ -192,7 +192,8 @@ void __init setup_memory(void) | |||
192 | start_pfn = memblock_region_memory_base_pfn(reg); | 192 | start_pfn = memblock_region_memory_base_pfn(reg); |
193 | end_pfn = memblock_region_memory_end_pfn(reg); | 193 | end_pfn = memblock_region_memory_end_pfn(reg); |
194 | memblock_set_node(start_pfn << PAGE_SHIFT, | 194 | memblock_set_node(start_pfn << PAGE_SHIFT, |
195 | (end_pfn - start_pfn) << PAGE_SHIFT, 0); | 195 | (end_pfn - start_pfn) << PAGE_SHIFT, |
196 | &memblock.memory, 0); | ||
196 | } | 197 | } |
197 | 198 | ||
198 | /* free bootmem is whole main memory */ | 199 | /* free bootmem is whole main memory */ |
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96f8168cf4ec..ae085ad0fba0 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c | |||
@@ -645,55 +645,30 @@ EXPORT_SYMBOL(empty_zero_page); | |||
645 | 645 | ||
646 | void show_mem(unsigned int filter) | 646 | void show_mem(unsigned int filter) |
647 | { | 647 | { |
648 | int i,free = 0,total = 0,reserved = 0; | 648 | int total = 0,reserved = 0; |
649 | int shared = 0, cached = 0; | 649 | pg_data_t *pgdat; |
650 | 650 | ||
651 | printk(KERN_INFO "Mem-info:\n"); | 651 | printk(KERN_INFO "Mem-info:\n"); |
652 | show_free_areas(filter); | 652 | show_free_areas(filter); |
653 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
654 | return; | ||
655 | #ifndef CONFIG_DISCONTIGMEM | ||
656 | i = max_mapnr; | ||
657 | while (i-- > 0) { | ||
658 | total++; | ||
659 | if (PageReserved(mem_map+i)) | ||
660 | reserved++; | ||
661 | else if (PageSwapCache(mem_map+i)) | ||
662 | cached++; | ||
663 | else if (!page_count(&mem_map[i])) | ||
664 | free++; | ||
665 | else | ||
666 | shared += page_count(&mem_map[i]) - 1; | ||
667 | } | ||
668 | #else | ||
669 | for (i = 0; i < npmem_ranges; i++) { | ||
670 | int j; | ||
671 | 653 | ||
672 | for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { | 654 | for_each_online_pgdat(pgdat) { |
673 | struct page *p; | 655 | unsigned long flags; |
674 | unsigned long flags; | 656 | int zoneid; |
675 | 657 | ||
676 | pgdat_resize_lock(NODE_DATA(i), &flags); | 658 | pgdat_resize_lock(pgdat, &flags); |
677 | p = nid_page_nr(i, j) - node_start_pfn(i); | 659 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
678 | 660 | struct zone *zone = &pgdat->node_zones[zoneid]; | |
679 | total++; | 661 | if (!populated_zone(zone)) |
680 | if (PageReserved(p)) | 662 | continue; |
681 | reserved++; | 663 | |
682 | else if (PageSwapCache(p)) | 664 | total += zone->present_pages; |
683 | cached++; | 665 | reserved = zone->present_pages - zone->managed_pages; |
684 | else if (!page_count(p)) | 666 | } |
685 | free++; | 667 | pgdat_resize_unlock(pgdat, &flags); |
686 | else | ||
687 | shared += page_count(p) - 1; | ||
688 | pgdat_resize_unlock(NODE_DATA(i), &flags); | ||
689 | } | ||
690 | } | 668 | } |
691 | #endif | 669 | |
692 | printk(KERN_INFO "%d pages of RAM\n", total); | 670 | printk(KERN_INFO "%d pages of RAM\n", total); |
693 | printk(KERN_INFO "%d reserved pages\n", reserved); | 671 | printk(KERN_INFO "%d reserved pages\n", reserved); |
694 | printk(KERN_INFO "%d pages shared\n", shared); | ||
695 | printk(KERN_INFO "%d pages swap cached\n", cached); | ||
696 | |||
697 | 672 | ||
698 | #ifdef CONFIG_DISCONTIGMEM | 673 | #ifdef CONFIG_DISCONTIGMEM |
699 | { | 674 | { |
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3fa93dc7fe75..8c1dd23652a1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c | |||
@@ -209,7 +209,7 @@ void __init do_init_bootmem(void) | |||
209 | /* Place all memblock_regions in the same node and merge contiguous | 209 | /* Place all memblock_regions in the same node and merge contiguous |
210 | * memblock_regions | 210 | * memblock_regions |
211 | */ | 211 | */ |
212 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); | 212 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); |
213 | 213 | ||
214 | /* Add all physical memory to the bootmem map, mark each area | 214 | /* Add all physical memory to the bootmem map, mark each area |
215 | * present. | 215 | * present. |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 078d3e00a616..5a944f25e94f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -670,7 +670,8 @@ static void __init parse_drconf_memory(struct device_node *memory) | |||
670 | node_set_online(nid); | 670 | node_set_online(nid); |
671 | sz = numa_enforce_memory_limit(base, size); | 671 | sz = numa_enforce_memory_limit(base, size); |
672 | if (sz) | 672 | if (sz) |
673 | memblock_set_node(base, sz, nid); | 673 | memblock_set_node(base, sz, |
674 | &memblock.memory, nid); | ||
674 | } while (--ranges); | 675 | } while (--ranges); |
675 | } | 676 | } |
676 | } | 677 | } |
@@ -760,7 +761,7 @@ new_range: | |||
760 | continue; | 761 | continue; |
761 | } | 762 | } |
762 | 763 | ||
763 | memblock_set_node(start, size, nid); | 764 | memblock_set_node(start, size, &memblock.memory, nid); |
764 | 765 | ||
765 | if (--ranges) | 766 | if (--ranges) |
766 | goto new_range; | 767 | goto new_range; |
@@ -797,7 +798,8 @@ static void __init setup_nonnuma(void) | |||
797 | 798 | ||
798 | fake_numa_create_new_node(end_pfn, &nid); | 799 | fake_numa_create_new_node(end_pfn, &nid); |
799 | memblock_set_node(PFN_PHYS(start_pfn), | 800 | memblock_set_node(PFN_PHYS(start_pfn), |
800 | PFN_PHYS(end_pfn - start_pfn), nid); | 801 | PFN_PHYS(end_pfn - start_pfn), |
802 | &memblock.memory, nid); | ||
801 | node_set_online(nid); | 803 | node_set_online(nid); |
802 | } | 804 | } |
803 | } | 805 | } |
diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 305f7ee1f382..c75d06aa27c3 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig | |||
@@ -2,7 +2,6 @@ menu "Machine selection" | |||
2 | 2 | ||
3 | config SCORE | 3 | config SCORE |
4 | def_bool y | 4 | def_bool y |
5 | select HAVE_GENERIC_HARDIRQS | ||
6 | select GENERIC_IRQ_SHOW | 5 | select GENERIC_IRQ_SHOW |
7 | select GENERIC_IOMAP | 6 | select GENERIC_IOMAP |
8 | select GENERIC_ATOMIC64 | 7 | select GENERIC_ATOMIC64 |
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c index 38b313909ac9..adad46e41a1d 100644 --- a/arch/sh/kernel/kgdb.c +++ b/arch/sh/kernel/kgdb.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/kdebug.h> | 13 | #include <linux/kdebug.h> |
14 | #include <linux/irq.h> | 14 | #include <linux/irq.h> |
15 | #include <linux/io.h> | 15 | #include <linux/io.h> |
16 | #include <linux/sched.h> | ||
16 | #include <asm/cacheflush.h> | 17 | #include <asm/cacheflush.h> |
17 | #include <asm/traps.h> | 18 | #include <asm/traps.h> |
18 | 19 | ||
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 1cf90e947dbf..de19cfa768f2 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c | |||
@@ -230,8 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, | |||
230 | pmb_bolt_mapping((unsigned long)__va(start), start, end - start, | 230 | pmb_bolt_mapping((unsigned long)__va(start), start, end - start, |
231 | PAGE_KERNEL); | 231 | PAGE_KERNEL); |
232 | 232 | ||
233 | memblock_set_node(PFN_PHYS(start_pfn), | 233 | memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), |
234 | PFN_PHYS(end_pfn - start_pfn), nid); | 234 | &memblock.memory, nid); |
235 | } | 235 | } |
236 | 236 | ||
237 | void __init __weak plat_early_device_setup(void) | 237 | void __init __weak plat_early_device_setup(void) |
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5322e530d09c..eafbc65c9c47 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -1021,7 +1021,8 @@ static void __init add_node_ranges(void) | |||
1021 | "start[%lx] end[%lx]\n", | 1021 | "start[%lx] end[%lx]\n", |
1022 | nid, start, this_end); | 1022 | nid, start, this_end); |
1023 | 1023 | ||
1024 | memblock_set_node(start, this_end - start, nid); | 1024 | memblock_set_node(start, this_end - start, |
1025 | &memblock.memory, nid); | ||
1025 | start = this_end; | 1026 | start = this_end; |
1026 | } | 1027 | } |
1027 | } | 1028 | } |
@@ -1325,7 +1326,7 @@ static void __init bootmem_init_nonnuma(void) | |||
1325 | (top_of_ram - total_ram) >> 20); | 1326 | (top_of_ram - total_ram) >> 20); |
1326 | 1327 | ||
1327 | init_node_masks_nonnuma(); | 1328 | init_node_masks_nonnuma(); |
1328 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); | 1329 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); |
1329 | allocate_node_data(0); | 1330 | allocate_node_data(0); |
1330 | node_set_online(0); | 1331 | node_set_online(0); |
1331 | } | 1332 | } |
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index ae6bc036db92..be2bde9b07cf 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c | |||
@@ -66,9 +66,6 @@ void show_mem(unsigned int filter) | |||
66 | printk(KERN_DEFAULT "Mem-info:\n"); | 66 | printk(KERN_DEFAULT "Mem-info:\n"); |
67 | show_free_areas(filter); | 67 | show_free_areas(filter); |
68 | 68 | ||
69 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
70 | return; | ||
71 | |||
72 | for_each_bank(i, mi) { | 69 | for_each_bank(i, mi) { |
73 | struct membank *bank = &mi->bank[i]; | 70 | struct membank *bank = &mi->bank[i]; |
74 | unsigned int pfn1, pfn2; | 71 | unsigned int pfn1, pfn2; |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..2f59cce3b38a 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr); | |||
51 | extern unsigned long max_low_pfn_mapped; | 51 | extern unsigned long max_low_pfn_mapped; |
52 | extern unsigned long max_pfn_mapped; | 52 | extern unsigned long max_pfn_mapped; |
53 | 53 | ||
54 | static inline phys_addr_t get_max_mapped(void) | 54 | static inline phys_addr_t get_max_low_mapped(void) |
55 | { | 55 | { |
56 | return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; | 56 | return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT; |
57 | } | 57 | } |
58 | 58 | ||
59 | bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); | 59 | bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); |
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index e2dbcb7dabdd..83a7995625a6 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c | |||
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void) | |||
91 | 91 | ||
92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); | 92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); |
93 | 93 | ||
94 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 94 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { |
95 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), | 95 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), |
96 | PAGE_SIZE, corruption_check_size); | 96 | PAGE_SIZE, corruption_check_size); |
97 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), | 97 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174da5fc5a7b..988c00a1f60d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void) | |||
1120 | nr_pages += end_pfn - start_pfn; | 1120 | nr_pages += end_pfn - start_pfn; |
1121 | } | 1121 | } |
1122 | 1122 | ||
1123 | for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { | 1123 | for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { |
1124 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); | 1124 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); |
1125 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); | 1125 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); |
1126 | if (start_pfn < end_pfn) | 1126 | if (start_pfn < end_pfn) |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e670354..c9675594d7ca 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) | |||
1119 | 1119 | ||
1120 | setup_real_mode(); | 1120 | setup_real_mode(); |
1121 | 1121 | ||
1122 | memblock_set_current_limit(get_max_mapped()); | 1122 | memblock_set_current_limit(get_max_low_mapped()); |
1123 | dma_contiguous_reserve(0); | 1123 | dma_contiguous_reserve(0); |
1124 | 1124 | ||
1125 | /* | 1125 | /* |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5bdc5430597c..e39504878aec 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -665,7 +665,7 @@ void __init initmem_init(void) | |||
665 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | 665 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; |
666 | #endif | 666 | #endif |
667 | 667 | ||
668 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); | 668 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); |
669 | sparse_memory_present_with_active_regions(0); | 669 | sparse_memory_present_with_active_regions(0); |
670 | 670 | ||
671 | #ifdef CONFIG_FLATMEM | 671 | #ifdef CONFIG_FLATMEM |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 104d56a9245f..f35c66c5959a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start, | |||
643 | #ifndef CONFIG_NUMA | 643 | #ifndef CONFIG_NUMA |
644 | void __init initmem_init(void) | 644 | void __init initmem_init(void) |
645 | { | 645 | { |
646 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); | 646 | memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); |
647 | } | 647 | } |
648 | #endif | 648 | #endif |
649 | 649 | ||
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 8dabbed409ee..1e9da795767a 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c | |||
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) | |||
74 | u64 i; | 74 | u64 i; |
75 | phys_addr_t this_start, this_end; | 75 | phys_addr_t this_start, this_end; |
76 | 76 | ||
77 | for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { | 77 | for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { |
78 | this_start = clamp_t(phys_addr_t, this_start, start, end); | 78 | this_start = clamp_t(phys_addr_t, this_start, start, end); |
79 | this_end = clamp_t(phys_addr_t, this_end, start, end); | 79 | this_end = clamp_t(phys_addr_t, this_end, start, end); |
80 | if (this_start < this_end) { | 80 | if (this_start < this_end) { |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c85da7bb6b60..81b2750f3666 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -491,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) | |||
491 | 491 | ||
492 | for (i = 0; i < mi->nr_blks; i++) { | 492 | for (i = 0; i < mi->nr_blks; i++) { |
493 | struct numa_memblk *mb = &mi->blk[i]; | 493 | struct numa_memblk *mb = &mi->blk[i]; |
494 | memblock_set_node(mb->start, mb->end - mb->start, mb->nid); | 494 | memblock_set_node(mb->start, mb->end - mb->start, |
495 | &memblock.memory, mb->nid); | ||
496 | |||
497 | /* | ||
498 | * At this time, all memory regions reserved by memblock are | ||
499 | * used by the kernel. Set the nid in memblock.reserved will | ||
500 | * mark out all the nodes the kernel resides in. | ||
501 | */ | ||
502 | memblock_set_node(mb->start, mb->end - mb->start, | ||
503 | &memblock.reserved, mb->nid); | ||
495 | } | 504 | } |
496 | 505 | ||
497 | /* | 506 | /* |
@@ -553,6 +562,30 @@ static void __init numa_init_array(void) | |||
553 | } | 562 | } |
554 | } | 563 | } |
555 | 564 | ||
565 | static void __init numa_clear_kernel_node_hotplug(void) | ||
566 | { | ||
567 | int i, nid; | ||
568 | nodemask_t numa_kernel_nodes; | ||
569 | unsigned long start, end; | ||
570 | struct memblock_type *type = &memblock.reserved; | ||
571 | |||
572 | /* Mark all kernel nodes. */ | ||
573 | for (i = 0; i < type->cnt; i++) | ||
574 | node_set(type->regions[i].nid, numa_kernel_nodes); | ||
575 | |||
576 | /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ | ||
577 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | ||
578 | nid = numa_meminfo.blk[i].nid; | ||
579 | if (!node_isset(nid, numa_kernel_nodes)) | ||
580 | continue; | ||
581 | |||
582 | start = numa_meminfo.blk[i].start; | ||
583 | end = numa_meminfo.blk[i].end; | ||
584 | |||
585 | memblock_clear_hotplug(start, end - start); | ||
586 | } | ||
587 | } | ||
588 | |||
556 | static int __init numa_init(int (*init_func)(void)) | 589 | static int __init numa_init(int (*init_func)(void)) |
557 | { | 590 | { |
558 | int i; | 591 | int i; |
@@ -565,7 +598,12 @@ static int __init numa_init(int (*init_func)(void)) | |||
565 | nodes_clear(node_possible_map); | 598 | nodes_clear(node_possible_map); |
566 | nodes_clear(node_online_map); | 599 | nodes_clear(node_online_map); |
567 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); | 600 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
568 | WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); | 601 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, |
602 | MAX_NUMNODES)); | ||
603 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, | ||
604 | MAX_NUMNODES)); | ||
605 | /* In case that parsing SRAT failed. */ | ||
606 | WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); | ||
569 | numa_reset_distance(); | 607 | numa_reset_distance(); |
570 | 608 | ||
571 | ret = init_func(); | 609 | ret = init_func(); |
@@ -601,6 +639,16 @@ static int __init numa_init(int (*init_func)(void)) | |||
601 | numa_clear_node(i); | 639 | numa_clear_node(i); |
602 | } | 640 | } |
603 | numa_init_array(); | 641 | numa_init_array(); |
642 | |||
643 | /* | ||
644 | * At very early time, the kernel have to use some memory such as | ||
645 | * loading the kernel image. We cannot prevent this anyway. So any | ||
646 | * node the kernel resides in should be un-hotpluggable. | ||
647 | * | ||
648 | * And when we come here, numa_init() won't fail. | ||
649 | */ | ||
650 | numa_clear_kernel_node_hotplug(); | ||
651 | |||
604 | return 0; | 652 | return 0; |
605 | } | 653 | } |
606 | 654 | ||
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 266ca912f62e..1a25187e151e 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
@@ -181,6 +181,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
181 | (unsigned long long) start, (unsigned long long) end - 1, | 181 | (unsigned long long) start, (unsigned long long) end - 1, |
182 | hotpluggable ? " hotplug" : ""); | 182 | hotpluggable ? " hotplug" : ""); |
183 | 183 | ||
184 | /* Mark hotplug range in memblock. */ | ||
185 | if (hotpluggable && memblock_mark_hotplug(start, ma->length)) | ||
186 | pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", | ||
187 | (unsigned long long)start, (unsigned long long)end - 1); | ||
188 | |||
184 | return 0; | 189 | return 0; |
185 | out_err_bad_srat: | 190 | out_err_bad_srat: |
186 | bad_srat(); | 191 | bad_srat(); |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index f895a8c8a244..92c5937f80c3 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/device.h> | 22 | #include <linux/device.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/backing-dev.h> | 24 | #include <linux/backing-dev.h> |
25 | #include <linux/bootmem.h> | ||
26 | #include <linux/splice.h> | 25 | #include <linux/splice.h> |
27 | #include <linux/pfn.h> | 26 | #include <linux/pfn.h> |
28 | #include <linux/export.h> | 27 | #include <linux/export.h> |
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index e2e04b007e15..17cf96c45f2b 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c | |||
@@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) | |||
324 | { | 324 | { |
325 | struct firmware_map_entry *entry; | 325 | struct firmware_map_entry *entry; |
326 | 326 | ||
327 | entry = alloc_bootmem(sizeof(struct firmware_map_entry)); | 327 | entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0); |
328 | if (WARN_ON(!entry)) | 328 | if (WARN_ON(!entry)) |
329 | return -ENOMEM; | 329 | return -ENOMEM; |
330 | 330 | ||
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 43b9bfea48fa..59779e19315e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c | |||
@@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, | |||
917 | 917 | ||
918 | /* If range covers entire pagetable, free it */ | 918 | /* If range covers entire pagetable, free it */ |
919 | if (!(start_pfn > level_pfn || | 919 | if (!(start_pfn > level_pfn || |
920 | last_pfn < level_pfn + level_size(level))) { | 920 | last_pfn < level_pfn + level_size(level) - 1)) { |
921 | dma_clear_pte(pte); | 921 | dma_clear_pte(pte); |
922 | domain_flush_cache(domain, pte, sizeof(*pte)); | 922 | domain_flush_cache(domain, pte, sizeof(*pte)); |
923 | free_pgtable_page(level_pte); | 923 | free_pgtable_page(level_pte); |
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dc52e13d58e0..3881610b6438 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c | |||
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, | |||
680 | struct i2c_msg __user *tmsgs; | 680 | struct i2c_msg __user *tmsgs; |
681 | struct i2c_msg32 __user *umsgs; | 681 | struct i2c_msg32 __user *umsgs; |
682 | compat_caddr_t datap; | 682 | compat_caddr_t datap; |
683 | int nmsgs, i; | 683 | u32 nmsgs; |
684 | int i; | ||
684 | 685 | ||
685 | if (get_user(nmsgs, &udata->nmsgs)) | 686 | if (get_user(nmsgs, &udata->nmsgs)) |
686 | return -EFAULT; | 687 | return -EFAULT; |
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f7ccc4..0b9ff4395e6a 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c | |||
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) | |||
82 | * events. | 82 | * events. |
83 | */ | 83 | */ |
84 | static int dnotify_handle_event(struct fsnotify_group *group, | 84 | static int dnotify_handle_event(struct fsnotify_group *group, |
85 | struct inode *inode, | ||
85 | struct fsnotify_mark *inode_mark, | 86 | struct fsnotify_mark *inode_mark, |
86 | struct fsnotify_mark *vfsmount_mark, | 87 | struct fsnotify_mark *vfsmount_mark, |
87 | struct fsnotify_event *event) | 88 | u32 mask, void *data, int data_type, |
89 | const unsigned char *file_name) | ||
88 | { | 90 | { |
89 | struct dnotify_mark *dn_mark; | 91 | struct dnotify_mark *dn_mark; |
90 | struct inode *to_tell; | ||
91 | struct dnotify_struct *dn; | 92 | struct dnotify_struct *dn; |
92 | struct dnotify_struct **prev; | 93 | struct dnotify_struct **prev; |
93 | struct fown_struct *fown; | 94 | struct fown_struct *fown; |
94 | __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; | 95 | __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; |
95 | 96 | ||
96 | BUG_ON(vfsmount_mark); | 97 | /* not a dir, dnotify doesn't care */ |
98 | if (!S_ISDIR(inode->i_mode)) | ||
99 | return 0; | ||
97 | 100 | ||
98 | to_tell = event->to_tell; | 101 | BUG_ON(vfsmount_mark); |
99 | 102 | ||
100 | dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); | 103 | dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); |
101 | 104 | ||
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, | |||
122 | return 0; | 125 | return 0; |
123 | } | 126 | } |
124 | 127 | ||
125 | /* | ||
126 | * Given an inode and mask determine if dnotify would be interested in sending | ||
127 | * userspace notification for that pair. | ||
128 | */ | ||
129 | static bool dnotify_should_send_event(struct fsnotify_group *group, | ||
130 | struct inode *inode, | ||
131 | struct fsnotify_mark *inode_mark, | ||
132 | struct fsnotify_mark *vfsmount_mark, | ||
133 | __u32 mask, void *data, int data_type) | ||
134 | { | ||
135 | /* not a dir, dnotify doesn't care */ | ||
136 | if (!S_ISDIR(inode->i_mode)) | ||
137 | return false; | ||
138 | |||
139 | return true; | ||
140 | } | ||
141 | |||
142 | static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) | 128 | static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) |
143 | { | 129 | { |
144 | struct dnotify_mark *dn_mark = container_of(fsn_mark, | 130 | struct dnotify_mark *dn_mark = container_of(fsn_mark, |
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) | |||
152 | 138 | ||
153 | static struct fsnotify_ops dnotify_fsnotify_ops = { | 139 | static struct fsnotify_ops dnotify_fsnotify_ops = { |
154 | .handle_event = dnotify_handle_event, | 140 | .handle_event = dnotify_handle_event, |
155 | .should_send_event = dnotify_should_send_event, | ||
156 | .free_group_priv = NULL, | ||
157 | .freeing_mark = NULL, | ||
158 | .free_event_priv = NULL, | ||
159 | }; | 141 | }; |
160 | 142 | ||
161 | /* | 143 | /* |
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f9122b262..58772623f02a 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c | |||
@@ -9,31 +9,27 @@ | |||
9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | 11 | ||
12 | static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) | 12 | #include "fanotify.h" |
13 | |||
14 | static bool should_merge(struct fsnotify_event *old_fsn, | ||
15 | struct fsnotify_event *new_fsn) | ||
13 | { | 16 | { |
14 | pr_debug("%s: old=%p new=%p\n", __func__, old, new); | 17 | struct fanotify_event_info *old, *new; |
15 | 18 | ||
16 | if (old->to_tell == new->to_tell && | ||
17 | old->data_type == new->data_type && | ||
18 | old->tgid == new->tgid) { | ||
19 | switch (old->data_type) { | ||
20 | case (FSNOTIFY_EVENT_PATH): | ||
21 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | 19 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS |
22 | /* dont merge two permission events */ | 20 | /* dont merge two permission events */ |
23 | if ((old->mask & FAN_ALL_PERM_EVENTS) && | 21 | if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && |
24 | (new->mask & FAN_ALL_PERM_EVENTS)) | 22 | (new_fsn->mask & FAN_ALL_PERM_EVENTS)) |
25 | return false; | 23 | return false; |
26 | #endif | 24 | #endif |
27 | if ((old->path.mnt == new->path.mnt) && | 25 | pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); |
28 | (old->path.dentry == new->path.dentry)) | 26 | old = FANOTIFY_E(old_fsn); |
29 | return true; | 27 | new = FANOTIFY_E(new_fsn); |
30 | break; | 28 | |
31 | case (FSNOTIFY_EVENT_NONE): | 29 | if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && |
32 | return true; | 30 | old->path.mnt == new->path.mnt && |
33 | default: | 31 | old->path.dentry == new->path.dentry) |
34 | BUG(); | 32 | return true; |
35 | }; | ||
36 | } | ||
37 | return false; | 33 | return false; |
38 | } | 34 | } |
39 | 35 | ||
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) | |||
41 | static struct fsnotify_event *fanotify_merge(struct list_head *list, | 37 | static struct fsnotify_event *fanotify_merge(struct list_head *list, |
42 | struct fsnotify_event *event) | 38 | struct fsnotify_event *event) |
43 | { | 39 | { |
44 | struct fsnotify_event_holder *test_holder; | 40 | struct fsnotify_event *test_event; |
45 | struct fsnotify_event *test_event = NULL; | 41 | bool do_merge = false; |
46 | struct fsnotify_event *new_event; | ||
47 | 42 | ||
48 | pr_debug("%s: list=%p event=%p\n", __func__, list, event); | 43 | pr_debug("%s: list=%p event=%p\n", __func__, list, event); |
49 | 44 | ||
50 | 45 | list_for_each_entry_reverse(test_event, list, list) { | |
51 | list_for_each_entry_reverse(test_holder, list, event_list) { | 46 | if (should_merge(test_event, event)) { |
52 | if (should_merge(test_holder->event, event)) { | 47 | do_merge = true; |
53 | test_event = test_holder->event; | ||
54 | break; | 48 | break; |
55 | } | 49 | } |
56 | } | 50 | } |
57 | 51 | ||
58 | if (!test_event) | 52 | if (!do_merge) |
59 | return NULL; | 53 | return NULL; |
60 | 54 | ||
61 | fsnotify_get_event(test_event); | 55 | test_event->mask |= event->mask; |
62 | 56 | return test_event; | |
63 | /* if they are exactly the same we are done */ | ||
64 | if (test_event->mask == event->mask) | ||
65 | return test_event; | ||
66 | |||
67 | /* | ||
68 | * if the refcnt == 2 this is the only queue | ||
69 | * for this event and so we can update the mask | ||
70 | * in place. | ||
71 | */ | ||
72 | if (atomic_read(&test_event->refcnt) == 2) { | ||
73 | test_event->mask |= event->mask; | ||
74 | return test_event; | ||
75 | } | ||
76 | |||
77 | new_event = fsnotify_clone_event(test_event); | ||
78 | |||
79 | /* done with test_event */ | ||
80 | fsnotify_put_event(test_event); | ||
81 | |||
82 | /* couldn't allocate memory, merge was not possible */ | ||
83 | if (unlikely(!new_event)) | ||
84 | return ERR_PTR(-ENOMEM); | ||
85 | |||
86 | /* build new event and replace it on the list */ | ||
87 | new_event->mask = (test_event->mask | event->mask); | ||
88 | fsnotify_replace_event(test_holder, new_event); | ||
89 | |||
90 | /* we hold a reference on new_event from clone_event */ | ||
91 | return new_event; | ||
92 | } | 57 | } |
93 | 58 | ||
94 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | 59 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS |
95 | static int fanotify_get_response_from_access(struct fsnotify_group *group, | 60 | static int fanotify_get_response_from_access(struct fsnotify_group *group, |
96 | struct fsnotify_event *event) | 61 | struct fanotify_event_info *event) |
97 | { | 62 | { |
98 | int ret; | 63 | int ret; |
99 | 64 | ||
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, | |||
106 | return 0; | 71 | return 0; |
107 | 72 | ||
108 | /* userspace responded, convert to something usable */ | 73 | /* userspace responded, convert to something usable */ |
109 | spin_lock(&event->lock); | ||
110 | switch (event->response) { | 74 | switch (event->response) { |
111 | case FAN_ALLOW: | 75 | case FAN_ALLOW: |
112 | ret = 0; | 76 | ret = 0; |
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, | |||
116 | ret = -EPERM; | 80 | ret = -EPERM; |
117 | } | 81 | } |
118 | event->response = 0; | 82 | event->response = 0; |
119 | spin_unlock(&event->lock); | ||
120 | 83 | ||
121 | pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, | 84 | pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, |
122 | group, event, ret); | 85 | group, event, ret); |
@@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, | |||
125 | } | 88 | } |
126 | #endif | 89 | #endif |
127 | 90 | ||
128 | static int fanotify_handle_event(struct fsnotify_group *group, | 91 | static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, |
129 | struct fsnotify_mark *inode_mark, | ||
130 | struct fsnotify_mark *fanotify_mark, | ||
131 | struct fsnotify_event *event) | ||
132 | { | ||
133 | int ret = 0; | ||
134 | struct fsnotify_event *notify_event = NULL; | ||
135 | |||
136 | BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); | ||
137 | BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); | ||
138 | BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); | ||
139 | BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); | ||
140 | BUILD_BUG_ON(FAN_OPEN != FS_OPEN); | ||
141 | BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); | ||
142 | BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); | ||
143 | BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); | ||
144 | BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); | ||
145 | BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); | ||
146 | |||
147 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); | ||
148 | |||
149 | notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); | ||
150 | if (IS_ERR(notify_event)) | ||
151 | return PTR_ERR(notify_event); | ||
152 | |||
153 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
154 | if (event->mask & FAN_ALL_PERM_EVENTS) { | ||
155 | /* if we merged we need to wait on the new event */ | ||
156 | if (notify_event) | ||
157 | event = notify_event; | ||
158 | ret = fanotify_get_response_from_access(group, event); | ||
159 | } | ||
160 | #endif | ||
161 | |||
162 | if (notify_event) | ||
163 | fsnotify_put_event(notify_event); | ||
164 | |||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | static bool fanotify_should_send_event(struct fsnotify_group *group, | ||
169 | struct inode *to_tell, | ||
170 | struct fsnotify_mark *inode_mark, | ||
171 | struct fsnotify_mark *vfsmnt_mark, | 92 | struct fsnotify_mark *vfsmnt_mark, |
172 | __u32 event_mask, void *data, int data_type) | 93 | u32 event_mask, |
94 | void *data, int data_type) | ||
173 | { | 95 | { |
174 | __u32 marks_mask, marks_ignored_mask; | 96 | __u32 marks_mask, marks_ignored_mask; |
175 | struct path *path = data; | 97 | struct path *path = data; |
176 | 98 | ||
177 | pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " | 99 | pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" |
178 | "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, | 100 | " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, |
179 | inode_mark, vfsmnt_mark, event_mask, data, data_type); | 101 | event_mask, data, data_type); |
180 | 102 | ||
181 | /* if we don't have enough info to send an event to userspace say no */ | 103 | /* if we don't have enough info to send an event to userspace say no */ |
182 | if (data_type != FSNOTIFY_EVENT_PATH) | 104 | if (data_type != FSNOTIFY_EVENT_PATH) |
@@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, | |||
217 | return false; | 139 | return false; |
218 | } | 140 | } |
219 | 141 | ||
142 | static int fanotify_handle_event(struct fsnotify_group *group, | ||
143 | struct inode *inode, | ||
144 | struct fsnotify_mark *inode_mark, | ||
145 | struct fsnotify_mark *fanotify_mark, | ||
146 | u32 mask, void *data, int data_type, | ||
147 | const unsigned char *file_name) | ||
148 | { | ||
149 | int ret = 0; | ||
150 | struct fanotify_event_info *event; | ||
151 | struct fsnotify_event *fsn_event; | ||
152 | struct fsnotify_event *notify_fsn_event; | ||
153 | |||
154 | BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); | ||
155 | BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); | ||
156 | BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); | ||
157 | BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); | ||
158 | BUILD_BUG_ON(FAN_OPEN != FS_OPEN); | ||
159 | BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); | ||
160 | BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); | ||
161 | BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); | ||
162 | BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); | ||
163 | BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); | ||
164 | |||
165 | if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, | ||
166 | data_type)) | ||
167 | return 0; | ||
168 | |||
169 | pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, | ||
170 | mask); | ||
171 | |||
172 | event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); | ||
173 | if (unlikely(!event)) | ||
174 | return -ENOMEM; | ||
175 | |||
176 | fsn_event = &event->fse; | ||
177 | fsnotify_init_event(fsn_event, inode, mask); | ||
178 | event->tgid = get_pid(task_tgid(current)); | ||
179 | if (data_type == FSNOTIFY_EVENT_PATH) { | ||
180 | struct path *path = data; | ||
181 | event->path = *path; | ||
182 | path_get(&event->path); | ||
183 | } else { | ||
184 | event->path.mnt = NULL; | ||
185 | event->path.dentry = NULL; | ||
186 | } | ||
187 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
188 | event->response = 0; | ||
189 | #endif | ||
190 | |||
191 | notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, | ||
192 | fanotify_merge); | ||
193 | if (notify_fsn_event) { | ||
194 | /* Our event wasn't used in the end. Free it. */ | ||
195 | fsnotify_destroy_event(group, fsn_event); | ||
196 | if (IS_ERR(notify_fsn_event)) | ||
197 | return PTR_ERR(notify_fsn_event); | ||
198 | /* We need to ask about a different events after a merge... */ | ||
199 | event = FANOTIFY_E(notify_fsn_event); | ||
200 | fsn_event = notify_fsn_event; | ||
201 | } | ||
202 | |||
203 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
204 | if (fsn_event->mask & FAN_ALL_PERM_EVENTS) | ||
205 | ret = fanotify_get_response_from_access(group, event); | ||
206 | #endif | ||
207 | return ret; | ||
208 | } | ||
209 | |||
220 | static void fanotify_free_group_priv(struct fsnotify_group *group) | 210 | static void fanotify_free_group_priv(struct fsnotify_group *group) |
221 | { | 211 | { |
222 | struct user_struct *user; | 212 | struct user_struct *user; |
@@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) | |||
226 | free_uid(user); | 216 | free_uid(user); |
227 | } | 217 | } |
228 | 218 | ||
219 | static void fanotify_free_event(struct fsnotify_event *fsn_event) | ||
220 | { | ||
221 | struct fanotify_event_info *event; | ||
222 | |||
223 | event = FANOTIFY_E(fsn_event); | ||
224 | path_put(&event->path); | ||
225 | put_pid(event->tgid); | ||
226 | kmem_cache_free(fanotify_event_cachep, event); | ||
227 | } | ||
228 | |||
229 | const struct fsnotify_ops fanotify_fsnotify_ops = { | 229 | const struct fsnotify_ops fanotify_fsnotify_ops = { |
230 | .handle_event = fanotify_handle_event, | 230 | .handle_event = fanotify_handle_event, |
231 | .should_send_event = fanotify_should_send_event, | ||
232 | .free_group_priv = fanotify_free_group_priv, | 231 | .free_group_priv = fanotify_free_group_priv, |
233 | .free_event_priv = NULL, | 232 | .free_event = fanotify_free_event, |
234 | .freeing_mark = NULL, | ||
235 | }; | 233 | }; |
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 000000000000..0e90174a116a --- /dev/null +++ b/fs/notify/fanotify/fanotify.h | |||
@@ -0,0 +1,23 @@ | |||
1 | #include <linux/fsnotify_backend.h> | ||
2 | #include <linux/path.h> | ||
3 | #include <linux/slab.h> | ||
4 | |||
5 | extern struct kmem_cache *fanotify_event_cachep; | ||
6 | |||
7 | struct fanotify_event_info { | ||
8 | struct fsnotify_event fse; | ||
9 | /* | ||
10 | * We hold ref to this path so it may be dereferenced at any point | ||
11 | * during this object's lifetime | ||
12 | */ | ||
13 | struct path path; | ||
14 | struct pid *tgid; | ||
15 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
16 | u32 response; /* userspace answer to question */ | ||
17 | #endif | ||
18 | }; | ||
19 | |||
20 | static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) | ||
21 | { | ||
22 | return container_of(fse, struct fanotify_event_info, fse); | ||
23 | } | ||
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb6427df3..57d7c083cb4b 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c | |||
@@ -19,6 +19,7 @@ | |||
19 | 19 | ||
20 | #include "../../mount.h" | 20 | #include "../../mount.h" |
21 | #include "../fdinfo.h" | 21 | #include "../fdinfo.h" |
22 | #include "fanotify.h" | ||
22 | 23 | ||
23 | #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 | 24 | #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 |
24 | #define FANOTIFY_DEFAULT_MAX_MARKS 8192 | 25 | #define FANOTIFY_DEFAULT_MAX_MARKS 8192 |
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; | |||
28 | 29 | ||
29 | static struct kmem_cache *fanotify_mark_cache __read_mostly; | 30 | static struct kmem_cache *fanotify_mark_cache __read_mostly; |
30 | static struct kmem_cache *fanotify_response_event_cache __read_mostly; | 31 | static struct kmem_cache *fanotify_response_event_cache __read_mostly; |
32 | struct kmem_cache *fanotify_event_cachep __read_mostly; | ||
31 | 33 | ||
32 | struct fanotify_response_event { | 34 | struct fanotify_response_event { |
33 | struct list_head list; | 35 | struct list_head list; |
34 | __s32 fd; | 36 | __s32 fd; |
35 | struct fsnotify_event *event; | 37 | struct fanotify_event_info *event; |
36 | }; | 38 | }; |
37 | 39 | ||
38 | /* | 40 | /* |
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, | |||
61 | } | 63 | } |
62 | 64 | ||
63 | static int create_fd(struct fsnotify_group *group, | 65 | static int create_fd(struct fsnotify_group *group, |
64 | struct fsnotify_event *event, | 66 | struct fanotify_event_info *event, |
65 | struct file **file) | 67 | struct file **file) |
66 | { | 68 | { |
67 | int client_fd; | 69 | int client_fd; |
68 | struct file *new_file; | 70 | struct file *new_file; |
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group, | |||
73 | if (client_fd < 0) | 75 | if (client_fd < 0) |
74 | return client_fd; | 76 | return client_fd; |
75 | 77 | ||
76 | if (event->data_type != FSNOTIFY_EVENT_PATH) { | ||
77 | WARN_ON(1); | ||
78 | put_unused_fd(client_fd); | ||
79 | return -EINVAL; | ||
80 | } | ||
81 | |||
82 | /* | 78 | /* |
83 | * we need a new file handle for the userspace program so it can read even if it was | 79 | * we need a new file handle for the userspace program so it can read even if it was |
84 | * originally opened O_WRONLY. | 80 | * originally opened O_WRONLY. |
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group, | |||
109 | } | 105 | } |
110 | 106 | ||
111 | static int fill_event_metadata(struct fsnotify_group *group, | 107 | static int fill_event_metadata(struct fsnotify_group *group, |
112 | struct fanotify_event_metadata *metadata, | 108 | struct fanotify_event_metadata *metadata, |
113 | struct fsnotify_event *event, | 109 | struct fsnotify_event *fsn_event, |
114 | struct file **file) | 110 | struct file **file) |
115 | { | 111 | { |
116 | int ret = 0; | 112 | int ret = 0; |
113 | struct fanotify_event_info *event; | ||
117 | 114 | ||
118 | pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, | 115 | pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, |
119 | group, metadata, event); | 116 | group, metadata, fsn_event); |
120 | 117 | ||
121 | *file = NULL; | 118 | *file = NULL; |
119 | event = container_of(fsn_event, struct fanotify_event_info, fse); | ||
122 | metadata->event_len = FAN_EVENT_METADATA_LEN; | 120 | metadata->event_len = FAN_EVENT_METADATA_LEN; |
123 | metadata->metadata_len = FAN_EVENT_METADATA_LEN; | 121 | metadata->metadata_len = FAN_EVENT_METADATA_LEN; |
124 | metadata->vers = FANOTIFY_METADATA_VERSION; | 122 | metadata->vers = FANOTIFY_METADATA_VERSION; |
125 | metadata->reserved = 0; | 123 | metadata->reserved = 0; |
126 | metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; | 124 | metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; |
127 | metadata->pid = pid_vnr(event->tgid); | 125 | metadata->pid = pid_vnr(event->tgid); |
128 | if (unlikely(event->mask & FAN_Q_OVERFLOW)) | 126 | if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) |
129 | metadata->fd = FAN_NOFD; | 127 | metadata->fd = FAN_NOFD; |
130 | else { | 128 | else { |
131 | metadata->fd = create_fd(group, event, file); | 129 | metadata->fd = create_fd(group, event, file); |
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, | |||
209 | if (!re) | 207 | if (!re) |
210 | return -ENOMEM; | 208 | return -ENOMEM; |
211 | 209 | ||
212 | re->event = event; | 210 | re->event = FANOTIFY_E(event); |
213 | re->fd = fd; | 211 | re->fd = fd; |
214 | 212 | ||
215 | mutex_lock(&group->fanotify_data.access_mutex); | 213 | mutex_lock(&group->fanotify_data.access_mutex); |
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, | |||
217 | if (atomic_read(&group->fanotify_data.bypass_perm)) { | 215 | if (atomic_read(&group->fanotify_data.bypass_perm)) { |
218 | mutex_unlock(&group->fanotify_data.access_mutex); | 216 | mutex_unlock(&group->fanotify_data.access_mutex); |
219 | kmem_cache_free(fanotify_response_event_cache, re); | 217 | kmem_cache_free(fanotify_response_event_cache, re); |
220 | event->response = FAN_ALLOW; | 218 | FANOTIFY_E(event)->response = FAN_ALLOW; |
221 | return 0; | 219 | return 0; |
222 | } | 220 | } |
223 | 221 | ||
@@ -273,7 +271,7 @@ out_close_fd: | |||
273 | out: | 271 | out: |
274 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | 272 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS |
275 | if (event->mask & FAN_ALL_PERM_EVENTS) { | 273 | if (event->mask & FAN_ALL_PERM_EVENTS) { |
276 | event->response = FAN_DENY; | 274 | FANOTIFY_E(event)->response = FAN_DENY; |
277 | wake_up(&group->fanotify_data.access_waitq); | 275 | wake_up(&group->fanotify_data.access_waitq); |
278 | } | 276 | } |
279 | #endif | 277 | #endif |
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, | |||
321 | if (IS_ERR(kevent)) | 319 | if (IS_ERR(kevent)) |
322 | break; | 320 | break; |
323 | ret = copy_event_to_user(group, kevent, buf); | 321 | ret = copy_event_to_user(group, kevent, buf); |
324 | fsnotify_put_event(kevent); | 322 | fsnotify_destroy_event(group, kevent); |
325 | if (ret < 0) | 323 | if (ret < 0) |
326 | break; | 324 | break; |
327 | buf += ret; | 325 | buf += ret; |
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) | |||
409 | static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 407 | static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
410 | { | 408 | { |
411 | struct fsnotify_group *group; | 409 | struct fsnotify_group *group; |
412 | struct fsnotify_event_holder *holder; | 410 | struct fsnotify_event *fsn_event; |
413 | void __user *p; | 411 | void __user *p; |
414 | int ret = -ENOTTY; | 412 | int ret = -ENOTTY; |
415 | size_t send_len = 0; | 413 | size_t send_len = 0; |
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar | |||
421 | switch (cmd) { | 419 | switch (cmd) { |
422 | case FIONREAD: | 420 | case FIONREAD: |
423 | mutex_lock(&group->notification_mutex); | 421 | mutex_lock(&group->notification_mutex); |
424 | list_for_each_entry(holder, &group->notification_list, event_list) | 422 | list_for_each_entry(fsn_event, &group->notification_list, list) |
425 | send_len += FAN_EVENT_METADATA_LEN; | 423 | send_len += FAN_EVENT_METADATA_LEN; |
426 | mutex_unlock(&group->notification_mutex); | 424 | mutex_unlock(&group->notification_mutex); |
427 | ret = put_user(send_len, (int __user *) p); | 425 | ret = put_user(send_len, (int __user *) p); |
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void) | |||
906 | fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); | 904 | fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); |
907 | fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, | 905 | fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, |
908 | SLAB_PANIC); | 906 | SLAB_PANIC); |
907 | fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); | ||
909 | 908 | ||
910 | return 0; | 909 | return 0; |
911 | } | 910 | } |
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d67d9b1..1d4e1ea2f37c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c | |||
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, | |||
128 | struct fsnotify_mark *vfsmount_mark, | 128 | struct fsnotify_mark *vfsmount_mark, |
129 | __u32 mask, void *data, | 129 | __u32 mask, void *data, |
130 | int data_is, u32 cookie, | 130 | int data_is, u32 cookie, |
131 | const unsigned char *file_name, | 131 | const unsigned char *file_name) |
132 | struct fsnotify_event **event) | ||
133 | { | 132 | { |
134 | struct fsnotify_group *group = NULL; | 133 | struct fsnotify_group *group = NULL; |
135 | __u32 inode_test_mask = 0; | 134 | __u32 inode_test_mask = 0; |
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell, | |||
170 | 169 | ||
171 | pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" | 170 | pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" |
172 | " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" | 171 | " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" |
173 | " data=%p data_is=%d cookie=%d event=%p\n", | 172 | " data=%p data_is=%d cookie=%d\n", |
174 | __func__, group, to_tell, mask, inode_mark, | 173 | __func__, group, to_tell, mask, inode_mark, |
175 | inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, | 174 | inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, |
176 | data_is, cookie, *event); | 175 | data_is, cookie); |
177 | 176 | ||
178 | if (!inode_test_mask && !vfsmount_test_mask) | 177 | if (!inode_test_mask && !vfsmount_test_mask) |
179 | return 0; | 178 | return 0; |
180 | 179 | ||
181 | if (group->ops->should_send_event(group, to_tell, inode_mark, | 180 | return group->ops->handle_event(group, to_tell, inode_mark, |
182 | vfsmount_mark, mask, data, | 181 | vfsmount_mark, mask, data, data_is, |
183 | data_is) == false) | 182 | file_name); |
184 | return 0; | ||
185 | |||
186 | if (!*event) { | ||
187 | *event = fsnotify_create_event(to_tell, mask, data, | ||
188 | data_is, file_name, | ||
189 | cookie, GFP_KERNEL); | ||
190 | if (!*event) | ||
191 | return -ENOMEM; | ||
192 | } | ||
193 | return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); | ||
194 | } | 183 | } |
195 | 184 | ||
196 | /* | 185 | /* |
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
205 | struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; | 194 | struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; |
206 | struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; | 195 | struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; |
207 | struct fsnotify_group *inode_group, *vfsmount_group; | 196 | struct fsnotify_group *inode_group, *vfsmount_group; |
208 | struct fsnotify_event *event = NULL; | ||
209 | struct mount *mnt; | 197 | struct mount *mnt; |
210 | int idx, ret = 0; | 198 | int idx, ret = 0; |
211 | /* global tests shouldn't care about events on child only the specific event */ | 199 | /* global tests shouldn't care about events on child only the specific event */ |
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
258 | 246 | ||
259 | if (inode_group > vfsmount_group) { | 247 | if (inode_group > vfsmount_group) { |
260 | /* handle inode */ | 248 | /* handle inode */ |
261 | ret = send_to_group(to_tell, inode_mark, NULL, mask, data, | 249 | ret = send_to_group(to_tell, inode_mark, NULL, mask, |
262 | data_is, cookie, file_name, &event); | 250 | data, data_is, cookie, file_name); |
263 | /* we didn't use the vfsmount_mark */ | 251 | /* we didn't use the vfsmount_mark */ |
264 | vfsmount_group = NULL; | 252 | vfsmount_group = NULL; |
265 | } else if (vfsmount_group > inode_group) { | 253 | } else if (vfsmount_group > inode_group) { |
266 | ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, | 254 | ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, |
267 | data_is, cookie, file_name, &event); | 255 | data, data_is, cookie, file_name); |
268 | inode_group = NULL; | 256 | inode_group = NULL; |
269 | } else { | 257 | } else { |
270 | ret = send_to_group(to_tell, inode_mark, vfsmount_mark, | 258 | ret = send_to_group(to_tell, inode_mark, vfsmount_mark, |
271 | mask, data, data_is, cookie, file_name, | 259 | mask, data, data_is, cookie, |
272 | &event); | 260 | file_name); |
273 | } | 261 | } |
274 | 262 | ||
275 | if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) | 263 | if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) |
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
285 | ret = 0; | 273 | ret = 0; |
286 | out: | 274 | out: |
287 | srcu_read_unlock(&fsnotify_mark_srcu, idx); | 275 | srcu_read_unlock(&fsnotify_mark_srcu, idx); |
288 | /* | ||
289 | * fsnotify_create_event() took a reference so the event can't be cleaned | ||
290 | * up while we are still trying to add it to lists, drop that one. | ||
291 | */ | ||
292 | if (event) | ||
293 | fsnotify_put_event(event); | ||
294 | 276 | ||
295 | return ret; | 277 | return ret; |
296 | } | 278 | } |
diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625bd88b4..ee674fe2cec7 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c | |||
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) | |||
99 | INIT_LIST_HEAD(&group->marks_list); | 99 | INIT_LIST_HEAD(&group->marks_list); |
100 | 100 | ||
101 | group->ops = ops; | 101 | group->ops = ops; |
102 | fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); | ||
102 | 103 | ||
103 | return group; | 104 | return group; |
104 | } | 105 | } |
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4de4bf..485eef3f4407 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h | |||
@@ -2,11 +2,12 @@ | |||
2 | #include <linux/inotify.h> | 2 | #include <linux/inotify.h> |
3 | #include <linux/slab.h> /* struct kmem_cache */ | 3 | #include <linux/slab.h> /* struct kmem_cache */ |
4 | 4 | ||
5 | extern struct kmem_cache *event_priv_cachep; | 5 | struct inotify_event_info { |
6 | 6 | struct fsnotify_event fse; | |
7 | struct inotify_event_private_data { | ||
8 | struct fsnotify_event_private_data fsnotify_event_priv_data; | ||
9 | int wd; | 7 | int wd; |
8 | u32 sync_cookie; | ||
9 | int name_len; | ||
10 | char name[]; | ||
10 | }; | 11 | }; |
11 | 12 | ||
12 | struct inotify_inode_mark { | 13 | struct inotify_inode_mark { |
@@ -14,8 +15,18 @@ struct inotify_inode_mark { | |||
14 | int wd; | 15 | int wd; |
15 | }; | 16 | }; |
16 | 17 | ||
18 | static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) | ||
19 | { | ||
20 | return container_of(fse, struct inotify_event_info, fse); | ||
21 | } | ||
22 | |||
17 | extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, | 23 | extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, |
18 | struct fsnotify_group *group); | 24 | struct fsnotify_group *group); |
19 | extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); | 25 | extern int inotify_handle_event(struct fsnotify_group *group, |
26 | struct inode *inode, | ||
27 | struct fsnotify_mark *inode_mark, | ||
28 | struct fsnotify_mark *vfsmount_mark, | ||
29 | u32 mask, void *data, int data_type, | ||
30 | const unsigned char *file_name); | ||
20 | 31 | ||
21 | extern const struct fsnotify_ops inotify_fsnotify_ops; | 32 | extern const struct fsnotify_ops inotify_fsnotify_ops; |
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308b81b4..aad1a35e9af1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c | |||
@@ -34,100 +34,87 @@ | |||
34 | #include "inotify.h" | 34 | #include "inotify.h" |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * Check if 2 events contain the same information. We do not compare private data | 37 | * Check if 2 events contain the same information. |
38 | * but at this moment that isn't a problem for any know fsnotify listeners. | ||
39 | */ | 38 | */ |
40 | static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) | 39 | static bool event_compare(struct fsnotify_event *old_fsn, |
40 | struct fsnotify_event *new_fsn) | ||
41 | { | 41 | { |
42 | if ((old->mask == new->mask) && | 42 | struct inotify_event_info *old, *new; |
43 | (old->to_tell == new->to_tell) && | 43 | |
44 | (old->data_type == new->data_type) && | 44 | if (old_fsn->mask & FS_IN_IGNORED) |
45 | (old->name_len == new->name_len)) { | 45 | return false; |
46 | switch (old->data_type) { | 46 | old = INOTIFY_E(old_fsn); |
47 | case (FSNOTIFY_EVENT_INODE): | 47 | new = INOTIFY_E(new_fsn); |
48 | /* remember, after old was put on the wait_q we aren't | 48 | if ((old_fsn->mask == new_fsn->mask) && |
49 | * allowed to look at the inode any more, only thing | 49 | (old_fsn->inode == new_fsn->inode) && |
50 | * left to check was if the file_name is the same */ | 50 | (old->name_len == new->name_len) && |
51 | if (!old->name_len || | 51 | (!old->name_len || !strcmp(old->name, new->name))) |
52 | !strcmp(old->file_name, new->file_name)) | 52 | return true; |
53 | return true; | ||
54 | break; | ||
55 | case (FSNOTIFY_EVENT_PATH): | ||
56 | if ((old->path.mnt == new->path.mnt) && | ||
57 | (old->path.dentry == new->path.dentry)) | ||
58 | return true; | ||
59 | break; | ||
60 | case (FSNOTIFY_EVENT_NONE): | ||
61 | if (old->mask & FS_Q_OVERFLOW) | ||
62 | return true; | ||
63 | else if (old->mask & FS_IN_IGNORED) | ||
64 | return false; | ||
65 | return true; | ||
66 | }; | ||
67 | } | ||
68 | return false; | 53 | return false; |
69 | } | 54 | } |
70 | 55 | ||
71 | static struct fsnotify_event *inotify_merge(struct list_head *list, | 56 | static struct fsnotify_event *inotify_merge(struct list_head *list, |
72 | struct fsnotify_event *event) | 57 | struct fsnotify_event *event) |
73 | { | 58 | { |
74 | struct fsnotify_event_holder *last_holder; | ||
75 | struct fsnotify_event *last_event; | 59 | struct fsnotify_event *last_event; |
76 | 60 | ||
77 | /* and the list better be locked by something too */ | 61 | last_event = list_entry(list->prev, struct fsnotify_event, list); |
78 | spin_lock(&event->lock); | 62 | if (!event_compare(last_event, event)) |
79 | 63 | return NULL; | |
80 | last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); | ||
81 | last_event = last_holder->event; | ||
82 | if (event_compare(last_event, event)) | ||
83 | fsnotify_get_event(last_event); | ||
84 | else | ||
85 | last_event = NULL; | ||
86 | |||
87 | spin_unlock(&event->lock); | ||
88 | |||
89 | return last_event; | 64 | return last_event; |
90 | } | 65 | } |
91 | 66 | ||
92 | static int inotify_handle_event(struct fsnotify_group *group, | 67 | int inotify_handle_event(struct fsnotify_group *group, |
93 | struct fsnotify_mark *inode_mark, | 68 | struct inode *inode, |
94 | struct fsnotify_mark *vfsmount_mark, | 69 | struct fsnotify_mark *inode_mark, |
95 | struct fsnotify_event *event) | 70 | struct fsnotify_mark *vfsmount_mark, |
71 | u32 mask, void *data, int data_type, | ||
72 | const unsigned char *file_name) | ||
96 | { | 73 | { |
97 | struct inotify_inode_mark *i_mark; | 74 | struct inotify_inode_mark *i_mark; |
98 | struct inode *to_tell; | 75 | struct inotify_event_info *event; |
99 | struct inotify_event_private_data *event_priv; | ||
100 | struct fsnotify_event_private_data *fsn_event_priv; | ||
101 | struct fsnotify_event *added_event; | 76 | struct fsnotify_event *added_event; |
102 | int wd, ret = 0; | 77 | struct fsnotify_event *fsn_event; |
78 | int ret = 0; | ||
79 | int len = 0; | ||
80 | int alloc_len = sizeof(struct inotify_event_info); | ||
103 | 81 | ||
104 | BUG_ON(vfsmount_mark); | 82 | BUG_ON(vfsmount_mark); |
105 | 83 | ||
106 | pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, | 84 | if ((inode_mark->mask & FS_EXCL_UNLINK) && |
107 | event, event->to_tell, event->mask); | 85 | (data_type == FSNOTIFY_EVENT_PATH)) { |
86 | struct path *path = data; | ||
108 | 87 | ||
109 | to_tell = event->to_tell; | 88 | if (d_unlinked(path->dentry)) |
89 | return 0; | ||
90 | } | ||
91 | if (file_name) { | ||
92 | len = strlen(file_name); | ||
93 | alloc_len += len + 1; | ||
94 | } | ||
95 | |||
96 | pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, | ||
97 | mask); | ||
110 | 98 | ||
111 | i_mark = container_of(inode_mark, struct inotify_inode_mark, | 99 | i_mark = container_of(inode_mark, struct inotify_inode_mark, |
112 | fsn_mark); | 100 | fsn_mark); |
113 | wd = i_mark->wd; | ||
114 | 101 | ||
115 | event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); | 102 | event = kmalloc(alloc_len, GFP_KERNEL); |
116 | if (unlikely(!event_priv)) | 103 | if (unlikely(!event)) |
117 | return -ENOMEM; | 104 | return -ENOMEM; |
118 | 105 | ||
119 | fsn_event_priv = &event_priv->fsnotify_event_priv_data; | 106 | fsn_event = &event->fse; |
120 | 107 | fsnotify_init_event(fsn_event, inode, mask); | |
121 | fsnotify_get_group(group); | 108 | event->wd = i_mark->wd; |
122 | fsn_event_priv->group = group; | 109 | event->name_len = len; |
123 | event_priv->wd = wd; | 110 | if (len) |
111 | strcpy(event->name, file_name); | ||
124 | 112 | ||
125 | added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); | 113 | added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); |
126 | if (added_event) { | 114 | if (added_event) { |
127 | inotify_free_event_priv(fsn_event_priv); | 115 | /* Our event wasn't used in the end. Free it. */ |
128 | if (!IS_ERR(added_event)) | 116 | fsnotify_destroy_event(group, fsn_event); |
129 | fsnotify_put_event(added_event); | 117 | if (IS_ERR(added_event)) |
130 | else | ||
131 | ret = PTR_ERR(added_event); | 118 | ret = PTR_ERR(added_event); |
132 | } | 119 | } |
133 | 120 | ||
@@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify | |||
142 | inotify_ignored_and_remove_idr(fsn_mark, group); | 129 | inotify_ignored_and_remove_idr(fsn_mark, group); |
143 | } | 130 | } |
144 | 131 | ||
145 | static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, | ||
146 | struct fsnotify_mark *inode_mark, | ||
147 | struct fsnotify_mark *vfsmount_mark, | ||
148 | __u32 mask, void *data, int data_type) | ||
149 | { | ||
150 | if ((inode_mark->mask & FS_EXCL_UNLINK) && | ||
151 | (data_type == FSNOTIFY_EVENT_PATH)) { | ||
152 | struct path *path = data; | ||
153 | |||
154 | if (d_unlinked(path->dentry)) | ||
155 | return false; | ||
156 | } | ||
157 | |||
158 | return true; | ||
159 | } | ||
160 | |||
161 | /* | 132 | /* |
162 | * This is NEVER supposed to be called. Inotify marks should either have been | 133 | * This is NEVER supposed to be called. Inotify marks should either have been |
163 | * removed from the idr when the watch was removed or in the | 134 | * removed from the idr when the watch was removed or in the |
@@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group) | |||
202 | free_uid(group->inotify_data.user); | 173 | free_uid(group->inotify_data.user); |
203 | } | 174 | } |
204 | 175 | ||
205 | void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) | 176 | static void inotify_free_event(struct fsnotify_event *fsn_event) |
206 | { | 177 | { |
207 | struct inotify_event_private_data *event_priv; | 178 | kfree(INOTIFY_E(fsn_event)); |
208 | |||
209 | |||
210 | event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, | ||
211 | fsnotify_event_priv_data); | ||
212 | |||
213 | fsnotify_put_group(fsn_event_priv->group); | ||
214 | kmem_cache_free(event_priv_cachep, event_priv); | ||
215 | } | 179 | } |
216 | 180 | ||
217 | const struct fsnotify_ops inotify_fsnotify_ops = { | 181 | const struct fsnotify_ops inotify_fsnotify_ops = { |
218 | .handle_event = inotify_handle_event, | 182 | .handle_event = inotify_handle_event, |
219 | .should_send_event = inotify_should_send_event, | ||
220 | .free_group_priv = inotify_free_group_priv, | 183 | .free_group_priv = inotify_free_group_priv, |
221 | .free_event_priv = inotify_free_event_priv, | 184 | .free_event = inotify_free_event, |
222 | .freeing_mark = inotify_freeing_mark, | 185 | .freeing_mark = inotify_freeing_mark, |
223 | }; | 186 | }; |
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 60f954a891ab..497395c8274b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c | |||
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; | |||
50 | static int inotify_max_user_watches __read_mostly; | 50 | static int inotify_max_user_watches __read_mostly; |
51 | 51 | ||
52 | static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; | 52 | static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; |
53 | struct kmem_cache *event_priv_cachep __read_mostly; | ||
54 | 53 | ||
55 | #ifdef CONFIG_SYSCTL | 54 | #ifdef CONFIG_SYSCTL |
56 | 55 | ||
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) | |||
124 | return ret; | 123 | return ret; |
125 | } | 124 | } |
126 | 125 | ||
126 | static int round_event_name_len(struct fsnotify_event *fsn_event) | ||
127 | { | ||
128 | struct inotify_event_info *event; | ||
129 | |||
130 | event = INOTIFY_E(fsn_event); | ||
131 | if (!event->name_len) | ||
132 | return 0; | ||
133 | return roundup(event->name_len + 1, sizeof(struct inotify_event)); | ||
134 | } | ||
135 | |||
127 | /* | 136 | /* |
128 | * Get an inotify_kernel_event if one exists and is small | 137 | * Get an inotify_kernel_event if one exists and is small |
129 | * enough to fit in "count". Return an error pointer if | 138 | * enough to fit in "count". Return an error pointer if |
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, | |||
144 | 153 | ||
145 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); | 154 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); |
146 | 155 | ||
147 | if (event->name_len) | 156 | event_size += round_event_name_len(event); |
148 | event_size += roundup(event->name_len + 1, event_size); | ||
149 | |||
150 | if (event_size > count) | 157 | if (event_size > count) |
151 | return ERR_PTR(-EINVAL); | 158 | return ERR_PTR(-EINVAL); |
152 | 159 | ||
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, | |||
164 | * buffer we had in "get_one_event()" above. | 171 | * buffer we had in "get_one_event()" above. |
165 | */ | 172 | */ |
166 | static ssize_t copy_event_to_user(struct fsnotify_group *group, | 173 | static ssize_t copy_event_to_user(struct fsnotify_group *group, |
167 | struct fsnotify_event *event, | 174 | struct fsnotify_event *fsn_event, |
168 | char __user *buf) | 175 | char __user *buf) |
169 | { | 176 | { |
170 | struct inotify_event inotify_event; | 177 | struct inotify_event inotify_event; |
171 | struct fsnotify_event_private_data *fsn_priv; | 178 | struct inotify_event_info *event; |
172 | struct inotify_event_private_data *priv; | ||
173 | size_t event_size = sizeof(struct inotify_event); | 179 | size_t event_size = sizeof(struct inotify_event); |
174 | size_t name_len = 0; | 180 | size_t name_len; |
175 | 181 | size_t pad_name_len; | |
176 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); | ||
177 | 182 | ||
178 | /* we get the inotify watch descriptor from the event private data */ | 183 | pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); |
179 | spin_lock(&event->lock); | ||
180 | fsn_priv = fsnotify_remove_priv_from_event(group, event); | ||
181 | spin_unlock(&event->lock); | ||
182 | |||
183 | if (!fsn_priv) | ||
184 | inotify_event.wd = -1; | ||
185 | else { | ||
186 | priv = container_of(fsn_priv, struct inotify_event_private_data, | ||
187 | fsnotify_event_priv_data); | ||
188 | inotify_event.wd = priv->wd; | ||
189 | inotify_free_event_priv(fsn_priv); | ||
190 | } | ||
191 | 184 | ||
185 | event = INOTIFY_E(fsn_event); | ||
186 | name_len = event->name_len; | ||
192 | /* | 187 | /* |
193 | * round up event->name_len so it is a multiple of event_size | 188 | * round up name length so it is a multiple of event_size |
194 | * plus an extra byte for the terminating '\0'. | 189 | * plus an extra byte for the terminating '\0'. |
195 | */ | 190 | */ |
196 | if (event->name_len) | 191 | pad_name_len = round_event_name_len(fsn_event); |
197 | name_len = roundup(event->name_len + 1, event_size); | 192 | inotify_event.len = pad_name_len; |
198 | inotify_event.len = name_len; | 193 | inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); |
199 | 194 | inotify_event.wd = event->wd; | |
200 | inotify_event.mask = inotify_mask_to_arg(event->mask); | ||
201 | inotify_event.cookie = event->sync_cookie; | 195 | inotify_event.cookie = event->sync_cookie; |
202 | 196 | ||
203 | /* send the main event */ | 197 | /* send the main event */ |
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, | |||
209 | /* | 203 | /* |
210 | * fsnotify only stores the pathname, so here we have to send the pathname | 204 | * fsnotify only stores the pathname, so here we have to send the pathname |
211 | * and then pad that pathname out to a multiple of sizeof(inotify_event) | 205 | * and then pad that pathname out to a multiple of sizeof(inotify_event) |
212 | * with zeros. I get my zeros from the nul_inotify_event. | 206 | * with zeros. |
213 | */ | 207 | */ |
214 | if (name_len) { | 208 | if (pad_name_len) { |
215 | unsigned int len_to_zero = name_len - event->name_len; | ||
216 | /* copy the path name */ | 209 | /* copy the path name */ |
217 | if (copy_to_user(buf, event->file_name, event->name_len)) | 210 | if (copy_to_user(buf, event->name, name_len)) |
218 | return -EFAULT; | 211 | return -EFAULT; |
219 | buf += event->name_len; | 212 | buf += name_len; |
220 | 213 | ||
221 | /* fill userspace with 0's */ | 214 | /* fill userspace with 0's */ |
222 | if (clear_user(buf, len_to_zero)) | 215 | if (clear_user(buf, pad_name_len - name_len)) |
223 | return -EFAULT; | 216 | return -EFAULT; |
224 | buf += len_to_zero; | 217 | event_size += pad_name_len; |
225 | event_size += name_len; | ||
226 | } | 218 | } |
227 | 219 | ||
228 | return event_size; | 220 | return event_size; |
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, | |||
254 | if (IS_ERR(kevent)) | 246 | if (IS_ERR(kevent)) |
255 | break; | 247 | break; |
256 | ret = copy_event_to_user(group, kevent, buf); | 248 | ret = copy_event_to_user(group, kevent, buf); |
257 | fsnotify_put_event(kevent); | 249 | fsnotify_destroy_event(group, kevent); |
258 | if (ret < 0) | 250 | if (ret < 0) |
259 | break; | 251 | break; |
260 | buf += ret; | 252 | buf += ret; |
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, | |||
297 | unsigned long arg) | 289 | unsigned long arg) |
298 | { | 290 | { |
299 | struct fsnotify_group *group; | 291 | struct fsnotify_group *group; |
300 | struct fsnotify_event_holder *holder; | 292 | struct fsnotify_event *fsn_event; |
301 | struct fsnotify_event *event; | ||
302 | void __user *p; | 293 | void __user *p; |
303 | int ret = -ENOTTY; | 294 | int ret = -ENOTTY; |
304 | size_t send_len = 0; | 295 | size_t send_len = 0; |
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, | |||
311 | switch (cmd) { | 302 | switch (cmd) { |
312 | case FIONREAD: | 303 | case FIONREAD: |
313 | mutex_lock(&group->notification_mutex); | 304 | mutex_lock(&group->notification_mutex); |
314 | list_for_each_entry(holder, &group->notification_list, event_list) { | 305 | list_for_each_entry(fsn_event, &group->notification_list, |
315 | event = holder->event; | 306 | list) { |
316 | send_len += sizeof(struct inotify_event); | 307 | send_len += sizeof(struct inotify_event); |
317 | if (event->name_len) | 308 | send_len += round_event_name_len(fsn_event); |
318 | send_len += roundup(event->name_len + 1, | ||
319 | sizeof(struct inotify_event)); | ||
320 | } | 309 | } |
321 | mutex_unlock(&group->notification_mutex); | 310 | mutex_unlock(&group->notification_mutex); |
322 | ret = put_user(send_len, (int __user *) p); | 311 | ret = put_user(send_len, (int __user *) p); |
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, | |||
503 | struct fsnotify_group *group) | 492 | struct fsnotify_group *group) |
504 | { | 493 | { |
505 | struct inotify_inode_mark *i_mark; | 494 | struct inotify_inode_mark *i_mark; |
506 | struct fsnotify_event *ignored_event, *notify_event; | ||
507 | struct inotify_event_private_data *event_priv; | ||
508 | struct fsnotify_event_private_data *fsn_event_priv; | ||
509 | int ret; | ||
510 | |||
511 | i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); | ||
512 | |||
513 | ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, | ||
514 | FSNOTIFY_EVENT_NONE, NULL, 0, | ||
515 | GFP_NOFS); | ||
516 | if (!ignored_event) | ||
517 | goto skip_send_ignore; | ||
518 | |||
519 | event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); | ||
520 | if (unlikely(!event_priv)) | ||
521 | goto skip_send_ignore; | ||
522 | |||
523 | fsn_event_priv = &event_priv->fsnotify_event_priv_data; | ||
524 | |||
525 | fsnotify_get_group(group); | ||
526 | fsn_event_priv->group = group; | ||
527 | event_priv->wd = i_mark->wd; | ||
528 | |||
529 | notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); | ||
530 | if (notify_event) { | ||
531 | if (IS_ERR(notify_event)) | ||
532 | ret = PTR_ERR(notify_event); | ||
533 | else | ||
534 | fsnotify_put_event(notify_event); | ||
535 | inotify_free_event_priv(fsn_event_priv); | ||
536 | } | ||
537 | 495 | ||
538 | skip_send_ignore: | 496 | /* Queue ignore event for the watch */ |
539 | /* matches the reference taken when the event was created */ | 497 | inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, |
540 | if (ignored_event) | 498 | NULL, FSNOTIFY_EVENT_NONE, NULL); |
541 | fsnotify_put_event(ignored_event); | ||
542 | 499 | ||
500 | i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); | ||
543 | /* remove this mark from the idr */ | 501 | /* remove this mark from the idr */ |
544 | inotify_remove_from_idr(group, i_mark); | 502 | inotify_remove_from_idr(group, i_mark); |
545 | 503 | ||
@@ -836,7 +794,6 @@ static int __init inotify_user_setup(void) | |||
836 | BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); | 794 | BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); |
837 | 795 | ||
838 | inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); | 796 | inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); |
839 | event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); | ||
840 | 797 | ||
841 | inotify_max_queued_events = 16384; | 798 | inotify_max_queued_events = 16384; |
842 | inotify_max_user_instances = 128; | 799 | inotify_max_user_instances = 128; |
diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05f160c..952237b8e2d2 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c | |||
@@ -48,15 +48,6 @@ | |||
48 | #include <linux/fsnotify_backend.h> | 48 | #include <linux/fsnotify_backend.h> |
49 | #include "fsnotify.h" | 49 | #include "fsnotify.h" |
50 | 50 | ||
51 | static struct kmem_cache *fsnotify_event_cachep; | ||
52 | static struct kmem_cache *fsnotify_event_holder_cachep; | ||
53 | /* | ||
54 | * This is a magic event we send when the q is too full. Since it doesn't | ||
55 | * hold real event information we just keep one system wide and use it any time | ||
56 | * it is needed. It's refcnt is set 1 at kernel init time and will never | ||
57 | * get set to 0 so it will never get 'freed' | ||
58 | */ | ||
59 | static struct fsnotify_event *q_overflow_event; | ||
60 | static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); | 51 | static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); |
61 | 52 | ||
62 | /** | 53 | /** |
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) | |||
76 | return list_empty(&group->notification_list) ? true : false; | 67 | return list_empty(&group->notification_list) ? true : false; |
77 | } | 68 | } |
78 | 69 | ||
79 | void fsnotify_get_event(struct fsnotify_event *event) | 70 | void fsnotify_destroy_event(struct fsnotify_group *group, |
71 | struct fsnotify_event *event) | ||
80 | { | 72 | { |
81 | atomic_inc(&event->refcnt); | 73 | /* Overflow events are per-group and we don't want to free them */ |
82 | } | 74 | if (!event || event->mask == FS_Q_OVERFLOW) |
83 | |||
84 | void fsnotify_put_event(struct fsnotify_event *event) | ||
85 | { | ||
86 | if (!event) | ||
87 | return; | 75 | return; |
88 | 76 | ||
89 | if (atomic_dec_and_test(&event->refcnt)) { | 77 | group->ops->free_event(event); |
90 | pr_debug("%s: event=%p\n", __func__, event); | ||
91 | |||
92 | if (event->data_type == FSNOTIFY_EVENT_PATH) | ||
93 | path_put(&event->path); | ||
94 | |||
95 | BUG_ON(!list_empty(&event->private_data_list)); | ||
96 | |||
97 | kfree(event->file_name); | ||
98 | put_pid(event->tgid); | ||
99 | kmem_cache_free(fsnotify_event_cachep, event); | ||
100 | } | ||
101 | } | ||
102 | |||
103 | struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) | ||
104 | { | ||
105 | return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); | ||
106 | } | ||
107 | |||
108 | void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) | ||
109 | { | ||
110 | if (holder) | ||
111 | kmem_cache_free(fsnotify_event_holder_cachep, holder); | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Find the private data that the group previously attached to this event when | ||
116 | * the group added the event to the notification queue (fsnotify_add_notify_event) | ||
117 | */ | ||
118 | struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) | ||
119 | { | ||
120 | struct fsnotify_event_private_data *lpriv; | ||
121 | struct fsnotify_event_private_data *priv = NULL; | ||
122 | |||
123 | assert_spin_locked(&event->lock); | ||
124 | |||
125 | list_for_each_entry(lpriv, &event->private_data_list, event_list) { | ||
126 | if (lpriv->group == group) { | ||
127 | priv = lpriv; | ||
128 | list_del(&priv->event_list); | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | return priv; | ||
133 | } | 78 | } |
134 | 79 | ||
135 | /* | 80 | /* |
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot | |||
137 | * event off the queue to deal with. If the event is successfully added to the | 82 | * event off the queue to deal with. If the event is successfully added to the |
138 | * group's notification queue, a reference is taken on event. | 83 | * group's notification queue, a reference is taken on event. |
139 | */ | 84 | */ |
140 | struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, | 85 | struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, |
141 | struct fsnotify_event_private_data *priv, | 86 | struct fsnotify_event *event, |
142 | struct fsnotify_event *(*merge)(struct list_head *, | 87 | struct fsnotify_event *(*merge)(struct list_head *, |
143 | struct fsnotify_event *)) | 88 | struct fsnotify_event *)) |
144 | { | 89 | { |
145 | struct fsnotify_event *return_event = NULL; | 90 | struct fsnotify_event *return_event = NULL; |
146 | struct fsnotify_event_holder *holder = NULL; | ||
147 | struct list_head *list = &group->notification_list; | 91 | struct list_head *list = &group->notification_list; |
148 | 92 | ||
149 | pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); | 93 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); |
150 | |||
151 | /* | ||
152 | * There is one fsnotify_event_holder embedded inside each fsnotify_event. | ||
153 | * Check if we expect to be able to use that holder. If not alloc a new | ||
154 | * holder. | ||
155 | * For the overflow event it's possible that something will use the in | ||
156 | * event holder before we get the lock so we may need to jump back and | ||
157 | * alloc a new holder, this can't happen for most events... | ||
158 | */ | ||
159 | if (!list_empty(&event->holder.event_list)) { | ||
160 | alloc_holder: | ||
161 | holder = fsnotify_alloc_event_holder(); | ||
162 | if (!holder) | ||
163 | return ERR_PTR(-ENOMEM); | ||
164 | } | ||
165 | 94 | ||
166 | mutex_lock(&group->notification_mutex); | 95 | mutex_lock(&group->notification_mutex); |
167 | 96 | ||
168 | if (group->q_len >= group->max_events) { | 97 | if (group->q_len >= group->max_events) { |
169 | event = q_overflow_event; | 98 | /* Queue overflow event only if it isn't already queued */ |
170 | 99 | if (list_empty(&group->overflow_event.list)) | |
171 | /* | 100 | event = &group->overflow_event; |
172 | * we need to return the overflow event | ||
173 | * which means we need a ref | ||
174 | */ | ||
175 | fsnotify_get_event(event); | ||
176 | return_event = event; | 101 | return_event = event; |
177 | |||
178 | /* sorry, no private data on the overflow event */ | ||
179 | priv = NULL; | ||
180 | } | 102 | } |
181 | 103 | ||
182 | if (!list_empty(list) && merge) { | 104 | if (!list_empty(list) && merge) { |
183 | struct fsnotify_event *tmp; | 105 | return_event = merge(list, event); |
184 | |||
185 | tmp = merge(list, event); | ||
186 | if (tmp) { | ||
187 | mutex_unlock(&group->notification_mutex); | ||
188 | |||
189 | if (return_event) | ||
190 | fsnotify_put_event(return_event); | ||
191 | if (holder != &event->holder) | ||
192 | fsnotify_destroy_event_holder(holder); | ||
193 | return tmp; | ||
194 | } | ||
195 | } | ||
196 | |||
197 | spin_lock(&event->lock); | ||
198 | |||
199 | if (list_empty(&event->holder.event_list)) { | ||
200 | if (unlikely(holder)) | ||
201 | fsnotify_destroy_event_holder(holder); | ||
202 | holder = &event->holder; | ||
203 | } else if (unlikely(!holder)) { | ||
204 | /* between the time we checked above and got the lock the in | ||
205 | * event holder was used, go back and get a new one */ | ||
206 | spin_unlock(&event->lock); | ||
207 | mutex_unlock(&group->notification_mutex); | ||
208 | |||
209 | if (return_event) { | 106 | if (return_event) { |
210 | fsnotify_put_event(return_event); | 107 | mutex_unlock(&group->notification_mutex); |
211 | return_event = NULL; | 108 | return return_event; |
212 | } | 109 | } |
213 | |||
214 | goto alloc_holder; | ||
215 | } | 110 | } |
216 | 111 | ||
217 | group->q_len++; | 112 | group->q_len++; |
218 | holder->event = event; | 113 | list_add_tail(&event->list, list); |
219 | |||
220 | fsnotify_get_event(event); | ||
221 | list_add_tail(&holder->event_list, list); | ||
222 | if (priv) | ||
223 | list_add_tail(&priv->event_list, &event->private_data_list); | ||
224 | spin_unlock(&event->lock); | ||
225 | mutex_unlock(&group->notification_mutex); | 114 | mutex_unlock(&group->notification_mutex); |
226 | 115 | ||
227 | wake_up(&group->notification_waitq); | 116 | wake_up(&group->notification_waitq); |
@@ -230,32 +119,20 @@ alloc_holder: | |||
230 | } | 119 | } |
231 | 120 | ||
232 | /* | 121 | /* |
233 | * Remove and return the first event from the notification list. There is a | 122 | * Remove and return the first event from the notification list. It is the |
234 | * reference held on this event since it was on the list. It is the responsibility | 123 | * responsibility of the caller to destroy the obtained event |
235 | * of the caller to drop this reference. | ||
236 | */ | 124 | */ |
237 | struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) | 125 | struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) |
238 | { | 126 | { |
239 | struct fsnotify_event *event; | 127 | struct fsnotify_event *event; |
240 | struct fsnotify_event_holder *holder; | ||
241 | 128 | ||
242 | BUG_ON(!mutex_is_locked(&group->notification_mutex)); | 129 | BUG_ON(!mutex_is_locked(&group->notification_mutex)); |
243 | 130 | ||
244 | pr_debug("%s: group=%p\n", __func__, group); | 131 | pr_debug("%s: group=%p\n", __func__, group); |
245 | 132 | ||
246 | holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); | 133 | event = list_first_entry(&group->notification_list, |
247 | 134 | struct fsnotify_event, list); | |
248 | event = holder->event; | 135 | list_del(&event->list); |
249 | |||
250 | spin_lock(&event->lock); | ||
251 | holder->event = NULL; | ||
252 | list_del_init(&holder->event_list); | ||
253 | spin_unlock(&event->lock); | ||
254 | |||
255 | /* event == holder means we are referenced through the in event holder */ | ||
256 | if (holder != &event->holder) | ||
257 | fsnotify_destroy_event_holder(holder); | ||
258 | |||
259 | group->q_len--; | 136 | group->q_len--; |
260 | 137 | ||
261 | return event; | 138 | return event; |
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group | |||
266 | */ | 143 | */ |
267 | struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) | 144 | struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) |
268 | { | 145 | { |
269 | struct fsnotify_event *event; | ||
270 | struct fsnotify_event_holder *holder; | ||
271 | |||
272 | BUG_ON(!mutex_is_locked(&group->notification_mutex)); | 146 | BUG_ON(!mutex_is_locked(&group->notification_mutex)); |
273 | 147 | ||
274 | holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); | 148 | return list_first_entry(&group->notification_list, |
275 | event = holder->event; | 149 | struct fsnotify_event, list); |
276 | |||
277 | return event; | ||
278 | } | 150 | } |
279 | 151 | ||
280 | /* | 152 | /* |
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) | |||
284 | void fsnotify_flush_notify(struct fsnotify_group *group) | 156 | void fsnotify_flush_notify(struct fsnotify_group *group) |
285 | { | 157 | { |
286 | struct fsnotify_event *event; | 158 | struct fsnotify_event *event; |
287 | struct fsnotify_event_private_data *priv; | ||
288 | 159 | ||
289 | mutex_lock(&group->notification_mutex); | 160 | mutex_lock(&group->notification_mutex); |
290 | while (!fsnotify_notify_queue_is_empty(group)) { | 161 | while (!fsnotify_notify_queue_is_empty(group)) { |
291 | event = fsnotify_remove_notify_event(group); | 162 | event = fsnotify_remove_notify_event(group); |
292 | /* if they don't implement free_event_priv they better not have attached any */ | 163 | fsnotify_destroy_event(group, event); |
293 | if (group->ops->free_event_priv) { | ||
294 | spin_lock(&event->lock); | ||
295 | priv = fsnotify_remove_priv_from_event(group, event); | ||
296 | spin_unlock(&event->lock); | ||
297 | if (priv) | ||
298 | group->ops->free_event_priv(priv); | ||
299 | } | ||
300 | fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ | ||
301 | } | 164 | } |
302 | mutex_unlock(&group->notification_mutex); | 165 | mutex_unlock(&group->notification_mutex); |
303 | } | 166 | } |
304 | 167 | ||
305 | static void initialize_event(struct fsnotify_event *event) | ||
306 | { | ||
307 | INIT_LIST_HEAD(&event->holder.event_list); | ||
308 | atomic_set(&event->refcnt, 1); | ||
309 | |||
310 | spin_lock_init(&event->lock); | ||
311 | |||
312 | INIT_LIST_HEAD(&event->private_data_list); | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Caller damn well better be holding whatever mutex is protecting the | ||
317 | * old_holder->event_list and the new_event must be a clean event which | ||
318 | * cannot be found anywhere else in the kernel. | ||
319 | */ | ||
320 | int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, | ||
321 | struct fsnotify_event *new_event) | ||
322 | { | ||
323 | struct fsnotify_event *old_event = old_holder->event; | ||
324 | struct fsnotify_event_holder *new_holder = &new_event->holder; | ||
325 | |||
326 | enum event_spinlock_class { | ||
327 | SPINLOCK_OLD, | ||
328 | SPINLOCK_NEW, | ||
329 | }; | ||
330 | |||
331 | pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); | ||
332 | |||
333 | /* | ||
334 | * if the new_event's embedded holder is in use someone | ||
335 | * screwed up and didn't give us a clean new event. | ||
336 | */ | ||
337 | BUG_ON(!list_empty(&new_holder->event_list)); | ||
338 | |||
339 | spin_lock_nested(&old_event->lock, SPINLOCK_OLD); | ||
340 | spin_lock_nested(&new_event->lock, SPINLOCK_NEW); | ||
341 | |||
342 | new_holder->event = new_event; | ||
343 | list_replace_init(&old_holder->event_list, &new_holder->event_list); | ||
344 | |||
345 | spin_unlock(&new_event->lock); | ||
346 | spin_unlock(&old_event->lock); | ||
347 | |||
348 | /* event == holder means we are referenced through the in event holder */ | ||
349 | if (old_holder != &old_event->holder) | ||
350 | fsnotify_destroy_event_holder(old_holder); | ||
351 | |||
352 | fsnotify_get_event(new_event); /* on the list take reference */ | ||
353 | fsnotify_put_event(old_event); /* off the list, drop reference */ | ||
354 | |||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) | ||
359 | { | ||
360 | struct fsnotify_event *event; | ||
361 | |||
362 | event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); | ||
363 | if (!event) | ||
364 | return NULL; | ||
365 | |||
366 | pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); | ||
367 | |||
368 | memcpy(event, old_event, sizeof(*event)); | ||
369 | initialize_event(event); | ||
370 | |||
371 | if (event->name_len) { | ||
372 | event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); | ||
373 | if (!event->file_name) { | ||
374 | kmem_cache_free(fsnotify_event_cachep, event); | ||
375 | return NULL; | ||
376 | } | ||
377 | } | ||
378 | event->tgid = get_pid(old_event->tgid); | ||
379 | if (event->data_type == FSNOTIFY_EVENT_PATH) | ||
380 | path_get(&event->path); | ||
381 | |||
382 | return event; | ||
383 | } | ||
384 | |||
385 | /* | 168 | /* |
386 | * fsnotify_create_event - Allocate a new event which will be sent to each | 169 | * fsnotify_create_event - Allocate a new event which will be sent to each |
387 | * group's handle_event function if the group was interested in this | 170 | * group's handle_event function if the group was interested in this |
388 | * particular event. | 171 | * particular event. |
389 | * | 172 | * |
390 | * @to_tell the inode which is supposed to receive the event (sometimes a | 173 | * @inode the inode which is supposed to receive the event (sometimes a |
391 | * parent of the inode to which the event happened. | 174 | * parent of the inode to which the event happened. |
392 | * @mask what actually happened. | 175 | * @mask what actually happened. |
393 | * @data pointer to the object which was actually affected | 176 | * @data pointer to the object which was actually affected |
394 | * @data_type flag indication if the data is a file, path, inode, nothing... | 177 | * @data_type flag indication if the data is a file, path, inode, nothing... |
395 | * @name the filename, if available | 178 | * @name the filename, if available |
396 | */ | 179 | */ |
397 | struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, | 180 | void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, |
398 | int data_type, const unsigned char *name, | 181 | u32 mask) |
399 | u32 cookie, gfp_t gfp) | ||
400 | { | 182 | { |
401 | struct fsnotify_event *event; | 183 | INIT_LIST_HEAD(&event->list); |
402 | 184 | event->inode = inode; | |
403 | event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); | ||
404 | if (!event) | ||
405 | return NULL; | ||
406 | |||
407 | pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", | ||
408 | __func__, event, to_tell, mask, data, data_type); | ||
409 | |||
410 | initialize_event(event); | ||
411 | |||
412 | if (name) { | ||
413 | event->file_name = kstrdup(name, gfp); | ||
414 | if (!event->file_name) { | ||
415 | kmem_cache_free(fsnotify_event_cachep, event); | ||
416 | return NULL; | ||
417 | } | ||
418 | event->name_len = strlen(event->file_name); | ||
419 | } | ||
420 | |||
421 | event->tgid = get_pid(task_tgid(current)); | ||
422 | event->sync_cookie = cookie; | ||
423 | event->to_tell = to_tell; | ||
424 | event->data_type = data_type; | ||
425 | |||
426 | switch (data_type) { | ||
427 | case FSNOTIFY_EVENT_PATH: { | ||
428 | struct path *path = data; | ||
429 | event->path.dentry = path->dentry; | ||
430 | event->path.mnt = path->mnt; | ||
431 | path_get(&event->path); | ||
432 | break; | ||
433 | } | ||
434 | case FSNOTIFY_EVENT_INODE: | ||
435 | event->inode = data; | ||
436 | break; | ||
437 | case FSNOTIFY_EVENT_NONE: | ||
438 | event->inode = NULL; | ||
439 | event->path.dentry = NULL; | ||
440 | event->path.mnt = NULL; | ||
441 | break; | ||
442 | default: | ||
443 | BUG(); | ||
444 | } | ||
445 | |||
446 | event->mask = mask; | 185 | event->mask = mask; |
447 | |||
448 | return event; | ||
449 | } | ||
450 | |||
451 | static __init int fsnotify_notification_init(void) | ||
452 | { | ||
453 | fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); | ||
454 | fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); | ||
455 | |||
456 | q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, | ||
457 | FSNOTIFY_EVENT_NONE, NULL, 0, | ||
458 | GFP_KERNEL); | ||
459 | if (!q_overflow_event) | ||
460 | panic("unable to allocate fsnotify q_overflow_event\n"); | ||
461 | |||
462 | return 0; | ||
463 | } | 186 | } |
464 | subsys_initcall(fsnotify_notification_init); | ||
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b32989..ce210d4951a1 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile | |||
@@ -38,7 +38,6 @@ ocfs2-objs := \ | |||
38 | symlink.o \ | 38 | symlink.o \ |
39 | sysfile.o \ | 39 | sysfile.o \ |
40 | uptodate.o \ | 40 | uptodate.o \ |
41 | ver.o \ | ||
42 | quota_local.o \ | 41 | quota_local.o \ |
43 | quota_global.o \ | 42 | quota_global.o \ |
44 | xattr.o \ | 43 | xattr.o \ |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index dc7411fe185d..8750ae1b8636 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7260 | start = range->start >> osb->s_clustersize_bits; | 7260 | start = range->start >> osb->s_clustersize_bits; |
7261 | len = range->len >> osb->s_clustersize_bits; | 7261 | len = range->len >> osb->s_clustersize_bits; |
7262 | minlen = range->minlen >> osb->s_clustersize_bits; | 7262 | minlen = range->minlen >> osb->s_clustersize_bits; |
7263 | trimmed = 0; | ||
7264 | |||
7265 | if (!len) { | ||
7266 | range->len = 0; | ||
7267 | return 0; | ||
7268 | } | ||
7269 | 7263 | ||
7270 | if (minlen >= osb->bitmap_cpg) | 7264 | if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) |
7271 | return -EINVAL; | 7265 | return -EINVAL; |
7272 | 7266 | ||
7273 | main_bm_inode = ocfs2_get_system_file_inode(osb, | 7267 | main_bm_inode = ocfs2_get_system_file_inode(osb, |
@@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7293 | goto out_unlock; | 7287 | goto out_unlock; |
7294 | } | 7288 | } |
7295 | 7289 | ||
7290 | len = range->len >> osb->s_clustersize_bits; | ||
7296 | if (start + len > le32_to_cpu(main_bm->i_clusters)) | 7291 | if (start + len > le32_to_cpu(main_bm->i_clusters)) |
7297 | len = le32_to_cpu(main_bm->i_clusters) - start; | 7292 | len = le32_to_cpu(main_bm->i_clusters) - start; |
7298 | 7293 | ||
@@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7307 | last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); | 7302 | last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); |
7308 | last_bit = osb->bitmap_cpg; | 7303 | last_bit = osb->bitmap_cpg; |
7309 | 7304 | ||
7305 | trimmed = 0; | ||
7310 | for (group = first_group; group <= last_group;) { | 7306 | for (group = first_group; group <= last_group;) { |
7311 | if (first_bit + len >= osb->bitmap_cpg) | 7307 | if (first_bit + len >= osb->bitmap_cpg) |
7312 | last_bit = osb->bitmap_cpg; | 7308 | last_bit = osb->bitmap_cpg; |
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d8608..1aefc0350ec3 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o | 1 | obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o |
2 | 2 | ||
3 | ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ | 3 | ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ |
4 | quorum.o tcp.o netdebug.o ver.o | 4 | quorum.o tcp.o netdebug.o |
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5f..441c84e169e6 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include "heartbeat.h" | 29 | #include "heartbeat.h" |
30 | #include "masklog.h" | 30 | #include "masklog.h" |
31 | #include "sys.h" | 31 | #include "sys.h" |
32 | #include "ver.h" | ||
33 | 32 | ||
34 | /* for now we operate under the assertion that there can be only one | 33 | /* for now we operate under the assertion that there can be only one |
35 | * cluster active at a time. Changing this will require trickling | 34 | * cluster active at a time. Changing this will require trickling |
@@ -945,8 +944,6 @@ static int __init init_o2nm(void) | |||
945 | { | 944 | { |
946 | int ret = -1; | 945 | int ret = -1; |
947 | 946 | ||
948 | cluster_print_version(); | ||
949 | |||
950 | ret = o2hb_init(); | 947 | ret = o2hb_init(); |
951 | if (ret) | 948 | if (ret) |
952 | goto out; | 949 | goto out; |
@@ -984,6 +981,7 @@ out: | |||
984 | 981 | ||
985 | MODULE_AUTHOR("Oracle"); | 982 | MODULE_AUTHOR("Oracle"); |
986 | MODULE_LICENSE("GPL"); | 983 | MODULE_LICENSE("GPL"); |
984 | MODULE_DESCRIPTION("OCFS2 cluster management"); | ||
987 | 985 | ||
988 | module_init(init_o2nm) | 986 | module_init(init_o2nm) |
989 | module_exit(exit_o2nm) | 987 | module_exit(exit_o2nm) |
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad3..000000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "ver.h" | ||
30 | |||
31 | #define CLUSTER_BUILD_VERSION "1.5.0" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION | ||
34 | |||
35 | void cluster_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(CLUSTER_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c2..000000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef O2CLUSTER_VER_H | ||
27 | #define O2CLUSTER_VER_H | ||
28 | |||
29 | void cluster_print_version(void); | ||
30 | |||
31 | #endif /* O2CLUSTER_VER_H */ | ||
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044efbb15..bd1aab1f49a4 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile | |||
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2 | |||
3 | obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o | 3 | obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o |
4 | 4 | ||
5 | ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ | 5 | ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ |
6 | dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o | 6 | dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o |
7 | 7 | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382abf840..33660a4a52fa 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -43,8 +43,6 @@ | |||
43 | #include "dlmdomain.h" | 43 | #include "dlmdomain.h" |
44 | #include "dlmdebug.h" | 44 | #include "dlmdebug.h" |
45 | 45 | ||
46 | #include "dlmver.h" | ||
47 | |||
48 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) | 46 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) |
49 | #include "cluster/masklog.h" | 47 | #include "cluster/masklog.h" |
50 | 48 | ||
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void) | |||
2328 | { | 2326 | { |
2329 | int status; | 2327 | int status; |
2330 | 2328 | ||
2331 | dlm_print_version(); | ||
2332 | |||
2333 | status = dlm_init_mle_cache(); | 2329 | status = dlm_init_mle_cache(); |
2334 | if (status) { | 2330 | if (status) { |
2335 | mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); | 2331 | mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); |
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void) | |||
2379 | 2375 | ||
2380 | MODULE_AUTHOR("Oracle"); | 2376 | MODULE_AUTHOR("Oracle"); |
2381 | MODULE_LICENSE("GPL"); | 2377 | MODULE_LICENSE("GPL"); |
2378 | MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); | ||
2382 | 2379 | ||
2383 | module_init(dlm_init); | 2380 | module_init(dlm_init); |
2384 | module_exit(dlm_exit); | 2381 | module_exit(dlm_exit); |
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4d158d..000000000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "dlmver.h" | ||
30 | |||
31 | #define DLM_BUILD_VERSION "1.5.0" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION | ||
34 | |||
35 | void dlm_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(DLM_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee77a16..000000000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmfsver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef DLM_VER_H | ||
27 | #define DLM_VER_H | ||
28 | |||
29 | void dlm_print_version(void); | ||
30 | |||
31 | #endif /* DLM_VER_H */ | ||
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89a6701..eed3db8c5b49 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile | |||
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2 | |||
2 | 2 | ||
3 | obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o | 3 | obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o |
4 | 4 | ||
5 | ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o | 5 | ocfs2_dlmfs-objs := userdlm.o dlmfs.o |
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d339e3..09b7d9dac71d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c | |||
@@ -49,7 +49,6 @@ | |||
49 | 49 | ||
50 | #include "stackglue.h" | 50 | #include "stackglue.h" |
51 | #include "userdlm.h" | 51 | #include "userdlm.h" |
52 | #include "dlmfsver.h" | ||
53 | 52 | ||
54 | #define MLOG_MASK_PREFIX ML_DLMFS | 53 | #define MLOG_MASK_PREFIX ML_DLMFS |
55 | #include "cluster/masklog.h" | 54 | #include "cluster/masklog.h" |
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void) | |||
644 | int status; | 643 | int status; |
645 | int cleanup_inode = 0, cleanup_worker = 0; | 644 | int cleanup_inode = 0, cleanup_worker = 0; |
646 | 645 | ||
647 | dlmfs_print_version(); | ||
648 | |||
649 | status = bdi_init(&dlmfs_backing_dev_info); | 646 | status = bdi_init(&dlmfs_backing_dev_info); |
650 | if (status) | 647 | if (status) |
651 | return status; | 648 | return status; |
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void) | |||
701 | 698 | ||
702 | MODULE_AUTHOR("Oracle"); | 699 | MODULE_AUTHOR("Oracle"); |
703 | MODULE_LICENSE("GPL"); | 700 | MODULE_LICENSE("GPL"); |
701 | MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); | ||
704 | 702 | ||
705 | module_init(init_dlmfs_fs) | 703 | module_init(init_dlmfs_fs) |
706 | module_exit(exit_dlmfs_fs) | 704 | module_exit(exit_dlmfs_fs) |
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b3321f83..000000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmfsver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "dlmfsver.h" | ||
30 | |||
31 | #define DLM_BUILD_VERSION "1.5.0" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION | ||
34 | |||
35 | void dlmfs_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(DLM_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadbed25c..000000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef DLMFS_VER_H | ||
27 | #define DLMFS_VER_H | ||
28 | |||
29 | void dlmfs_print_version(void); | ||
30 | |||
31 | #endif /* DLMFS_VER_H */ | ||
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3407b2c62b21..19986959d149 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) | |||
2996 | 2996 | ||
2997 | /* for now, uuid == domain */ | 2997 | /* for now, uuid == domain */ |
2998 | status = ocfs2_cluster_connect(osb->osb_cluster_stack, | 2998 | status = ocfs2_cluster_connect(osb->osb_cluster_stack, |
2999 | osb->osb_cluster_name, | ||
3000 | strlen(osb->osb_cluster_name), | ||
2999 | osb->uuid_str, | 3001 | osb->uuid_str, |
3000 | strlen(osb->uuid_str), | 3002 | strlen(osb->uuid_str), |
3001 | &lproto, ocfs2_do_node_down, osb, | 3003 | &lproto, ocfs2_do_node_down, osb, |
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) | |||
3005 | goto bail; | 3007 | goto bail; |
3006 | } | 3008 | } |
3007 | 3009 | ||
3008 | status = ocfs2_cluster_this_node(&osb->node_num); | 3010 | status = ocfs2_cluster_this_node(conn, &osb->node_num); |
3009 | if (status < 0) { | 3011 | if (status < 0) { |
3010 | mlog_errno(status); | 3012 | mlog_errno(status); |
3011 | mlog(ML_ERROR, | 3013 | mlog(ML_ERROR, |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6fff128cad16..f42eecef6478 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, | |||
1869 | } | 1869 | } |
1870 | size = sr->l_start + sr->l_len; | 1870 | size = sr->l_start + sr->l_len; |
1871 | 1871 | ||
1872 | if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { | 1872 | if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || |
1873 | cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { | ||
1873 | if (sr->l_len <= 0) { | 1874 | if (sr->l_len <= 0) { |
1874 | ret = -EINVAL; | 1875 | ret = -EINVAL; |
1875 | goto out_inode_unlock; | 1876 | goto out_inode_unlock; |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9b455d..8ca3c29accbf 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | #include <linux/mount.h> | 9 | #include <linux/mount.h> |
10 | #include <linux/blkdev.h> | ||
10 | #include <linux/compat.h> | 11 | #include <linux/compat.h> |
11 | 12 | ||
12 | #include <cluster/masklog.h> | 13 | #include <cluster/masklog.h> |
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
966 | case FITRIM: | 967 | case FITRIM: |
967 | { | 968 | { |
968 | struct super_block *sb = inode->i_sb; | 969 | struct super_block *sb = inode->i_sb; |
970 | struct request_queue *q = bdev_get_queue(sb->s_bdev); | ||
969 | struct fstrim_range range; | 971 | struct fstrim_range range; |
970 | int ret = 0; | 972 | int ret = 0; |
971 | 973 | ||
972 | if (!capable(CAP_SYS_ADMIN)) | 974 | if (!capable(CAP_SYS_ADMIN)) |
973 | return -EPERM; | 975 | return -EPERM; |
974 | 976 | ||
977 | if (!blk_queue_discard(q)) | ||
978 | return -EOPNOTSUPP; | ||
979 | |||
975 | if (copy_from_user(&range, argp, sizeof(range))) | 980 | if (copy_from_user(&range, argp, sizeof(range))) |
976 | return -EFAULT; | 981 | return -EFAULT; |
977 | 982 | ||
983 | range.minlen = max_t(u64, q->limits.discard_granularity, | ||
984 | range.minlen); | ||
978 | ret = ocfs2_trim_fs(sb, &range); | 985 | ret = ocfs2_trim_fs(sb, &range); |
979 | if (ret < 0) | 986 | if (ret < 0) |
980 | return ret; | 987 | return ret; |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 631a98213474..64c304d668f0 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, | |||
561 | mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); | 561 | mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); |
562 | } | 562 | } |
563 | 563 | ||
564 | static int ocfs2_alloc_dinode_update_counts(struct inode *inode, | ||
565 | handle_t *handle, | ||
566 | struct buffer_head *di_bh, | ||
567 | u32 num_bits, | ||
568 | u16 chain) | ||
569 | { | ||
570 | int ret; | ||
571 | u32 tmp_used; | ||
572 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; | ||
573 | struct ocfs2_chain_list *cl = | ||
574 | (struct ocfs2_chain_list *) &di->id2.i_chain; | ||
575 | |||
576 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | ||
577 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
578 | if (ret < 0) { | ||
579 | mlog_errno(ret); | ||
580 | goto out; | ||
581 | } | ||
582 | |||
583 | tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); | ||
584 | di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); | ||
585 | le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); | ||
586 | ocfs2_journal_dirty(handle, di_bh); | ||
587 | |||
588 | out: | ||
589 | return ret; | ||
590 | } | ||
591 | |||
592 | static inline int ocfs2_block_group_set_bits(handle_t *handle, | ||
593 | struct inode *alloc_inode, | ||
594 | struct ocfs2_group_desc *bg, | ||
595 | struct buffer_head *group_bh, | ||
596 | unsigned int bit_off, | ||
597 | unsigned int num_bits) | ||
598 | { | ||
599 | int status; | ||
600 | void *bitmap = bg->bg_bitmap; | ||
601 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | ||
602 | |||
603 | /* All callers get the descriptor via | ||
604 | * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ | ||
605 | BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); | ||
606 | BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); | ||
607 | |||
608 | mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, | ||
609 | num_bits); | ||
610 | |||
611 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
612 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | ||
613 | |||
614 | status = ocfs2_journal_access_gd(handle, | ||
615 | INODE_CACHE(alloc_inode), | ||
616 | group_bh, | ||
617 | journal_type); | ||
618 | if (status < 0) { | ||
619 | mlog_errno(status); | ||
620 | goto bail; | ||
621 | } | ||
622 | |||
623 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | ||
624 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | ||
625 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | ||
626 | " count %u but claims %u are freed. num_bits %d", | ||
627 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | ||
628 | le16_to_cpu(bg->bg_bits), | ||
629 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | ||
630 | return -EROFS; | ||
631 | } | ||
632 | while (num_bits--) | ||
633 | ocfs2_set_bit(bit_off++, bitmap); | ||
634 | |||
635 | ocfs2_journal_dirty(handle, group_bh); | ||
636 | |||
637 | bail: | ||
638 | return status; | ||
639 | } | ||
640 | |||
641 | static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, | 564 | static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, |
642 | u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, | 565 | u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, |
643 | u32 len, int ext_flags) | 566 | u32 len, int ext_flags) |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a903470c794..553f53cc73ae 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -387,6 +387,7 @@ struct ocfs2_super | |||
387 | u8 osb_stackflags; | 387 | u8 osb_stackflags; |
388 | 388 | ||
389 | char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; | 389 | char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; |
390 | char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; | ||
390 | struct ocfs2_cluster_connection *cconn; | 391 | struct ocfs2_cluster_connection *cconn; |
391 | struct ocfs2_lock_res osb_super_lockres; | 392 | struct ocfs2_lock_res osb_super_lockres; |
392 | struct ocfs2_lock_res osb_rename_lockres; | 393 | struct ocfs2_lock_res osb_rename_lockres; |
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f8930456f..1724d43d3da1 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c | |||
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) | |||
398 | return 0; | 398 | return 0; |
399 | } | 399 | } |
400 | 400 | ||
401 | static int o2cb_cluster_this_node(unsigned int *node) | 401 | static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, |
402 | unsigned int *node) | ||
402 | { | 403 | { |
403 | int node_num; | 404 | int node_num; |
404 | 405 | ||
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231f..13a8537d8e8b 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/reboot.h> | 25 | #include <linux/reboot.h> |
26 | #include <linux/sched.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | 28 | ||
28 | #include "stackglue.h" | 29 | #include "stackglue.h" |
@@ -102,6 +103,12 @@ | |||
102 | #define OCFS2_TEXT_UUID_LEN 32 | 103 | #define OCFS2_TEXT_UUID_LEN 32 |
103 | #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 | 104 | #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 |
104 | #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 | 105 | #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 |
106 | #define VERSION_LOCK "version_lock" | ||
107 | |||
108 | enum ocfs2_connection_type { | ||
109 | WITH_CONTROLD, | ||
110 | NO_CONTROLD | ||
111 | }; | ||
105 | 112 | ||
106 | /* | 113 | /* |
107 | * ocfs2_live_connection is refcounted because the filesystem and | 114 | * ocfs2_live_connection is refcounted because the filesystem and |
@@ -110,6 +117,13 @@ | |||
110 | struct ocfs2_live_connection { | 117 | struct ocfs2_live_connection { |
111 | struct list_head oc_list; | 118 | struct list_head oc_list; |
112 | struct ocfs2_cluster_connection *oc_conn; | 119 | struct ocfs2_cluster_connection *oc_conn; |
120 | enum ocfs2_connection_type oc_type; | ||
121 | atomic_t oc_this_node; | ||
122 | int oc_our_slot; | ||
123 | struct dlm_lksb oc_version_lksb; | ||
124 | char oc_lvb[DLM_LVB_LEN]; | ||
125 | struct completion oc_sync_wait; | ||
126 | wait_queue_head_t oc_wait; | ||
113 | }; | 127 | }; |
114 | 128 | ||
115 | struct ocfs2_control_private { | 129 | struct ocfs2_control_private { |
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) | |||
198 | * mount path. Since the VFS prevents multiple calls to | 212 | * mount path. Since the VFS prevents multiple calls to |
199 | * fill_super(), we can't get dupes here. | 213 | * fill_super(), we can't get dupes here. |
200 | */ | 214 | */ |
201 | static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, | 215 | static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, |
202 | struct ocfs2_live_connection **c_ret) | 216 | struct ocfs2_live_connection *c) |
203 | { | 217 | { |
204 | int rc = 0; | 218 | int rc = 0; |
205 | struct ocfs2_live_connection *c; | ||
206 | |||
207 | c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); | ||
208 | if (!c) | ||
209 | return -ENOMEM; | ||
210 | 219 | ||
211 | mutex_lock(&ocfs2_control_lock); | 220 | mutex_lock(&ocfs2_control_lock); |
212 | c->oc_conn = conn; | 221 | c->oc_conn = conn; |
213 | 222 | ||
214 | if (atomic_read(&ocfs2_control_opened)) | 223 | if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) |
215 | list_add(&c->oc_list, &ocfs2_live_connection_list); | 224 | list_add(&c->oc_list, &ocfs2_live_connection_list); |
216 | else { | 225 | else { |
217 | printk(KERN_ERR | 226 | printk(KERN_ERR |
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, | |||
220 | } | 229 | } |
221 | 230 | ||
222 | mutex_unlock(&ocfs2_control_lock); | 231 | mutex_unlock(&ocfs2_control_lock); |
223 | |||
224 | if (!rc) | ||
225 | *c_ret = c; | ||
226 | else | ||
227 | kfree(c); | ||
228 | |||
229 | return rc; | 232 | return rc; |
230 | } | 233 | } |
231 | 234 | ||
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, | |||
799 | return 0; | 802 | return 0; |
800 | } | 803 | } |
801 | 804 | ||
805 | static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) | ||
806 | { | ||
807 | struct ocfs2_protocol_version *pv = | ||
808 | (struct ocfs2_protocol_version *)lvb; | ||
809 | /* | ||
810 | * ocfs2_protocol_version has two u8 variables, so we don't | ||
811 | * need any endian conversion. | ||
812 | */ | ||
813 | ver->pv_major = pv->pv_major; | ||
814 | ver->pv_minor = pv->pv_minor; | ||
815 | } | ||
816 | |||
817 | static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) | ||
818 | { | ||
819 | struct ocfs2_protocol_version *pv = | ||
820 | (struct ocfs2_protocol_version *)lvb; | ||
821 | /* | ||
822 | * ocfs2_protocol_version has two u8 variables, so we don't | ||
823 | * need any endian conversion. | ||
824 | */ | ||
825 | pv->pv_major = ver->pv_major; | ||
826 | pv->pv_minor = ver->pv_minor; | ||
827 | } | ||
828 | |||
829 | static void sync_wait_cb(void *arg) | ||
830 | { | ||
831 | struct ocfs2_cluster_connection *conn = arg; | ||
832 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
833 | complete(&lc->oc_sync_wait); | ||
834 | } | ||
835 | |||
836 | static int sync_unlock(struct ocfs2_cluster_connection *conn, | ||
837 | struct dlm_lksb *lksb, char *name) | ||
838 | { | ||
839 | int error; | ||
840 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
841 | |||
842 | error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); | ||
843 | if (error) { | ||
844 | printk(KERN_ERR "%s lkid %x error %d\n", | ||
845 | name, lksb->sb_lkid, error); | ||
846 | return error; | ||
847 | } | ||
848 | |||
849 | wait_for_completion(&lc->oc_sync_wait); | ||
850 | |||
851 | if (lksb->sb_status != -DLM_EUNLOCK) { | ||
852 | printk(KERN_ERR "%s lkid %x status %d\n", | ||
853 | name, lksb->sb_lkid, lksb->sb_status); | ||
854 | return -1; | ||
855 | } | ||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | static int sync_lock(struct ocfs2_cluster_connection *conn, | ||
860 | int mode, uint32_t flags, | ||
861 | struct dlm_lksb *lksb, char *name) | ||
862 | { | ||
863 | int error, status; | ||
864 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
865 | |||
866 | error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, | ||
867 | name, strlen(name), | ||
868 | 0, sync_wait_cb, conn, NULL); | ||
869 | if (error) { | ||
870 | printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", | ||
871 | name, lksb->sb_lkid, flags, mode, error); | ||
872 | return error; | ||
873 | } | ||
874 | |||
875 | wait_for_completion(&lc->oc_sync_wait); | ||
876 | |||
877 | status = lksb->sb_status; | ||
878 | |||
879 | if (status && status != -EAGAIN) { | ||
880 | printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", | ||
881 | name, lksb->sb_lkid, flags, mode, status); | ||
882 | } | ||
883 | |||
884 | return status; | ||
885 | } | ||
886 | |||
887 | |||
888 | static int version_lock(struct ocfs2_cluster_connection *conn, int mode, | ||
889 | int flags) | ||
890 | { | ||
891 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
892 | return sync_lock(conn, mode, flags, | ||
893 | &lc->oc_version_lksb, VERSION_LOCK); | ||
894 | } | ||
895 | |||
896 | static int version_unlock(struct ocfs2_cluster_connection *conn) | ||
897 | { | ||
898 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
899 | return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); | ||
900 | } | ||
901 | |||
902 | /* get_protocol_version() | ||
903 | * | ||
904 | * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. | ||
905 | * The algorithm is: | ||
906 | * 1. Attempt to take the lock in EX mode (non-blocking). | ||
907 | * 2. If successful (which means it is the first mount), write the | ||
908 | * version number and downconvert to PR lock. | ||
909 | * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after | ||
910 | * taking the PR lock. | ||
911 | */ | ||
912 | |||
913 | static int get_protocol_version(struct ocfs2_cluster_connection *conn) | ||
914 | { | ||
915 | int ret; | ||
916 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
917 | struct ocfs2_protocol_version pv; | ||
918 | |||
919 | running_proto.pv_major = | ||
920 | ocfs2_user_plugin.sp_max_proto.pv_major; | ||
921 | running_proto.pv_minor = | ||
922 | ocfs2_user_plugin.sp_max_proto.pv_minor; | ||
923 | |||
924 | lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; | ||
925 | ret = version_lock(conn, DLM_LOCK_EX, | ||
926 | DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); | ||
927 | if (!ret) { | ||
928 | conn->cc_version.pv_major = running_proto.pv_major; | ||
929 | conn->cc_version.pv_minor = running_proto.pv_minor; | ||
930 | version_to_lvb(&running_proto, lc->oc_lvb); | ||
931 | version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); | ||
932 | } else if (ret == -EAGAIN) { | ||
933 | ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); | ||
934 | if (ret) | ||
935 | goto out; | ||
936 | lvb_to_version(lc->oc_lvb, &pv); | ||
937 | |||
938 | if ((pv.pv_major != running_proto.pv_major) || | ||
939 | (pv.pv_minor > running_proto.pv_minor)) { | ||
940 | ret = -EINVAL; | ||
941 | goto out; | ||
942 | } | ||
943 | |||
944 | conn->cc_version.pv_major = pv.pv_major; | ||
945 | conn->cc_version.pv_minor = pv.pv_minor; | ||
946 | } | ||
947 | out: | ||
948 | return ret; | ||
949 | } | ||
950 | |||
951 | static void user_recover_prep(void *arg) | ||
952 | { | ||
953 | } | ||
954 | |||
955 | static void user_recover_slot(void *arg, struct dlm_slot *slot) | ||
956 | { | ||
957 | struct ocfs2_cluster_connection *conn = arg; | ||
958 | printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", | ||
959 | slot->nodeid, slot->slot); | ||
960 | conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); | ||
961 | |||
962 | } | ||
963 | |||
964 | static void user_recover_done(void *arg, struct dlm_slot *slots, | ||
965 | int num_slots, int our_slot, | ||
966 | uint32_t generation) | ||
967 | { | ||
968 | struct ocfs2_cluster_connection *conn = arg; | ||
969 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
970 | int i; | ||
971 | |||
972 | for (i = 0; i < num_slots; i++) | ||
973 | if (slots[i].slot == our_slot) { | ||
974 | atomic_set(&lc->oc_this_node, slots[i].nodeid); | ||
975 | break; | ||
976 | } | ||
977 | |||
978 | lc->oc_our_slot = our_slot; | ||
979 | wake_up(&lc->oc_wait); | ||
980 | } | ||
981 | |||
982 | static const struct dlm_lockspace_ops ocfs2_ls_ops = { | ||
983 | .recover_prep = user_recover_prep, | ||
984 | .recover_slot = user_recover_slot, | ||
985 | .recover_done = user_recover_done, | ||
986 | }; | ||
987 | |||
988 | static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) | ||
989 | { | ||
990 | version_unlock(conn); | ||
991 | dlm_release_lockspace(conn->cc_lockspace, 2); | ||
992 | conn->cc_lockspace = NULL; | ||
993 | ocfs2_live_connection_drop(conn->cc_private); | ||
994 | conn->cc_private = NULL; | ||
995 | return 0; | ||
996 | } | ||
997 | |||
802 | static int user_cluster_connect(struct ocfs2_cluster_connection *conn) | 998 | static int user_cluster_connect(struct ocfs2_cluster_connection *conn) |
803 | { | 999 | { |
804 | dlm_lockspace_t *fsdlm; | 1000 | dlm_lockspace_t *fsdlm; |
805 | struct ocfs2_live_connection *uninitialized_var(control); | 1001 | struct ocfs2_live_connection *lc; |
806 | int rc = 0; | 1002 | int rc, ops_rv; |
807 | 1003 | ||
808 | BUG_ON(conn == NULL); | 1004 | BUG_ON(conn == NULL); |
809 | 1005 | ||
810 | rc = ocfs2_live_connection_new(conn, &control); | 1006 | lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); |
1007 | if (!lc) { | ||
1008 | rc = -ENOMEM; | ||
1009 | goto out; | ||
1010 | } | ||
1011 | |||
1012 | init_waitqueue_head(&lc->oc_wait); | ||
1013 | init_completion(&lc->oc_sync_wait); | ||
1014 | atomic_set(&lc->oc_this_node, 0); | ||
1015 | conn->cc_private = lc; | ||
1016 | lc->oc_type = NO_CONTROLD; | ||
1017 | |||
1018 | rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, | ||
1019 | DLM_LSFL_FS, DLM_LVB_LEN, | ||
1020 | &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); | ||
1021 | if (rc) | ||
1022 | goto out; | ||
1023 | |||
1024 | if (ops_rv == -EOPNOTSUPP) { | ||
1025 | lc->oc_type = WITH_CONTROLD; | ||
1026 | printk(KERN_NOTICE "ocfs2: You seem to be using an older " | ||
1027 | "version of dlm_controld and/or ocfs2-tools." | ||
1028 | " Please consider upgrading.\n"); | ||
1029 | } else if (ops_rv) { | ||
1030 | rc = ops_rv; | ||
1031 | goto out; | ||
1032 | } | ||
1033 | conn->cc_lockspace = fsdlm; | ||
1034 | |||
1035 | rc = ocfs2_live_connection_attach(conn, lc); | ||
811 | if (rc) | 1036 | if (rc) |
812 | goto out; | 1037 | goto out; |
813 | 1038 | ||
1039 | if (lc->oc_type == NO_CONTROLD) { | ||
1040 | rc = get_protocol_version(conn); | ||
1041 | if (rc) { | ||
1042 | printk(KERN_ERR "ocfs2: Could not determine" | ||
1043 | " locking version\n"); | ||
1044 | user_cluster_disconnect(conn); | ||
1045 | goto out; | ||
1046 | } | ||
1047 | wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); | ||
1048 | } | ||
1049 | |||
814 | /* | 1050 | /* |
815 | * running_proto must have been set before we allowed any mounts | 1051 | * running_proto must have been set before we allowed any mounts |
816 | * to proceed. | 1052 | * to proceed. |
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) | |||
818 | if (fs_protocol_compare(&running_proto, &conn->cc_version)) { | 1054 | if (fs_protocol_compare(&running_proto, &conn->cc_version)) { |
819 | printk(KERN_ERR | 1055 | printk(KERN_ERR |
820 | "Unable to mount with fs locking protocol version " | 1056 | "Unable to mount with fs locking protocol version " |
821 | "%u.%u because the userspace control daemon has " | 1057 | "%u.%u because negotiated protocol is %u.%u\n", |
822 | "negotiated %u.%u\n", | ||
823 | conn->cc_version.pv_major, conn->cc_version.pv_minor, | 1058 | conn->cc_version.pv_major, conn->cc_version.pv_minor, |
824 | running_proto.pv_major, running_proto.pv_minor); | 1059 | running_proto.pv_major, running_proto.pv_minor); |
825 | rc = -EPROTO; | 1060 | rc = -EPROTO; |
826 | ocfs2_live_connection_drop(control); | 1061 | ocfs2_live_connection_drop(lc); |
827 | goto out; | 1062 | lc = NULL; |
828 | } | ||
829 | |||
830 | rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, | ||
831 | NULL, NULL, NULL, &fsdlm); | ||
832 | if (rc) { | ||
833 | ocfs2_live_connection_drop(control); | ||
834 | goto out; | ||
835 | } | 1063 | } |
836 | 1064 | ||
837 | conn->cc_private = control; | ||
838 | conn->cc_lockspace = fsdlm; | ||
839 | out: | 1065 | out: |
1066 | if (rc && lc) | ||
1067 | kfree(lc); | ||
840 | return rc; | 1068 | return rc; |
841 | } | 1069 | } |
842 | 1070 | ||
843 | static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) | ||
844 | { | ||
845 | dlm_release_lockspace(conn->cc_lockspace, 2); | ||
846 | conn->cc_lockspace = NULL; | ||
847 | ocfs2_live_connection_drop(conn->cc_private); | ||
848 | conn->cc_private = NULL; | ||
849 | return 0; | ||
850 | } | ||
851 | 1071 | ||
852 | static int user_cluster_this_node(unsigned int *this_node) | 1072 | static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, |
1073 | unsigned int *this_node) | ||
853 | { | 1074 | { |
854 | int rc; | 1075 | int rc; |
1076 | struct ocfs2_live_connection *lc = conn->cc_private; | ||
1077 | |||
1078 | if (lc->oc_type == WITH_CONTROLD) | ||
1079 | rc = ocfs2_control_get_this_node(); | ||
1080 | else if (lc->oc_type == NO_CONTROLD) | ||
1081 | rc = atomic_read(&lc->oc_this_node); | ||
1082 | else | ||
1083 | rc = -EINVAL; | ||
855 | 1084 | ||
856 | rc = ocfs2_control_get_this_node(); | ||
857 | if (rc < 0) | 1085 | if (rc < 0) |
858 | return rc; | 1086 | return rc; |
859 | 1087 | ||
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index cb7ec0b63ddc..1324e6600e57 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c | |||
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, | |||
309 | EXPORT_SYMBOL_GPL(ocfs2_plock); | 309 | EXPORT_SYMBOL_GPL(ocfs2_plock); |
310 | 310 | ||
311 | int ocfs2_cluster_connect(const char *stack_name, | 311 | int ocfs2_cluster_connect(const char *stack_name, |
312 | const char *cluster_name, | ||
313 | int cluster_name_len, | ||
312 | const char *group, | 314 | const char *group, |
313 | int grouplen, | 315 | int grouplen, |
314 | struct ocfs2_locking_protocol *lproto, | 316 | struct ocfs2_locking_protocol *lproto, |
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name, | |||
342 | goto out; | 344 | goto out; |
343 | } | 345 | } |
344 | 346 | ||
345 | memcpy(new_conn->cc_name, group, grouplen); | 347 | strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); |
346 | new_conn->cc_namelen = grouplen; | 348 | new_conn->cc_namelen = grouplen; |
349 | strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); | ||
350 | new_conn->cc_cluster_name_len = cluster_name_len; | ||
347 | new_conn->cc_recovery_handler = recovery_handler; | 351 | new_conn->cc_recovery_handler = recovery_handler; |
348 | new_conn->cc_recovery_data = recovery_data; | 352 | new_conn->cc_recovery_data = recovery_data; |
349 | 353 | ||
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, | |||
386 | 390 | ||
387 | if (cluster_stack_name[0]) | 391 | if (cluster_stack_name[0]) |
388 | stack_name = cluster_stack_name; | 392 | stack_name = cluster_stack_name; |
389 | return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, | 393 | return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, |
390 | recovery_handler, recovery_data, conn); | 394 | lproto, recovery_handler, recovery_data, |
395 | conn); | ||
391 | } | 396 | } |
392 | EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); | 397 | EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); |
393 | 398 | ||
@@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) | |||
460 | } | 465 | } |
461 | EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); | 466 | EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); |
462 | 467 | ||
463 | int ocfs2_cluster_this_node(unsigned int *node) | 468 | int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, |
469 | unsigned int *node) | ||
464 | { | 470 | { |
465 | return active_stack->sp_ops->this_node(node); | 471 | return active_stack->sp_ops->this_node(conn, node); |
466 | } | 472 | } |
467 | EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); | 473 | EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); |
468 | 474 | ||
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fdb8d0d..66334a30cea8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h | |||
@@ -45,6 +45,9 @@ struct file_lock; | |||
45 | */ | 45 | */ |
46 | #define GROUP_NAME_MAX 64 | 46 | #define GROUP_NAME_MAX 64 |
47 | 47 | ||
48 | /* This shadows OCFS2_CLUSTER_NAME_LEN */ | ||
49 | #define CLUSTER_NAME_MAX 16 | ||
50 | |||
48 | 51 | ||
49 | /* | 52 | /* |
50 | * ocfs2_protocol_version changes when ocfs2 does something different in | 53 | * ocfs2_protocol_version changes when ocfs2 does something different in |
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol { | |||
97 | * locking compatibility. | 100 | * locking compatibility. |
98 | */ | 101 | */ |
99 | struct ocfs2_cluster_connection { | 102 | struct ocfs2_cluster_connection { |
100 | char cc_name[GROUP_NAME_MAX]; | 103 | char cc_name[GROUP_NAME_MAX + 1]; |
101 | int cc_namelen; | 104 | int cc_namelen; |
105 | char cc_cluster_name[CLUSTER_NAME_MAX + 1]; | ||
106 | int cc_cluster_name_len; | ||
102 | struct ocfs2_protocol_version cc_version; | 107 | struct ocfs2_protocol_version cc_version; |
103 | struct ocfs2_locking_protocol *cc_proto; | 108 | struct ocfs2_locking_protocol *cc_proto; |
104 | void (*cc_recovery_handler)(int node_num, void *recovery_data); | 109 | void (*cc_recovery_handler)(int node_num, void *recovery_data); |
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations { | |||
152 | * ->this_node() returns the cluster's unique identifier for the | 157 | * ->this_node() returns the cluster's unique identifier for the |
153 | * local node. | 158 | * local node. |
154 | */ | 159 | */ |
155 | int (*this_node)(unsigned int *node); | 160 | int (*this_node)(struct ocfs2_cluster_connection *conn, |
161 | unsigned int *node); | ||
156 | 162 | ||
157 | /* | 163 | /* |
158 | * Call the underlying dlm lock function. The ->dlm_lock() | 164 | * Call the underlying dlm lock function. The ->dlm_lock() |
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin { | |||
239 | 245 | ||
240 | /* Used by the filesystem */ | 246 | /* Used by the filesystem */ |
241 | int ocfs2_cluster_connect(const char *stack_name, | 247 | int ocfs2_cluster_connect(const char *stack_name, |
248 | const char *cluster_name, | ||
249 | int cluster_name_len, | ||
242 | const char *group, | 250 | const char *group, |
243 | int grouplen, | 251 | int grouplen, |
244 | struct ocfs2_locking_protocol *lproto, | 252 | struct ocfs2_locking_protocol *lproto, |
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, | |||
260 | int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, | 268 | int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, |
261 | int hangup_pending); | 269 | int hangup_pending); |
262 | void ocfs2_cluster_hangup(const char *group, int grouplen); | 270 | void ocfs2_cluster_hangup(const char *group, int grouplen); |
263 | int ocfs2_cluster_this_node(unsigned int *node); | 271 | int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, |
272 | unsigned int *node); | ||
264 | 273 | ||
265 | struct ocfs2_lock_res; | 274 | struct ocfs2_lock_res; |
266 | int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, | 275 | int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2c91452c4047..47ae2663a6f5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, | |||
113 | struct ocfs2_suballoc_result *res); | 113 | struct ocfs2_suballoc_result *res); |
114 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, | 114 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, |
115 | int nr); | 115 | int nr); |
116 | static inline int ocfs2_block_group_set_bits(handle_t *handle, | ||
117 | struct inode *alloc_inode, | ||
118 | struct ocfs2_group_desc *bg, | ||
119 | struct buffer_head *group_bh, | ||
120 | unsigned int bit_off, | ||
121 | unsigned int num_bits); | ||
122 | static int ocfs2_relink_block_group(handle_t *handle, | 116 | static int ocfs2_relink_block_group(handle_t *handle, |
123 | struct inode *alloc_inode, | 117 | struct inode *alloc_inode, |
124 | struct buffer_head *fe_bh, | 118 | struct buffer_head *fe_bh, |
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, | |||
1343 | return status; | 1337 | return status; |
1344 | } | 1338 | } |
1345 | 1339 | ||
1346 | static inline int ocfs2_block_group_set_bits(handle_t *handle, | 1340 | int ocfs2_block_group_set_bits(handle_t *handle, |
1347 | struct inode *alloc_inode, | 1341 | struct inode *alloc_inode, |
1348 | struct ocfs2_group_desc *bg, | 1342 | struct ocfs2_group_desc *bg, |
1349 | struct buffer_head *group_bh, | 1343 | struct buffer_head *group_bh, |
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, | |||
1388 | ocfs2_journal_dirty(handle, group_bh); | 1382 | ocfs2_journal_dirty(handle, group_bh); |
1389 | 1383 | ||
1390 | bail: | 1384 | bail: |
1391 | if (status) | ||
1392 | mlog_errno(status); | ||
1393 | return status; | 1385 | return status; |
1394 | } | 1386 | } |
1395 | 1387 | ||
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode, | |||
1588 | return ret; | 1580 | return ret; |
1589 | } | 1581 | } |
1590 | 1582 | ||
1591 | static int ocfs2_alloc_dinode_update_counts(struct inode *inode, | 1583 | int ocfs2_alloc_dinode_update_counts(struct inode *inode, |
1592 | handle_t *handle, | 1584 | handle_t *handle, |
1593 | struct buffer_head *di_bh, | 1585 | struct buffer_head *di_bh, |
1594 | u32 num_bits, | 1586 | u32 num_bits, |
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa50911..218d8036b3e7 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h | |||
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, | |||
86 | u32 bits_wanted, | 86 | u32 bits_wanted, |
87 | struct ocfs2_alloc_context **ac); | 87 | struct ocfs2_alloc_context **ac); |
88 | 88 | ||
89 | int ocfs2_alloc_dinode_update_counts(struct inode *inode, | ||
90 | handle_t *handle, | ||
91 | struct buffer_head *di_bh, | ||
92 | u32 num_bits, | ||
93 | u16 chain); | ||
94 | int ocfs2_block_group_set_bits(handle_t *handle, | ||
95 | struct inode *alloc_inode, | ||
96 | struct ocfs2_group_desc *bg, | ||
97 | struct buffer_head *group_bh, | ||
98 | unsigned int bit_off, | ||
99 | unsigned int num_bits); | ||
100 | |||
89 | int ocfs2_claim_metadata(handle_t *handle, | 101 | int ocfs2_claim_metadata(handle_t *handle, |
90 | struct ocfs2_alloc_context *ac, | 102 | struct ocfs2_alloc_context *ac, |
91 | u32 bits_wanted, | 103 | u32 bits_wanted, |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c41492957aa5..49d84f80f36c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -68,7 +68,6 @@ | |||
68 | #include "super.h" | 68 | #include "super.h" |
69 | #include "sysfile.h" | 69 | #include "sysfile.h" |
70 | #include "uptodate.h" | 70 | #include "uptodate.h" |
71 | #include "ver.h" | ||
72 | #include "xattr.h" | 71 | #include "xattr.h" |
73 | #include "quota.h" | 72 | #include "quota.h" |
74 | #include "refcounttree.h" | 73 | #include "refcounttree.h" |
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL; | |||
90 | 89 | ||
91 | MODULE_AUTHOR("Oracle"); | 90 | MODULE_AUTHOR("Oracle"); |
92 | MODULE_LICENSE("GPL"); | 91 | MODULE_LICENSE("GPL"); |
92 | MODULE_DESCRIPTION("OCFS2 cluster file system"); | ||
93 | 93 | ||
94 | struct mount_options | 94 | struct mount_options |
95 | { | 95 | { |
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void) | |||
1618 | { | 1618 | { |
1619 | int status, i; | 1619 | int status, i; |
1620 | 1620 | ||
1621 | ocfs2_print_version(); | ||
1622 | |||
1623 | for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) | 1621 | for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) |
1624 | init_waitqueue_head(&ocfs2__ioend_wq[i]); | 1622 | init_waitqueue_head(&ocfs2__ioend_wq[i]); |
1625 | 1623 | ||
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1947 | 1945 | ||
1948 | ocfs2_shutdown_local_alloc(osb); | 1946 | ocfs2_shutdown_local_alloc(osb); |
1949 | 1947 | ||
1950 | ocfs2_truncate_log_shutdown(osb); | ||
1951 | |||
1952 | /* This will disable recovery and flush any recovery work. */ | 1948 | /* This will disable recovery and flush any recovery work. */ |
1953 | ocfs2_recovery_exit(osb); | 1949 | ocfs2_recovery_exit(osb); |
1954 | 1950 | ||
1951 | /* | ||
1952 | * During dismount, when it recovers another node it will call | ||
1953 | * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. | ||
1954 | */ | ||
1955 | ocfs2_truncate_log_shutdown(osb); | ||
1956 | |||
1955 | ocfs2_journal_shutdown(osb); | 1957 | ocfs2_journal_shutdown(osb); |
1956 | 1958 | ||
1957 | ocfs2_sync_blockdev(sb); | 1959 | ocfs2_sync_blockdev(sb); |
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2225 | if (ocfs2_clusterinfo_valid(osb)) { | 2227 | if (ocfs2_clusterinfo_valid(osb)) { |
2226 | osb->osb_stackflags = | 2228 | osb->osb_stackflags = |
2227 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; | 2229 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; |
2228 | memcpy(osb->osb_cluster_stack, | 2230 | strlcpy(osb->osb_cluster_stack, |
2229 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, | 2231 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, |
2230 | OCFS2_STACK_LABEL_LEN); | 2232 | OCFS2_STACK_LABEL_LEN + 1); |
2231 | osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; | ||
2232 | if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { | 2233 | if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { |
2233 | mlog(ML_ERROR, | 2234 | mlog(ML_ERROR, |
2234 | "couldn't mount because of an invalid " | 2235 | "couldn't mount because of an invalid " |
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2237 | status = -EINVAL; | 2238 | status = -EINVAL; |
2238 | goto bail; | 2239 | goto bail; |
2239 | } | 2240 | } |
2241 | strlcpy(osb->osb_cluster_name, | ||
2242 | OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, | ||
2243 | OCFS2_CLUSTER_NAME_LEN + 1); | ||
2240 | } else { | 2244 | } else { |
2241 | /* The empty string is identical with classic tools that | 2245 | /* The empty string is identical with classic tools that |
2242 | * don't know about s_cluster_info. */ | 2246 | * don't know about s_cluster_info. */ |
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4128a2..000000000000 --- a/fs/ocfs2/ver.c +++ /dev/null | |||
@@ -1,43 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/kernel.h> | ||
29 | |||
30 | #include "ver.h" | ||
31 | |||
32 | #define OCFS2_BUILD_VERSION "1.5.0" | ||
33 | |||
34 | #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION | ||
35 | |||
36 | void ocfs2_print_version(void) | ||
37 | { | ||
38 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
39 | } | ||
40 | |||
41 | MODULE_DESCRIPTION(VERSION_STR); | ||
42 | |||
43 | MODULE_VERSION(OCFS2_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb91d2f..000000000000 --- a/fs/ocfs2/ver.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_VER_H | ||
27 | #define OCFS2_VER_H | ||
28 | |||
29 | void ocfs2_print_version(void); | ||
30 | |||
31 | #endif /* OCFS2_VER_H */ | ||
diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135b7f82..021e7c069b86 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c | |||
@@ -22,11 +22,80 @@ | |||
22 | 22 | ||
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
24 | 24 | ||
25 | EXPORT_SYMBOL(posix_acl_init); | 25 | struct posix_acl **acl_by_type(struct inode *inode, int type) |
26 | EXPORT_SYMBOL(posix_acl_alloc); | 26 | { |
27 | EXPORT_SYMBOL(posix_acl_valid); | 27 | switch (type) { |
28 | EXPORT_SYMBOL(posix_acl_equiv_mode); | 28 | case ACL_TYPE_ACCESS: |
29 | EXPORT_SYMBOL(posix_acl_from_mode); | 29 | return &inode->i_acl; |
30 | case ACL_TYPE_DEFAULT: | ||
31 | return &inode->i_default_acl; | ||
32 | default: | ||
33 | BUG(); | ||
34 | } | ||
35 | } | ||
36 | EXPORT_SYMBOL(acl_by_type); | ||
37 | |||
38 | struct posix_acl *get_cached_acl(struct inode *inode, int type) | ||
39 | { | ||
40 | struct posix_acl **p = acl_by_type(inode, type); | ||
41 | struct posix_acl *acl = ACCESS_ONCE(*p); | ||
42 | if (acl) { | ||
43 | spin_lock(&inode->i_lock); | ||
44 | acl = *p; | ||
45 | if (acl != ACL_NOT_CACHED) | ||
46 | acl = posix_acl_dup(acl); | ||
47 | spin_unlock(&inode->i_lock); | ||
48 | } | ||
49 | return acl; | ||
50 | } | ||
51 | EXPORT_SYMBOL(get_cached_acl); | ||
52 | |||
53 | struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) | ||
54 | { | ||
55 | return rcu_dereference(*acl_by_type(inode, type)); | ||
56 | } | ||
57 | EXPORT_SYMBOL(get_cached_acl_rcu); | ||
58 | |||
59 | void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
60 | { | ||
61 | struct posix_acl **p = acl_by_type(inode, type); | ||
62 | struct posix_acl *old; | ||
63 | spin_lock(&inode->i_lock); | ||
64 | old = *p; | ||
65 | rcu_assign_pointer(*p, posix_acl_dup(acl)); | ||
66 | spin_unlock(&inode->i_lock); | ||
67 | if (old != ACL_NOT_CACHED) | ||
68 | posix_acl_release(old); | ||
69 | } | ||
70 | EXPORT_SYMBOL(set_cached_acl); | ||
71 | |||
72 | void forget_cached_acl(struct inode *inode, int type) | ||
73 | { | ||
74 | struct posix_acl **p = acl_by_type(inode, type); | ||
75 | struct posix_acl *old; | ||
76 | spin_lock(&inode->i_lock); | ||
77 | old = *p; | ||
78 | *p = ACL_NOT_CACHED; | ||
79 | spin_unlock(&inode->i_lock); | ||
80 | if (old != ACL_NOT_CACHED) | ||
81 | posix_acl_release(old); | ||
82 | } | ||
83 | EXPORT_SYMBOL(forget_cached_acl); | ||
84 | |||
85 | void forget_all_cached_acls(struct inode *inode) | ||
86 | { | ||
87 | struct posix_acl *old_access, *old_default; | ||
88 | spin_lock(&inode->i_lock); | ||
89 | old_access = inode->i_acl; | ||
90 | old_default = inode->i_default_acl; | ||
91 | inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; | ||
92 | spin_unlock(&inode->i_lock); | ||
93 | if (old_access != ACL_NOT_CACHED) | ||
94 | posix_acl_release(old_access); | ||
95 | if (old_default != ACL_NOT_CACHED) | ||
96 | posix_acl_release(old_default); | ||
97 | } | ||
98 | EXPORT_SYMBOL(forget_all_cached_acls); | ||
30 | 99 | ||
31 | /* | 100 | /* |
32 | * Init a fresh posix_acl | 101 | * Init a fresh posix_acl |
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count) | |||
37 | atomic_set(&acl->a_refcount, 1); | 106 | atomic_set(&acl->a_refcount, 1); |
38 | acl->a_count = count; | 107 | acl->a_count = count; |
39 | } | 108 | } |
109 | EXPORT_SYMBOL(posix_acl_init); | ||
40 | 110 | ||
41 | /* | 111 | /* |
42 | * Allocate a new ACL with the specified number of entries. | 112 | * Allocate a new ACL with the specified number of entries. |
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags) | |||
51 | posix_acl_init(acl, count); | 121 | posix_acl_init(acl, count); |
52 | return acl; | 122 | return acl; |
53 | } | 123 | } |
124 | EXPORT_SYMBOL(posix_acl_alloc); | ||
54 | 125 | ||
55 | /* | 126 | /* |
56 | * Clone an ACL. | 127 | * Clone an ACL. |
@@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl) | |||
146 | return 0; | 217 | return 0; |
147 | return -EINVAL; | 218 | return -EINVAL; |
148 | } | 219 | } |
220 | EXPORT_SYMBOL(posix_acl_valid); | ||
149 | 221 | ||
150 | /* | 222 | /* |
151 | * Returns 0 if the acl can be exactly represented in the traditional | 223 | * Returns 0 if the acl can be exactly represented in the traditional |
@@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) | |||
186 | *mode_p = (*mode_p & ~S_IRWXUGO) | mode; | 258 | *mode_p = (*mode_p & ~S_IRWXUGO) | mode; |
187 | return not_equiv; | 259 | return not_equiv; |
188 | } | 260 | } |
261 | EXPORT_SYMBOL(posix_acl_equiv_mode); | ||
189 | 262 | ||
190 | /* | 263 | /* |
191 | * Create an ACL representing the file mode permission bits of an inode. | 264 | * Create an ACL representing the file mode permission bits of an inode. |
@@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) | |||
207 | acl->a_entries[2].e_perm = (mode & S_IRWXO); | 280 | acl->a_entries[2].e_perm = (mode & S_IRWXO); |
208 | return acl; | 281 | return acl; |
209 | } | 282 | } |
283 | EXPORT_SYMBOL(posix_acl_from_mode); | ||
210 | 284 | ||
211 | /* | 285 | /* |
212 | * Return 0 if current is granted want access to the inode | 286 | * Return 0 if current is granted want access to the inode |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a77d2b299199..24270eceddbf 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
26 | unsigned long committed; | 26 | unsigned long committed; |
27 | struct vmalloc_info vmi; | 27 | struct vmalloc_info vmi; |
28 | long cached; | 28 | long cached; |
29 | long available; | ||
30 | unsigned long pagecache; | ||
31 | unsigned long wmark_low = 0; | ||
29 | unsigned long pages[NR_LRU_LISTS]; | 32 | unsigned long pages[NR_LRU_LISTS]; |
33 | struct zone *zone; | ||
30 | int lru; | 34 | int lru; |
31 | 35 | ||
32 | /* | 36 | /* |
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
47 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) | 51 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) |
48 | pages[lru] = global_page_state(NR_LRU_BASE + lru); | 52 | pages[lru] = global_page_state(NR_LRU_BASE + lru); |
49 | 53 | ||
54 | for_each_zone(zone) | ||
55 | wmark_low += zone->watermark[WMARK_LOW]; | ||
56 | |||
57 | /* | ||
58 | * Estimate the amount of memory available for userspace allocations, | ||
59 | * without causing swapping. | ||
60 | * | ||
61 | * Free memory cannot be taken below the low watermark, before the | ||
62 | * system starts swapping. | ||
63 | */ | ||
64 | available = i.freeram - wmark_low; | ||
65 | |||
66 | /* | ||
67 | * Not all the page cache can be freed, otherwise the system will | ||
68 | * start swapping. Assume at least half of the page cache, or the | ||
69 | * low watermark worth of cache, needs to stay. | ||
70 | */ | ||
71 | pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; | ||
72 | pagecache -= min(pagecache / 2, wmark_low); | ||
73 | available += pagecache; | ||
74 | |||
75 | /* | ||
76 | * Part of the reclaimable swap consists of items that are in use, | ||
77 | * and cannot be freed. Cap this estimate at the low watermark. | ||
78 | */ | ||
79 | available += global_page_state(NR_SLAB_RECLAIMABLE) - | ||
80 | min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); | ||
81 | |||
82 | if (available < 0) | ||
83 | available = 0; | ||
84 | |||
50 | /* | 85 | /* |
51 | * Tagged format, for easy grepping and expansion. | 86 | * Tagged format, for easy grepping and expansion. |
52 | */ | 87 | */ |
53 | seq_printf(m, | 88 | seq_printf(m, |
54 | "MemTotal: %8lu kB\n" | 89 | "MemTotal: %8lu kB\n" |
55 | "MemFree: %8lu kB\n" | 90 | "MemFree: %8lu kB\n" |
91 | "MemAvailable: %8lu kB\n" | ||
56 | "Buffers: %8lu kB\n" | 92 | "Buffers: %8lu kB\n" |
57 | "Cached: %8lu kB\n" | 93 | "Cached: %8lu kB\n" |
58 | "SwapCached: %8lu kB\n" | 94 | "SwapCached: %8lu kB\n" |
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
105 | , | 141 | , |
106 | K(i.totalram), | 142 | K(i.totalram), |
107 | K(i.freeram), | 143 | K(i.freeram), |
144 | K(available), | ||
108 | K(i.bufferram), | 145 | K(i.bufferram), |
109 | K(cached), | 146 | K(cached), |
110 | K(total_swapcache_pages()), | 147 | K(total_swapcache_pages()), |
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 39d14659a8d3..6a3e2c420180 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c | |||
@@ -275,4 +275,4 @@ int __init init_ramfs_fs(void) | |||
275 | 275 | ||
276 | return err; | 276 | return err; |
277 | } | 277 | } |
278 | module_init(init_ramfs_fs) | 278 | fs_initcall(init_ramfs_fs); |
diff --git a/fs/read_write.c b/fs/read_write.c index 58e440df1bc6..1193ffd03565 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, | |||
901 | io_fn_t fn; | 901 | io_fn_t fn; |
902 | iov_fn_t fnv; | 902 | iov_fn_t fnv; |
903 | 903 | ||
904 | ret = -EFAULT; | ||
905 | if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) | ||
906 | goto out; | ||
907 | |||
908 | ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, | 904 | ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, |
909 | UIO_FASTIOV, iovstack, &iov); | 905 | UIO_FASTIOV, iovstack, &iov); |
910 | if (ret <= 0) | 906 | if (ret <= 0) |
diff --git a/fs/super.c b/fs/super.c index e5f6c2cfac38..cecd780e0f44 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
166 | if (!s) | 166 | if (!s) |
167 | return NULL; | 167 | return NULL; |
168 | 168 | ||
169 | INIT_LIST_HEAD(&s->s_mounts); | ||
170 | |||
169 | if (security_sb_alloc(s)) | 171 | if (security_sb_alloc(s)) |
170 | goto fail; | 172 | goto fail; |
171 | 173 | ||
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
188 | if (list_lru_init(&s->s_inode_lru)) | 190 | if (list_lru_init(&s->s_inode_lru)) |
189 | goto fail; | 191 | goto fail; |
190 | 192 | ||
191 | INIT_LIST_HEAD(&s->s_mounts); | ||
192 | init_rwsem(&s->s_umount); | 193 | init_rwsem(&s->s_umount); |
193 | lockdep_set_class(&s->s_umount, &type->s_umount_key); | 194 | lockdep_set_class(&s->s_umount, &type->s_umount_key); |
194 | /* | 195 | /* |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f1f07d31a3af..2fae55def608 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #define _LINUX_BOOTMEM_H | 5 | #define _LINUX_BOOTMEM_H |
6 | 6 | ||
7 | #include <linux/mmzone.h> | 7 | #include <linux/mmzone.h> |
8 | #include <linux/mm_types.h> | ||
8 | #include <asm/dma.h> | 9 | #include <asm/dma.h> |
9 | 10 | ||
10 | /* | 11 | /* |
@@ -52,7 +53,6 @@ extern void free_bootmem_node(pg_data_t *pgdat, | |||
52 | unsigned long size); | 53 | unsigned long size); |
53 | extern void free_bootmem(unsigned long physaddr, unsigned long size); | 54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); |
54 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); | 55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); |
55 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | ||
56 | 56 | ||
57 | /* | 57 | /* |
58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, | 58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, |
@@ -142,6 +142,157 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, | |||
142 | #define alloc_bootmem_low_pages_node(pgdat, x) \ | 142 | #define alloc_bootmem_low_pages_node(pgdat, x) \ |
143 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) | 143 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) |
144 | 144 | ||
145 | |||
146 | #if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) | ||
147 | |||
148 | /* FIXME: use MEMBLOCK_ALLOC_* variants here */ | ||
149 | #define BOOTMEM_ALLOC_ACCESSIBLE 0 | ||
150 | #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) | ||
151 | |||
152 | /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ | ||
153 | void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, | ||
154 | phys_addr_t align, phys_addr_t min_addr, | ||
155 | phys_addr_t max_addr, int nid); | ||
156 | void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align, | ||
157 | phys_addr_t min_addr, phys_addr_t max_addr, int nid); | ||
158 | void __memblock_free_early(phys_addr_t base, phys_addr_t size); | ||
159 | void __memblock_free_late(phys_addr_t base, phys_addr_t size); | ||
160 | |||
161 | static inline void * __init memblock_virt_alloc( | ||
162 | phys_addr_t size, phys_addr_t align) | ||
163 | { | ||
164 | return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT, | ||
165 | BOOTMEM_ALLOC_ACCESSIBLE, | ||
166 | NUMA_NO_NODE); | ||
167 | } | ||
168 | |||
169 | static inline void * __init memblock_virt_alloc_nopanic( | ||
170 | phys_addr_t size, phys_addr_t align) | ||
171 | { | ||
172 | return memblock_virt_alloc_try_nid_nopanic(size, align, | ||
173 | BOOTMEM_LOW_LIMIT, | ||
174 | BOOTMEM_ALLOC_ACCESSIBLE, | ||
175 | NUMA_NO_NODE); | ||
176 | } | ||
177 | |||
178 | static inline void * __init memblock_virt_alloc_from_nopanic( | ||
179 | phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) | ||
180 | { | ||
181 | return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr, | ||
182 | BOOTMEM_ALLOC_ACCESSIBLE, | ||
183 | NUMA_NO_NODE); | ||
184 | } | ||
185 | |||
186 | static inline void * __init memblock_virt_alloc_node( | ||
187 | phys_addr_t size, int nid) | ||
188 | { | ||
189 | return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT, | ||
190 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
191 | } | ||
192 | |||
193 | static inline void * __init memblock_virt_alloc_node_nopanic( | ||
194 | phys_addr_t size, int nid) | ||
195 | { | ||
196 | return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT, | ||
197 | BOOTMEM_ALLOC_ACCESSIBLE, | ||
198 | nid); | ||
199 | } | ||
200 | |||
201 | static inline void __init memblock_free_early( | ||
202 | phys_addr_t base, phys_addr_t size) | ||
203 | { | ||
204 | __memblock_free_early(base, size); | ||
205 | } | ||
206 | |||
207 | static inline void __init memblock_free_early_nid( | ||
208 | phys_addr_t base, phys_addr_t size, int nid) | ||
209 | { | ||
210 | __memblock_free_early(base, size); | ||
211 | } | ||
212 | |||
213 | static inline void __init memblock_free_late( | ||
214 | phys_addr_t base, phys_addr_t size) | ||
215 | { | ||
216 | __memblock_free_late(base, size); | ||
217 | } | ||
218 | |||
219 | #else | ||
220 | |||
221 | #define BOOTMEM_ALLOC_ACCESSIBLE 0 | ||
222 | |||
223 | |||
224 | /* Fall back to all the existing bootmem APIs */ | ||
225 | static inline void * __init memblock_virt_alloc( | ||
226 | phys_addr_t size, phys_addr_t align) | ||
227 | { | ||
228 | if (!align) | ||
229 | align = SMP_CACHE_BYTES; | ||
230 | return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); | ||
231 | } | ||
232 | |||
233 | static inline void * __init memblock_virt_alloc_nopanic( | ||
234 | phys_addr_t size, phys_addr_t align) | ||
235 | { | ||
236 | if (!align) | ||
237 | align = SMP_CACHE_BYTES; | ||
238 | return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); | ||
239 | } | ||
240 | |||
241 | static inline void * __init memblock_virt_alloc_from_nopanic( | ||
242 | phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) | ||
243 | { | ||
244 | return __alloc_bootmem_nopanic(size, align, min_addr); | ||
245 | } | ||
246 | |||
247 | static inline void * __init memblock_virt_alloc_node( | ||
248 | phys_addr_t size, int nid) | ||
249 | { | ||
250 | return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES, | ||
251 | BOOTMEM_LOW_LIMIT); | ||
252 | } | ||
253 | |||
254 | static inline void * __init memblock_virt_alloc_node_nopanic( | ||
255 | phys_addr_t size, int nid) | ||
256 | { | ||
257 | return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | ||
258 | SMP_CACHE_BYTES, | ||
259 | BOOTMEM_LOW_LIMIT); | ||
260 | } | ||
261 | |||
262 | static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, | ||
263 | phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) | ||
264 | { | ||
265 | return __alloc_bootmem_node_high(NODE_DATA(nid), size, align, | ||
266 | min_addr); | ||
267 | } | ||
268 | |||
269 | static inline void * __init memblock_virt_alloc_try_nid_nopanic( | ||
270 | phys_addr_t size, phys_addr_t align, | ||
271 | phys_addr_t min_addr, phys_addr_t max_addr, int nid) | ||
272 | { | ||
273 | return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, | ||
274 | min_addr, max_addr); | ||
275 | } | ||
276 | |||
277 | static inline void __init memblock_free_early( | ||
278 | phys_addr_t base, phys_addr_t size) | ||
279 | { | ||
280 | free_bootmem(base, size); | ||
281 | } | ||
282 | |||
283 | static inline void __init memblock_free_early_nid( | ||
284 | phys_addr_t base, phys_addr_t size, int nid) | ||
285 | { | ||
286 | free_bootmem_node(NODE_DATA(nid), base, size); | ||
287 | } | ||
288 | |||
289 | static inline void __init memblock_free_late( | ||
290 | phys_addr_t base, phys_addr_t size) | ||
291 | { | ||
292 | free_bootmem_late(base, size); | ||
293 | } | ||
294 | #endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */ | ||
295 | |||
145 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP | 296 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP |
146 | extern void *alloc_remap(int nid, unsigned long size); | 297 | extern void *alloc_remap(int nid, unsigned long size); |
147 | #else | 298 | #else |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 091d72e70d8a..7e1c76e3cd68 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order) | |||
62 | return zone->compact_considered < defer_limit; | 62 | return zone->compact_considered < defer_limit; |
63 | } | 63 | } |
64 | 64 | ||
65 | /* | ||
66 | * Update defer tracking counters after successful compaction of given order, | ||
67 | * which means an allocation either succeeded (alloc_success == true) or is | ||
68 | * expected to succeed. | ||
69 | */ | ||
70 | static inline void compaction_defer_reset(struct zone *zone, int order, | ||
71 | bool alloc_success) | ||
72 | { | ||
73 | if (alloc_success) { | ||
74 | zone->compact_considered = 0; | ||
75 | zone->compact_defer_shift = 0; | ||
76 | } | ||
77 | if (order >= zone->compact_order_failed) | ||
78 | zone->compact_order_failed = order + 1; | ||
79 | } | ||
80 | |||
65 | /* Returns true if restarting compaction after many failures */ | 81 | /* Returns true if restarting compaction after many failures */ |
66 | static inline bool compaction_restarting(struct zone *zone, int order) | 82 | static inline bool compaction_restarting(struct zone *zone, int order) |
67 | { | 83 | { |
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index fc0e34ce038f..fe8cb610deac 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h | |||
@@ -85,6 +85,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, | |||
85 | 85 | ||
86 | extern void debug_dma_dump_mappings(struct device *dev); | 86 | extern void debug_dma_dump_mappings(struct device *dev); |
87 | 87 | ||
88 | extern void debug_dma_assert_idle(struct page *page); | ||
89 | |||
88 | #else /* CONFIG_DMA_API_DEBUG */ | 90 | #else /* CONFIG_DMA_API_DEBUG */ |
89 | 91 | ||
90 | static inline void dma_debug_add_bus(struct bus_type *bus) | 92 | static inline void dma_debug_add_bus(struct bus_type *bus) |
@@ -183,6 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev) | |||
183 | { | 185 | { |
184 | } | 186 | } |
185 | 187 | ||
188 | static inline void debug_dma_assert_idle(struct page *page) | ||
189 | { | ||
190 | } | ||
191 | |||
186 | #endif /* CONFIG_DMA_API_DEBUG */ | 192 | #endif /* CONFIG_DMA_API_DEBUG */ |
187 | 193 | ||
188 | #endif /* __DMA_DEBUG_H */ | 194 | #endif /* __DMA_DEBUG_H */ |
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 4b2ee8d12f5e..7d8d5e608594 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/path.h> /* struct path */ | 15 | #include <linux/path.h> /* struct path */ |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/types.h> | 17 | #include <linux/types.h> |
18 | |||
19 | #include <linux/atomic.h> | 18 | #include <linux/atomic.h> |
20 | 19 | ||
21 | /* | 20 | /* |
@@ -79,6 +78,7 @@ struct fsnotify_group; | |||
79 | struct fsnotify_event; | 78 | struct fsnotify_event; |
80 | struct fsnotify_mark; | 79 | struct fsnotify_mark; |
81 | struct fsnotify_event_private_data; | 80 | struct fsnotify_event_private_data; |
81 | struct fsnotify_fname; | ||
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Each group much define these ops. The fsnotify infrastructure will call | 84 | * Each group much define these ops. The fsnotify infrastructure will call |
@@ -94,17 +94,27 @@ struct fsnotify_event_private_data; | |||
94 | * userspace messages that marks have been removed. | 94 | * userspace messages that marks have been removed. |
95 | */ | 95 | */ |
96 | struct fsnotify_ops { | 96 | struct fsnotify_ops { |
97 | bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, | ||
98 | struct fsnotify_mark *inode_mark, | ||
99 | struct fsnotify_mark *vfsmount_mark, | ||
100 | __u32 mask, void *data, int data_type); | ||
101 | int (*handle_event)(struct fsnotify_group *group, | 97 | int (*handle_event)(struct fsnotify_group *group, |
98 | struct inode *inode, | ||
102 | struct fsnotify_mark *inode_mark, | 99 | struct fsnotify_mark *inode_mark, |
103 | struct fsnotify_mark *vfsmount_mark, | 100 | struct fsnotify_mark *vfsmount_mark, |
104 | struct fsnotify_event *event); | 101 | u32 mask, void *data, int data_type, |
102 | const unsigned char *file_name); | ||
105 | void (*free_group_priv)(struct fsnotify_group *group); | 103 | void (*free_group_priv)(struct fsnotify_group *group); |
106 | void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); | 104 | void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); |
107 | void (*free_event_priv)(struct fsnotify_event_private_data *priv); | 105 | void (*free_event)(struct fsnotify_event *event); |
106 | }; | ||
107 | |||
108 | /* | ||
109 | * all of the information about the original object we want to now send to | ||
110 | * a group. If you want to carry more info from the accessing task to the | ||
111 | * listener this structure is where you need to be adding fields. | ||
112 | */ | ||
113 | struct fsnotify_event { | ||
114 | struct list_head list; | ||
115 | /* inode may ONLY be dereferenced during handle_event(). */ | ||
116 | struct inode *inode; /* either the inode the event happened to or its parent */ | ||
117 | u32 mask; /* the type of access, bitwise OR for FS_* event types */ | ||
108 | }; | 118 | }; |
109 | 119 | ||
110 | /* | 120 | /* |
@@ -148,7 +158,11 @@ struct fsnotify_group { | |||
148 | * a group */ | 158 | * a group */ |
149 | struct list_head marks_list; /* all inode marks for this group */ | 159 | struct list_head marks_list; /* all inode marks for this group */ |
150 | 160 | ||
151 | struct fasync_struct *fsn_fa; /* async notification */ | 161 | struct fasync_struct *fsn_fa; /* async notification */ |
162 | |||
163 | struct fsnotify_event overflow_event; /* Event we queue when the | ||
164 | * notification list is too | ||
165 | * full */ | ||
152 | 166 | ||
153 | /* groups can define private fields here or use the void *private */ | 167 | /* groups can define private fields here or use the void *private */ |
154 | union { | 168 | union { |
@@ -177,76 +191,10 @@ struct fsnotify_group { | |||
177 | }; | 191 | }; |
178 | }; | 192 | }; |
179 | 193 | ||
180 | /* | ||
181 | * A single event can be queued in multiple group->notification_lists. | ||
182 | * | ||
183 | * each group->notification_list will point to an event_holder which in turns points | ||
184 | * to the actual event that needs to be sent to userspace. | ||
185 | * | ||
186 | * Seemed cheaper to create a refcnt'd event and a small holder for every group | ||
187 | * than create a different event for every group | ||
188 | * | ||
189 | */ | ||
190 | struct fsnotify_event_holder { | ||
191 | struct fsnotify_event *event; | ||
192 | struct list_head event_list; | ||
193 | }; | ||
194 | |||
195 | /* | ||
196 | * Inotify needs to tack data onto an event. This struct lets us later find the | ||
197 | * correct private data of the correct group. | ||
198 | */ | ||
199 | struct fsnotify_event_private_data { | ||
200 | struct fsnotify_group *group; | ||
201 | struct list_head event_list; | ||
202 | }; | ||
203 | |||
204 | /* | ||
205 | * all of the information about the original object we want to now send to | ||
206 | * a group. If you want to carry more info from the accessing task to the | ||
207 | * listener this structure is where you need to be adding fields. | ||
208 | */ | ||
209 | struct fsnotify_event { | ||
210 | /* | ||
211 | * If we create an event we are also likely going to need a holder | ||
212 | * to link to a group. So embed one holder in the event. Means only | ||
213 | * one allocation for the common case where we only have one group | ||
214 | */ | ||
215 | struct fsnotify_event_holder holder; | ||
216 | spinlock_t lock; /* protection for the associated event_holder and private_list */ | ||
217 | /* to_tell may ONLY be dereferenced during handle_event(). */ | ||
218 | struct inode *to_tell; /* either the inode the event happened to or its parent */ | ||
219 | /* | ||
220 | * depending on the event type we should have either a path or inode | ||
221 | * We hold a reference on path, but NOT on inode. Since we have the ref on | ||
222 | * the path, it may be dereferenced at any point during this object's | ||
223 | * lifetime. That reference is dropped when this object's refcnt hits | ||
224 | * 0. If this event contains an inode instead of a path, the inode may | ||
225 | * ONLY be used during handle_event(). | ||
226 | */ | ||
227 | union { | ||
228 | struct path path; | ||
229 | struct inode *inode; | ||
230 | }; | ||
231 | /* when calling fsnotify tell it if the data is a path or inode */ | 194 | /* when calling fsnotify tell it if the data is a path or inode */ |
232 | #define FSNOTIFY_EVENT_NONE 0 | 195 | #define FSNOTIFY_EVENT_NONE 0 |
233 | #define FSNOTIFY_EVENT_PATH 1 | 196 | #define FSNOTIFY_EVENT_PATH 1 |
234 | #define FSNOTIFY_EVENT_INODE 2 | 197 | #define FSNOTIFY_EVENT_INODE 2 |
235 | int data_type; /* which of the above union we have */ | ||
236 | atomic_t refcnt; /* how many groups still are using/need to send this event */ | ||
237 | __u32 mask; /* the type of access, bitwise OR for FS_* event types */ | ||
238 | |||
239 | u32 sync_cookie; /* used to corrolate events, namely inotify mv events */ | ||
240 | const unsigned char *file_name; | ||
241 | size_t name_len; | ||
242 | struct pid *tgid; | ||
243 | |||
244 | #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS | ||
245 | __u32 response; /* userspace answer to question */ | ||
246 | #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ | ||
247 | |||
248 | struct list_head private_data_list; /* groups can store private data here */ | ||
249 | }; | ||
250 | 198 | ||
251 | /* | 199 | /* |
252 | * Inode specific fields in an fsnotify_mark | 200 | * Inode specific fields in an fsnotify_mark |
@@ -370,17 +318,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group); | |||
370 | extern void fsnotify_destroy_group(struct fsnotify_group *group); | 318 | extern void fsnotify_destroy_group(struct fsnotify_group *group); |
371 | /* fasync handler function */ | 319 | /* fasync handler function */ |
372 | extern int fsnotify_fasync(int fd, struct file *file, int on); | 320 | extern int fsnotify_fasync(int fd, struct file *file, int on); |
373 | /* take a reference to an event */ | 321 | /* Free event from memory */ |
374 | extern void fsnotify_get_event(struct fsnotify_event *event); | 322 | extern void fsnotify_destroy_event(struct fsnotify_group *group, |
375 | extern void fsnotify_put_event(struct fsnotify_event *event); | 323 | struct fsnotify_event *event); |
376 | /* find private data previously attached to an event and unlink it */ | ||
377 | extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, | ||
378 | struct fsnotify_event *event); | ||
379 | |||
380 | /* attach the event to the group notification queue */ | 324 | /* attach the event to the group notification queue */ |
381 | extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, | 325 | extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, |
382 | struct fsnotify_event *event, | 326 | struct fsnotify_event *event, |
383 | struct fsnotify_event_private_data *priv, | ||
384 | struct fsnotify_event *(*merge)(struct list_head *, | 327 | struct fsnotify_event *(*merge)(struct list_head *, |
385 | struct fsnotify_event *)); | 328 | struct fsnotify_event *)); |
386 | /* true if the group notification queue is empty */ | 329 | /* true if the group notification queue is empty */ |
@@ -430,15 +373,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark); | |||
430 | extern void fsnotify_unmount_inodes(struct list_head *list); | 373 | extern void fsnotify_unmount_inodes(struct list_head *list); |
431 | 374 | ||
432 | /* put here because inotify does some weird stuff when destroying watches */ | 375 | /* put here because inotify does some weird stuff when destroying watches */ |
433 | extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, | 376 | extern void fsnotify_init_event(struct fsnotify_event *event, |
434 | void *data, int data_is, | 377 | struct inode *to_tell, u32 mask); |
435 | const unsigned char *name, | ||
436 | u32 cookie, gfp_t gfp); | ||
437 | |||
438 | /* fanotify likes to change events after they are on lists... */ | ||
439 | extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event); | ||
440 | extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, | ||
441 | struct fsnotify_event *new_event); | ||
442 | 378 | ||
443 | #else | 379 | #else |
444 | 380 | ||
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 91672e2deec3..db512014e061 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -157,6 +157,26 @@ static inline int hpage_nr_pages(struct page *page) | |||
157 | return HPAGE_PMD_NR; | 157 | return HPAGE_PMD_NR; |
158 | return 1; | 158 | return 1; |
159 | } | 159 | } |
160 | /* | ||
161 | * compound_trans_head() should be used instead of compound_head(), | ||
162 | * whenever the "page" passed as parameter could be the tail of a | ||
163 | * transparent hugepage that could be undergoing a | ||
164 | * __split_huge_page_refcount(). The page structure layout often | ||
165 | * changes across releases and it makes extensive use of unions. So if | ||
166 | * the page structure layout will change in a way that | ||
167 | * page->first_page gets clobbered by __split_huge_page_refcount, the | ||
168 | * implementation making use of smp_rmb() will be required. | ||
169 | * | ||
170 | * Currently we define compound_trans_head as compound_head, because | ||
171 | * page->private is in the same union with page->first_page, and | ||
172 | * page->private isn't clobbered. However this also means we're | ||
173 | * currently leaving dirt into the page->private field of anonymous | ||
174 | * pages resulting from a THP split, instead of setting page->private | ||
175 | * to zero like for every other page that has PG_private not set. But | ||
176 | * anonymous pages don't use page->private so this is not a problem. | ||
177 | */ | ||
178 | #if 0 | ||
179 | /* This will be needed if page->private will be clobbered in split_huge_page */ | ||
160 | static inline struct page *compound_trans_head(struct page *page) | 180 | static inline struct page *compound_trans_head(struct page *page) |
161 | { | 181 | { |
162 | if (PageTail(page)) { | 182 | if (PageTail(page)) { |
@@ -174,6 +194,9 @@ static inline struct page *compound_trans_head(struct page *page) | |||
174 | } | 194 | } |
175 | return page; | 195 | return page; |
176 | } | 196 | } |
197 | #else | ||
198 | #define compound_trans_head(page) compound_head(page) | ||
199 | #endif | ||
177 | 200 | ||
178 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 201 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
179 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); | 202 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bd7e98752222..d01cc972a1d9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -31,7 +31,6 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); | |||
31 | void hugepage_put_subpool(struct hugepage_subpool *spool); | 31 | void hugepage_put_subpool(struct hugepage_subpool *spool); |
32 | 32 | ||
33 | int PageHuge(struct page *page); | 33 | int PageHuge(struct page *page); |
34 | int PageHeadHuge(struct page *page_head); | ||
35 | 34 | ||
36 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); | 35 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); |
37 | int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); | 36 | int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); |
@@ -104,11 +103,6 @@ static inline int PageHuge(struct page *page) | |||
104 | return 0; | 103 | return 0; |
105 | } | 104 | } |
106 | 105 | ||
107 | static inline int PageHeadHuge(struct page *page_head) | ||
108 | { | ||
109 | return 0; | ||
110 | } | ||
111 | |||
112 | static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 106 | static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
113 | { | 107 | { |
114 | } | 108 | } |
@@ -360,6 +354,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, | |||
360 | 354 | ||
361 | static inline struct hstate *page_hstate(struct page *page) | 355 | static inline struct hstate *page_hstate(struct page *page) |
362 | { | 356 | { |
357 | VM_BUG_ON(!PageHuge(page)); | ||
363 | return size_to_hstate(PAGE_SIZE << compound_order(page)); | 358 | return size_to_hstate(PAGE_SIZE << compound_order(page)); |
364 | } | 359 | } |
365 | 360 | ||
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f0e52383a001..1516a8ff8f92 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -41,6 +41,7 @@ extern struct fs_struct init_fs; | |||
41 | 41 | ||
42 | #define INIT_SIGNALS(sig) { \ | 42 | #define INIT_SIGNALS(sig) { \ |
43 | .nr_threads = 1, \ | 43 | .nr_threads = 1, \ |
44 | .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ | ||
44 | .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ | 45 | .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ |
45 | .shared_pending = { \ | 46 | .shared_pending = { \ |
46 | .list = LIST_HEAD_INIT(sig.shared_pending.list), \ | 47 | .list = LIST_HEAD_INIT(sig.shared_pending.list), \ |
@@ -222,6 +223,7 @@ extern struct task_group root_task_group; | |||
222 | [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ | 223 | [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ |
223 | }, \ | 224 | }, \ |
224 | .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ | 225 | .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ |
226 | .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ | ||
225 | INIT_IDS \ | 227 | INIT_IDS \ |
226 | INIT_PERF_EVENTS(tsk) \ | 228 | INIT_PERF_EVENTS(tsk) \ |
227 | INIT_TRACE_IRQFLAGS \ | 229 | INIT_TRACE_IRQFLAGS \ |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 45c9b6a17bcb..3be6bb18562d 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h | |||
@@ -73,11 +73,7 @@ static inline void set_page_stable_node(struct page *page, | |||
73 | struct page *ksm_might_need_to_copy(struct page *page, | 73 | struct page *ksm_might_need_to_copy(struct page *page, |
74 | struct vm_area_struct *vma, unsigned long address); | 74 | struct vm_area_struct *vma, unsigned long address); |
75 | 75 | ||
76 | int page_referenced_ksm(struct page *page, | 76 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); |
77 | struct mem_cgroup *memcg, unsigned long *vm_flags); | ||
78 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); | ||
79 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
80 | struct vm_area_struct *, unsigned long, void *), void *arg); | ||
81 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); | 77 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); |
82 | 78 | ||
83 | #else /* !CONFIG_KSM */ | 79 | #else /* !CONFIG_KSM */ |
@@ -115,13 +111,8 @@ static inline int page_referenced_ksm(struct page *page, | |||
115 | return 0; | 111 | return 0; |
116 | } | 112 | } |
117 | 113 | ||
118 | static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | 114 | static inline int rmap_walk_ksm(struct page *page, |
119 | { | 115 | struct rmap_walk_control *rwc) |
120 | return 0; | ||
121 | } | ||
122 | |||
123 | static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*, | ||
124 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
125 | { | 116 | { |
126 | return 0; | 117 | return 0; |
127 | } | 118 | } |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77c60e52939d..cd0274bebd4c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -19,9 +19,13 @@ | |||
19 | 19 | ||
20 | #define INIT_MEMBLOCK_REGIONS 128 | 20 | #define INIT_MEMBLOCK_REGIONS 128 |
21 | 21 | ||
22 | /* Definition of memblock flags. */ | ||
23 | #define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */ | ||
24 | |||
22 | struct memblock_region { | 25 | struct memblock_region { |
23 | phys_addr_t base; | 26 | phys_addr_t base; |
24 | phys_addr_t size; | 27 | phys_addr_t size; |
28 | unsigned long flags; | ||
25 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 29 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
26 | int nid; | 30 | int nid; |
27 | #endif | 31 | #endif |
@@ -43,12 +47,17 @@ struct memblock { | |||
43 | 47 | ||
44 | extern struct memblock memblock; | 48 | extern struct memblock memblock; |
45 | extern int memblock_debug; | 49 | extern int memblock_debug; |
50 | #ifdef CONFIG_MOVABLE_NODE | ||
51 | /* If movable_node boot option specified */ | ||
52 | extern bool movable_node_enabled; | ||
53 | #endif /* CONFIG_MOVABLE_NODE */ | ||
46 | 54 | ||
47 | #define memblock_dbg(fmt, ...) \ | 55 | #define memblock_dbg(fmt, ...) \ |
48 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | 56 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) |
49 | 57 | ||
50 | phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, | 58 | phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, |
51 | phys_addr_t size, phys_addr_t align, int nid); | 59 | phys_addr_t start, phys_addr_t end, |
60 | int nid); | ||
52 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, | 61 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, |
53 | phys_addr_t size, phys_addr_t align); | 62 | phys_addr_t size, phys_addr_t align); |
54 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); | 63 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); |
@@ -59,6 +68,28 @@ int memblock_remove(phys_addr_t base, phys_addr_t size); | |||
59 | int memblock_free(phys_addr_t base, phys_addr_t size); | 68 | int memblock_free(phys_addr_t base, phys_addr_t size); |
60 | int memblock_reserve(phys_addr_t base, phys_addr_t size); | 69 | int memblock_reserve(phys_addr_t base, phys_addr_t size); |
61 | void memblock_trim_memory(phys_addr_t align); | 70 | void memblock_trim_memory(phys_addr_t align); |
71 | int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); | ||
72 | int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); | ||
73 | #ifdef CONFIG_MOVABLE_NODE | ||
74 | static inline bool memblock_is_hotpluggable(struct memblock_region *m) | ||
75 | { | ||
76 | return m->flags & MEMBLOCK_HOTPLUG; | ||
77 | } | ||
78 | |||
79 | static inline bool movable_node_is_enabled(void) | ||
80 | { | ||
81 | return movable_node_enabled; | ||
82 | } | ||
83 | #else | ||
84 | static inline bool memblock_is_hotpluggable(struct memblock_region *m) | ||
85 | { | ||
86 | return false; | ||
87 | } | ||
88 | static inline bool movable_node_is_enabled(void) | ||
89 | { | ||
90 | return false; | ||
91 | } | ||
92 | #endif | ||
62 | 93 | ||
63 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 94 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
64 | int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, | 95 | int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, |
@@ -87,7 +118,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, | |||
87 | /** | 118 | /** |
88 | * for_each_free_mem_range - iterate through free memblock areas | 119 | * for_each_free_mem_range - iterate through free memblock areas |
89 | * @i: u64 used as loop variable | 120 | * @i: u64 used as loop variable |
90 | * @nid: node selector, %MAX_NUMNODES for all nodes | 121 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
91 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 122 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
92 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 123 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
93 | * @p_nid: ptr to int for nid of the range, can be %NULL | 124 | * @p_nid: ptr to int for nid of the range, can be %NULL |
@@ -107,7 +138,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, | |||
107 | /** | 138 | /** |
108 | * for_each_free_mem_range_reverse - rev-iterate through free memblock areas | 139 | * for_each_free_mem_range_reverse - rev-iterate through free memblock areas |
109 | * @i: u64 used as loop variable | 140 | * @i: u64 used as loop variable |
110 | * @nid: node selector, %MAX_NUMNODES for all nodes | 141 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
111 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 142 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL |
112 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 143 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL |
113 | * @p_nid: ptr to int for nid of the range, can be %NULL | 144 | * @p_nid: ptr to int for nid of the range, can be %NULL |
@@ -121,8 +152,21 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, | |||
121 | i != (u64)ULLONG_MAX; \ | 152 | i != (u64)ULLONG_MAX; \ |
122 | __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) | 153 | __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) |
123 | 154 | ||
155 | static inline void memblock_set_region_flags(struct memblock_region *r, | ||
156 | unsigned long flags) | ||
157 | { | ||
158 | r->flags |= flags; | ||
159 | } | ||
160 | |||
161 | static inline void memblock_clear_region_flags(struct memblock_region *r, | ||
162 | unsigned long flags) | ||
163 | { | ||
164 | r->flags &= ~flags; | ||
165 | } | ||
166 | |||
124 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 167 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
125 | int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); | 168 | int memblock_set_node(phys_addr_t base, phys_addr_t size, |
169 | struct memblock_type *type, int nid); | ||
126 | 170 | ||
127 | static inline void memblock_set_region_node(struct memblock_region *r, int nid) | 171 | static inline void memblock_set_region_node(struct memblock_region *r, int nid) |
128 | { | 172 | { |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 9fe426b30a41..5f1ea756aace 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -211,20 +211,8 @@ static inline void mpol_get(struct mempolicy *pol) | |||
211 | { | 211 | { |
212 | } | 212 | } |
213 | 213 | ||
214 | static inline struct mempolicy *mpol_dup(struct mempolicy *old) | ||
215 | { | ||
216 | return NULL; | ||
217 | } | ||
218 | |||
219 | struct shared_policy {}; | 214 | struct shared_policy {}; |
220 | 215 | ||
221 | static inline int mpol_set_shared_policy(struct shared_policy *info, | ||
222 | struct vm_area_struct *vma, | ||
223 | struct mempolicy *new) | ||
224 | { | ||
225 | return -EINVAL; | ||
226 | } | ||
227 | |||
228 | static inline void mpol_shared_policy_init(struct shared_policy *sp, | 216 | static inline void mpol_shared_policy_init(struct shared_policy *sp, |
229 | struct mempolicy *mpol) | 217 | struct mempolicy *mpol) |
230 | { | 218 | { |
@@ -234,12 +222,6 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) | |||
234 | { | 222 | { |
235 | } | 223 | } |
236 | 224 | ||
237 | static inline struct mempolicy * | ||
238 | mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | ||
239 | { | ||
240 | return NULL; | ||
241 | } | ||
242 | |||
243 | #define vma_policy(vma) NULL | 225 | #define vma_policy(vma) NULL |
244 | 226 | ||
245 | static inline int | 227 | static inline int |
@@ -266,10 +248,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | |||
266 | { | 248 | { |
267 | } | 249 | } |
268 | 250 | ||
269 | static inline void mpol_fix_fork_child_flag(struct task_struct *p) | ||
270 | { | ||
271 | } | ||
272 | |||
273 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, | 251 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, |
274 | unsigned long addr, gfp_t gfp_flags, | 252 | unsigned long addr, gfp_t gfp_flags, |
275 | struct mempolicy **mpol, nodemask_t **nodemask) | 253 | struct mempolicy **mpol, nodemask_t **nodemask) |
@@ -284,12 +262,6 @@ static inline bool init_nodemask_of_mempolicy(nodemask_t *m) | |||
284 | return false; | 262 | return false; |
285 | } | 263 | } |
286 | 264 | ||
287 | static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, | ||
288 | const nodemask_t *mask) | ||
289 | { | ||
290 | return false; | ||
291 | } | ||
292 | |||
293 | static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | 265 | static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
294 | const nodemask_t *to, int flags) | 266 | const nodemask_t *to, int flags) |
295 | { | 267 | { |
@@ -307,10 +279,6 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol) | |||
307 | } | 279 | } |
308 | #endif | 280 | #endif |
309 | 281 | ||
310 | static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | ||
311 | { | ||
312 | } | ||
313 | |||
314 | static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, | 282 | static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, |
315 | unsigned long address) | 283 | unsigned long address) |
316 | { | 284 | { |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f015c059e159..84a31ad0b791 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -35,16 +35,12 @@ enum migrate_reason { | |||
35 | 35 | ||
36 | #ifdef CONFIG_MIGRATION | 36 | #ifdef CONFIG_MIGRATION |
37 | 37 | ||
38 | extern void putback_lru_pages(struct list_head *l); | ||
39 | extern void putback_movable_pages(struct list_head *l); | 38 | extern void putback_movable_pages(struct list_head *l); |
40 | extern int migrate_page(struct address_space *, | 39 | extern int migrate_page(struct address_space *, |
41 | struct page *, struct page *, enum migrate_mode); | 40 | struct page *, struct page *, enum migrate_mode); |
42 | extern int migrate_pages(struct list_head *l, new_page_t x, | 41 | extern int migrate_pages(struct list_head *l, new_page_t x, |
43 | unsigned long private, enum migrate_mode mode, int reason); | 42 | unsigned long private, enum migrate_mode mode, int reason); |
44 | 43 | ||
45 | extern int fail_migrate_page(struct address_space *, | ||
46 | struct page *, struct page *); | ||
47 | |||
48 | extern int migrate_prep(void); | 44 | extern int migrate_prep(void); |
49 | extern int migrate_prep_local(void); | 45 | extern int migrate_prep_local(void); |
50 | extern int migrate_vmas(struct mm_struct *mm, | 46 | extern int migrate_vmas(struct mm_struct *mm, |
@@ -59,7 +55,6 @@ extern int migrate_page_move_mapping(struct address_space *mapping, | |||
59 | int extra_count); | 55 | int extra_count); |
60 | #else | 56 | #else |
61 | 57 | ||
62 | static inline void putback_lru_pages(struct list_head *l) {} | ||
63 | static inline void putback_movable_pages(struct list_head *l) {} | 58 | static inline void putback_movable_pages(struct list_head *l) {} |
64 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 59 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
65 | unsigned long private, enum migrate_mode mode, int reason) | 60 | unsigned long private, enum migrate_mode mode, int reason) |
@@ -86,7 +81,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
86 | 81 | ||
87 | /* Possible settings for the migrate_page() method in address_operations */ | 82 | /* Possible settings for the migrate_page() method in address_operations */ |
88 | #define migrate_page NULL | 83 | #define migrate_page NULL |
89 | #define fail_migrate_page NULL | ||
90 | 84 | ||
91 | #endif /* CONFIG_MIGRATION */ | 85 | #endif /* CONFIG_MIGRATION */ |
92 | 86 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 35527173cf50..a512dd836931 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -57,6 +57,15 @@ extern int sysctl_legacy_va_layout; | |||
57 | extern unsigned long sysctl_user_reserve_kbytes; | 57 | extern unsigned long sysctl_user_reserve_kbytes; |
58 | extern unsigned long sysctl_admin_reserve_kbytes; | 58 | extern unsigned long sysctl_admin_reserve_kbytes; |
59 | 59 | ||
60 | extern int sysctl_overcommit_memory; | ||
61 | extern int sysctl_overcommit_ratio; | ||
62 | extern unsigned long sysctl_overcommit_kbytes; | ||
63 | |||
64 | extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, | ||
65 | size_t *, loff_t *); | ||
66 | extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, | ||
67 | size_t *, loff_t *); | ||
68 | |||
60 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) | 69 | #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) |
61 | 70 | ||
62 | /* to align the pointer to the (next) page boundary */ | 71 | /* to align the pointer to the (next) page boundary */ |
@@ -414,15 +423,44 @@ static inline int page_count(struct page *page) | |||
414 | return atomic_read(&compound_head(page)->_count); | 423 | return atomic_read(&compound_head(page)->_count); |
415 | } | 424 | } |
416 | 425 | ||
426 | #ifdef CONFIG_HUGETLB_PAGE | ||
427 | extern int PageHeadHuge(struct page *page_head); | ||
428 | #else /* CONFIG_HUGETLB_PAGE */ | ||
429 | static inline int PageHeadHuge(struct page *page_head) | ||
430 | { | ||
431 | return 0; | ||
432 | } | ||
433 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
434 | |||
435 | static inline bool __compound_tail_refcounted(struct page *page) | ||
436 | { | ||
437 | return !PageSlab(page) && !PageHeadHuge(page); | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * This takes a head page as parameter and tells if the | ||
442 | * tail page reference counting can be skipped. | ||
443 | * | ||
444 | * For this to be safe, PageSlab and PageHeadHuge must remain true on | ||
445 | * any given page where they return true here, until all tail pins | ||
446 | * have been released. | ||
447 | */ | ||
448 | static inline bool compound_tail_refcounted(struct page *page) | ||
449 | { | ||
450 | VM_BUG_ON(!PageHead(page)); | ||
451 | return __compound_tail_refcounted(page); | ||
452 | } | ||
453 | |||
417 | static inline void get_huge_page_tail(struct page *page) | 454 | static inline void get_huge_page_tail(struct page *page) |
418 | { | 455 | { |
419 | /* | 456 | /* |
420 | * __split_huge_page_refcount() cannot run | 457 | * __split_huge_page_refcount() cannot run from under us. |
421 | * from under us. | ||
422 | */ | 458 | */ |
459 | VM_BUG_ON(!PageTail(page)); | ||
423 | VM_BUG_ON(page_mapcount(page) < 0); | 460 | VM_BUG_ON(page_mapcount(page) < 0); |
424 | VM_BUG_ON(atomic_read(&page->_count) != 0); | 461 | VM_BUG_ON(atomic_read(&page->_count) != 0); |
425 | atomic_inc(&page->_mapcount); | 462 | if (compound_tail_refcounted(page->first_page)) |
463 | atomic_inc(&page->_mapcount); | ||
426 | } | 464 | } |
427 | 465 | ||
428 | extern bool __get_page_tail(struct page *page); | 466 | extern bool __get_page_tail(struct page *page); |
@@ -846,11 +884,14 @@ static __always_inline void *lowmem_page_address(const struct page *page) | |||
846 | #endif | 884 | #endif |
847 | 885 | ||
848 | #if defined(WANT_PAGE_VIRTUAL) | 886 | #if defined(WANT_PAGE_VIRTUAL) |
849 | #define page_address(page) ((page)->virtual) | 887 | static inline void *page_address(const struct page *page) |
850 | #define set_page_address(page, address) \ | 888 | { |
851 | do { \ | 889 | return page->virtual; |
852 | (page)->virtual = (address); \ | 890 | } |
853 | } while(0) | 891 | static inline void set_page_address(struct page *page, void *address) |
892 | { | ||
893 | page->virtual = address; | ||
894 | } | ||
854 | #define page_address_init() do { } while(0) | 895 | #define page_address_init() do { } while(0) |
855 | #endif | 896 | #endif |
856 | 897 | ||
@@ -984,7 +1025,6 @@ extern void pagefault_out_of_memory(void); | |||
984 | * various contexts. | 1025 | * various contexts. |
985 | */ | 1026 | */ |
986 | #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ | 1027 | #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ |
987 | #define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ | ||
988 | 1028 | ||
989 | extern void show_free_areas(unsigned int flags); | 1029 | extern void show_free_areas(unsigned int flags); |
990 | extern bool skip_free_areas_node(unsigned int flags, int nid); | 1030 | extern bool skip_free_areas_node(unsigned int flags, int nid); |
@@ -1318,6 +1358,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a | |||
1318 | 1358 | ||
1319 | #if USE_SPLIT_PTE_PTLOCKS | 1359 | #if USE_SPLIT_PTE_PTLOCKS |
1320 | #if ALLOC_SPLIT_PTLOCKS | 1360 | #if ALLOC_SPLIT_PTLOCKS |
1361 | void __init ptlock_cache_init(void); | ||
1321 | extern bool ptlock_alloc(struct page *page); | 1362 | extern bool ptlock_alloc(struct page *page); |
1322 | extern void ptlock_free(struct page *page); | 1363 | extern void ptlock_free(struct page *page); |
1323 | 1364 | ||
@@ -1326,6 +1367,10 @@ static inline spinlock_t *ptlock_ptr(struct page *page) | |||
1326 | return page->ptl; | 1367 | return page->ptl; |
1327 | } | 1368 | } |
1328 | #else /* ALLOC_SPLIT_PTLOCKS */ | 1369 | #else /* ALLOC_SPLIT_PTLOCKS */ |
1370 | static inline void ptlock_cache_init(void) | ||
1371 | { | ||
1372 | } | ||
1373 | |||
1329 | static inline bool ptlock_alloc(struct page *page) | 1374 | static inline bool ptlock_alloc(struct page *page) |
1330 | { | 1375 | { |
1331 | return true; | 1376 | return true; |
@@ -1378,10 +1423,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) | |||
1378 | { | 1423 | { |
1379 | return &mm->page_table_lock; | 1424 | return &mm->page_table_lock; |
1380 | } | 1425 | } |
1426 | static inline void ptlock_cache_init(void) {} | ||
1381 | static inline bool ptlock_init(struct page *page) { return true; } | 1427 | static inline bool ptlock_init(struct page *page) { return true; } |
1382 | static inline void pte_lock_deinit(struct page *page) {} | 1428 | static inline void pte_lock_deinit(struct page *page) {} |
1383 | #endif /* USE_SPLIT_PTE_PTLOCKS */ | 1429 | #endif /* USE_SPLIT_PTE_PTLOCKS */ |
1384 | 1430 | ||
1431 | static inline void pgtable_init(void) | ||
1432 | { | ||
1433 | ptlock_cache_init(); | ||
1434 | pgtable_cache_init(); | ||
1435 | } | ||
1436 | |||
1385 | static inline bool pgtable_page_ctor(struct page *page) | 1437 | static inline bool pgtable_page_ctor(struct page *page) |
1386 | { | 1438 | { |
1387 | inc_zone_page_state(page, NR_PAGETABLE); | 1439 | inc_zone_page_state(page, NR_PAGETABLE); |
diff --git a/include/linux/mman.h b/include/linux/mman.h index 7f7f8dae4b1d..16373c8f5f57 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | extern int sysctl_overcommit_memory; | 10 | extern int sysctl_overcommit_memory; |
11 | extern int sysctl_overcommit_ratio; | 11 | extern int sysctl_overcommit_ratio; |
12 | extern unsigned long sysctl_overcommit_kbytes; | ||
12 | extern struct percpu_counter vm_committed_as; | 13 | extern struct percpu_counter vm_committed_as; |
13 | 14 | ||
14 | #ifdef CONFIG_SMP | 15 | #ifdef CONFIG_SMP |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bd791e452ad7..5f2052c83154 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -490,6 +490,12 @@ struct zone { | |||
490 | unsigned long managed_pages; | 490 | unsigned long managed_pages; |
491 | 491 | ||
492 | /* | 492 | /* |
493 | * Number of MIGRATE_RESEVE page block. To maintain for just | ||
494 | * optimization. Protected by zone->lock. | ||
495 | */ | ||
496 | int nr_migrate_reserve_block; | ||
497 | |||
498 | /* | ||
493 | * rarely used fields: | 499 | * rarely used fields: |
494 | */ | 500 | */ |
495 | const char *name; | 501 | const char *name; |
@@ -758,10 +764,7 @@ typedef struct pglist_data { | |||
758 | int kswapd_max_order; | 764 | int kswapd_max_order; |
759 | enum zone_type classzone_idx; | 765 | enum zone_type classzone_idx; |
760 | #ifdef CONFIG_NUMA_BALANCING | 766 | #ifdef CONFIG_NUMA_BALANCING |
761 | /* | 767 | /* Lock serializing the migrate rate limiting window */ |
762 | * Lock serializing the per destination node AutoNUMA memory | ||
763 | * migration rate limiting data. | ||
764 | */ | ||
765 | spinlock_t numabalancing_migrate_lock; | 768 | spinlock_t numabalancing_migrate_lock; |
766 | 769 | ||
767 | /* Rate limiting time interval */ | 770 | /* Rate limiting time interval */ |
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 7931efe71175..fb616942e4c7 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h | |||
@@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); | |||
94 | extern struct posix_acl *get_posix_acl(struct inode *, int); | 94 | extern struct posix_acl *get_posix_acl(struct inode *, int); |
95 | extern int set_posix_acl(struct inode *, int, struct posix_acl *); | 95 | extern int set_posix_acl(struct inode *, int, struct posix_acl *); |
96 | 96 | ||
97 | #ifdef CONFIG_FS_POSIX_ACL | 97 | struct posix_acl **acl_by_type(struct inode *inode, int type); |
98 | static inline struct posix_acl **acl_by_type(struct inode *inode, int type) | 98 | struct posix_acl *get_cached_acl(struct inode *inode, int type); |
99 | { | 99 | struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); |
100 | switch (type) { | 100 | void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); |
101 | case ACL_TYPE_ACCESS: | 101 | void forget_cached_acl(struct inode *inode, int type); |
102 | return &inode->i_acl; | 102 | void forget_all_cached_acls(struct inode *inode); |
103 | case ACL_TYPE_DEFAULT: | ||
104 | return &inode->i_default_acl; | ||
105 | default: | ||
106 | BUG(); | ||
107 | } | ||
108 | } | ||
109 | |||
110 | static inline struct posix_acl *get_cached_acl(struct inode *inode, int type) | ||
111 | { | ||
112 | struct posix_acl **p = acl_by_type(inode, type); | ||
113 | struct posix_acl *acl = ACCESS_ONCE(*p); | ||
114 | if (acl) { | ||
115 | spin_lock(&inode->i_lock); | ||
116 | acl = *p; | ||
117 | if (acl != ACL_NOT_CACHED) | ||
118 | acl = posix_acl_dup(acl); | ||
119 | spin_unlock(&inode->i_lock); | ||
120 | } | ||
121 | return acl; | ||
122 | } | ||
123 | |||
124 | static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) | ||
125 | { | ||
126 | return rcu_dereference(*acl_by_type(inode, type)); | ||
127 | } | ||
128 | |||
129 | static inline void set_cached_acl(struct inode *inode, | ||
130 | int type, | ||
131 | struct posix_acl *acl) | ||
132 | { | ||
133 | struct posix_acl **p = acl_by_type(inode, type); | ||
134 | struct posix_acl *old; | ||
135 | spin_lock(&inode->i_lock); | ||
136 | old = *p; | ||
137 | rcu_assign_pointer(*p, posix_acl_dup(acl)); | ||
138 | spin_unlock(&inode->i_lock); | ||
139 | if (old != ACL_NOT_CACHED) | ||
140 | posix_acl_release(old); | ||
141 | } | ||
142 | |||
143 | static inline void forget_cached_acl(struct inode *inode, int type) | ||
144 | { | ||
145 | struct posix_acl **p = acl_by_type(inode, type); | ||
146 | struct posix_acl *old; | ||
147 | spin_lock(&inode->i_lock); | ||
148 | old = *p; | ||
149 | *p = ACL_NOT_CACHED; | ||
150 | spin_unlock(&inode->i_lock); | ||
151 | if (old != ACL_NOT_CACHED) | ||
152 | posix_acl_release(old); | ||
153 | } | ||
154 | |||
155 | static inline void forget_all_cached_acls(struct inode *inode) | ||
156 | { | ||
157 | struct posix_acl *old_access, *old_default; | ||
158 | spin_lock(&inode->i_lock); | ||
159 | old_access = inode->i_acl; | ||
160 | old_default = inode->i_default_acl; | ||
161 | inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; | ||
162 | spin_unlock(&inode->i_lock); | ||
163 | if (old_access != ACL_NOT_CACHED) | ||
164 | posix_acl_release(old_access); | ||
165 | if (old_default != ACL_NOT_CACHED) | ||
166 | posix_acl_release(old_default); | ||
167 | } | ||
168 | #endif | ||
169 | 103 | ||
170 | static inline void cache_no_acl(struct inode *inode) | 104 | static inline void cache_no_acl(struct inode *inode) |
171 | { | 105 | { |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6dacb93a6d94..1da693d51255 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -184,13 +184,13 @@ static inline void page_dup_rmap(struct page *page) | |||
184 | int page_referenced(struct page *, int is_locked, | 184 | int page_referenced(struct page *, int is_locked, |
185 | struct mem_cgroup *memcg, unsigned long *vm_flags); | 185 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
186 | int page_referenced_one(struct page *, struct vm_area_struct *, | 186 | int page_referenced_one(struct page *, struct vm_area_struct *, |
187 | unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); | 187 | unsigned long address, void *arg); |
188 | 188 | ||
189 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) | 189 | #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) |
190 | 190 | ||
191 | int try_to_unmap(struct page *, enum ttu_flags flags); | 191 | int try_to_unmap(struct page *, enum ttu_flags flags); |
192 | int try_to_unmap_one(struct page *, struct vm_area_struct *, | 192 | int try_to_unmap_one(struct page *, struct vm_area_struct *, |
193 | unsigned long address, enum ttu_flags flags); | 193 | unsigned long address, void *arg); |
194 | 194 | ||
195 | /* | 195 | /* |
196 | * Called from mm/filemap_xip.c to unmap empty zero page | 196 | * Called from mm/filemap_xip.c to unmap empty zero page |
@@ -236,10 +236,27 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma); | |||
236 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); | 236 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); |
237 | 237 | ||
238 | /* | 238 | /* |
239 | * Called by migrate.c to remove migration ptes, but might be used more later. | 239 | * rmap_walk_control: To control rmap traversing for specific needs |
240 | * | ||
241 | * arg: passed to rmap_one() and invalid_vma() | ||
242 | * rmap_one: executed on each vma where page is mapped | ||
243 | * done: for checking traversing termination condition | ||
244 | * file_nonlinear: for handling file nonlinear mapping | ||
245 | * anon_lock: for getting anon_lock by optimized way rather than default | ||
246 | * invalid_vma: for skipping uninterested vma | ||
240 | */ | 247 | */ |
241 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 248 | struct rmap_walk_control { |
242 | struct vm_area_struct *, unsigned long, void *), void *arg); | 249 | void *arg; |
250 | int (*rmap_one)(struct page *page, struct vm_area_struct *vma, | ||
251 | unsigned long addr, void *arg); | ||
252 | int (*done)(struct page *page); | ||
253 | int (*file_nonlinear)(struct page *, struct address_space *, | ||
254 | struct vm_area_struct *vma); | ||
255 | struct anon_vma *(*anon_lock)(struct page *page); | ||
256 | bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); | ||
257 | }; | ||
258 | |||
259 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc); | ||
243 | 260 | ||
244 | #else /* !CONFIG_MMU */ | 261 | #else /* !CONFIG_MMU */ |
245 | 262 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad050b5..485234d2fd42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -549,6 +549,7 @@ struct signal_struct { | |||
549 | atomic_t sigcnt; | 549 | atomic_t sigcnt; |
550 | atomic_t live; | 550 | atomic_t live; |
551 | int nr_threads; | 551 | int nr_threads; |
552 | struct list_head thread_head; | ||
552 | 553 | ||
553 | wait_queue_head_t wait_chldexit; /* for wait4() */ | 554 | wait_queue_head_t wait_chldexit; /* for wait4() */ |
554 | 555 | ||
@@ -1271,6 +1272,7 @@ struct task_struct { | |||
1271 | /* PID/PID hash table linkage. */ | 1272 | /* PID/PID hash table linkage. */ |
1272 | struct pid_link pids[PIDTYPE_MAX]; | 1273 | struct pid_link pids[PIDTYPE_MAX]; |
1273 | struct list_head thread_group; | 1274 | struct list_head thread_group; |
1275 | struct list_head thread_node; | ||
1274 | 1276 | ||
1275 | struct completion *vfork_done; /* for vfork() */ | 1277 | struct completion *vfork_done; /* for vfork() */ |
1276 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1278 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
@@ -2341,6 +2343,16 @@ extern bool current_is_single_threaded(void); | |||
2341 | #define while_each_thread(g, t) \ | 2343 | #define while_each_thread(g, t) \ |
2342 | while ((t = next_thread(t)) != g) | 2344 | while ((t = next_thread(t)) != g) |
2343 | 2345 | ||
2346 | #define __for_each_thread(signal, t) \ | ||
2347 | list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) | ||
2348 | |||
2349 | #define for_each_thread(p, t) \ | ||
2350 | __for_each_thread((p)->signal, t) | ||
2351 | |||
2352 | /* Careful: this is a double loop, 'break' won't work as expected. */ | ||
2353 | #define for_each_process_thread(p, t) \ | ||
2354 | for_each_process(p) for_each_thread(p, t) | ||
2355 | |||
2344 | static inline int get_nr_threads(struct task_struct *tsk) | 2356 | static inline int get_nr_threads(struct task_struct *tsk) |
2345 | { | 2357 | { |
2346 | return tsk->signal->nr_threads; | 2358 | return tsk->signal->nr_threads; |
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index fde1b3e94c7d..06f544ef2f6f 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -67,6 +67,48 @@ TRACE_EVENT(mm_compaction_migratepages, | |||
67 | __entry->nr_failed) | 67 | __entry->nr_failed) |
68 | ); | 68 | ); |
69 | 69 | ||
70 | TRACE_EVENT(mm_compaction_begin, | ||
71 | TP_PROTO(unsigned long zone_start, unsigned long migrate_start, | ||
72 | unsigned long free_start, unsigned long zone_end), | ||
73 | |||
74 | TP_ARGS(zone_start, migrate_start, free_start, zone_end), | ||
75 | |||
76 | TP_STRUCT__entry( | ||
77 | __field(unsigned long, zone_start) | ||
78 | __field(unsigned long, migrate_start) | ||
79 | __field(unsigned long, free_start) | ||
80 | __field(unsigned long, zone_end) | ||
81 | ), | ||
82 | |||
83 | TP_fast_assign( | ||
84 | __entry->zone_start = zone_start; | ||
85 | __entry->migrate_start = migrate_start; | ||
86 | __entry->free_start = free_start; | ||
87 | __entry->zone_end = zone_end; | ||
88 | ), | ||
89 | |||
90 | TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", | ||
91 | __entry->zone_start, | ||
92 | __entry->migrate_start, | ||
93 | __entry->free_start, | ||
94 | __entry->zone_end) | ||
95 | ); | ||
96 | |||
97 | TRACE_EVENT(mm_compaction_end, | ||
98 | TP_PROTO(int status), | ||
99 | |||
100 | TP_ARGS(status), | ||
101 | |||
102 | TP_STRUCT__entry( | ||
103 | __field(int, status) | ||
104 | ), | ||
105 | |||
106 | TP_fast_assign( | ||
107 | __entry->status = status; | ||
108 | ), | ||
109 | |||
110 | TP_printk("status=%d", __entry->status) | ||
111 | ); | ||
70 | 112 | ||
71 | #endif /* _TRACE_COMPACTION_H */ | 113 | #endif /* _TRACE_COMPACTION_H */ |
72 | 114 | ||
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index ec2a6ccfd7e5..3075ffbb9a83 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h | |||
@@ -45,6 +45,32 @@ TRACE_EVENT(mm_migrate_pages, | |||
45 | __print_symbolic(__entry->reason, MIGRATE_REASON)) | 45 | __print_symbolic(__entry->reason, MIGRATE_REASON)) |
46 | ); | 46 | ); |
47 | 47 | ||
48 | TRACE_EVENT(mm_numa_migrate_ratelimit, | ||
49 | |||
50 | TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), | ||
51 | |||
52 | TP_ARGS(p, dst_nid, nr_pages), | ||
53 | |||
54 | TP_STRUCT__entry( | ||
55 | __array( char, comm, TASK_COMM_LEN) | ||
56 | __field( pid_t, pid) | ||
57 | __field( int, dst_nid) | ||
58 | __field( unsigned long, nr_pages) | ||
59 | ), | ||
60 | |||
61 | TP_fast_assign( | ||
62 | memcpy(__entry->comm, p->comm, TASK_COMM_LEN); | ||
63 | __entry->pid = p->pid; | ||
64 | __entry->dst_nid = dst_nid; | ||
65 | __entry->nr_pages = nr_pages; | ||
66 | ), | ||
67 | |||
68 | TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", | ||
69 | __entry->comm, | ||
70 | __entry->pid, | ||
71 | __entry->dst_nid, | ||
72 | __entry->nr_pages) | ||
73 | ); | ||
48 | #endif /* _TRACE_MIGRATE_H */ | 74 | #endif /* _TRACE_MIGRATE_H */ |
49 | 75 | ||
50 | /* This part must be outside protection */ | 76 | /* This part must be outside protection */ |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 04c308413a5d..67e1bbf83695 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang, | |||
443 | ); | 443 | ); |
444 | #endif /* CONFIG_DETECT_HUNG_TASK */ | 444 | #endif /* CONFIG_DETECT_HUNG_TASK */ |
445 | 445 | ||
446 | DECLARE_EVENT_CLASS(sched_move_task_template, | ||
447 | |||
448 | TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), | ||
449 | |||
450 | TP_ARGS(tsk, src_cpu, dst_cpu), | ||
451 | |||
452 | TP_STRUCT__entry( | ||
453 | __field( pid_t, pid ) | ||
454 | __field( pid_t, tgid ) | ||
455 | __field( pid_t, ngid ) | ||
456 | __field( int, src_cpu ) | ||
457 | __field( int, src_nid ) | ||
458 | __field( int, dst_cpu ) | ||
459 | __field( int, dst_nid ) | ||
460 | ), | ||
461 | |||
462 | TP_fast_assign( | ||
463 | __entry->pid = task_pid_nr(tsk); | ||
464 | __entry->tgid = task_tgid_nr(tsk); | ||
465 | __entry->ngid = task_numa_group_id(tsk); | ||
466 | __entry->src_cpu = src_cpu; | ||
467 | __entry->src_nid = cpu_to_node(src_cpu); | ||
468 | __entry->dst_cpu = dst_cpu; | ||
469 | __entry->dst_nid = cpu_to_node(dst_cpu); | ||
470 | ), | ||
471 | |||
472 | TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", | ||
473 | __entry->pid, __entry->tgid, __entry->ngid, | ||
474 | __entry->src_cpu, __entry->src_nid, | ||
475 | __entry->dst_cpu, __entry->dst_nid) | ||
476 | ); | ||
477 | |||
478 | /* | ||
479 | * Tracks migration of tasks from one runqueue to another. Can be used to | ||
480 | * detect if automatic NUMA balancing is bouncing between nodes | ||
481 | */ | ||
482 | DEFINE_EVENT(sched_move_task_template, sched_move_numa, | ||
483 | TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), | ||
484 | |||
485 | TP_ARGS(tsk, src_cpu, dst_cpu) | ||
486 | ); | ||
487 | |||
488 | DEFINE_EVENT(sched_move_task_template, sched_stick_numa, | ||
489 | TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), | ||
490 | |||
491 | TP_ARGS(tsk, src_cpu, dst_cpu) | ||
492 | ); | ||
493 | |||
494 | TRACE_EVENT(sched_swap_numa, | ||
495 | |||
496 | TP_PROTO(struct task_struct *src_tsk, int src_cpu, | ||
497 | struct task_struct *dst_tsk, int dst_cpu), | ||
498 | |||
499 | TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), | ||
500 | |||
501 | TP_STRUCT__entry( | ||
502 | __field( pid_t, src_pid ) | ||
503 | __field( pid_t, src_tgid ) | ||
504 | __field( pid_t, src_ngid ) | ||
505 | __field( int, src_cpu ) | ||
506 | __field( int, src_nid ) | ||
507 | __field( pid_t, dst_pid ) | ||
508 | __field( pid_t, dst_tgid ) | ||
509 | __field( pid_t, dst_ngid ) | ||
510 | __field( int, dst_cpu ) | ||
511 | __field( int, dst_nid ) | ||
512 | ), | ||
513 | |||
514 | TP_fast_assign( | ||
515 | __entry->src_pid = task_pid_nr(src_tsk); | ||
516 | __entry->src_tgid = task_tgid_nr(src_tsk); | ||
517 | __entry->src_ngid = task_numa_group_id(src_tsk); | ||
518 | __entry->src_cpu = src_cpu; | ||
519 | __entry->src_nid = cpu_to_node(src_cpu); | ||
520 | __entry->dst_pid = task_pid_nr(dst_tsk); | ||
521 | __entry->dst_tgid = task_tgid_nr(dst_tsk); | ||
522 | __entry->dst_ngid = task_numa_group_id(dst_tsk); | ||
523 | __entry->dst_cpu = dst_cpu; | ||
524 | __entry->dst_nid = cpu_to_node(dst_cpu); | ||
525 | ), | ||
526 | |||
527 | TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", | ||
528 | __entry->src_pid, __entry->src_tgid, __entry->src_ngid, | ||
529 | __entry->src_cpu, __entry->src_nid, | ||
530 | __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, | ||
531 | __entry->dst_cpu, __entry->dst_nid) | ||
532 | ); | ||
446 | #endif /* _TRACE_SCHED_H */ | 533 | #endif /* _TRACE_SCHED_H */ |
447 | 534 | ||
448 | /* This part must be outside protection */ | 535 | /* This part must be outside protection */ |
diff --git a/init/main.c b/init/main.c index febc511e078a..f865261fb096 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -355,9 +355,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } | |||
355 | */ | 355 | */ |
356 | static void __init setup_command_line(char *command_line) | 356 | static void __init setup_command_line(char *command_line) |
357 | { | 357 | { |
358 | saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); | 358 | saved_command_line = |
359 | initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1); | 359 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); |
360 | static_command_line = alloc_bootmem(strlen (command_line)+1); | 360 | initcall_command_line = |
361 | memblock_virt_alloc(strlen(boot_command_line) + 1, 0); | ||
362 | static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); | ||
361 | strcpy (saved_command_line, boot_command_line); | 363 | strcpy (saved_command_line, boot_command_line); |
362 | strcpy (static_command_line, command_line); | 364 | strcpy (static_command_line, command_line); |
363 | } | 365 | } |
@@ -476,7 +478,7 @@ static void __init mm_init(void) | |||
476 | mem_init(); | 478 | mem_init(); |
477 | kmem_cache_init(); | 479 | kmem_cache_init(); |
478 | percpu_init_late(); | 480 | percpu_init_late(); |
479 | pgtable_cache_init(); | 481 | pgtable_init(); |
480 | vmalloc_init(); | 482 | vmalloc_init(); |
481 | } | 483 | } |
482 | 484 | ||
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..67ccf0e7cca9 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) | |||
912 | } | 912 | } |
913 | 913 | ||
914 | static int audit_tree_handle_event(struct fsnotify_group *group, | 914 | static int audit_tree_handle_event(struct fsnotify_group *group, |
915 | struct inode *to_tell, | ||
915 | struct fsnotify_mark *inode_mark, | 916 | struct fsnotify_mark *inode_mark, |
916 | struct fsnotify_mark *vfsmonut_mark, | 917 | struct fsnotify_mark *vfsmount_mark, |
917 | struct fsnotify_event *event) | 918 | u32 mask, void *data, int data_type, |
919 | const unsigned char *file_name) | ||
918 | { | 920 | { |
919 | BUG(); | 921 | return 0; |
920 | return -EOPNOTSUPP; | ||
921 | } | 922 | } |
922 | 923 | ||
923 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) | 924 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) |
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify | |||
933 | BUG_ON(atomic_read(&entry->refcnt) < 1); | 934 | BUG_ON(atomic_read(&entry->refcnt) < 1); |
934 | } | 935 | } |
935 | 936 | ||
936 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, | ||
937 | struct fsnotify_mark *inode_mark, | ||
938 | struct fsnotify_mark *vfsmount_mark, | ||
939 | __u32 mask, void *data, int data_type) | ||
940 | { | ||
941 | return false; | ||
942 | } | ||
943 | |||
944 | static const struct fsnotify_ops audit_tree_ops = { | 937 | static const struct fsnotify_ops audit_tree_ops = { |
945 | .handle_event = audit_tree_handle_event, | 938 | .handle_event = audit_tree_handle_event, |
946 | .should_send_event = audit_tree_send_event, | ||
947 | .free_group_priv = NULL, | ||
948 | .free_event_priv = NULL, | ||
949 | .freeing_mark = audit_tree_freeing_mark, | 939 | .freeing_mark = audit_tree_freeing_mark, |
950 | }; | 940 | }; |
951 | 941 | ||
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..2596fac5dcb4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) | |||
465 | } | 465 | } |
466 | } | 466 | } |
467 | 467 | ||
468 | static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, | ||
469 | struct fsnotify_mark *inode_mark, | ||
470 | struct fsnotify_mark *vfsmount_mark, | ||
471 | __u32 mask, void *data, int data_type) | ||
472 | { | ||
473 | return true; | ||
474 | } | ||
475 | |||
476 | /* Update watch data in audit rules based on fsnotify events. */ | 468 | /* Update watch data in audit rules based on fsnotify events. */ |
477 | static int audit_watch_handle_event(struct fsnotify_group *group, | 469 | static int audit_watch_handle_event(struct fsnotify_group *group, |
470 | struct inode *to_tell, | ||
478 | struct fsnotify_mark *inode_mark, | 471 | struct fsnotify_mark *inode_mark, |
479 | struct fsnotify_mark *vfsmount_mark, | 472 | struct fsnotify_mark *vfsmount_mark, |
480 | struct fsnotify_event *event) | 473 | u32 mask, void *data, int data_type, |
474 | const unsigned char *dname) | ||
481 | { | 475 | { |
482 | struct inode *inode; | 476 | struct inode *inode; |
483 | __u32 mask = event->mask; | ||
484 | const char *dname = event->file_name; | ||
485 | struct audit_parent *parent; | 477 | struct audit_parent *parent; |
486 | 478 | ||
487 | parent = container_of(inode_mark, struct audit_parent, mark); | 479 | parent = container_of(inode_mark, struct audit_parent, mark); |
488 | 480 | ||
489 | BUG_ON(group != audit_watch_group); | 481 | BUG_ON(group != audit_watch_group); |
490 | 482 | ||
491 | switch (event->data_type) { | 483 | switch (data_type) { |
492 | case (FSNOTIFY_EVENT_PATH): | 484 | case (FSNOTIFY_EVENT_PATH): |
493 | inode = event->path.dentry->d_inode; | 485 | inode = ((struct path *)data)->dentry->d_inode; |
494 | break; | 486 | break; |
495 | case (FSNOTIFY_EVENT_INODE): | 487 | case (FSNOTIFY_EVENT_INODE): |
496 | inode = event->inode; | 488 | inode = (struct inode *)data; |
497 | break; | 489 | break; |
498 | default: | 490 | default: |
499 | BUG(); | 491 | BUG(); |
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
512 | } | 504 | } |
513 | 505 | ||
514 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { | 506 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { |
515 | .should_send_event = audit_watch_should_send_event, | ||
516 | .handle_event = audit_watch_handle_event, | 507 | .handle_event = audit_watch_handle_event, |
517 | .free_group_priv = NULL, | ||
518 | .freeing_mark = NULL, | ||
519 | .free_event_priv = NULL, | ||
520 | }; | 508 | }; |
521 | 509 | ||
522 | static int __init audit_watch_init(void) | 510 | static int __init audit_watch_init(void) |
diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..1e77fc645317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | } | 75 | } |
76 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
77 | list_del_rcu(&p->thread_node); | ||
77 | } | 78 | } |
78 | 79 | ||
79 | /* | 80 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 294189fc7ac8..2f11bbe376b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1035,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1035 | sig->nr_threads = 1; | 1035 | sig->nr_threads = 1; |
1036 | atomic_set(&sig->live, 1); | 1036 | atomic_set(&sig->live, 1); |
1037 | atomic_set(&sig->sigcnt, 1); | 1037 | atomic_set(&sig->sigcnt, 1); |
1038 | |||
1039 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ | ||
1040 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); | ||
1041 | tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); | ||
1042 | |||
1038 | init_waitqueue_head(&sig->wait_chldexit); | 1043 | init_waitqueue_head(&sig->wait_chldexit); |
1039 | sig->curr_target = tsk; | 1044 | sig->curr_target = tsk; |
1040 | init_sigpending(&sig->shared_pending); | 1045 | init_sigpending(&sig->shared_pending); |
@@ -1474,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1474 | atomic_inc(¤t->signal->sigcnt); | 1479 | atomic_inc(¤t->signal->sigcnt); |
1475 | list_add_tail_rcu(&p->thread_group, | 1480 | list_add_tail_rcu(&p->thread_group, |
1476 | &p->group_leader->thread_group); | 1481 | &p->group_leader->thread_group); |
1482 | list_add_tail_rcu(&p->thread_node, | ||
1483 | &p->signal->thread_head); | ||
1477 | } | 1484 | } |
1478 | attach_pid(p, PIDTYPE_PID); | 1485 | attach_pid(p, PIDTYPE_PID); |
1479 | nr_threads++; | 1486 | nr_threads++; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e204af..d9f61a145802 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
637 | BUG_ON(!region); | 637 | BUG_ON(!region); |
638 | } else | 638 | } else |
639 | /* This allocation cannot fail */ | 639 | /* This allocation cannot fail */ |
640 | region = alloc_bootmem(sizeof(struct nosave_region)); | 640 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); |
641 | region->start_pfn = start_pfn; | 641 | region->start_pfn = start_pfn; |
642 | region->end_pfn = end_pfn; | 642 | region->end_pfn = end_pfn; |
643 | list_add_tail(®ion->list, &nosave_regions); | 643 | list_add_tail(®ion->list, &nosave_regions); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86bae576..f8b41bddc6dc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early) | |||
757 | return; | 757 | return; |
758 | 758 | ||
759 | if (early) { | 759 | if (early) { |
760 | unsigned long mem; | 760 | new_log_buf = |
761 | 761 | memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); | |
762 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); | ||
763 | if (!mem) | ||
764 | return; | ||
765 | new_log_buf = __va(mem); | ||
766 | } else { | 762 | } else { |
767 | new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); | 763 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); |
768 | } | 764 | } |
769 | 765 | ||
770 | if (unlikely(!new_log_buf)) { | 766 | if (unlikely(!new_log_buf)) { |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3897e09e86a2..4d6964e49711 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | 1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) |
1109 | goto out; | 1109 | goto out; |
1110 | 1110 | ||
1111 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); | ||
1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | 1112 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); |
1112 | 1113 | ||
1113 | out: | 1114 | out: |
@@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
4603 | 4604 | ||
4604 | /* TODO: This is not properly updating schedstats */ | 4605 | /* TODO: This is not properly updating schedstats */ |
4605 | 4606 | ||
4607 | trace_sched_move_numa(p, curr_cpu, target_cpu); | ||
4606 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | 4608 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); |
4607 | } | 4609 | } |
4608 | 4610 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b24b6cfde9aa..867b0a4b0893 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
1250 | p->numa_scan_period = task_scan_min(p); | 1250 | p->numa_scan_period = task_scan_min(p); |
1251 | 1251 | ||
1252 | if (env.best_task == NULL) { | 1252 | if (env.best_task == NULL) { |
1253 | int ret = migrate_task_to(p, env.best_cpu); | 1253 | ret = migrate_task_to(p, env.best_cpu); |
1254 | if (ret != 0) | ||
1255 | trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); | ||
1254 | return ret; | 1256 | return ret; |
1255 | } | 1257 | } |
1256 | 1258 | ||
1257 | ret = migrate_swap(p, env.best_task); | 1259 | ret = migrate_swap(p, env.best_task); |
1260 | if (ret != 0) | ||
1261 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); | ||
1258 | put_task_struct(env.best_task); | 1262 | put_task_struct(env.best_task); |
1259 | return ret; | 1263 | return ret; |
1260 | } | 1264 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..332cefcdb04b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -95,8 +95,6 @@ | |||
95 | #if defined(CONFIG_SYSCTL) | 95 | #if defined(CONFIG_SYSCTL) |
96 | 96 | ||
97 | /* External variables not in a header file. */ | 97 | /* External variables not in a header file. */ |
98 | extern int sysctl_overcommit_memory; | ||
99 | extern int sysctl_overcommit_ratio; | ||
100 | extern int max_threads; | 98 | extern int max_threads; |
101 | extern int suid_dumpable; | 99 | extern int suid_dumpable; |
102 | #ifdef CONFIG_COREDUMP | 100 | #ifdef CONFIG_COREDUMP |
@@ -1121,7 +1119,14 @@ static struct ctl_table vm_table[] = { | |||
1121 | .data = &sysctl_overcommit_ratio, | 1119 | .data = &sysctl_overcommit_ratio, |
1122 | .maxlen = sizeof(sysctl_overcommit_ratio), | 1120 | .maxlen = sizeof(sysctl_overcommit_ratio), |
1123 | .mode = 0644, | 1121 | .mode = 0644, |
1124 | .proc_handler = proc_dointvec, | 1122 | .proc_handler = overcommit_ratio_handler, |
1123 | }, | ||
1124 | { | ||
1125 | .procname = "overcommit_kbytes", | ||
1126 | .data = &sysctl_overcommit_kbytes, | ||
1127 | .maxlen = sizeof(sysctl_overcommit_kbytes), | ||
1128 | .mode = 0644, | ||
1129 | .proc_handler = overcommit_kbytes_handler, | ||
1125 | }, | 1130 | }, |
1126 | { | 1131 | { |
1127 | .procname = "page-cluster", | 1132 | .procname = "page-cluster", |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6982094a7e74..900b63c1e899 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -1584,8 +1584,16 @@ config DMA_API_DEBUG | |||
1584 | With this option you will be able to detect common bugs in device | 1584 | With this option you will be able to detect common bugs in device |
1585 | drivers like double-freeing of DMA mappings or freeing mappings that | 1585 | drivers like double-freeing of DMA mappings or freeing mappings that |
1586 | were never allocated. | 1586 | were never allocated. |
1587 | This option causes a performance degredation. Use only if you want | 1587 | |
1588 | to debug device drivers. If unsure, say N. | 1588 | This also attempts to catch cases where a page owned by DMA is |
1589 | accessed by the cpu in a way that could cause data corruption. For | ||
1590 | example, this enables cow_user_page() to check that the source page is | ||
1591 | not undergoing DMA. | ||
1592 | |||
1593 | This option causes a performance degradation. Use only if you want to | ||
1594 | debug device drivers and dma interactions. | ||
1595 | |||
1596 | If unsure, say N. | ||
1589 | 1597 | ||
1590 | source "samples/Kconfig" | 1598 | source "samples/Kconfig" |
1591 | 1599 | ||
diff --git a/lib/cpumask.c b/lib/cpumask.c index d327b87c99b7..b810b753c607 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c | |||
@@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var); | |||
140 | */ | 140 | */ |
141 | void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) | 141 | void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) |
142 | { | 142 | { |
143 | *mask = alloc_bootmem(cpumask_size()); | 143 | *mask = memblock_virt_alloc(cpumask_size(), 0); |
144 | } | 144 | } |
145 | 145 | ||
146 | /** | 146 | /** |
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var); | |||
161 | */ | 161 | */ |
162 | void __init free_bootmem_cpumask_var(cpumask_var_t mask) | 162 | void __init free_bootmem_cpumask_var(cpumask_var_t mask) |
163 | { | 163 | { |
164 | free_bootmem(__pa(mask), cpumask_size()); | 164 | memblock_free_early(__pa(mask), cpumask_size()); |
165 | } | 165 | } |
166 | #endif | 166 | #endif |
diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d87a17a819d0..c38083871f11 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c | |||
@@ -53,11 +53,26 @@ enum map_err_types { | |||
53 | 53 | ||
54 | #define DMA_DEBUG_STACKTRACE_ENTRIES 5 | 54 | #define DMA_DEBUG_STACKTRACE_ENTRIES 5 |
55 | 55 | ||
56 | /** | ||
57 | * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping | ||
58 | * @list: node on pre-allocated free_entries list | ||
59 | * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent | ||
60 | * @type: single, page, sg, coherent | ||
61 | * @pfn: page frame of the start address | ||
62 | * @offset: offset of mapping relative to pfn | ||
63 | * @size: length of the mapping | ||
64 | * @direction: enum dma_data_direction | ||
65 | * @sg_call_ents: 'nents' from dma_map_sg | ||
66 | * @sg_mapped_ents: 'mapped_ents' from dma_map_sg | ||
67 | * @map_err_type: track whether dma_mapping_error() was checked | ||
68 | * @stacktrace: support backtraces when a violation is detected | ||
69 | */ | ||
56 | struct dma_debug_entry { | 70 | struct dma_debug_entry { |
57 | struct list_head list; | 71 | struct list_head list; |
58 | struct device *dev; | 72 | struct device *dev; |
59 | int type; | 73 | int type; |
60 | phys_addr_t paddr; | 74 | unsigned long pfn; |
75 | size_t offset; | ||
61 | u64 dev_addr; | 76 | u64 dev_addr; |
62 | u64 size; | 77 | u64 size; |
63 | int direction; | 78 | int direction; |
@@ -372,6 +387,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry) | |||
372 | list_del(&entry->list); | 387 | list_del(&entry->list); |
373 | } | 388 | } |
374 | 389 | ||
390 | static unsigned long long phys_addr(struct dma_debug_entry *entry) | ||
391 | { | ||
392 | return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; | ||
393 | } | ||
394 | |||
375 | /* | 395 | /* |
376 | * Dump mapping entries for debugging purposes | 396 | * Dump mapping entries for debugging purposes |
377 | */ | 397 | */ |
@@ -389,9 +409,9 @@ void debug_dma_dump_mappings(struct device *dev) | |||
389 | list_for_each_entry(entry, &bucket->list, list) { | 409 | list_for_each_entry(entry, &bucket->list, list) { |
390 | if (!dev || dev == entry->dev) { | 410 | if (!dev || dev == entry->dev) { |
391 | dev_info(entry->dev, | 411 | dev_info(entry->dev, |
392 | "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n", | 412 | "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", |
393 | type2name[entry->type], idx, | 413 | type2name[entry->type], idx, |
394 | (unsigned long long)entry->paddr, | 414 | phys_addr(entry), entry->pfn, |
395 | entry->dev_addr, entry->size, | 415 | entry->dev_addr, entry->size, |
396 | dir2name[entry->direction], | 416 | dir2name[entry->direction], |
397 | maperr2str[entry->map_err_type]); | 417 | maperr2str[entry->map_err_type]); |
@@ -404,6 +424,133 @@ void debug_dma_dump_mappings(struct device *dev) | |||
404 | EXPORT_SYMBOL(debug_dma_dump_mappings); | 424 | EXPORT_SYMBOL(debug_dma_dump_mappings); |
405 | 425 | ||
406 | /* | 426 | /* |
427 | * For each page mapped (initial page in the case of | ||
428 | * dma_alloc_coherent/dma_map_{single|page}, or each page in a | ||
429 | * scatterlist) insert into this tree using the pfn as the key. At | ||
430 | * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If | ||
431 | * the pfn already exists at insertion time add a tag as a reference | ||
432 | * count for the overlapping mappings. For now, the overlap tracking | ||
433 | * just ensures that 'unmaps' balance 'maps' before marking the pfn | ||
434 | * idle, but we should also be flagging overlaps as an API violation. | ||
435 | * | ||
436 | * Memory usage is mostly constrained by the maximum number of available | ||
437 | * dma-debug entries in that we need a free dma_debug_entry before | ||
438 | * inserting into the tree. In the case of dma_map_{single|page} and | ||
439 | * dma_alloc_coherent there is only one dma_debug_entry and one pfn to | ||
440 | * track per event. dma_map_sg(), on the other hand, | ||
441 | * consumes a single dma_debug_entry, but inserts 'nents' entries into | ||
442 | * the tree. | ||
443 | * | ||
444 | * At any time debug_dma_assert_idle() can be called to trigger a | ||
445 | * warning if the given page is in the active set. | ||
446 | */ | ||
447 | static RADIX_TREE(dma_active_pfn, GFP_NOWAIT); | ||
448 | static DEFINE_SPINLOCK(radix_lock); | ||
449 | #define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) | ||
450 | |||
451 | static int active_pfn_read_overlap(unsigned long pfn) | ||
452 | { | ||
453 | int overlap = 0, i; | ||
454 | |||
455 | for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) | ||
456 | if (radix_tree_tag_get(&dma_active_pfn, pfn, i)) | ||
457 | overlap |= 1 << i; | ||
458 | return overlap; | ||
459 | } | ||
460 | |||
461 | static int active_pfn_set_overlap(unsigned long pfn, int overlap) | ||
462 | { | ||
463 | int i; | ||
464 | |||
465 | if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0) | ||
466 | return 0; | ||
467 | |||
468 | for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) | ||
469 | if (overlap & 1 << i) | ||
470 | radix_tree_tag_set(&dma_active_pfn, pfn, i); | ||
471 | else | ||
472 | radix_tree_tag_clear(&dma_active_pfn, pfn, i); | ||
473 | |||
474 | return overlap; | ||
475 | } | ||
476 | |||
477 | static void active_pfn_inc_overlap(unsigned long pfn) | ||
478 | { | ||
479 | int overlap = active_pfn_read_overlap(pfn); | ||
480 | |||
481 | overlap = active_pfn_set_overlap(pfn, ++overlap); | ||
482 | |||
483 | /* If we overflowed the overlap counter then we're potentially | ||
484 | * leaking dma-mappings. Otherwise, if maps and unmaps are | ||
485 | * balanced then this overflow may cause false negatives in | ||
486 | * debug_dma_assert_idle() as the pfn may be marked idle | ||
487 | * prematurely. | ||
488 | */ | ||
489 | WARN_ONCE(overlap == 0, | ||
490 | "DMA-API: exceeded %d overlapping mappings of pfn %lx\n", | ||
491 | ACTIVE_PFN_MAX_OVERLAP, pfn); | ||
492 | } | ||
493 | |||
494 | static int active_pfn_dec_overlap(unsigned long pfn) | ||
495 | { | ||
496 | int overlap = active_pfn_read_overlap(pfn); | ||
497 | |||
498 | return active_pfn_set_overlap(pfn, --overlap); | ||
499 | } | ||
500 | |||
501 | static int active_pfn_insert(struct dma_debug_entry *entry) | ||
502 | { | ||
503 | unsigned long flags; | ||
504 | int rc; | ||
505 | |||
506 | spin_lock_irqsave(&radix_lock, flags); | ||
507 | rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry); | ||
508 | if (rc == -EEXIST) | ||
509 | active_pfn_inc_overlap(entry->pfn); | ||
510 | spin_unlock_irqrestore(&radix_lock, flags); | ||
511 | |||
512 | return rc; | ||
513 | } | ||
514 | |||
515 | static void active_pfn_remove(struct dma_debug_entry *entry) | ||
516 | { | ||
517 | unsigned long flags; | ||
518 | |||
519 | spin_lock_irqsave(&radix_lock, flags); | ||
520 | if (active_pfn_dec_overlap(entry->pfn) == 0) | ||
521 | radix_tree_delete(&dma_active_pfn, entry->pfn); | ||
522 | spin_unlock_irqrestore(&radix_lock, flags); | ||
523 | } | ||
524 | |||
525 | /** | ||
526 | * debug_dma_assert_idle() - assert that a page is not undergoing dma | ||
527 | * @page: page to lookup in the dma_active_pfn tree | ||
528 | * | ||
529 | * Place a call to this routine in cases where the cpu touching the page | ||
530 | * before the dma completes (page is dma_unmapped) will lead to data | ||
531 | * corruption. | ||
532 | */ | ||
533 | void debug_dma_assert_idle(struct page *page) | ||
534 | { | ||
535 | unsigned long flags; | ||
536 | struct dma_debug_entry *entry; | ||
537 | |||
538 | if (!page) | ||
539 | return; | ||
540 | |||
541 | spin_lock_irqsave(&radix_lock, flags); | ||
542 | entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page)); | ||
543 | spin_unlock_irqrestore(&radix_lock, flags); | ||
544 | |||
545 | if (!entry) | ||
546 | return; | ||
547 | |||
548 | err_printk(entry->dev, entry, | ||
549 | "DMA-API: cpu touching an active dma mapped page " | ||
550 | "[pfn=0x%lx]\n", entry->pfn); | ||
551 | } | ||
552 | |||
553 | /* | ||
407 | * Wrapper function for adding an entry to the hash. | 554 | * Wrapper function for adding an entry to the hash. |
408 | * This function takes care of locking itself. | 555 | * This function takes care of locking itself. |
409 | */ | 556 | */ |
@@ -411,10 +558,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) | |||
411 | { | 558 | { |
412 | struct hash_bucket *bucket; | 559 | struct hash_bucket *bucket; |
413 | unsigned long flags; | 560 | unsigned long flags; |
561 | int rc; | ||
414 | 562 | ||
415 | bucket = get_hash_bucket(entry, &flags); | 563 | bucket = get_hash_bucket(entry, &flags); |
416 | hash_bucket_add(bucket, entry); | 564 | hash_bucket_add(bucket, entry); |
417 | put_hash_bucket(bucket, &flags); | 565 | put_hash_bucket(bucket, &flags); |
566 | |||
567 | rc = active_pfn_insert(entry); | ||
568 | if (rc == -ENOMEM) { | ||
569 | pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n"); | ||
570 | global_disable = true; | ||
571 | } | ||
572 | |||
573 | /* TODO: report -EEXIST errors here as overlapping mappings are | ||
574 | * not supported by the DMA API | ||
575 | */ | ||
418 | } | 576 | } |
419 | 577 | ||
420 | static struct dma_debug_entry *__dma_entry_alloc(void) | 578 | static struct dma_debug_entry *__dma_entry_alloc(void) |
@@ -469,6 +627,8 @@ static void dma_entry_free(struct dma_debug_entry *entry) | |||
469 | { | 627 | { |
470 | unsigned long flags; | 628 | unsigned long flags; |
471 | 629 | ||
630 | active_pfn_remove(entry); | ||
631 | |||
472 | /* | 632 | /* |
473 | * add to beginning of the list - this way the entries are | 633 | * add to beginning of the list - this way the entries are |
474 | * more likely cache hot when they are reallocated. | 634 | * more likely cache hot when they are reallocated. |
@@ -895,15 +1055,15 @@ static void check_unmap(struct dma_debug_entry *ref) | |||
895 | ref->dev_addr, ref->size, | 1055 | ref->dev_addr, ref->size, |
896 | type2name[entry->type], type2name[ref->type]); | 1056 | type2name[entry->type], type2name[ref->type]); |
897 | } else if ((entry->type == dma_debug_coherent) && | 1057 | } else if ((entry->type == dma_debug_coherent) && |
898 | (ref->paddr != entry->paddr)) { | 1058 | (phys_addr(ref) != phys_addr(entry))) { |
899 | err_printk(ref->dev, entry, "DMA-API: device driver frees " | 1059 | err_printk(ref->dev, entry, "DMA-API: device driver frees " |
900 | "DMA memory with different CPU address " | 1060 | "DMA memory with different CPU address " |
901 | "[device address=0x%016llx] [size=%llu bytes] " | 1061 | "[device address=0x%016llx] [size=%llu bytes] " |
902 | "[cpu alloc address=0x%016llx] " | 1062 | "[cpu alloc address=0x%016llx] " |
903 | "[cpu free address=0x%016llx]", | 1063 | "[cpu free address=0x%016llx]", |
904 | ref->dev_addr, ref->size, | 1064 | ref->dev_addr, ref->size, |
905 | (unsigned long long)entry->paddr, | 1065 | phys_addr(entry), |
906 | (unsigned long long)ref->paddr); | 1066 | phys_addr(ref)); |
907 | } | 1067 | } |
908 | 1068 | ||
909 | if (ref->sg_call_ents && ref->type == dma_debug_sg && | 1069 | if (ref->sg_call_ents && ref->type == dma_debug_sg && |
@@ -1052,7 +1212,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, | |||
1052 | 1212 | ||
1053 | entry->dev = dev; | 1213 | entry->dev = dev; |
1054 | entry->type = dma_debug_page; | 1214 | entry->type = dma_debug_page; |
1055 | entry->paddr = page_to_phys(page) + offset; | 1215 | entry->pfn = page_to_pfn(page); |
1216 | entry->offset = offset, | ||
1056 | entry->dev_addr = dma_addr; | 1217 | entry->dev_addr = dma_addr; |
1057 | entry->size = size; | 1218 | entry->size = size; |
1058 | entry->direction = direction; | 1219 | entry->direction = direction; |
@@ -1148,7 +1309,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
1148 | 1309 | ||
1149 | entry->type = dma_debug_sg; | 1310 | entry->type = dma_debug_sg; |
1150 | entry->dev = dev; | 1311 | entry->dev = dev; |
1151 | entry->paddr = sg_phys(s); | 1312 | entry->pfn = page_to_pfn(sg_page(s)); |
1313 | entry->offset = s->offset, | ||
1152 | entry->size = sg_dma_len(s); | 1314 | entry->size = sg_dma_len(s); |
1153 | entry->dev_addr = sg_dma_address(s); | 1315 | entry->dev_addr = sg_dma_address(s); |
1154 | entry->direction = direction; | 1316 | entry->direction = direction; |
@@ -1198,7 +1360,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
1198 | struct dma_debug_entry ref = { | 1360 | struct dma_debug_entry ref = { |
1199 | .type = dma_debug_sg, | 1361 | .type = dma_debug_sg, |
1200 | .dev = dev, | 1362 | .dev = dev, |
1201 | .paddr = sg_phys(s), | 1363 | .pfn = page_to_pfn(sg_page(s)), |
1364 | .offset = s->offset, | ||
1202 | .dev_addr = sg_dma_address(s), | 1365 | .dev_addr = sg_dma_address(s), |
1203 | .size = sg_dma_len(s), | 1366 | .size = sg_dma_len(s), |
1204 | .direction = dir, | 1367 | .direction = dir, |
@@ -1233,7 +1396,8 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, | |||
1233 | 1396 | ||
1234 | entry->type = dma_debug_coherent; | 1397 | entry->type = dma_debug_coherent; |
1235 | entry->dev = dev; | 1398 | entry->dev = dev; |
1236 | entry->paddr = virt_to_phys(virt); | 1399 | entry->pfn = page_to_pfn(virt_to_page(virt)); |
1400 | entry->offset = (size_t) virt & PAGE_MASK; | ||
1237 | entry->size = size; | 1401 | entry->size = size; |
1238 | entry->dev_addr = dma_addr; | 1402 | entry->dev_addr = dma_addr; |
1239 | entry->direction = DMA_BIDIRECTIONAL; | 1403 | entry->direction = DMA_BIDIRECTIONAL; |
@@ -1248,7 +1412,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size, | |||
1248 | struct dma_debug_entry ref = { | 1412 | struct dma_debug_entry ref = { |
1249 | .type = dma_debug_coherent, | 1413 | .type = dma_debug_coherent, |
1250 | .dev = dev, | 1414 | .dev = dev, |
1251 | .paddr = virt_to_phys(virt), | 1415 | .pfn = page_to_pfn(virt_to_page(virt)), |
1416 | .offset = (size_t) virt & PAGE_MASK, | ||
1252 | .dev_addr = addr, | 1417 | .dev_addr = addr, |
1253 | .size = size, | 1418 | .size = size, |
1254 | .direction = DMA_BIDIRECTIONAL, | 1419 | .direction = DMA_BIDIRECTIONAL, |
@@ -1356,7 +1521,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, | |||
1356 | struct dma_debug_entry ref = { | 1521 | struct dma_debug_entry ref = { |
1357 | .type = dma_debug_sg, | 1522 | .type = dma_debug_sg, |
1358 | .dev = dev, | 1523 | .dev = dev, |
1359 | .paddr = sg_phys(s), | 1524 | .pfn = page_to_pfn(sg_page(s)), |
1525 | .offset = s->offset, | ||
1360 | .dev_addr = sg_dma_address(s), | 1526 | .dev_addr = sg_dma_address(s), |
1361 | .size = sg_dma_len(s), | 1527 | .size = sg_dma_len(s), |
1362 | .direction = direction, | 1528 | .direction = direction, |
@@ -1388,7 +1554,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | |||
1388 | struct dma_debug_entry ref = { | 1554 | struct dma_debug_entry ref = { |
1389 | .type = dma_debug_sg, | 1555 | .type = dma_debug_sg, |
1390 | .dev = dev, | 1556 | .dev = dev, |
1391 | .paddr = sg_phys(s), | 1557 | .pfn = page_to_pfn(sg_page(s)), |
1558 | .offset = s->offset, | ||
1392 | .dev_addr = sg_dma_address(s), | 1559 | .dev_addr = sg_dma_address(s), |
1393 | .size = sg_dma_len(s), | 1560 | .size = sg_dma_len(s), |
1394 | .direction = direction, | 1561 | .direction = direction, |
diff --git a/lib/show_mem.c b/lib/show_mem.c index 5847a4921b8e..09225796991a 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c | |||
@@ -17,9 +17,6 @@ void show_mem(unsigned int filter) | |||
17 | printk("Mem-Info:\n"); | 17 | printk("Mem-Info:\n"); |
18 | show_free_areas(filter); | 18 | show_free_areas(filter); |
19 | 19 | ||
20 | if (filter & SHOW_MEM_FILTER_PAGE_COUNT) | ||
21 | return; | ||
22 | |||
23 | for_each_online_pgdat(pgdat) { | 20 | for_each_online_pgdat(pgdat) { |
24 | unsigned long flags; | 21 | unsigned long flags; |
25 | int zoneid; | 22 | int zoneid; |
@@ -46,4 +43,7 @@ void show_mem(unsigned int filter) | |||
46 | printk("%lu pages in pagetable cache\n", | 43 | printk("%lu pages in pagetable cache\n", |
47 | quicklist_total_size()); | 44 | quicklist_total_size()); |
48 | #endif | 45 | #endif |
46 | #ifdef CONFIG_MEMORY_FAILURE | ||
47 | printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); | ||
48 | #endif | ||
49 | } | 49 | } |
diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e4399fa65ad6..615f3de4b5ce 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c | |||
@@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) | |||
172 | /* | 172 | /* |
173 | * Get the overflow emergency buffer | 173 | * Get the overflow emergency buffer |
174 | */ | 174 | */ |
175 | v_overflow_buffer = alloc_bootmem_low_pages_nopanic( | 175 | v_overflow_buffer = memblock_virt_alloc_nopanic( |
176 | PAGE_ALIGN(io_tlb_overflow)); | 176 | PAGE_ALIGN(io_tlb_overflow), |
177 | PAGE_SIZE); | ||
177 | if (!v_overflow_buffer) | 178 | if (!v_overflow_buffer) |
178 | return -ENOMEM; | 179 | return -ENOMEM; |
179 | 180 | ||
@@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) | |||
184 | * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE | 185 | * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE |
185 | * between io_tlb_start and io_tlb_end. | 186 | * between io_tlb_start and io_tlb_end. |
186 | */ | 187 | */ |
187 | io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); | 188 | io_tlb_list = memblock_virt_alloc( |
189 | PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), | ||
190 | PAGE_SIZE); | ||
188 | for (i = 0; i < io_tlb_nslabs; i++) | 191 | for (i = 0; i < io_tlb_nslabs; i++) |
189 | io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); | 192 | io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); |
190 | io_tlb_index = 0; | 193 | io_tlb_index = 0; |
191 | io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); | 194 | io_tlb_orig_addr = memblock_virt_alloc( |
195 | PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), | ||
196 | PAGE_SIZE); | ||
192 | 197 | ||
193 | if (verbose) | 198 | if (verbose) |
194 | swiotlb_print_info(); | 199 | swiotlb_print_info(); |
@@ -215,13 +220,13 @@ swiotlb_init(int verbose) | |||
215 | bytes = io_tlb_nslabs << IO_TLB_SHIFT; | 220 | bytes = io_tlb_nslabs << IO_TLB_SHIFT; |
216 | 221 | ||
217 | /* Get IO TLB memory from the low pages */ | 222 | /* Get IO TLB memory from the low pages */ |
218 | vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); | 223 | vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); |
219 | if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) | 224 | if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) |
220 | return; | 225 | return; |
221 | 226 | ||
222 | if (io_tlb_start) | 227 | if (io_tlb_start) |
223 | free_bootmem(io_tlb_start, | 228 | memblock_free_early(io_tlb_start, |
224 | PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); | 229 | PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); |
225 | pr_warn("Cannot allocate SWIOTLB buffer"); | 230 | pr_warn("Cannot allocate SWIOTLB buffer"); |
226 | no_iotlb_memory = true; | 231 | no_iotlb_memory = true; |
227 | } | 232 | } |
@@ -357,14 +362,14 @@ void __init swiotlb_free(void) | |||
357 | free_pages((unsigned long)phys_to_virt(io_tlb_start), | 362 | free_pages((unsigned long)phys_to_virt(io_tlb_start), |
358 | get_order(io_tlb_nslabs << IO_TLB_SHIFT)); | 363 | get_order(io_tlb_nslabs << IO_TLB_SHIFT)); |
359 | } else { | 364 | } else { |
360 | free_bootmem_late(io_tlb_overflow_buffer, | 365 | memblock_free_late(io_tlb_overflow_buffer, |
361 | PAGE_ALIGN(io_tlb_overflow)); | 366 | PAGE_ALIGN(io_tlb_overflow)); |
362 | free_bootmem_late(__pa(io_tlb_orig_addr), | 367 | memblock_free_late(__pa(io_tlb_orig_addr), |
363 | PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); | 368 | PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); |
364 | free_bootmem_late(__pa(io_tlb_list), | 369 | memblock_free_late(__pa(io_tlb_list), |
365 | PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); | 370 | PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); |
366 | free_bootmem_late(io_tlb_start, | 371 | memblock_free_late(io_tlb_start, |
367 | PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); | 372 | PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); |
368 | } | 373 | } |
369 | io_tlb_nslabs = 0; | 374 | io_tlb_nslabs = 0; |
370 | } | 375 | } |
diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..3a91a2ea3d34 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
459 | unsigned long flags; | 459 | unsigned long flags; |
460 | bool locked = false; | 460 | bool locked = false; |
461 | struct page *page = NULL, *valid_page = NULL; | 461 | struct page *page = NULL, *valid_page = NULL; |
462 | bool skipped_async_unsuitable = false; | ||
462 | 463 | ||
463 | /* | 464 | /* |
464 | * Ensure that there are not too many pages isolated from the LRU | 465 | * Ensure that there are not too many pages isolated from the LRU |
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
534 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 535 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
535 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 536 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
536 | cc->finished_update_migrate = true; | 537 | cc->finished_update_migrate = true; |
538 | skipped_async_unsuitable = true; | ||
537 | goto next_pageblock; | 539 | goto next_pageblock; |
538 | } | 540 | } |
539 | 541 | ||
@@ -627,8 +629,13 @@ next_pageblock: | |||
627 | if (locked) | 629 | if (locked) |
628 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 630 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
629 | 631 | ||
630 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 632 | /* |
631 | if (low_pfn == end_pfn) | 633 | * Update the pageblock-skip information and cached scanner pfn, |
634 | * if the whole pageblock was scanned without isolating any page. | ||
635 | * This is not done when pageblock was skipped due to being unsuitable | ||
636 | * for async compaction, so that eventual sync compaction can try. | ||
637 | */ | ||
638 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | ||
632 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 639 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
633 | 640 | ||
634 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 641 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone, | |||
660 | * is the end of the pageblock the migration scanner is using. | 667 | * is the end of the pageblock the migration scanner is using. |
661 | */ | 668 | */ |
662 | pfn = cc->free_pfn; | 669 | pfn = cc->free_pfn; |
663 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 670 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
664 | 671 | ||
665 | /* | 672 | /* |
666 | * Take care that if the migration scanner is at the end of the zone | 673 | * Take care that if the migration scanner is at the end of the zone |
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone, | |||
676 | * pages on cc->migratepages. We stop searching if the migrate | 683 | * pages on cc->migratepages. We stop searching if the migrate |
677 | * and free page scanners meet or enough free pages are isolated. | 684 | * and free page scanners meet or enough free pages are isolated. |
678 | */ | 685 | */ |
679 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 686 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
680 | pfn -= pageblock_nr_pages) { | 687 | pfn -= pageblock_nr_pages) { |
681 | unsigned long isolated; | 688 | unsigned long isolated; |
682 | 689 | ||
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone, | |||
738 | /* split_free_page does not map the pages */ | 745 | /* split_free_page does not map the pages */ |
739 | map_pages(freelist); | 746 | map_pages(freelist); |
740 | 747 | ||
741 | cc->free_pfn = high_pfn; | 748 | /* |
749 | * If we crossed the migrate scanner, we want to keep it that way | ||
750 | * so that compact_finished() may detect this | ||
751 | */ | ||
752 | if (pfn < low_pfn) | ||
753 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | ||
754 | else | ||
755 | cc->free_pfn = high_pfn; | ||
742 | cc->nr_freepages = nr_freepages; | 756 | cc->nr_freepages = nr_freepages; |
743 | } | 757 | } |
744 | 758 | ||
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone, | |||
837 | 851 | ||
838 | /* Compaction run completes if the migrate and free scanner meet */ | 852 | /* Compaction run completes if the migrate and free scanner meet */ |
839 | if (cc->free_pfn <= cc->migrate_pfn) { | 853 | if (cc->free_pfn <= cc->migrate_pfn) { |
854 | /* Let the next compaction start anew. */ | ||
855 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | ||
856 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
857 | |||
840 | /* | 858 | /* |
841 | * Mark that the PG_migrate_skip information should be cleared | 859 | * Mark that the PG_migrate_skip information should be cleared |
842 | * by kswapd when it goes to sleep. kswapd does not set the | 860 | * by kswapd when it goes to sleep. kswapd does not set the |
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
947 | } | 965 | } |
948 | 966 | ||
949 | /* | 967 | /* |
968 | * Clear pageblock skip if there were failures recently and compaction | ||
969 | * is about to be retried after being deferred. kswapd does not do | ||
970 | * this reset as it'll reset the cached information when going to sleep. | ||
971 | */ | ||
972 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
973 | __reset_isolation_suitable(zone); | ||
974 | |||
975 | /* | ||
950 | * Setup to move all movable pages to the end of the zone. Used cached | 976 | * Setup to move all movable pages to the end of the zone. Used cached |
951 | * information on where the scanners should start but check that it | 977 | * information on where the scanners should start but check that it |
952 | * is initialised by ensuring the values are within zone boundaries. | 978 | * is initialised by ensuring the values are within zone boundaries. |
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
962 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 988 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; |
963 | } | 989 | } |
964 | 990 | ||
965 | /* | 991 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
966 | * Clear pageblock skip if there were failures recently and compaction | ||
967 | * is about to be retried after being deferred. kswapd does not do | ||
968 | * this reset as it'll reset the cached information when going to sleep. | ||
969 | */ | ||
970 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
971 | __reset_isolation_suitable(zone); | ||
972 | 992 | ||
973 | migrate_prep_local(); | 993 | migrate_prep_local(); |
974 | 994 | ||
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1003 | if (err) { | 1023 | if (err) { |
1004 | putback_movable_pages(&cc->migratepages); | 1024 | putback_movable_pages(&cc->migratepages); |
1005 | cc->nr_migratepages = 0; | 1025 | cc->nr_migratepages = 0; |
1006 | if (err == -ENOMEM) { | 1026 | /* |
1027 | * migrate_pages() may return -ENOMEM when scanners meet | ||
1028 | * and we want compact_finished() to detect it | ||
1029 | */ | ||
1030 | if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { | ||
1007 | ret = COMPACT_PARTIAL; | 1031 | ret = COMPACT_PARTIAL; |
1008 | goto out; | 1032 | goto out; |
1009 | } | 1033 | } |
@@ -1015,6 +1039,8 @@ out: | |||
1015 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1039 | cc->nr_freepages -= release_freepages(&cc->freepages); |
1016 | VM_BUG_ON(cc->nr_freepages != 0); | 1040 | VM_BUG_ON(cc->nr_freepages != 0); |
1017 | 1041 | ||
1042 | trace_mm_compaction_end(ret); | ||
1043 | |||
1018 | return ret; | 1044 | return ret; |
1019 | } | 1045 | } |
1020 | 1046 | ||
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1120 | compact_zone(zone, cc); | 1146 | compact_zone(zone, cc); |
1121 | 1147 | ||
1122 | if (cc->order > 0) { | 1148 | if (cc->order > 0) { |
1123 | int ok = zone_watermark_ok(zone, cc->order, | 1149 | if (zone_watermark_ok(zone, cc->order, |
1124 | low_wmark_pages(zone), 0, 0); | 1150 | low_wmark_pages(zone), 0, 0)) |
1125 | if (ok && cc->order >= zone->compact_order_failed) | 1151 | compaction_defer_reset(zone, cc->order, false); |
1126 | zone->compact_order_failed = cc->order + 1; | ||
1127 | /* Currently async compaction is never deferred. */ | 1152 | /* Currently async compaction is never deferred. */ |
1128 | else if (!ok && cc->sync) | 1153 | else if (cc->sync) |
1129 | defer_compaction(zone, cc->order); | 1154 | defer_compaction(zone, cc->order); |
1130 | } | 1155 | } |
1131 | 1156 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..04306b9de90d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
690 | */ | 690 | */ |
691 | int PageHuge(struct page *page) | 691 | int PageHuge(struct page *page) |
692 | { | 692 | { |
693 | compound_page_dtor *dtor; | ||
694 | |||
695 | if (!PageCompound(page)) | 693 | if (!PageCompound(page)) |
696 | return 0; | 694 | return 0; |
697 | 695 | ||
698 | page = compound_head(page); | 696 | page = compound_head(page); |
699 | dtor = get_compound_page_dtor(page); | 697 | return get_compound_page_dtor(page) == free_huge_page; |
700 | |||
701 | return dtor == free_huge_page; | ||
702 | } | 698 | } |
703 | EXPORT_SYMBOL_GPL(PageHuge); | 699 | EXPORT_SYMBOL_GPL(PageHuge); |
704 | 700 | ||
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge); | |||
708 | */ | 704 | */ |
709 | int PageHeadHuge(struct page *page_head) | 705 | int PageHeadHuge(struct page *page_head) |
710 | { | 706 | { |
711 | compound_page_dtor *dtor; | ||
712 | |||
713 | if (!PageHead(page_head)) | 707 | if (!PageHead(page_head)) |
714 | return 0; | 708 | return 0; |
715 | 709 | ||
716 | dtor = get_compound_page_dtor(page_head); | 710 | return get_compound_page_dtor(page_head) == free_huge_page; |
717 | |||
718 | return dtor == free_huge_page; | ||
719 | } | 711 | } |
720 | EXPORT_SYMBOL_GPL(PageHeadHuge); | ||
721 | 712 | ||
722 | pgoff_t __basepage_index(struct page *page) | 713 | pgoff_t __basepage_index(struct page *page) |
723 | { | 714 | { |
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1280 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { | 1271 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
1281 | void *addr; | 1272 | void *addr; |
1282 | 1273 | ||
1283 | addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), | 1274 | addr = memblock_virt_alloc_try_nid_nopanic( |
1284 | huge_page_size(h), huge_page_size(h), 0); | 1275 | huge_page_size(h), huge_page_size(h), |
1285 | 1276 | 0, BOOTMEM_ALLOC_ACCESSIBLE, node); | |
1286 | if (addr) { | 1277 | if (addr) { |
1287 | /* | 1278 | /* |
1288 | * Use the beginning of the huge page to store the | 1279 | * Use the beginning of the huge page to store the |
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) | |||
1322 | 1313 | ||
1323 | #ifdef CONFIG_HIGHMEM | 1314 | #ifdef CONFIG_HIGHMEM |
1324 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | 1315 | page = pfn_to_page(m->phys >> PAGE_SHIFT); |
1325 | free_bootmem_late((unsigned long)m, | 1316 | memblock_free_late(__pa(m), |
1326 | sizeof(struct huge_bootmem_page)); | 1317 | sizeof(struct huge_bootmem_page)); |
1327 | #else | 1318 | #else |
1328 | page = virt_to_page(m); | 1319 | page = virt_to_page(m); |
1329 | #endif | 1320 | #endif |
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2355 | int cow; | 2346 | int cow; |
2356 | struct hstate *h = hstate_vma(vma); | 2347 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2348 | unsigned long sz = huge_page_size(h); |
2349 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2350 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2351 | int ret = 0; | ||
2358 | 2352 | ||
2359 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 2353 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
2360 | 2354 | ||
2355 | mmun_start = vma->vm_start; | ||
2356 | mmun_end = vma->vm_end; | ||
2357 | if (cow) | ||
2358 | mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); | ||
2359 | |||
2361 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 2360 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
2362 | spinlock_t *src_ptl, *dst_ptl; | 2361 | spinlock_t *src_ptl, *dst_ptl; |
2363 | src_pte = huge_pte_offset(src, addr); | 2362 | src_pte = huge_pte_offset(src, addr); |
2364 | if (!src_pte) | 2363 | if (!src_pte) |
2365 | continue; | 2364 | continue; |
2366 | dst_pte = huge_pte_alloc(dst, addr, sz); | 2365 | dst_pte = huge_pte_alloc(dst, addr, sz); |
2367 | if (!dst_pte) | 2366 | if (!dst_pte) { |
2368 | goto nomem; | 2367 | ret = -ENOMEM; |
2368 | break; | ||
2369 | } | ||
2369 | 2370 | ||
2370 | /* If the pagetables are shared don't copy or take references */ | 2371 | /* If the pagetables are shared don't copy or take references */ |
2371 | if (dst_pte == src_pte) | 2372 | if (dst_pte == src_pte) |
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2386 | spin_unlock(src_ptl); | 2387 | spin_unlock(src_ptl); |
2387 | spin_unlock(dst_ptl); | 2388 | spin_unlock(dst_ptl); |
2388 | } | 2389 | } |
2389 | return 0; | ||
2390 | 2390 | ||
2391 | nomem: | 2391 | if (cow) |
2392 | return -ENOMEM; | 2392 | mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); |
2393 | |||
2394 | return ret; | ||
2393 | } | 2395 | } |
2394 | 2396 | ||
2395 | static int is_hugetlb_entry_migration(pte_t pte) | 2397 | static int is_hugetlb_entry_migration(pte_t pte) |
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3079 | same_page: | 3081 | same_page: |
3080 | if (pages) { | 3082 | if (pages) { |
3081 | pages[i] = mem_map_offset(page, pfn_offset); | 3083 | pages[i] = mem_map_offset(page, pfn_offset); |
3082 | get_page(pages[i]); | 3084 | get_page_foll(pages[i]); |
3083 | } | 3085 | } |
3084 | 3086 | ||
3085 | if (vmas) | 3087 | if (vmas) |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
55 | return 0; | 55 | return 0; |
56 | 56 | ||
57 | inject: | 57 | inject: |
58 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
60 | } | 60 | } |
61 | 61 | ||
diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..a346ba120e42 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page, | |||
47 | * page_cache_get_speculative()) on tail pages. | 47 | * page_cache_get_speculative()) on tail pages. |
48 | */ | 48 | */ |
49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | 49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); |
50 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
51 | VM_BUG_ON(page_mapcount(page) < 0); | ||
52 | if (get_page_head) | 50 | if (get_page_head) |
53 | atomic_inc(&page->first_page->_count); | 51 | atomic_inc(&page->first_page->_count); |
54 | atomic_inc(&page->_mapcount); | 52 | get_huge_page_tail(page); |
55 | } | 53 | } |
56 | 54 | ||
57 | /* | 55 | /* |
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
1891 | return new_page; | 1891 | return new_page; |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | 1894 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) |
1895 | unsigned long *vm_flags) | ||
1896 | { | 1895 | { |
1897 | struct stable_node *stable_node; | 1896 | struct stable_node *stable_node; |
1898 | struct rmap_item *rmap_item; | 1897 | struct rmap_item *rmap_item; |
1899 | unsigned int mapcount = page_mapcount(page); | 1898 | int ret = SWAP_AGAIN; |
1900 | int referenced = 0; | ||
1901 | int search_new_forks = 0; | 1899 | int search_new_forks = 0; |
1902 | 1900 | ||
1903 | VM_BUG_ON(!PageKsm(page)); | 1901 | VM_BUG_ON(!PageKsm(page)); |
1902 | |||
1903 | /* | ||
1904 | * Rely on the page lock to protect against concurrent modifications | ||
1905 | * to that page's node of the stable tree. | ||
1906 | */ | ||
1904 | VM_BUG_ON(!PageLocked(page)); | 1907 | VM_BUG_ON(!PageLocked(page)); |
1905 | 1908 | ||
1906 | stable_node = page_stable_node(page); | 1909 | stable_node = page_stable_node(page); |
1907 | if (!stable_node) | 1910 | if (!stable_node) |
1908 | return 0; | 1911 | return ret; |
1909 | again: | 1912 | again: |
1910 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | 1913 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1911 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1914 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
@@ -1928,113 +1931,16 @@ again: | |||
1928 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | 1931 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) |
1929 | continue; | 1932 | continue; |
1930 | 1933 | ||
1931 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | 1934 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1932 | continue; | ||
1933 | |||
1934 | referenced += page_referenced_one(page, vma, | ||
1935 | rmap_item->address, &mapcount, vm_flags); | ||
1936 | if (!search_new_forks || !mapcount) | ||
1937 | break; | ||
1938 | } | ||
1939 | anon_vma_unlock_read(anon_vma); | ||
1940 | if (!mapcount) | ||
1941 | goto out; | ||
1942 | } | ||
1943 | if (!search_new_forks++) | ||
1944 | goto again; | ||
1945 | out: | ||
1946 | return referenced; | ||
1947 | } | ||
1948 | |||
1949 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1950 | { | ||
1951 | struct stable_node *stable_node; | ||
1952 | struct rmap_item *rmap_item; | ||
1953 | int ret = SWAP_AGAIN; | ||
1954 | int search_new_forks = 0; | ||
1955 | |||
1956 | VM_BUG_ON(!PageKsm(page)); | ||
1957 | VM_BUG_ON(!PageLocked(page)); | ||
1958 | |||
1959 | stable_node = page_stable_node(page); | ||
1960 | if (!stable_node) | ||
1961 | return SWAP_FAIL; | ||
1962 | again: | ||
1963 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
1964 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1965 | struct anon_vma_chain *vmac; | ||
1966 | struct vm_area_struct *vma; | ||
1967 | |||
1968 | anon_vma_lock_read(anon_vma); | ||
1969 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
1970 | 0, ULONG_MAX) { | ||
1971 | vma = vmac->vma; | ||
1972 | if (rmap_item->address < vma->vm_start || | ||
1973 | rmap_item->address >= vma->vm_end) | ||
1974 | continue; | ||
1975 | /* | ||
1976 | * Initially we examine only the vma which covers this | ||
1977 | * rmap_item; but later, if there is still work to do, | ||
1978 | * we examine covering vmas in other mms: in case they | ||
1979 | * were forked from the original since ksmd passed. | ||
1980 | */ | ||
1981 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1982 | continue; | 1935 | continue; |
1983 | 1936 | ||
1984 | ret = try_to_unmap_one(page, vma, | 1937 | ret = rwc->rmap_one(page, vma, |
1985 | rmap_item->address, flags); | 1938 | rmap_item->address, rwc->arg); |
1986 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1939 | if (ret != SWAP_AGAIN) { |
1987 | anon_vma_unlock_read(anon_vma); | 1940 | anon_vma_unlock_read(anon_vma); |
1988 | goto out; | 1941 | goto out; |
1989 | } | 1942 | } |
1990 | } | 1943 | if (rwc->done && rwc->done(page)) { |
1991 | anon_vma_unlock_read(anon_vma); | ||
1992 | } | ||
1993 | if (!search_new_forks++) | ||
1994 | goto again; | ||
1995 | out: | ||
1996 | return ret; | ||
1997 | } | ||
1998 | |||
1999 | #ifdef CONFIG_MIGRATION | ||
2000 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
2001 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
2002 | { | ||
2003 | struct stable_node *stable_node; | ||
2004 | struct rmap_item *rmap_item; | ||
2005 | int ret = SWAP_AGAIN; | ||
2006 | int search_new_forks = 0; | ||
2007 | |||
2008 | VM_BUG_ON(!PageKsm(page)); | ||
2009 | VM_BUG_ON(!PageLocked(page)); | ||
2010 | |||
2011 | stable_node = page_stable_node(page); | ||
2012 | if (!stable_node) | ||
2013 | return ret; | ||
2014 | again: | ||
2015 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
2016 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
2017 | struct anon_vma_chain *vmac; | ||
2018 | struct vm_area_struct *vma; | ||
2019 | |||
2020 | anon_vma_lock_read(anon_vma); | ||
2021 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
2022 | 0, ULONG_MAX) { | ||
2023 | vma = vmac->vma; | ||
2024 | if (rmap_item->address < vma->vm_start || | ||
2025 | rmap_item->address >= vma->vm_end) | ||
2026 | continue; | ||
2027 | /* | ||
2028 | * Initially we examine only the vma which covers this | ||
2029 | * rmap_item; but later, if there is still work to do, | ||
2030 | * we examine covering vmas in other mms: in case they | ||
2031 | * were forked from the original since ksmd passed. | ||
2032 | */ | ||
2033 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
2034 | continue; | ||
2035 | |||
2036 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
2037 | if (ret != SWAP_AGAIN) { | ||
2038 | anon_vma_unlock_read(anon_vma); | 1944 | anon_vma_unlock_read(anon_vma); |
2039 | goto out; | 1945 | goto out; |
2040 | } | 1946 | } |
@@ -2047,6 +1953,7 @@ out: | |||
2047 | return ret; | 1953 | return ret; |
2048 | } | 1954 | } |
2049 | 1955 | ||
1956 | #ifdef CONFIG_MIGRATION | ||
2050 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 1957 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
2051 | { | 1958 | { |
2052 | struct stable_node *stable_node; | 1959 | struct stable_node *stable_node; |
diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..1c2ef2c7edab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | 23 | #include <asm-generic/sections.h> |
24 | #include <linux/io.h> | ||
25 | |||
26 | #include "internal.h" | ||
24 | 27 | ||
25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = { | |||
39 | }; | 42 | }; |
40 | 43 | ||
41 | int memblock_debug __initdata_memblock; | 44 | int memblock_debug __initdata_memblock; |
45 | #ifdef CONFIG_MOVABLE_NODE | ||
46 | bool movable_node_enabled __initdata_memblock = false; | ||
47 | #endif | ||
42 | static int memblock_can_resize __initdata_memblock; | 48 | static int memblock_can_resize __initdata_memblock; |
43 | static int memblock_memory_in_slab __initdata_memblock = 0; | 49 | static int memblock_memory_in_slab __initdata_memblock = 0; |
44 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 50 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 97 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
92 | * @size: size of free area to find | 98 | * @size: size of free area to find |
93 | * @align: alignment of free area to find | 99 | * @align: alignment of free area to find |
94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 100 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
95 | * | 101 | * |
96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | 102 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. |
97 | * | 103 | * |
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 129 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
124 | * @size: size of free area to find | 130 | * @size: size of free area to find |
125 | * @align: alignment of free area to find | 131 | * @align: alignment of free area to find |
126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 132 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
127 | * | 133 | * |
128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | 134 | * Utility called from memblock_find_in_range_node(), find free area top-down. |
129 | * | 135 | * |
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
154 | 160 | ||
155 | /** | 161 | /** |
156 | * memblock_find_in_range_node - find free area in given range and node | 162 | * memblock_find_in_range_node - find free area in given range and node |
157 | * @start: start of candidate range | ||
158 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
159 | * @size: size of free area to find | 163 | * @size: size of free area to find |
160 | * @align: alignment of free area to find | 164 | * @align: alignment of free area to find |
161 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 165 | * @start: start of candidate range |
166 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
167 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
162 | * | 168 | * |
163 | * Find @size free area aligned to @align in the specified range and node. | 169 | * Find @size free area aligned to @align in the specified range and node. |
164 | * | 170 | * |
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
173 | * RETURNS: | 179 | * RETURNS: |
174 | * Found address on success, 0 on failure. | 180 | * Found address on success, 0 on failure. |
175 | */ | 181 | */ |
176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 182 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, |
177 | phys_addr_t end, phys_addr_t size, | 183 | phys_addr_t align, phys_addr_t start, |
178 | phys_addr_t align, int nid) | 184 | phys_addr_t end, int nid) |
179 | { | 185 | { |
180 | int ret; | 186 | int ret; |
181 | phys_addr_t kernel_end; | 187 | phys_addr_t kernel_end; |
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
238 | phys_addr_t end, phys_addr_t size, | 244 | phys_addr_t end, phys_addr_t size, |
239 | phys_addr_t align) | 245 | phys_addr_t align) |
240 | { | 246 | { |
241 | return memblock_find_in_range_node(start, end, size, align, | 247 | return memblock_find_in_range_node(size, align, start, end, |
242 | MAX_NUMNODES); | 248 | NUMA_NO_NODE); |
243 | } | 249 | } |
244 | 250 | ||
245 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 251 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
255 | type->cnt = 1; | 261 | type->cnt = 1; |
256 | type->regions[0].base = 0; | 262 | type->regions[0].base = 0; |
257 | type->regions[0].size = 0; | 263 | type->regions[0].size = 0; |
264 | type->regions[0].flags = 0; | ||
258 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); | 265 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); |
259 | } | 266 | } |
260 | } | 267 | } |
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | |||
265 | if (memblock.reserved.regions == memblock_reserved_init_regions) | 272 | if (memblock.reserved.regions == memblock_reserved_init_regions) |
266 | return 0; | 273 | return 0; |
267 | 274 | ||
275 | /* | ||
276 | * Don't allow nobootmem allocator to free reserved memory regions | ||
277 | * array if | ||
278 | * - CONFIG_DEBUG_FS is enabled; | ||
279 | * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; | ||
280 | * - reserved memory regions array have been resized during boot. | ||
281 | * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" | ||
282 | * will show garbage instead of state of memory reservations. | ||
283 | */ | ||
284 | if (IS_ENABLED(CONFIG_DEBUG_FS) && | ||
285 | !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) | ||
286 | return 0; | ||
287 | |||
268 | *addr = __pa(memblock.reserved.regions); | 288 | *addr = __pa(memblock.reserved.regions); |
269 | 289 | ||
270 | return PAGE_ALIGN(sizeof(struct memblock_region) * | 290 | return PAGE_ALIGN(sizeof(struct memblock_region) * |
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
405 | 425 | ||
406 | if (this->base + this->size != next->base || | 426 | if (this->base + this->size != next->base || |
407 | memblock_get_region_node(this) != | 427 | memblock_get_region_node(this) != |
408 | memblock_get_region_node(next)) { | 428 | memblock_get_region_node(next) || |
429 | this->flags != next->flags) { | ||
409 | BUG_ON(this->base + this->size > next->base); | 430 | BUG_ON(this->base + this->size > next->base); |
410 | i++; | 431 | i++; |
411 | continue; | 432 | continue; |
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
425 | * @base: base address of the new region | 446 | * @base: base address of the new region |
426 | * @size: size of the new region | 447 | * @size: size of the new region |
427 | * @nid: node id of the new region | 448 | * @nid: node id of the new region |
449 | * @flags: flags of the new region | ||
428 | * | 450 | * |
429 | * Insert new memblock region [@base,@base+@size) into @type at @idx. | 451 | * Insert new memblock region [@base,@base+@size) into @type at @idx. |
430 | * @type must already have extra room to accomodate the new region. | 452 | * @type must already have extra room to accomodate the new region. |
431 | */ | 453 | */ |
432 | static void __init_memblock memblock_insert_region(struct memblock_type *type, | 454 | static void __init_memblock memblock_insert_region(struct memblock_type *type, |
433 | int idx, phys_addr_t base, | 455 | int idx, phys_addr_t base, |
434 | phys_addr_t size, int nid) | 456 | phys_addr_t size, |
457 | int nid, unsigned long flags) | ||
435 | { | 458 | { |
436 | struct memblock_region *rgn = &type->regions[idx]; | 459 | struct memblock_region *rgn = &type->regions[idx]; |
437 | 460 | ||
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
439 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); | 462 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); |
440 | rgn->base = base; | 463 | rgn->base = base; |
441 | rgn->size = size; | 464 | rgn->size = size; |
465 | rgn->flags = flags; | ||
442 | memblock_set_region_node(rgn, nid); | 466 | memblock_set_region_node(rgn, nid); |
443 | type->cnt++; | 467 | type->cnt++; |
444 | type->total_size += size; | 468 | type->total_size += size; |
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
450 | * @base: base address of the new region | 474 | * @base: base address of the new region |
451 | * @size: size of the new region | 475 | * @size: size of the new region |
452 | * @nid: nid of the new region | 476 | * @nid: nid of the new region |
477 | * @flags: flags of the new region | ||
453 | * | 478 | * |
454 | * Add new memblock region [@base,@base+@size) into @type. The new region | 479 | * Add new memblock region [@base,@base+@size) into @type. The new region |
455 | * is allowed to overlap with existing ones - overlaps don't affect already | 480 | * is allowed to overlap with existing ones - overlaps don't affect already |
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
460 | * 0 on success, -errno on failure. | 485 | * 0 on success, -errno on failure. |
461 | */ | 486 | */ |
462 | static int __init_memblock memblock_add_region(struct memblock_type *type, | 487 | static int __init_memblock memblock_add_region(struct memblock_type *type, |
463 | phys_addr_t base, phys_addr_t size, int nid) | 488 | phys_addr_t base, phys_addr_t size, |
489 | int nid, unsigned long flags) | ||
464 | { | 490 | { |
465 | bool insert = false; | 491 | bool insert = false; |
466 | phys_addr_t obase = base; | 492 | phys_addr_t obase = base; |
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, | |||
475 | WARN_ON(type->cnt != 1 || type->total_size); | 501 | WARN_ON(type->cnt != 1 || type->total_size); |
476 | type->regions[0].base = base; | 502 | type->regions[0].base = base; |
477 | type->regions[0].size = size; | 503 | type->regions[0].size = size; |
504 | type->regions[0].flags = flags; | ||
478 | memblock_set_region_node(&type->regions[0], nid); | 505 | memblock_set_region_node(&type->regions[0], nid); |
479 | type->total_size = size; | 506 | type->total_size = size; |
480 | return 0; | 507 | return 0; |
@@ -505,7 +532,8 @@ repeat: | |||
505 | nr_new++; | 532 | nr_new++; |
506 | if (insert) | 533 | if (insert) |
507 | memblock_insert_region(type, i++, base, | 534 | memblock_insert_region(type, i++, base, |
508 | rbase - base, nid); | 535 | rbase - base, nid, |
536 | flags); | ||
509 | } | 537 | } |
510 | /* area below @rend is dealt with, forget about it */ | 538 | /* area below @rend is dealt with, forget about it */ |
511 | base = min(rend, end); | 539 | base = min(rend, end); |
@@ -515,7 +543,8 @@ repeat: | |||
515 | if (base < end) { | 543 | if (base < end) { |
516 | nr_new++; | 544 | nr_new++; |
517 | if (insert) | 545 | if (insert) |
518 | memblock_insert_region(type, i, base, end - base, nid); | 546 | memblock_insert_region(type, i, base, end - base, |
547 | nid, flags); | ||
519 | } | 548 | } |
520 | 549 | ||
521 | /* | 550 | /* |
@@ -537,12 +566,13 @@ repeat: | |||
537 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | 566 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, |
538 | int nid) | 567 | int nid) |
539 | { | 568 | { |
540 | return memblock_add_region(&memblock.memory, base, size, nid); | 569 | return memblock_add_region(&memblock.memory, base, size, nid, 0); |
541 | } | 570 | } |
542 | 571 | ||
543 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 572 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
544 | { | 573 | { |
545 | return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); | 574 | return memblock_add_region(&memblock.memory, base, size, |
575 | MAX_NUMNODES, 0); | ||
546 | } | 576 | } |
547 | 577 | ||
548 | /** | 578 | /** |
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
597 | rgn->size -= base - rbase; | 627 | rgn->size -= base - rbase; |
598 | type->total_size -= base - rbase; | 628 | type->total_size -= base - rbase; |
599 | memblock_insert_region(type, i, rbase, base - rbase, | 629 | memblock_insert_region(type, i, rbase, base - rbase, |
600 | memblock_get_region_node(rgn)); | 630 | memblock_get_region_node(rgn), |
631 | rgn->flags); | ||
601 | } else if (rend > end) { | 632 | } else if (rend > end) { |
602 | /* | 633 | /* |
603 | * @rgn intersects from above. Split and redo the | 634 | * @rgn intersects from above. Split and redo the |
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
607 | rgn->size -= end - rbase; | 638 | rgn->size -= end - rbase; |
608 | type->total_size -= end - rbase; | 639 | type->total_size -= end - rbase; |
609 | memblock_insert_region(type, i--, rbase, end - rbase, | 640 | memblock_insert_region(type, i--, rbase, end - rbase, |
610 | memblock_get_region_node(rgn)); | 641 | memblock_get_region_node(rgn), |
642 | rgn->flags); | ||
611 | } else { | 643 | } else { |
612 | /* @rgn is fully contained, record it */ | 644 | /* @rgn is fully contained, record it */ |
613 | if (!*end_rgn) | 645 | if (!*end_rgn) |
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | |||
643 | { | 675 | { |
644 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", | 676 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", |
645 | (unsigned long long)base, | 677 | (unsigned long long)base, |
646 | (unsigned long long)base + size, | 678 | (unsigned long long)base + size - 1, |
647 | (void *)_RET_IP_); | 679 | (void *)_RET_IP_); |
648 | 680 | ||
649 | return __memblock_remove(&memblock.reserved, base, size); | 681 | return __memblock_remove(&memblock.reserved, base, size); |
650 | } | 682 | } |
651 | 683 | ||
652 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 684 | static int __init_memblock memblock_reserve_region(phys_addr_t base, |
685 | phys_addr_t size, | ||
686 | int nid, | ||
687 | unsigned long flags) | ||
653 | { | 688 | { |
654 | struct memblock_type *_rgn = &memblock.reserved; | 689 | struct memblock_type *_rgn = &memblock.reserved; |
655 | 690 | ||
656 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", | 691 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
657 | (unsigned long long)base, | 692 | (unsigned long long)base, |
658 | (unsigned long long)base + size, | 693 | (unsigned long long)base + size - 1, |
659 | (void *)_RET_IP_); | 694 | flags, (void *)_RET_IP_); |
660 | 695 | ||
661 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); | 696 | return memblock_add_region(_rgn, base, size, nid, flags); |
697 | } | ||
698 | |||
699 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | ||
700 | { | ||
701 | return memblock_reserve_region(base, size, MAX_NUMNODES, 0); | ||
702 | } | ||
703 | |||
704 | /** | ||
705 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
706 | * @base: the base phys addr of the region | ||
707 | * @size: the size of the region | ||
708 | * | ||
709 | * This function isolates region [@base, @base + @size), and mark it with flag | ||
710 | * MEMBLOCK_HOTPLUG. | ||
711 | * | ||
712 | * Return 0 on succees, -errno on failure. | ||
713 | */ | ||
714 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
715 | { | ||
716 | struct memblock_type *type = &memblock.memory; | ||
717 | int i, ret, start_rgn, end_rgn; | ||
718 | |||
719 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
720 | if (ret) | ||
721 | return ret; | ||
722 | |||
723 | for (i = start_rgn; i < end_rgn; i++) | ||
724 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | ||
725 | |||
726 | memblock_merge_regions(type); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | /** | ||
731 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
732 | * @base: the base phys addr of the region | ||
733 | * @size: the size of the region | ||
734 | * | ||
735 | * This function isolates region [@base, @base + @size), and clear flag | ||
736 | * MEMBLOCK_HOTPLUG for the isolated regions. | ||
737 | * | ||
738 | * Return 0 on succees, -errno on failure. | ||
739 | */ | ||
740 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | ||
741 | { | ||
742 | struct memblock_type *type = &memblock.memory; | ||
743 | int i, ret, start_rgn, end_rgn; | ||
744 | |||
745 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
746 | if (ret) | ||
747 | return ret; | ||
748 | |||
749 | for (i = start_rgn; i < end_rgn; i++) | ||
750 | memblock_clear_region_flags(&type->regions[i], | ||
751 | MEMBLOCK_HOTPLUG); | ||
752 | |||
753 | memblock_merge_regions(type); | ||
754 | return 0; | ||
662 | } | 755 | } |
663 | 756 | ||
664 | /** | 757 | /** |
665 | * __next_free_mem_range - next function for for_each_free_mem_range() | 758 | * __next_free_mem_range - next function for for_each_free_mem_range() |
666 | * @idx: pointer to u64 loop variable | 759 | * @idx: pointer to u64 loop variable |
667 | * @nid: node selector, %MAX_NUMNODES for all nodes | 760 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
668 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 761 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
669 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 762 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
670 | * @out_nid: ptr to int for nid of the range, can be %NULL | 763 | * @out_nid: ptr to int for nid of the range, can be %NULL |
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
693 | int mi = *idx & 0xffffffff; | 786 | int mi = *idx & 0xffffffff; |
694 | int ri = *idx >> 32; | 787 | int ri = *idx >> 32; |
695 | 788 | ||
789 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
790 | nid = NUMA_NO_NODE; | ||
791 | |||
696 | for ( ; mi < mem->cnt; mi++) { | 792 | for ( ; mi < mem->cnt; mi++) { |
697 | struct memblock_region *m = &mem->regions[mi]; | 793 | struct memblock_region *m = &mem->regions[mi]; |
698 | phys_addr_t m_start = m->base; | 794 | phys_addr_t m_start = m->base; |
699 | phys_addr_t m_end = m->base + m->size; | 795 | phys_addr_t m_end = m->base + m->size; |
700 | 796 | ||
701 | /* only memory regions are associated with nodes, check it */ | 797 | /* only memory regions are associated with nodes, check it */ |
702 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 798 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
703 | continue; | 799 | continue; |
704 | 800 | ||
705 | /* scan areas before each reservation for intersection */ | 801 | /* scan areas before each reservation for intersection */ |
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
740 | /** | 836 | /** |
741 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 837 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
742 | * @idx: pointer to u64 loop variable | 838 | * @idx: pointer to u64 loop variable |
743 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 839 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
744 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 840 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
745 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 841 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
746 | * @out_nid: ptr to int for nid of the range, can be %NULL | 842 | * @out_nid: ptr to int for nid of the range, can be %NULL |
747 | * | 843 | * |
748 | * Reverse of __next_free_mem_range(). | 844 | * Reverse of __next_free_mem_range(). |
845 | * | ||
846 | * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't | ||
847 | * be able to hot-remove hotpluggable memory used by the kernel. So this | ||
848 | * function skip hotpluggable regions if needed when allocating memory for the | ||
849 | * kernel. | ||
749 | */ | 850 | */ |
750 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | 851 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, |
751 | phys_addr_t *out_start, | 852 | phys_addr_t *out_start, |
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
756 | int mi = *idx & 0xffffffff; | 857 | int mi = *idx & 0xffffffff; |
757 | int ri = *idx >> 32; | 858 | int ri = *idx >> 32; |
758 | 859 | ||
860 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
861 | nid = NUMA_NO_NODE; | ||
862 | |||
759 | if (*idx == (u64)ULLONG_MAX) { | 863 | if (*idx == (u64)ULLONG_MAX) { |
760 | mi = mem->cnt - 1; | 864 | mi = mem->cnt - 1; |
761 | ri = rsv->cnt; | 865 | ri = rsv->cnt; |
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
767 | phys_addr_t m_end = m->base + m->size; | 871 | phys_addr_t m_end = m->base + m->size; |
768 | 872 | ||
769 | /* only memory regions are associated with nodes, check it */ | 873 | /* only memory regions are associated with nodes, check it */ |
770 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 874 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
875 | continue; | ||
876 | |||
877 | /* skip hotpluggable memory regions if needed */ | ||
878 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | ||
771 | continue; | 879 | continue; |
772 | 880 | ||
773 | /* scan areas before each reservation for intersection */ | 881 | /* scan areas before each reservation for intersection */ |
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, | |||
837 | * memblock_set_node - set node ID on memblock regions | 945 | * memblock_set_node - set node ID on memblock regions |
838 | * @base: base of area to set node ID for | 946 | * @base: base of area to set node ID for |
839 | * @size: size of area to set node ID for | 947 | * @size: size of area to set node ID for |
948 | * @type: memblock type to set node ID for | ||
840 | * @nid: node ID to set | 949 | * @nid: node ID to set |
841 | * | 950 | * |
842 | * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. | 951 | * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. |
843 | * Regions which cross the area boundaries are split as necessary. | 952 | * Regions which cross the area boundaries are split as necessary. |
844 | * | 953 | * |
845 | * RETURNS: | 954 | * RETURNS: |
846 | * 0 on success, -errno on failure. | 955 | * 0 on success, -errno on failure. |
847 | */ | 956 | */ |
848 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | 957 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, |
849 | int nid) | 958 | struct memblock_type *type, int nid) |
850 | { | 959 | { |
851 | struct memblock_type *type = &memblock.memory; | ||
852 | int start_rgn, end_rgn; | 960 | int start_rgn, end_rgn; |
853 | int i, ret; | 961 | int i, ret; |
854 | 962 | ||
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
870 | { | 978 | { |
871 | phys_addr_t found; | 979 | phys_addr_t found; |
872 | 980 | ||
873 | if (WARN_ON(!align)) | 981 | if (!align) |
874 | align = __alignof__(long long); | 982 | align = SMP_CACHE_BYTES; |
875 | 983 | ||
876 | /* align @size to avoid excessive fragmentation on reserved array */ | 984 | /* align @size to avoid excessive fragmentation on reserved array */ |
877 | size = round_up(size, align); | 985 | size = round_up(size, align); |
878 | 986 | ||
879 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); | 987 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); |
880 | if (found && !memblock_reserve(found, size)) | 988 | if (found && !memblock_reserve(found, size)) |
881 | return found; | 989 | return found; |
882 | 990 | ||
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n | |||
890 | 998 | ||
891 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 999 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
892 | { | 1000 | { |
893 | return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); | 1001 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); |
894 | } | 1002 | } |
895 | 1003 | ||
896 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1004 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i | |||
920 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); | 1028 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); |
921 | } | 1029 | } |
922 | 1030 | ||
1031 | /** | ||
1032 | * memblock_virt_alloc_internal - allocate boot memory block | ||
1033 | * @size: size of memory block to be allocated in bytes | ||
1034 | * @align: alignment of the region and block's size | ||
1035 | * @min_addr: the lower bound of the memory region to allocate (phys address) | ||
1036 | * @max_addr: the upper bound of the memory region to allocate (phys address) | ||
1037 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1038 | * | ||
1039 | * The @min_addr limit is dropped if it can not be satisfied and the allocation | ||
1040 | * will fall back to memory below @min_addr. Also, allocation may fall back | ||
1041 | * to any node in the system if the specified node can not | ||
1042 | * hold the requested memory. | ||
1043 | * | ||
1044 | * The allocation is performed from memory region limited by | ||
1045 | * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. | ||
1046 | * | ||
1047 | * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. | ||
1048 | * | ||
1049 | * The phys address of allocated boot memory block is converted to virtual and | ||
1050 | * allocated memory is reset to 0. | ||
1051 | * | ||
1052 | * In addition, function sets the min_count to 0 using kmemleak_alloc for | ||
1053 | * allocated boot memory block, so that it is never reported as leaks. | ||
1054 | * | ||
1055 | * RETURNS: | ||
1056 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1057 | */ | ||
1058 | static void * __init memblock_virt_alloc_internal( | ||
1059 | phys_addr_t size, phys_addr_t align, | ||
1060 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1061 | int nid) | ||
1062 | { | ||
1063 | phys_addr_t alloc; | ||
1064 | void *ptr; | ||
1065 | |||
1066 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
1067 | nid = NUMA_NO_NODE; | ||
1068 | |||
1069 | /* | ||
1070 | * Detect any accidental use of these APIs after slab is ready, as at | ||
1071 | * this moment memblock may be deinitialized already and its | ||
1072 | * internal data may be destroyed (after execution of free_all_bootmem) | ||
1073 | */ | ||
1074 | if (WARN_ON_ONCE(slab_is_available())) | ||
1075 | return kzalloc_node(size, GFP_NOWAIT, nid); | ||
1076 | |||
1077 | if (!align) | ||
1078 | align = SMP_CACHE_BYTES; | ||
1079 | |||
1080 | /* align @size to avoid excessive fragmentation on reserved array */ | ||
1081 | size = round_up(size, align); | ||
1082 | |||
1083 | again: | ||
1084 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, | ||
1085 | nid); | ||
1086 | if (alloc) | ||
1087 | goto done; | ||
1088 | |||
1089 | if (nid != NUMA_NO_NODE) { | ||
1090 | alloc = memblock_find_in_range_node(size, align, min_addr, | ||
1091 | max_addr, NUMA_NO_NODE); | ||
1092 | if (alloc) | ||
1093 | goto done; | ||
1094 | } | ||
1095 | |||
1096 | if (min_addr) { | ||
1097 | min_addr = 0; | ||
1098 | goto again; | ||
1099 | } else { | ||
1100 | goto error; | ||
1101 | } | ||
1102 | |||
1103 | done: | ||
1104 | memblock_reserve(alloc, size); | ||
1105 | ptr = phys_to_virt(alloc); | ||
1106 | memset(ptr, 0, size); | ||
1107 | |||
1108 | /* | ||
1109 | * The min_count is set to 0 so that bootmem allocated blocks | ||
1110 | * are never reported as leaks. This is because many of these blocks | ||
1111 | * are only referred via the physical address which is not | ||
1112 | * looked up by kmemleak. | ||
1113 | */ | ||
1114 | kmemleak_alloc(ptr, size, 0, 0); | ||
1115 | |||
1116 | return ptr; | ||
1117 | |||
1118 | error: | ||
1119 | return NULL; | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block | ||
1124 | * @size: size of memory block to be allocated in bytes | ||
1125 | * @align: alignment of the region and block's size | ||
1126 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1127 | * is preferred (phys address) | ||
1128 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1129 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1130 | * allocate only from memory limited by memblock.current_limit value | ||
1131 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1132 | * | ||
1133 | * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides | ||
1134 | * additional debug information (including caller info), if enabled. | ||
1135 | * | ||
1136 | * RETURNS: | ||
1137 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1138 | */ | ||
1139 | void * __init memblock_virt_alloc_try_nid_nopanic( | ||
1140 | phys_addr_t size, phys_addr_t align, | ||
1141 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1142 | int nid) | ||
1143 | { | ||
1144 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1145 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1146 | (u64)max_addr, (void *)_RET_IP_); | ||
1147 | return memblock_virt_alloc_internal(size, align, min_addr, | ||
1148 | max_addr, nid); | ||
1149 | } | ||
1150 | |||
1151 | /** | ||
1152 | * memblock_virt_alloc_try_nid - allocate boot memory block with panicking | ||
1153 | * @size: size of memory block to be allocated in bytes | ||
1154 | * @align: alignment of the region and block's size | ||
1155 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1156 | * is preferred (phys address) | ||
1157 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1158 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1159 | * allocate only from memory limited by memblock.current_limit value | ||
1160 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1161 | * | ||
1162 | * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() | ||
1163 | * which provides debug information (including caller info), if enabled, | ||
1164 | * and panics if the request can not be satisfied. | ||
1165 | * | ||
1166 | * RETURNS: | ||
1167 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1168 | */ | ||
1169 | void * __init memblock_virt_alloc_try_nid( | ||
1170 | phys_addr_t size, phys_addr_t align, | ||
1171 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1172 | int nid) | ||
1173 | { | ||
1174 | void *ptr; | ||
1175 | |||
1176 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1177 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1178 | (u64)max_addr, (void *)_RET_IP_); | ||
1179 | ptr = memblock_virt_alloc_internal(size, align, | ||
1180 | min_addr, max_addr, nid); | ||
1181 | if (ptr) | ||
1182 | return ptr; | ||
1183 | |||
1184 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", | ||
1185 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1186 | (u64)max_addr); | ||
1187 | return NULL; | ||
1188 | } | ||
1189 | |||
1190 | /** | ||
1191 | * __memblock_free_early - free boot memory block | ||
1192 | * @base: phys starting address of the boot memory block | ||
1193 | * @size: size of the boot memory block in bytes | ||
1194 | * | ||
1195 | * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. | ||
1196 | * The freeing memory will not be released to the buddy allocator. | ||
1197 | */ | ||
1198 | void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | ||
1199 | { | ||
1200 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1201 | __func__, (u64)base, (u64)base + size - 1, | ||
1202 | (void *)_RET_IP_); | ||
1203 | kmemleak_free_part(__va(base), size); | ||
1204 | __memblock_remove(&memblock.reserved, base, size); | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * __memblock_free_late - free bootmem block pages directly to buddy allocator | ||
1209 | * @addr: phys starting address of the boot memory block | ||
1210 | * @size: size of the boot memory block in bytes | ||
1211 | * | ||
1212 | * This is only useful when the bootmem allocator has already been torn | ||
1213 | * down, but we are still initializing the system. Pages are released directly | ||
1214 | * to the buddy allocator, no bootmem metadata is updated because it is gone. | ||
1215 | */ | ||
1216 | void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) | ||
1217 | { | ||
1218 | u64 cursor, end; | ||
1219 | |||
1220 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1221 | __func__, (u64)base, (u64)base + size - 1, | ||
1222 | (void *)_RET_IP_); | ||
1223 | kmemleak_free_part(__va(base), size); | ||
1224 | cursor = PFN_UP(base); | ||
1225 | end = PFN_DOWN(base + size); | ||
1226 | |||
1227 | for (; cursor < end; cursor++) { | ||
1228 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
1229 | totalram_pages++; | ||
1230 | } | ||
1231 | } | ||
923 | 1232 | ||
924 | /* | 1233 | /* |
925 | * Remaining API functions | 1234 | * Remaining API functions |
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) | |||
1101 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) | 1410 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) |
1102 | { | 1411 | { |
1103 | unsigned long long base, size; | 1412 | unsigned long long base, size; |
1413 | unsigned long flags; | ||
1104 | int i; | 1414 | int i; |
1105 | 1415 | ||
1106 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); | 1416 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); |
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name | |||
1111 | 1421 | ||
1112 | base = rgn->base; | 1422 | base = rgn->base; |
1113 | size = rgn->size; | 1423 | size = rgn->size; |
1424 | flags = rgn->flags; | ||
1114 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 1425 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
1115 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) | 1426 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) |
1116 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", | 1427 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", |
1117 | memblock_get_region_node(rgn)); | 1428 | memblock_get_region_node(rgn)); |
1118 | #endif | 1429 | #endif |
1119 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", | 1430 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", |
1120 | name, i, base, base + size - 1, size, nid_buf); | 1431 | name, i, base, base + size - 1, size, nid_buf, flags); |
1121 | } | 1432 | } |
1122 | } | 1433 | } |
1123 | 1434 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7caff36180cd..67dd2a881433 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
1688 | */ | 1688 | */ |
1689 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1689 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
1690 | { | 1690 | { |
1691 | struct cgroup *task_cgrp; | ||
1692 | struct cgroup *mem_cgrp; | ||
1693 | /* | 1691 | /* |
1694 | * Need a buffer in BSS, can't rely on allocations. The code relies | 1692 | * protects memcg_name and makes sure that parallel ooms do not |
1695 | * on the assumption that OOM is serialized for memory controller. | 1693 | * interleave |
1696 | * If this assumption is broken, revisit this code. | ||
1697 | */ | 1694 | */ |
1695 | static DEFINE_SPINLOCK(oom_info_lock); | ||
1696 | struct cgroup *task_cgrp; | ||
1697 | struct cgroup *mem_cgrp; | ||
1698 | static char memcg_name[PATH_MAX]; | 1698 | static char memcg_name[PATH_MAX]; |
1699 | int ret; | 1699 | int ret; |
1700 | struct mem_cgroup *iter; | 1700 | struct mem_cgroup *iter; |
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1703 | if (!p) | 1703 | if (!p) |
1704 | return; | 1704 | return; |
1705 | 1705 | ||
1706 | spin_lock(&oom_info_lock); | ||
1706 | rcu_read_lock(); | 1707 | rcu_read_lock(); |
1707 | 1708 | ||
1708 | mem_cgrp = memcg->css.cgroup; | 1709 | mem_cgrp = memcg->css.cgroup; |
@@ -1771,6 +1772,7 @@ done: | |||
1771 | 1772 | ||
1772 | pr_cont("\n"); | 1773 | pr_cont("\n"); |
1773 | } | 1774 | } |
1775 | spin_unlock(&oom_info_lock); | ||
1774 | } | 1776 | } |
1775 | 1777 | ||
1776 | /* | 1778 | /* |
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex); | |||
3000 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 3002 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
3001 | { | 3003 | { |
3002 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | 3004 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && |
3003 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | 3005 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == |
3006 | KMEM_ACCOUNTED_MASK; | ||
3004 | } | 3007 | } |
3005 | 3008 | ||
3006 | /* | 3009 | /* |
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
3126 | * But when we create a new cache, we can call this as well if its parent | 3129 | * But when we create a new cache, we can call this as well if its parent |
3127 | * is kmem-limited. That will have to hold set_limit_mutex as well. | 3130 | * is kmem-limited. That will have to hold set_limit_mutex as well. |
3128 | */ | 3131 | */ |
3129 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | 3132 | static int memcg_update_cache_sizes(struct mem_cgroup *memcg) |
3130 | { | 3133 | { |
3131 | int num, ret; | 3134 | int num, ret; |
3132 | 3135 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..b25ed321e667 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
611 | } | 611 | } |
612 | 612 | ||
613 | /* | 613 | /* |
614 | * Dirty cache page page | 614 | * Dirty pagecache page |
615 | * Issues: when the error hit a hole page the error is not properly | 615 | * Issues: when the error hit a hole page the error is not properly |
616 | * propagated. | 616 | * propagated. |
617 | */ | 617 | */ |
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1587 | if (ret) { | 1587 | if (ret) { |
1588 | putback_lru_pages(&pagelist); | 1588 | if (!list_empty(&pagelist)) { |
1589 | list_del(&page->lru); | ||
1590 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1591 | page_is_file_cache(page)); | ||
1592 | putback_lru_page(page); | ||
1593 | } | ||
1594 | |||
1589 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1595 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1590 | pfn, ret, page->flags); | 1596 | pfn, ret, page->flags); |
1591 | if (ret > 0) | 1597 | if (ret > 0) |
diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..86487dfa5e59 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | ||
62 | 63 | ||
63 | #include <asm/io.h> | 64 | #include <asm/io.h> |
64 | #include <asm/pgalloc.h> | 65 | #include <asm/pgalloc.h> |
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2559 | 2560 | ||
2560 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2561 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2561 | { | 2562 | { |
2563 | debug_dma_assert_idle(src); | ||
2564 | |||
2562 | /* | 2565 | /* |
2563 | * If the source page was a PFN mapping, we don't have | 2566 | * If the source page was a PFN mapping, we don't have |
2564 | * a "struct page" for it. We do a best-effort copy by | 2567 | * a "struct page" for it. We do a best-effort copy by |
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, | |||
4272 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 4275 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
4273 | 4276 | ||
4274 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS | 4277 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS |
4278 | |||
4279 | static struct kmem_cache *page_ptl_cachep; | ||
4280 | |||
4281 | void __init ptlock_cache_init(void) | ||
4282 | { | ||
4283 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | ||
4284 | SLAB_PANIC, NULL); | ||
4285 | } | ||
4286 | |||
4275 | bool ptlock_alloc(struct page *page) | 4287 | bool ptlock_alloc(struct page *page) |
4276 | { | 4288 | { |
4277 | spinlock_t *ptl; | 4289 | spinlock_t *ptl; |
4278 | 4290 | ||
4279 | ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); | 4291 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); |
4280 | if (!ptl) | 4292 | if (!ptl) |
4281 | return false; | 4293 | return false; |
4282 | page->ptl = ptl; | 4294 | page->ptl = ptl; |
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page) | |||
4285 | 4297 | ||
4286 | void ptlock_free(struct page *page) | 4298 | void ptlock_free(struct page *page) |
4287 | { | 4299 | { |
4288 | kfree(page->ptl); | 4300 | kmem_cache_free(page_ptl_cachep, page->ptl); |
4289 | } | 4301 | } |
4290 | #endif | 4302 | #endif |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..cc2ab37220b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/bootmem.h> | ||
13 | #include <linux/compiler.h> | 12 | #include <linux/compiler.h> |
14 | #include <linux/export.h> | 13 | #include <linux/export.h> |
15 | #include <linux/pagevec.h> | 14 | #include <linux/pagevec.h> |
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
269 | } | 268 | } |
270 | 269 | ||
271 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | 270 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or |
272 | * alloc_bootmem_node_nopanic() */ | 271 | * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ |
273 | static int __ref ensure_zone_is_initialized(struct zone *zone, | 272 | static int __ref ensure_zone_is_initialized(struct zone *zone, |
274 | unsigned long start_pfn, unsigned long num_pages) | 273 | unsigned long start_pfn, unsigned long num_pages) |
275 | { | 274 | { |
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p) | |||
1446 | * the kernel away from hotpluggable memory. | 1445 | * the kernel away from hotpluggable memory. |
1447 | */ | 1446 | */ |
1448 | memblock_set_bottom_up(true); | 1447 | memblock_set_bottom_up(true); |
1448 | movable_node_enabled = true; | ||
1449 | #else | 1449 | #else |
1450 | pr_warn("movable_node option not supported\n"); | 1450 | pr_warn("movable_node option not supported\n"); |
1451 | #endif | 1451 | #endif |
diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..a8025befc323 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -72,28 +72,12 @@ int migrate_prep_local(void) | |||
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
75 | * Add isolated pages on the list back to the LRU under page lock | ||
76 | * to avoid leaking evictable pages back onto unevictable list. | ||
77 | */ | ||
78 | void putback_lru_pages(struct list_head *l) | ||
79 | { | ||
80 | struct page *page; | ||
81 | struct page *page2; | ||
82 | |||
83 | list_for_each_entry_safe(page, page2, l, lru) { | ||
84 | list_del(&page->lru); | ||
85 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
86 | page_is_file_cache(page)); | ||
87 | putback_lru_page(page); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Put previously isolated pages back onto the appropriate lists | 75 | * Put previously isolated pages back onto the appropriate lists |
93 | * from where they were once taken off for compaction/migration. | 76 | * from where they were once taken off for compaction/migration. |
94 | * | 77 | * |
95 | * This function shall be used instead of putback_lru_pages(), | 78 | * This function shall be used whenever the isolated pageset has been |
96 | * whenever the isolated pageset has been built by isolate_migratepages_range() | 79 | * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() |
80 | * and isolate_huge_page(). | ||
97 | */ | 81 | */ |
98 | void putback_movable_pages(struct list_head *l) | 82 | void putback_movable_pages(struct list_head *l) |
99 | { | 83 | { |
@@ -199,7 +183,12 @@ out: | |||
199 | */ | 183 | */ |
200 | static void remove_migration_ptes(struct page *old, struct page *new) | 184 | static void remove_migration_ptes(struct page *old, struct page *new) |
201 | { | 185 | { |
202 | rmap_walk(new, remove_migration_pte, old); | 186 | struct rmap_walk_control rwc = { |
187 | .rmap_one = remove_migration_pte, | ||
188 | .arg = old, | ||
189 | }; | ||
190 | |||
191 | rmap_walk(new, &rwc); | ||
203 | } | 192 | } |
204 | 193 | ||
205 | /* | 194 | /* |
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
563 | * Migration functions | 552 | * Migration functions |
564 | ***********************************************************/ | 553 | ***********************************************************/ |
565 | 554 | ||
566 | /* Always fail migration. Used for mappings that are not movable */ | ||
567 | int fail_migrate_page(struct address_space *mapping, | ||
568 | struct page *newpage, struct page *page) | ||
569 | { | ||
570 | return -EIO; | ||
571 | } | ||
572 | EXPORT_SYMBOL(fail_migrate_page); | ||
573 | |||
574 | /* | 555 | /* |
575 | * Common logic to directly migrate a single page suitable for | 556 | * Common logic to directly migrate a single page suitable for |
576 | * pages that do not use PagePrivate/PagePrivate2. | 557 | * pages that do not use PagePrivate/PagePrivate2. |
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1008 | { | 989 | { |
1009 | int rc = 0; | 990 | int rc = 0; |
1010 | int *result = NULL; | 991 | int *result = NULL; |
1011 | struct page *new_hpage = get_new_page(hpage, private, &result); | 992 | struct page *new_hpage; |
1012 | struct anon_vma *anon_vma = NULL; | 993 | struct anon_vma *anon_vma = NULL; |
1013 | 994 | ||
1014 | /* | 995 | /* |
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1018 | * tables or check whether the hugepage is pmd-based or not before | 999 | * tables or check whether the hugepage is pmd-based or not before |
1019 | * kicking migration. | 1000 | * kicking migration. |
1020 | */ | 1001 | */ |
1021 | if (!hugepage_migration_support(page_hstate(hpage))) | 1002 | if (!hugepage_migration_support(page_hstate(hpage))) { |
1003 | putback_active_hugepage(hpage); | ||
1022 | return -ENOSYS; | 1004 | return -ENOSYS; |
1005 | } | ||
1023 | 1006 | ||
1007 | new_hpage = get_new_page(hpage, private, &result); | ||
1024 | if (!new_hpage) | 1008 | if (!new_hpage) |
1025 | return -ENOMEM; | 1009 | return -ENOMEM; |
1026 | 1010 | ||
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1120 | nr_succeeded++; | 1104 | nr_succeeded++; |
1121 | break; | 1105 | break; |
1122 | default: | 1106 | default: |
1123 | /* Permanent failure */ | 1107 | /* |
1108 | * Permanent failure (-EBUSY, -ENOSYS, etc.): | ||
1109 | * unlike -EAGAIN case, the failed page is | ||
1110 | * removed from migration page list and not | ||
1111 | * retried in the next outer loop. | ||
1112 | */ | ||
1124 | nr_failed++; | 1113 | nr_failed++; |
1125 | break; | 1114 | break; |
1126 | } | 1115 | } |
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node) | |||
1594 | } | 1583 | } |
1595 | 1584 | ||
1596 | /* Returns true if the node is migrate rate-limited after the update */ | 1585 | /* Returns true if the node is migrate rate-limited after the update */ |
1597 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | 1586 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, |
1587 | unsigned long nr_pages) | ||
1598 | { | 1588 | { |
1599 | bool rate_limited = false; | ||
1600 | |||
1601 | /* | 1589 | /* |
1602 | * Rate-limit the amount of data that is being migrated to a node. | 1590 | * Rate-limit the amount of data that is being migrated to a node. |
1603 | * Optimal placement is no good if the memory bus is saturated and | 1591 | * Optimal placement is no good if the memory bus is saturated and |
1604 | * all the time is being spent migrating! | 1592 | * all the time is being spent migrating! |
1605 | */ | 1593 | */ |
1606 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1607 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | 1594 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { |
1595 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1608 | pgdat->numabalancing_migrate_nr_pages = 0; | 1596 | pgdat->numabalancing_migrate_nr_pages = 0; |
1609 | pgdat->numabalancing_migrate_next_window = jiffies + | 1597 | pgdat->numabalancing_migrate_next_window = jiffies + |
1610 | msecs_to_jiffies(migrate_interval_millisecs); | 1598 | msecs_to_jiffies(migrate_interval_millisecs); |
1599 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1611 | } | 1600 | } |
1612 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | 1601 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { |
1613 | rate_limited = true; | 1602 | trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, |
1614 | else | 1603 | nr_pages); |
1615 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | 1604 | return true; |
1616 | spin_unlock(&pgdat->numabalancing_migrate_lock); | 1605 | } |
1617 | 1606 | ||
1618 | return rate_limited; | 1607 | /* |
1608 | * This is an unlocked non-atomic update so errors are possible. | ||
1609 | * The consequences are failing to migrate when we potentiall should | ||
1610 | * have which is not severe enough to warrant locking. If it is ever | ||
1611 | * a problem, it can be converted to a per-cpu counter. | ||
1612 | */ | ||
1613 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1614 | return false; | ||
1619 | } | 1615 | } |
1620 | 1616 | ||
1621 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1617 | static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
1622 | { | 1618 | { |
1623 | int page_lru; | 1619 | int page_lru; |
1624 | 1620 | ||
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1705 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1701 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1706 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1702 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
1707 | if (nr_remaining) { | 1703 | if (nr_remaining) { |
1708 | putback_lru_pages(&migratepages); | 1704 | if (!list_empty(&migratepages)) { |
1705 | list_del(&page->lru); | ||
1706 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1707 | page_is_file_cache(page)); | ||
1708 | putback_lru_page(page); | ||
1709 | } | ||
1709 | isolated = 0; | 1710 | isolated = 0; |
1710 | } else | 1711 | } else |
1711 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1712 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..10819ed4df3e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
709 | 709 | ||
710 | lru_add_drain_all(); /* flush pagevec */ | 710 | lru_add_drain_all(); /* flush pagevec */ |
711 | 711 | ||
712 | down_write(¤t->mm->mmap_sem); | ||
713 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 712 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
714 | start &= PAGE_MASK; | 713 | start &= PAGE_MASK; |
715 | 714 | ||
716 | locked = len >> PAGE_SHIFT; | ||
717 | locked += current->mm->locked_vm; | ||
718 | |||
719 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 715 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
720 | lock_limit >>= PAGE_SHIFT; | 716 | lock_limit >>= PAGE_SHIFT; |
717 | locked = len >> PAGE_SHIFT; | ||
718 | |||
719 | down_write(¤t->mm->mmap_sem); | ||
720 | |||
721 | locked += current->mm->locked_vm; | ||
721 | 722 | ||
722 | /* check against resource limits */ | 723 | /* check against resource limits */ |
723 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 724 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
724 | error = do_mlock(start, len, 1); | 725 | error = do_mlock(start, len, 1); |
726 | |||
725 | up_write(¤t->mm->mmap_sem); | 727 | up_write(¤t->mm->mmap_sem); |
726 | if (!error) | 728 | if (!error) |
727 | error = __mm_populate(start, len, 0); | 729 | error = __mm_populate(start, len, 0); |
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | |||
732 | { | 734 | { |
733 | int ret; | 735 | int ret; |
734 | 736 | ||
735 | down_write(¤t->mm->mmap_sem); | ||
736 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 737 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
737 | start &= PAGE_MASK; | 738 | start &= PAGE_MASK; |
739 | |||
740 | down_write(¤t->mm->mmap_sem); | ||
738 | ret = do_mlock(start, len, 0); | 741 | ret = do_mlock(start, len, 0); |
739 | up_write(¤t->mm->mmap_sem); | 742 | up_write(¤t->mm->mmap_sem); |
743 | |||
740 | return ret; | 744 | return ret; |
741 | } | 745 | } |
742 | 746 | ||
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
781 | if (flags & MCL_CURRENT) | 785 | if (flags & MCL_CURRENT) |
782 | lru_add_drain_all(); /* flush pagevec */ | 786 | lru_add_drain_all(); /* flush pagevec */ |
783 | 787 | ||
784 | down_write(¤t->mm->mmap_sem); | ||
785 | |||
786 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 788 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
787 | lock_limit >>= PAGE_SHIFT; | 789 | lock_limit >>= PAGE_SHIFT; |
788 | 790 | ||
789 | ret = -ENOMEM; | 791 | ret = -ENOMEM; |
792 | down_write(¤t->mm->mmap_sem); | ||
793 | |||
790 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | 794 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
791 | capable(CAP_IPC_LOCK)) | 795 | capable(CAP_IPC_LOCK)) |
792 | ret = do_mlockall(flags); | 796 | ret = do_mlockall(flags); |
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
86 | 86 | ||
87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
89 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 90 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 91 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 92 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1190 | return hint; | 1191 | return hint; |
1191 | } | 1192 | } |
1192 | 1193 | ||
1194 | static inline int mlock_future_check(struct mm_struct *mm, | ||
1195 | unsigned long flags, | ||
1196 | unsigned long len) | ||
1197 | { | ||
1198 | unsigned long locked, lock_limit; | ||
1199 | |||
1200 | /* mlock MCL_FUTURE? */ | ||
1201 | if (flags & VM_LOCKED) { | ||
1202 | locked = len >> PAGE_SHIFT; | ||
1203 | locked += mm->locked_vm; | ||
1204 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1205 | lock_limit >>= PAGE_SHIFT; | ||
1206 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1207 | return -EAGAIN; | ||
1208 | } | ||
1209 | return 0; | ||
1210 | } | ||
1211 | |||
1193 | /* | 1212 | /* |
1194 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1213 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1195 | */ | 1214 | */ |
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1251 | if (!can_do_mlock()) | 1270 | if (!can_do_mlock()) |
1252 | return -EPERM; | 1271 | return -EPERM; |
1253 | 1272 | ||
1254 | /* mlock MCL_FUTURE? */ | 1273 | if (mlock_future_check(mm, vm_flags, len)) |
1255 | if (vm_flags & VM_LOCKED) { | 1274 | return -EAGAIN; |
1256 | unsigned long locked, lock_limit; | ||
1257 | locked = len >> PAGE_SHIFT; | ||
1258 | locked += mm->locked_vm; | ||
1259 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1260 | lock_limit >>= PAGE_SHIFT; | ||
1261 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1262 | return -EAGAIN; | ||
1263 | } | ||
1264 | 1275 | ||
1265 | if (file) { | 1276 | if (file) { |
1266 | struct inode *inode = file_inode(file); | 1277 | struct inode *inode = file_inode(file); |
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2591 | if (error & ~PAGE_MASK) | 2602 | if (error & ~PAGE_MASK) |
2592 | return error; | 2603 | return error; |
2593 | 2604 | ||
2594 | /* | 2605 | error = mlock_future_check(mm, mm->def_flags, len); |
2595 | * mlock MCL_FUTURE? | 2606 | if (error) |
2596 | */ | 2607 | return error; |
2597 | if (mm->def_flags & VM_LOCKED) { | ||
2598 | unsigned long locked, lock_limit; | ||
2599 | locked = len >> PAGE_SHIFT; | ||
2600 | locked += mm->locked_vm; | ||
2601 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
2602 | lock_limit >>= PAGE_SHIFT; | ||
2603 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
2604 | return -EAGAIN; | ||
2605 | } | ||
2606 | 2608 | ||
2607 | /* | 2609 | /* |
2608 | * mm->mmap_sem is required to protect against another thread | 2610 | * mm->mmap_sem is required to protect against another thread |
diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..7332c1785744 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mmu_notifier.h> | 23 | #include <linux/mmu_notifier.h> |
24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/ksm.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
63 | 64 | ||
64 | ptent = *pte; | 65 | ptent = *pte; |
65 | page = vm_normal_page(vma, addr, oldpte); | 66 | page = vm_normal_page(vma, addr, oldpte); |
66 | if (page) { | 67 | if (page && !PageKsm(page)) { |
67 | if (!pte_numa(oldpte)) { | 68 | if (!pte_numa(oldpte)) { |
68 | ptent = pte_mknuma(ptent); | 69 | ptent = pte_mknuma(ptent); |
69 | set_pte_at(mm, addr, pte, ptent); | 70 | set_pte_at(mm, addr, pte, ptent); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..19121ceb8874 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
41 | if (limit > memblock.current_limit) | 41 | if (limit > memblock.current_limit) |
42 | limit = memblock.current_limit; | 42 | limit = memblock.current_limit; |
43 | 43 | ||
44 | addr = memblock_find_in_range_node(goal, limit, size, align, nid); | 44 | addr = memblock_find_in_range_node(size, align, goal, limit, nid); |
45 | if (!addr) | 45 | if (!addr) |
46 | return NULL; | 46 | return NULL; |
47 | 47 | ||
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) | |||
117 | phys_addr_t start, end, size; | 117 | phys_addr_t start, end, size; |
118 | u64 i; | 118 | u64 i; |
119 | 119 | ||
120 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 120 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) |
121 | count += __free_memory_core(start, end); | 121 | count += __free_memory_core(start, end); |
122 | 122 | ||
123 | /* free range that is used for reserved array if we allocate it */ | 123 | /* free range that is used for reserved array if we allocate it */ |
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) | |||
161 | reset_all_zones_managed_pages(); | 161 | reset_all_zones_managed_pages(); |
162 | 162 | ||
163 | /* | 163 | /* |
164 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 164 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id |
165 | * because in some case like Node0 doesn't have RAM installed | 165 | * because in some case like Node0 doesn't have RAM installed |
166 | * low ram will be on Node1 | 166 | * low ram will be on Node1 |
167 | */ | 167 | */ |
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
215 | 215 | ||
216 | restart: | 216 | restart: |
217 | 217 | ||
218 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | 218 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); |
219 | 219 | ||
220 | if (ptr) | 220 | if (ptr) |
221 | return ptr; | 221 | return ptr; |
@@ -299,7 +299,7 @@ again: | |||
299 | if (ptr) | 299 | if (ptr) |
300 | return ptr; | 300 | return ptr; |
301 | 301 | ||
302 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 302 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, |
303 | goal, limit); | 303 | goal, limit); |
304 | if (ptr) | 304 | if (ptr) |
305 | return ptr; | 305 | return ptr; |
diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; | |||
60 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
63 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
63 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
64 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
65 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..054ff47c4478 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock); | |||
47 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
48 | /** | 48 | /** |
49 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
50 | * @tsk: task struct of which task to consider | 50 | * @start: task struct of which task to consider |
51 | * @mask: nodemask passed to page allocator for mempolicy ooms | 51 | * @mask: nodemask passed to page allocator for mempolicy ooms |
52 | * | 52 | * |
53 | * Task eligibility is determined by whether or not a candidate task, @tsk, | 53 | * Task eligibility is determined by whether or not a candidate task, @tsk, |
54 | * shares the same mempolicy nodes as current if it is bound by such a policy | 54 | * shares the same mempolicy nodes as current if it is bound by such a policy |
55 | * and whether or not it has the same set of allowed cpuset nodes. | 55 | * and whether or not it has the same set of allowed cpuset nodes. |
56 | */ | 56 | */ |
57 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 57 | static bool has_intersects_mems_allowed(struct task_struct *start, |
58 | const nodemask_t *mask) | 58 | const nodemask_t *mask) |
59 | { | 59 | { |
60 | struct task_struct *start = tsk; | 60 | struct task_struct *tsk; |
61 | bool ret = false; | ||
61 | 62 | ||
62 | do { | 63 | rcu_read_lock(); |
64 | for_each_thread(start, tsk) { | ||
63 | if (mask) { | 65 | if (mask) { |
64 | /* | 66 | /* |
65 | * If this is a mempolicy constrained oom, tsk's | 67 | * If this is a mempolicy constrained oom, tsk's |
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
67 | * mempolicy intersects current, otherwise it may be | 69 | * mempolicy intersects current, otherwise it may be |
68 | * needlessly killed. | 70 | * needlessly killed. |
69 | */ | 71 | */ |
70 | if (mempolicy_nodemask_intersects(tsk, mask)) | 72 | ret = mempolicy_nodemask_intersects(tsk, mask); |
71 | return true; | ||
72 | } else { | 73 | } else { |
73 | /* | 74 | /* |
74 | * This is not a mempolicy constrained oom, so only | 75 | * This is not a mempolicy constrained oom, so only |
75 | * check the mems of tsk's cpuset. | 76 | * check the mems of tsk's cpuset. |
76 | */ | 77 | */ |
77 | if (cpuset_mems_allowed_intersects(current, tsk)) | 78 | ret = cpuset_mems_allowed_intersects(current, tsk); |
78 | return true; | ||
79 | } | 79 | } |
80 | } while_each_thread(start, tsk); | 80 | if (ret) |
81 | break; | ||
82 | } | ||
83 | rcu_read_unlock(); | ||
81 | 84 | ||
82 | return false; | 85 | return ret; |
83 | } | 86 | } |
84 | #else | 87 | #else |
85 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 88 | static bool has_intersects_mems_allowed(struct task_struct *tsk, |
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
97 | */ | 100 | */ |
98 | struct task_struct *find_lock_task_mm(struct task_struct *p) | 101 | struct task_struct *find_lock_task_mm(struct task_struct *p) |
99 | { | 102 | { |
100 | struct task_struct *t = p; | 103 | struct task_struct *t; |
101 | 104 | ||
102 | do { | 105 | rcu_read_lock(); |
106 | |||
107 | for_each_thread(p, t) { | ||
103 | task_lock(t); | 108 | task_lock(t); |
104 | if (likely(t->mm)) | 109 | if (likely(t->mm)) |
105 | return t; | 110 | goto found; |
106 | task_unlock(t); | 111 | task_unlock(t); |
107 | } while_each_thread(p, t); | 112 | } |
113 | t = NULL; | ||
114 | found: | ||
115 | rcu_read_unlock(); | ||
108 | 116 | ||
109 | return NULL; | 117 | return t; |
110 | } | 118 | } |
111 | 119 | ||
112 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
301 | unsigned long chosen_points = 0; | 309 | unsigned long chosen_points = 0; |
302 | 310 | ||
303 | rcu_read_lock(); | 311 | rcu_read_lock(); |
304 | do_each_thread(g, p) { | 312 | for_each_process_thread(g, p) { |
305 | unsigned int points; | 313 | unsigned int points; |
306 | 314 | ||
307 | switch (oom_scan_process_thread(p, totalpages, nodemask, | 315 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
323 | chosen = p; | 331 | chosen = p; |
324 | chosen_points = points; | 332 | chosen_points = points; |
325 | } | 333 | } |
326 | } while_each_thread(g, p); | 334 | } |
327 | if (chosen) | 335 | if (chosen) |
328 | get_task_struct(chosen); | 336 | get_task_struct(chosen); |
329 | rcu_read_unlock(); | 337 | rcu_read_unlock(); |
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
406 | { | 414 | { |
407 | struct task_struct *victim = p; | 415 | struct task_struct *victim = p; |
408 | struct task_struct *child; | 416 | struct task_struct *child; |
409 | struct task_struct *t = p; | 417 | struct task_struct *t; |
410 | struct mm_struct *mm; | 418 | struct mm_struct *mm; |
411 | unsigned int victim_points = 0; | 419 | unsigned int victim_points = 0; |
412 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 420 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
437 | * still freeing memory. | 445 | * still freeing memory. |
438 | */ | 446 | */ |
439 | read_lock(&tasklist_lock); | 447 | read_lock(&tasklist_lock); |
440 | do { | 448 | for_each_thread(p, t) { |
441 | list_for_each_entry(child, &t->children, sibling) { | 449 | list_for_each_entry(child, &t->children, sibling) { |
442 | unsigned int child_points; | 450 | unsigned int child_points; |
443 | 451 | ||
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
455 | get_task_struct(victim); | 463 | get_task_struct(victim); |
456 | } | 464 | } |
457 | } | 465 | } |
458 | } while_each_thread(p, t); | 466 | } |
459 | read_unlock(&tasklist_lock); | 467 | read_unlock(&tasklist_lock); |
460 | 468 | ||
461 | rcu_read_lock(); | ||
462 | p = find_lock_task_mm(victim); | 469 | p = find_lock_task_mm(victim); |
463 | if (!p) { | 470 | if (!p) { |
464 | rcu_read_unlock(); | ||
465 | put_task_struct(victim); | 471 | put_task_struct(victim); |
466 | return; | 472 | return; |
467 | } else if (victim != p) { | 473 | } else if (victim != p) { |
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
487 | * That thread will now get access to memory reserves since it has a | 493 | * That thread will now get access to memory reserves since it has a |
488 | * pending fatal signal. | 494 | * pending fatal signal. |
489 | */ | 495 | */ |
496 | rcu_read_lock(); | ||
490 | for_each_process(p) | 497 | for_each_process(p) |
491 | if (p->mm == mm && !same_thread_group(p, victim) && | 498 | if (p->mm == mm && !same_thread_group(p, victim) && |
492 | !(p->flags & PF_KTHREAD)) { | 499 | !(p->flags & PF_KTHREAD)) { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..533e2147d14f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
2072 | return; | 2072 | return; |
2073 | 2073 | ||
2074 | /* | 2074 | /* |
2075 | * Walking all memory to count page types is very expensive and should | ||
2076 | * be inhibited in non-blockable contexts. | ||
2077 | */ | ||
2078 | if (!(gfp_mask & __GFP_WAIT)) | ||
2079 | filter |= SHOW_MEM_FILTER_PAGE_COUNT; | ||
2080 | |||
2081 | /* | ||
2082 | * This documents exceptions given to allocations in certain | 2075 | * This documents exceptions given to allocations in certain |
2083 | * contexts that are allowed to allocate outside current's set | 2076 | * contexts that are allowed to allocate outside current's set |
2084 | * of allowed nodes. | 2077 | * of allowed nodes. |
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2242 | preferred_zone, migratetype); | 2235 | preferred_zone, migratetype); |
2243 | if (page) { | 2236 | if (page) { |
2244 | preferred_zone->compact_blockskip_flush = false; | 2237 | preferred_zone->compact_blockskip_flush = false; |
2245 | preferred_zone->compact_considered = 0; | 2238 | compaction_defer_reset(preferred_zone, order, true); |
2246 | preferred_zone->compact_defer_shift = 0; | ||
2247 | if (order >= preferred_zone->compact_order_failed) | ||
2248 | preferred_zone->compact_order_failed = order + 1; | ||
2249 | count_vm_event(COMPACTSUCCESS); | 2239 | count_vm_event(COMPACTSUCCESS); |
2250 | return page; | 2240 | return page; |
2251 | } | 2241 | } |
@@ -2535,8 +2525,15 @@ rebalance: | |||
2535 | } | 2525 | } |
2536 | 2526 | ||
2537 | /* Atomic allocations - we can't balance anything */ | 2527 | /* Atomic allocations - we can't balance anything */ |
2538 | if (!wait) | 2528 | if (!wait) { |
2529 | /* | ||
2530 | * All existing users of the deprecated __GFP_NOFAIL are | ||
2531 | * blockable, so warn of any new users that actually allow this | ||
2532 | * type of allocation to fail. | ||
2533 | */ | ||
2534 | WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); | ||
2539 | goto nopage; | 2535 | goto nopage; |
2536 | } | ||
2540 | 2537 | ||
2541 | /* Avoid recursion of direct reclaim */ | 2538 | /* Avoid recursion of direct reclaim */ |
2542 | if (current->flags & PF_MEMALLOC) | 2539 | if (current->flags & PF_MEMALLOC) |
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3901 | struct page *page; | 3898 | struct page *page; |
3902 | unsigned long block_migratetype; | 3899 | unsigned long block_migratetype; |
3903 | int reserve; | 3900 | int reserve; |
3901 | int old_reserve; | ||
3904 | 3902 | ||
3905 | /* | 3903 | /* |
3906 | * Get the start pfn, end pfn and the number of blocks to reserve | 3904 | * Get the start pfn, end pfn and the number of blocks to reserve |
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3922 | * future allocation of hugepages at runtime. | 3920 | * future allocation of hugepages at runtime. |
3923 | */ | 3921 | */ |
3924 | reserve = min(2, reserve); | 3922 | reserve = min(2, reserve); |
3923 | old_reserve = zone->nr_migrate_reserve_block; | ||
3924 | |||
3925 | /* When memory hot-add, we almost always need to do nothing */ | ||
3926 | if (reserve == old_reserve) | ||
3927 | return; | ||
3928 | zone->nr_migrate_reserve_block = reserve; | ||
3925 | 3929 | ||
3926 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3930 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3927 | if (!pfn_valid(pfn)) | 3931 | if (!pfn_valid(pfn)) |
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3959 | reserve--; | 3963 | reserve--; |
3960 | continue; | 3964 | continue; |
3961 | } | 3965 | } |
3966 | } else if (!old_reserve) { | ||
3967 | /* | ||
3968 | * At boot time we don't need to scan the whole zone | ||
3969 | * for turning off MIGRATE_RESERVE. | ||
3970 | */ | ||
3971 | break; | ||
3962 | } | 3972 | } |
3963 | 3973 | ||
3964 | /* | 3974 | /* |
@@ -4209,7 +4219,6 @@ static noinline __init_refok | |||
4209 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 4219 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
4210 | { | 4220 | { |
4211 | int i; | 4221 | int i; |
4212 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
4213 | size_t alloc_size; | 4222 | size_t alloc_size; |
4214 | 4223 | ||
4215 | /* | 4224 | /* |
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
4225 | 4234 | ||
4226 | if (!slab_is_available()) { | 4235 | if (!slab_is_available()) { |
4227 | zone->wait_table = (wait_queue_head_t *) | 4236 | zone->wait_table = (wait_queue_head_t *) |
4228 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 4237 | memblock_virt_alloc_node_nopanic( |
4238 | alloc_size, zone->zone_pgdat->node_id); | ||
4229 | } else { | 4239 | } else { |
4230 | /* | 4240 | /* |
4231 | * This case means that a zone whose size was 0 gets new memory | 4241 | * This case means that a zone whose size was 0 gets new memory |
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
4345 | #endif | 4355 | #endif |
4346 | 4356 | ||
4347 | /** | 4357 | /** |
4348 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4358 | * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
4349 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4359 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4350 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4360 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
4351 | * | 4361 | * |
4352 | * If an architecture guarantees that all ranges registered with | 4362 | * If an architecture guarantees that all ranges registered with |
4353 | * add_active_ranges() contain no holes and may be freed, this | 4363 | * add_active_ranges() contain no holes and may be freed, this |
4354 | * this function may be used instead of calling free_bootmem() manually. | 4364 | * this function may be used instead of calling memblock_free_early_nid() |
4365 | * manually. | ||
4355 | */ | 4366 | */ |
4356 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4367 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4357 | { | 4368 | { |
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
4363 | end_pfn = min(end_pfn, max_low_pfn); | 4374 | end_pfn = min(end_pfn, max_low_pfn); |
4364 | 4375 | ||
4365 | if (start_pfn < end_pfn) | 4376 | if (start_pfn < end_pfn) |
4366 | free_bootmem_node(NODE_DATA(this_nid), | 4377 | memblock_free_early_nid(PFN_PHYS(start_pfn), |
4367 | PFN_PHYS(start_pfn), | 4378 | (end_pfn - start_pfn) << PAGE_SHIFT, |
4368 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4379 | this_nid); |
4369 | } | 4380 | } |
4370 | } | 4381 | } |
4371 | 4382 | ||
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4636 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); | 4647 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
4637 | zone->pageblock_flags = NULL; | 4648 | zone->pageblock_flags = NULL; |
4638 | if (usemapsize) | 4649 | if (usemapsize) |
4639 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4650 | zone->pageblock_flags = |
4640 | usemapsize); | 4651 | memblock_virt_alloc_node_nopanic(usemapsize, |
4652 | pgdat->node_id); | ||
4641 | } | 4653 | } |
4642 | #else | 4654 | #else |
4643 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | 4655 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4831 | size = (end - start) * sizeof(struct page); | 4843 | size = (end - start) * sizeof(struct page); |
4832 | map = alloc_remap(pgdat->node_id, size); | 4844 | map = alloc_remap(pgdat->node_id, size); |
4833 | if (!map) | 4845 | if (!map) |
4834 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4846 | map = memblock_virt_alloc_node_nopanic(size, |
4847 | pgdat->node_id); | ||
4835 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4848 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4836 | } | 4849 | } |
4837 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4850 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5012 | nodemask_t saved_node_state = node_states[N_MEMORY]; | 5025 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
5013 | unsigned long totalpages = early_calculate_totalpages(); | 5026 | unsigned long totalpages = early_calculate_totalpages(); |
5014 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); | 5027 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
5028 | struct memblock_type *type = &memblock.memory; | ||
5029 | |||
5030 | /* Need to find movable_zone earlier when movable_node is specified. */ | ||
5031 | find_usable_zone_for_movable(); | ||
5032 | |||
5033 | /* | ||
5034 | * If movable_node is specified, ignore kernelcore and movablecore | ||
5035 | * options. | ||
5036 | */ | ||
5037 | if (movable_node_is_enabled()) { | ||
5038 | for (i = 0; i < type->cnt; i++) { | ||
5039 | if (!memblock_is_hotpluggable(&type->regions[i])) | ||
5040 | continue; | ||
5041 | |||
5042 | nid = type->regions[i].nid; | ||
5043 | |||
5044 | usable_startpfn = PFN_DOWN(type->regions[i].base); | ||
5045 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
5046 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
5047 | usable_startpfn; | ||
5048 | } | ||
5049 | |||
5050 | goto out2; | ||
5051 | } | ||
5015 | 5052 | ||
5016 | /* | 5053 | /* |
5017 | * If movablecore was specified, calculate what size of | 5054 | * If movablecore=nn[KMG] was specified, calculate what size of |
5018 | * kernelcore that corresponds so that memory usable for | 5055 | * kernelcore that corresponds so that memory usable for |
5019 | * any allocation type is evenly spread. If both kernelcore | 5056 | * any allocation type is evenly spread. If both kernelcore |
5020 | * and movablecore are specified, then the value of kernelcore | 5057 | * and movablecore are specified, then the value of kernelcore |
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5040 | goto out; | 5077 | goto out; |
5041 | 5078 | ||
5042 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 5079 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
5043 | find_usable_zone_for_movable(); | ||
5044 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 5080 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
5045 | 5081 | ||
5046 | restart: | 5082 | restart: |
@@ -5131,6 +5167,7 @@ restart: | |||
5131 | if (usable_nodes && required_kernelcore > usable_nodes) | 5167 | if (usable_nodes && required_kernelcore > usable_nodes) |
5132 | goto restart; | 5168 | goto restart; |
5133 | 5169 | ||
5170 | out2: | ||
5134 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5171 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
5135 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5172 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
5136 | zone_movable_pfn[nid] = | 5173 | zone_movable_pfn[nid] = |
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5857 | do { | 5894 | do { |
5858 | size = bucketsize << log2qty; | 5895 | size = bucketsize << log2qty; |
5859 | if (flags & HASH_EARLY) | 5896 | if (flags & HASH_EARLY) |
5860 | table = alloc_bootmem_nopanic(size); | 5897 | table = memblock_virt_alloc_nopanic(size, 0); |
5861 | else if (hashdist) | 5898 | else if (hashdist) |
5862 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5899 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
5863 | else { | 5900 | else { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3bd0b8e6ab12..cfd162882c00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) | |||
54 | 54 | ||
55 | table_size = sizeof(struct page_cgroup) * nr_pages; | 55 | table_size = sizeof(struct page_cgroup) * nr_pages; |
56 | 56 | ||
57 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | 57 | base = memblock_virt_alloc_try_nid_nopanic( |
58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
59 | if (!base) | 60 | if (!base) |
60 | return -ENOMEM; | 61 | return -ENOMEM; |
61 | NODE_DATA(nid)->node_page_cgroup = base; | 62 | NODE_DATA(nid)->node_page_cgroup = base; |
diff --git a/mm/percpu.c b/mm/percpu.c index afbf352ae580..036cfe07050f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1063 | __alignof__(ai->groups[0].cpu_map[0])); | 1063 | __alignof__(ai->groups[0].cpu_map[0])); |
1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); | 1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); |
1065 | 1065 | ||
1066 | ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); | 1066 | ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); |
1067 | if (!ptr) | 1067 | if (!ptr) |
1068 | return NULL; | 1068 | return NULL; |
1069 | ai = ptr; | 1069 | ai = ptr; |
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1088 | */ | 1088 | */ |
1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) | 1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) |
1090 | { | 1090 | { |
1091 | free_bootmem(__pa(ai), ai->__ai_size); | 1091 | memblock_free_early(__pa(ai), ai->__ai_size); |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | /** | 1094 | /** |
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); | 1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); |
1247 | 1247 | ||
1248 | /* process group information and build config tables accordingly */ | 1248 | /* process group information and build config tables accordingly */ |
1249 | group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); | 1249 | group_offsets = memblock_virt_alloc(ai->nr_groups * |
1250 | group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); | 1250 | sizeof(group_offsets[0]), 0); |
1251 | unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); | 1251 | group_sizes = memblock_virt_alloc(ai->nr_groups * |
1252 | unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); | 1252 | sizeof(group_sizes[0]), 0); |
1253 | unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); | ||
1254 | unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); | ||
1253 | 1255 | ||
1254 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1256 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
1255 | unit_map[cpu] = UINT_MAX; | 1257 | unit_map[cpu] = UINT_MAX; |
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1311 | * empty chunks. | 1313 | * empty chunks. |
1312 | */ | 1314 | */ |
1313 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; | 1315 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; |
1314 | pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); | 1316 | pcpu_slot = memblock_virt_alloc( |
1317 | pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); | ||
1315 | for (i = 0; i < pcpu_nr_slots; i++) | 1318 | for (i = 0; i < pcpu_nr_slots; i++) |
1316 | INIT_LIST_HEAD(&pcpu_slot[i]); | 1319 | INIT_LIST_HEAD(&pcpu_slot[i]); |
1317 | 1320 | ||
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1322 | * covers static area + reserved area (mostly used for module | 1325 | * covers static area + reserved area (mostly used for module |
1323 | * static percpu allocation). | 1326 | * static percpu allocation). |
1324 | */ | 1327 | */ |
1325 | schunk = alloc_bootmem(pcpu_chunk_struct_size); | 1328 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1326 | INIT_LIST_HEAD(&schunk->list); | 1329 | INIT_LIST_HEAD(&schunk->list); |
1327 | schunk->base_addr = base_addr; | 1330 | schunk->base_addr = base_addr; |
1328 | schunk->map = smap; | 1331 | schunk->map = smap; |
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1346 | 1349 | ||
1347 | /* init dynamic chunk if necessary */ | 1350 | /* init dynamic chunk if necessary */ |
1348 | if (dyn_size) { | 1351 | if (dyn_size) { |
1349 | dchunk = alloc_bootmem(pcpu_chunk_struct_size); | 1352 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1350 | INIT_LIST_HEAD(&dchunk->list); | 1353 | INIT_LIST_HEAD(&dchunk->list); |
1351 | dchunk->base_addr = base_addr; | 1354 | dchunk->base_addr = base_addr; |
1352 | dchunk->map = dmap; | 1355 | dchunk->map = dmap; |
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1626 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; | 1629 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; |
1627 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); | 1630 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); |
1628 | 1631 | ||
1629 | areas = alloc_bootmem_nopanic(areas_size); | 1632 | areas = memblock_virt_alloc_nopanic(areas_size, 0); |
1630 | if (!areas) { | 1633 | if (!areas) { |
1631 | rc = -ENOMEM; | 1634 | rc = -ENOMEM; |
1632 | goto out_free; | 1635 | goto out_free; |
@@ -1712,7 +1715,7 @@ out_free_areas: | |||
1712 | out_free: | 1715 | out_free: |
1713 | pcpu_free_alloc_info(ai); | 1716 | pcpu_free_alloc_info(ai); |
1714 | if (areas) | 1717 | if (areas) |
1715 | free_bootmem(__pa(areas), areas_size); | 1718 | memblock_free_early(__pa(areas), areas_size); |
1716 | return rc; | 1719 | return rc; |
1717 | } | 1720 | } |
1718 | #endif /* BUILD_EMBED_FIRST_CHUNK */ | 1721 | #endif /* BUILD_EMBED_FIRST_CHUNK */ |
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, | |||
1760 | /* unaligned allocations can't be freed, round up to page size */ | 1763 | /* unaligned allocations can't be freed, round up to page size */ |
1761 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * | 1764 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * |
1762 | sizeof(pages[0])); | 1765 | sizeof(pages[0])); |
1763 | pages = alloc_bootmem(pages_size); | 1766 | pages = memblock_virt_alloc(pages_size, 0); |
1764 | 1767 | ||
1765 | /* allocate pages */ | 1768 | /* allocate pages */ |
1766 | j = 0; | 1769 | j = 0; |
@@ -1823,7 +1826,7 @@ enomem: | |||
1823 | free_fn(page_address(pages[j]), PAGE_SIZE); | 1826 | free_fn(page_address(pages[j]), PAGE_SIZE); |
1824 | rc = -ENOMEM; | 1827 | rc = -ENOMEM; |
1825 | out_free_ar: | 1828 | out_free_ar: |
1826 | free_bootmem(__pa(pages), pages_size); | 1829 | memblock_free_early(__pa(pages), pages_size); |
1827 | pcpu_free_alloc_info(ai); | 1830 | pcpu_free_alloc_info(ai); |
1828 | return rc; | 1831 | return rc; |
1829 | } | 1832 | } |
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
1848 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, | 1851 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, |
1849 | size_t align) | 1852 | size_t align) |
1850 | { | 1853 | { |
1851 | return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | 1854 | return memblock_virt_alloc_from_nopanic( |
1855 | size, align, __pa(MAX_DMA_ADDRESS)); | ||
1852 | } | 1856 | } |
1853 | 1857 | ||
1854 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) | 1858 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) |
1855 | { | 1859 | { |
1856 | free_bootmem(__pa(ptr), size); | 1860 | memblock_free_early(__pa(ptr), size); |
1857 | } | 1861 | } |
1858 | 1862 | ||
1859 | void __init setup_per_cpu_areas(void) | 1863 | void __init setup_per_cpu_areas(void) |
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) | |||
1896 | void *fc; | 1900 | void *fc; |
1897 | 1901 | ||
1898 | ai = pcpu_alloc_alloc_info(1, 1); | 1902 | ai = pcpu_alloc_alloc_info(1, 1); |
1899 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1903 | fc = memblock_virt_alloc_from_nopanic(unit_size, |
1904 | PAGE_SIZE, | ||
1905 | __pa(MAX_DMA_ADDRESS)); | ||
1900 | if (!ai || !fc) | 1906 | if (!ai || !fc) |
1901 | panic("Failed to allocate memory for percpu areas."); | 1907 | panic("Failed to allocate memory for percpu areas."); |
1902 | /* kmemleak tracks the percpu allocations separately */ | 1908 | /* kmemleak tracks the percpu allocations separately */ |
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
660 | return 1; | 660 | return 1; |
661 | } | 661 | } |
662 | 662 | ||
663 | struct page_referenced_arg { | ||
664 | int mapcount; | ||
665 | int referenced; | ||
666 | unsigned long vm_flags; | ||
667 | struct mem_cgroup *memcg; | ||
668 | }; | ||
663 | /* | 669 | /* |
664 | * Subfunctions of page_referenced: page_referenced_one called | 670 | * arg: page_referenced_arg will be passed |
665 | * repeatedly from either page_referenced_anon or page_referenced_file. | ||
666 | */ | 671 | */ |
667 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
668 | unsigned long address, unsigned int *mapcount, | 673 | unsigned long address, void *arg) |
669 | unsigned long *vm_flags) | ||
670 | { | 674 | { |
671 | struct mm_struct *mm = vma->vm_mm; | 675 | struct mm_struct *mm = vma->vm_mm; |
672 | spinlock_t *ptl; | 676 | spinlock_t *ptl; |
673 | int referenced = 0; | 677 | int referenced = 0; |
678 | struct page_referenced_arg *pra = arg; | ||
674 | 679 | ||
675 | if (unlikely(PageTransHuge(page))) { | 680 | if (unlikely(PageTransHuge(page))) { |
676 | pmd_t *pmd; | 681 | pmd_t *pmd; |
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
682 | pmd = page_check_address_pmd(page, mm, address, | 687 | pmd = page_check_address_pmd(page, mm, address, |
683 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | 688 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); |
684 | if (!pmd) | 689 | if (!pmd) |
685 | goto out; | 690 | return SWAP_AGAIN; |
686 | 691 | ||
687 | if (vma->vm_flags & VM_LOCKED) { | 692 | if (vma->vm_flags & VM_LOCKED) { |
688 | spin_unlock(ptl); | 693 | spin_unlock(ptl); |
689 | *mapcount = 0; /* break early from loop */ | 694 | pra->vm_flags |= VM_LOCKED; |
690 | *vm_flags |= VM_LOCKED; | 695 | return SWAP_FAIL; /* To break the loop */ |
691 | goto out; | ||
692 | } | 696 | } |
693 | 697 | ||
694 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 698 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
704 | */ | 708 | */ |
705 | pte = page_check_address(page, mm, address, &ptl, 0); | 709 | pte = page_check_address(page, mm, address, &ptl, 0); |
706 | if (!pte) | 710 | if (!pte) |
707 | goto out; | 711 | return SWAP_AGAIN; |
708 | 712 | ||
709 | if (vma->vm_flags & VM_LOCKED) { | 713 | if (vma->vm_flags & VM_LOCKED) { |
710 | pte_unmap_unlock(pte, ptl); | 714 | pte_unmap_unlock(pte, ptl); |
711 | *mapcount = 0; /* break early from loop */ | 715 | pra->vm_flags |= VM_LOCKED; |
712 | *vm_flags |= VM_LOCKED; | 716 | return SWAP_FAIL; /* To break the loop */ |
713 | goto out; | ||
714 | } | 717 | } |
715 | 718 | ||
716 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 719 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
727 | pte_unmap_unlock(pte, ptl); | 730 | pte_unmap_unlock(pte, ptl); |
728 | } | 731 | } |
729 | 732 | ||
730 | (*mapcount)--; | 733 | if (referenced) { |
731 | 734 | pra->referenced++; | |
732 | if (referenced) | 735 | pra->vm_flags |= vma->vm_flags; |
733 | *vm_flags |= vma->vm_flags; | ||
734 | out: | ||
735 | return referenced; | ||
736 | } | ||
737 | |||
738 | static int page_referenced_anon(struct page *page, | ||
739 | struct mem_cgroup *memcg, | ||
740 | unsigned long *vm_flags) | ||
741 | { | ||
742 | unsigned int mapcount; | ||
743 | struct anon_vma *anon_vma; | ||
744 | pgoff_t pgoff; | ||
745 | struct anon_vma_chain *avc; | ||
746 | int referenced = 0; | ||
747 | |||
748 | anon_vma = page_lock_anon_vma_read(page); | ||
749 | if (!anon_vma) | ||
750 | return referenced; | ||
751 | |||
752 | mapcount = page_mapcount(page); | ||
753 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
754 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
755 | struct vm_area_struct *vma = avc->vma; | ||
756 | unsigned long address = vma_address(page, vma); | ||
757 | /* | ||
758 | * If we are reclaiming on behalf of a cgroup, skip | ||
759 | * counting on behalf of references from different | ||
760 | * cgroups | ||
761 | */ | ||
762 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
763 | continue; | ||
764 | referenced += page_referenced_one(page, vma, address, | ||
765 | &mapcount, vm_flags); | ||
766 | if (!mapcount) | ||
767 | break; | ||
768 | } | 736 | } |
769 | 737 | ||
770 | page_unlock_anon_vma_read(anon_vma); | 738 | pra->mapcount--; |
771 | return referenced; | 739 | if (!pra->mapcount) |
740 | return SWAP_SUCCESS; /* To break the loop */ | ||
741 | |||
742 | return SWAP_AGAIN; | ||
772 | } | 743 | } |
773 | 744 | ||
774 | /** | 745 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
775 | * page_referenced_file - referenced check for object-based rmap | ||
776 | * @page: the page we're checking references on. | ||
777 | * @memcg: target memory control group | ||
778 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
779 | * | ||
780 | * For an object-based mapped page, find all the places it is mapped and | ||
781 | * check/clear the referenced flag. This is done by following the page->mapping | ||
782 | * pointer, then walking the chain of vmas it holds. It returns the number | ||
783 | * of references it found. | ||
784 | * | ||
785 | * This function is only called from page_referenced for object-based pages. | ||
786 | */ | ||
787 | static int page_referenced_file(struct page *page, | ||
788 | struct mem_cgroup *memcg, | ||
789 | unsigned long *vm_flags) | ||
790 | { | 746 | { |
791 | unsigned int mapcount; | 747 | struct page_referenced_arg *pra = arg; |
792 | struct address_space *mapping = page->mapping; | 748 | struct mem_cgroup *memcg = pra->memcg; |
793 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
794 | struct vm_area_struct *vma; | ||
795 | int referenced = 0; | ||
796 | |||
797 | /* | ||
798 | * The caller's checks on page->mapping and !PageAnon have made | ||
799 | * sure that this is a file page: the check for page->mapping | ||
800 | * excludes the case just before it gets set on an anon page. | ||
801 | */ | ||
802 | BUG_ON(PageAnon(page)); | ||
803 | |||
804 | /* | ||
805 | * The page lock not only makes sure that page->mapping cannot | ||
806 | * suddenly be NULLified by truncation, it makes sure that the | ||
807 | * structure at mapping cannot be freed and reused yet, | ||
808 | * so we can safely take mapping->i_mmap_mutex. | ||
809 | */ | ||
810 | BUG_ON(!PageLocked(page)); | ||
811 | |||
812 | mutex_lock(&mapping->i_mmap_mutex); | ||
813 | 749 | ||
814 | /* | 750 | if (!mm_match_cgroup(vma->vm_mm, memcg)) |
815 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount | 751 | return true; |
816 | * is more likely to be accurate if we note it after spinning. | ||
817 | */ | ||
818 | mapcount = page_mapcount(page); | ||
819 | |||
820 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
821 | unsigned long address = vma_address(page, vma); | ||
822 | /* | ||
823 | * If we are reclaiming on behalf of a cgroup, skip | ||
824 | * counting on behalf of references from different | ||
825 | * cgroups | ||
826 | */ | ||
827 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
828 | continue; | ||
829 | referenced += page_referenced_one(page, vma, address, | ||
830 | &mapcount, vm_flags); | ||
831 | if (!mapcount) | ||
832 | break; | ||
833 | } | ||
834 | 752 | ||
835 | mutex_unlock(&mapping->i_mmap_mutex); | 753 | return false; |
836 | return referenced; | ||
837 | } | 754 | } |
838 | 755 | ||
839 | /** | 756 | /** |
@@ -851,41 +768,57 @@ int page_referenced(struct page *page, | |||
851 | struct mem_cgroup *memcg, | 768 | struct mem_cgroup *memcg, |
852 | unsigned long *vm_flags) | 769 | unsigned long *vm_flags) |
853 | { | 770 | { |
854 | int referenced = 0; | 771 | int ret; |
855 | int we_locked = 0; | 772 | int we_locked = 0; |
773 | struct page_referenced_arg pra = { | ||
774 | .mapcount = page_mapcount(page), | ||
775 | .memcg = memcg, | ||
776 | }; | ||
777 | struct rmap_walk_control rwc = { | ||
778 | .rmap_one = page_referenced_one, | ||
779 | .arg = (void *)&pra, | ||
780 | .anon_lock = page_lock_anon_vma_read, | ||
781 | }; | ||
856 | 782 | ||
857 | *vm_flags = 0; | 783 | *vm_flags = 0; |
858 | if (page_mapped(page) && page_rmapping(page)) { | 784 | if (!page_mapped(page)) |
859 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 785 | return 0; |
860 | we_locked = trylock_page(page); | 786 | |
861 | if (!we_locked) { | 787 | if (!page_rmapping(page)) |
862 | referenced++; | 788 | return 0; |
863 | goto out; | 789 | |
864 | } | 790 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
865 | } | 791 | we_locked = trylock_page(page); |
866 | if (unlikely(PageKsm(page))) | 792 | if (!we_locked) |
867 | referenced += page_referenced_ksm(page, memcg, | 793 | return 1; |
868 | vm_flags); | ||
869 | else if (PageAnon(page)) | ||
870 | referenced += page_referenced_anon(page, memcg, | ||
871 | vm_flags); | ||
872 | else if (page->mapping) | ||
873 | referenced += page_referenced_file(page, memcg, | ||
874 | vm_flags); | ||
875 | if (we_locked) | ||
876 | unlock_page(page); | ||
877 | } | 794 | } |
878 | out: | 795 | |
879 | return referenced; | 796 | /* |
797 | * If we are reclaiming on behalf of a cgroup, skip | ||
798 | * counting on behalf of references from different | ||
799 | * cgroups | ||
800 | */ | ||
801 | if (memcg) { | ||
802 | rwc.invalid_vma = invalid_page_referenced_vma; | ||
803 | } | ||
804 | |||
805 | ret = rmap_walk(page, &rwc); | ||
806 | *vm_flags = pra.vm_flags; | ||
807 | |||
808 | if (we_locked) | ||
809 | unlock_page(page); | ||
810 | |||
811 | return pra.referenced; | ||
880 | } | 812 | } |
881 | 813 | ||
882 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 814 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
883 | unsigned long address) | 815 | unsigned long address, void *arg) |
884 | { | 816 | { |
885 | struct mm_struct *mm = vma->vm_mm; | 817 | struct mm_struct *mm = vma->vm_mm; |
886 | pte_t *pte; | 818 | pte_t *pte; |
887 | spinlock_t *ptl; | 819 | spinlock_t *ptl; |
888 | int ret = 0; | 820 | int ret = 0; |
821 | int *cleaned = arg; | ||
889 | 822 | ||
890 | pte = page_check_address(page, mm, address, &ptl, 1); | 823 | pte = page_check_address(page, mm, address, &ptl, 1); |
891 | if (!pte) | 824 | if (!pte) |
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
904 | 837 | ||
905 | pte_unmap_unlock(pte, ptl); | 838 | pte_unmap_unlock(pte, ptl); |
906 | 839 | ||
907 | if (ret) | 840 | if (ret) { |
908 | mmu_notifier_invalidate_page(mm, address); | 841 | mmu_notifier_invalidate_page(mm, address); |
842 | (*cleaned)++; | ||
843 | } | ||
909 | out: | 844 | out: |
910 | return ret; | 845 | return SWAP_AGAIN; |
911 | } | 846 | } |
912 | 847 | ||
913 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 848 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
914 | { | 849 | { |
915 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 850 | if (vma->vm_flags & VM_SHARED) |
916 | struct vm_area_struct *vma; | 851 | return 0; |
917 | int ret = 0; | ||
918 | |||
919 | BUG_ON(PageAnon(page)); | ||
920 | 852 | ||
921 | mutex_lock(&mapping->i_mmap_mutex); | 853 | return 1; |
922 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
923 | if (vma->vm_flags & VM_SHARED) { | ||
924 | unsigned long address = vma_address(page, vma); | ||
925 | ret += page_mkclean_one(page, vma, address); | ||
926 | } | ||
927 | } | ||
928 | mutex_unlock(&mapping->i_mmap_mutex); | ||
929 | return ret; | ||
930 | } | 854 | } |
931 | 855 | ||
932 | int page_mkclean(struct page *page) | 856 | int page_mkclean(struct page *page) |
933 | { | 857 | { |
934 | int ret = 0; | 858 | int cleaned = 0; |
859 | struct address_space *mapping; | ||
860 | struct rmap_walk_control rwc = { | ||
861 | .arg = (void *)&cleaned, | ||
862 | .rmap_one = page_mkclean_one, | ||
863 | .invalid_vma = invalid_mkclean_vma, | ||
864 | }; | ||
935 | 865 | ||
936 | BUG_ON(!PageLocked(page)); | 866 | BUG_ON(!PageLocked(page)); |
937 | 867 | ||
938 | if (page_mapped(page)) { | 868 | if (!page_mapped(page)) |
939 | struct address_space *mapping = page_mapping(page); | 869 | return 0; |
940 | if (mapping) | ||
941 | ret = page_mkclean_file(mapping, page); | ||
942 | } | ||
943 | 870 | ||
944 | return ret; | 871 | mapping = page_mapping(page); |
872 | if (!mapping) | ||
873 | return 0; | ||
874 | |||
875 | rmap_walk(page, &rwc); | ||
876 | |||
877 | return cleaned; | ||
945 | } | 878 | } |
946 | EXPORT_SYMBOL_GPL(page_mkclean); | 879 | EXPORT_SYMBOL_GPL(page_mkclean); |
947 | 880 | ||
@@ -1177,17 +1110,17 @@ out: | |||
1177 | } | 1110 | } |
1178 | 1111 | ||
1179 | /* | 1112 | /* |
1180 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1113 | * @arg: enum ttu_flags will be passed to this argument |
1181 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. | ||
1182 | */ | 1114 | */ |
1183 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1184 | unsigned long address, enum ttu_flags flags) | 1116 | unsigned long address, void *arg) |
1185 | { | 1117 | { |
1186 | struct mm_struct *mm = vma->vm_mm; | 1118 | struct mm_struct *mm = vma->vm_mm; |
1187 | pte_t *pte; | 1119 | pte_t *pte; |
1188 | pte_t pteval; | 1120 | pte_t pteval; |
1189 | spinlock_t *ptl; | 1121 | spinlock_t *ptl; |
1190 | int ret = SWAP_AGAIN; | 1122 | int ret = SWAP_AGAIN; |
1123 | enum ttu_flags flags = (enum ttu_flags)arg; | ||
1191 | 1124 | ||
1192 | pte = page_check_address(page, mm, address, &ptl, 0); | 1125 | pte = page_check_address(page, mm, address, &ptl, 0); |
1193 | if (!pte) | 1126 | if (!pte) |
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1426 | return ret; | 1359 | return ret; |
1427 | } | 1360 | } |
1428 | 1361 | ||
1429 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1362 | static int try_to_unmap_nonlinear(struct page *page, |
1430 | { | 1363 | struct address_space *mapping, struct vm_area_struct *vma) |
1431 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1432 | |||
1433 | if (!maybe_stack) | ||
1434 | return false; | ||
1435 | |||
1436 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1437 | VM_STACK_INCOMPLETE_SETUP) | ||
1438 | return true; | ||
1439 | |||
1440 | return false; | ||
1441 | } | ||
1442 | |||
1443 | /** | ||
1444 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
1445 | * rmap method | ||
1446 | * @page: the page to unmap/unlock | ||
1447 | * @flags: action and flags | ||
1448 | * | ||
1449 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1450 | * contained in the anon_vma struct it points to. | ||
1451 | * | ||
1452 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1453 | * anonymous pages. | ||
1454 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1455 | * where the page was found will be held for write. So, we won't recheck | ||
1456 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1457 | * 'LOCKED. | ||
1458 | */ | ||
1459 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | ||
1460 | { | ||
1461 | struct anon_vma *anon_vma; | ||
1462 | pgoff_t pgoff; | ||
1463 | struct anon_vma_chain *avc; | ||
1464 | int ret = SWAP_AGAIN; | ||
1465 | |||
1466 | anon_vma = page_lock_anon_vma_read(page); | ||
1467 | if (!anon_vma) | ||
1468 | return ret; | ||
1469 | |||
1470 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1471 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1472 | struct vm_area_struct *vma = avc->vma; | ||
1473 | unsigned long address; | ||
1474 | |||
1475 | /* | ||
1476 | * During exec, a temporary VMA is setup and later moved. | ||
1477 | * The VMA is moved under the anon_vma lock but not the | ||
1478 | * page tables leading to a race where migration cannot | ||
1479 | * find the migration ptes. Rather than increasing the | ||
1480 | * locking requirements of exec(), migration skips | ||
1481 | * temporary VMAs until after exec() completes. | ||
1482 | */ | ||
1483 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && | ||
1484 | is_vma_temporary_stack(vma)) | ||
1485 | continue; | ||
1486 | |||
1487 | address = vma_address(page, vma); | ||
1488 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1489 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1490 | break; | ||
1491 | } | ||
1492 | |||
1493 | page_unlock_anon_vma_read(anon_vma); | ||
1494 | return ret; | ||
1495 | } | ||
1496 | |||
1497 | /** | ||
1498 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | ||
1499 | * @page: the page to unmap/unlock | ||
1500 | * @flags: action and flags | ||
1501 | * | ||
1502 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1503 | * contained in the address_space struct it points to. | ||
1504 | * | ||
1505 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1506 | * object-based pages. | ||
1507 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1508 | * where the page was found will be held for write. So, we won't recheck | ||
1509 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1510 | * 'LOCKED. | ||
1511 | */ | ||
1512 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | ||
1513 | { | 1364 | { |
1514 | struct address_space *mapping = page->mapping; | ||
1515 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1516 | struct vm_area_struct *vma; | ||
1517 | int ret = SWAP_AGAIN; | 1365 | int ret = SWAP_AGAIN; |
1518 | unsigned long cursor; | 1366 | unsigned long cursor; |
1519 | unsigned long max_nl_cursor = 0; | 1367 | unsigned long max_nl_cursor = 0; |
1520 | unsigned long max_nl_size = 0; | 1368 | unsigned long max_nl_size = 0; |
1521 | unsigned int mapcount; | 1369 | unsigned int mapcount; |
1522 | 1370 | ||
1523 | if (PageHuge(page)) | 1371 | list_for_each_entry(vma, |
1524 | pgoff = page->index << compound_order(page); | 1372 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1525 | 1373 | ||
1526 | mutex_lock(&mapping->i_mmap_mutex); | ||
1527 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
1528 | unsigned long address = vma_address(page, vma); | ||
1529 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1530 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1531 | goto out; | ||
1532 | } | ||
1533 | |||
1534 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
1535 | goto out; | ||
1536 | |||
1537 | /* | ||
1538 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1539 | * It's costly. Instead, later, page reclaim logic may call | ||
1540 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1541 | */ | ||
1542 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1543 | goto out; | ||
1544 | |||
1545 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
1546 | shared.nonlinear) { | ||
1547 | cursor = (unsigned long) vma->vm_private_data; | 1374 | cursor = (unsigned long) vma->vm_private_data; |
1548 | if (cursor > max_nl_cursor) | 1375 | if (cursor > max_nl_cursor) |
1549 | max_nl_cursor = cursor; | 1376 | max_nl_cursor = cursor; |
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1553 | } | 1380 | } |
1554 | 1381 | ||
1555 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | 1382 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
1556 | ret = SWAP_FAIL; | 1383 | return SWAP_FAIL; |
1557 | goto out; | ||
1558 | } | 1384 | } |
1559 | 1385 | ||
1560 | /* | 1386 | /* |
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1566 | */ | 1392 | */ |
1567 | mapcount = page_mapcount(page); | 1393 | mapcount = page_mapcount(page); |
1568 | if (!mapcount) | 1394 | if (!mapcount) |
1569 | goto out; | 1395 | return ret; |
1396 | |||
1570 | cond_resched(); | 1397 | cond_resched(); |
1571 | 1398 | ||
1572 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1399 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1574 | max_nl_cursor = CLUSTER_SIZE; | 1401 | max_nl_cursor = CLUSTER_SIZE; |
1575 | 1402 | ||
1576 | do { | 1403 | do { |
1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1404 | list_for_each_entry(vma, |
1578 | shared.nonlinear) { | 1405 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1406 | |||
1579 | cursor = (unsigned long) vma->vm_private_data; | 1407 | cursor = (unsigned long) vma->vm_private_data; |
1580 | while ( cursor < max_nl_cursor && | 1408 | while (cursor < max_nl_cursor && |
1581 | cursor < vma->vm_end - vma->vm_start) { | 1409 | cursor < vma->vm_end - vma->vm_start) { |
1582 | if (try_to_unmap_cluster(cursor, &mapcount, | 1410 | if (try_to_unmap_cluster(cursor, &mapcount, |
1583 | vma, page) == SWAP_MLOCK) | 1411 | vma, page) == SWAP_MLOCK) |
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1585 | cursor += CLUSTER_SIZE; | 1413 | cursor += CLUSTER_SIZE; |
1586 | vma->vm_private_data = (void *) cursor; | 1414 | vma->vm_private_data = (void *) cursor; |
1587 | if ((int)mapcount <= 0) | 1415 | if ((int)mapcount <= 0) |
1588 | goto out; | 1416 | return ret; |
1589 | } | 1417 | } |
1590 | vma->vm_private_data = (void *) max_nl_cursor; | 1418 | vma->vm_private_data = (void *) max_nl_cursor; |
1591 | } | 1419 | } |
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1600 | */ | 1428 | */ |
1601 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | 1429 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1602 | vma->vm_private_data = NULL; | 1430 | vma->vm_private_data = NULL; |
1603 | out: | 1431 | |
1604 | mutex_unlock(&mapping->i_mmap_mutex); | ||
1605 | return ret; | 1432 | return ret; |
1606 | } | 1433 | } |
1607 | 1434 | ||
1435 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | ||
1436 | { | ||
1437 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1438 | |||
1439 | if (!maybe_stack) | ||
1440 | return false; | ||
1441 | |||
1442 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1443 | VM_STACK_INCOMPLETE_SETUP) | ||
1444 | return true; | ||
1445 | |||
1446 | return false; | ||
1447 | } | ||
1448 | |||
1449 | static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) | ||
1450 | { | ||
1451 | return is_vma_temporary_stack(vma); | ||
1452 | } | ||
1453 | |||
1454 | static int page_not_mapped(struct page *page) | ||
1455 | { | ||
1456 | return !page_mapped(page); | ||
1457 | }; | ||
1458 | |||
1608 | /** | 1459 | /** |
1609 | * try_to_unmap - try to remove all page table mappings to a page | 1460 | * try_to_unmap - try to remove all page table mappings to a page |
1610 | * @page: the page to get unmapped | 1461 | * @page: the page to get unmapped |
@@ -1622,16 +1473,29 @@ out: | |||
1622 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1473 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1623 | { | 1474 | { |
1624 | int ret; | 1475 | int ret; |
1476 | struct rmap_walk_control rwc = { | ||
1477 | .rmap_one = try_to_unmap_one, | ||
1478 | .arg = (void *)flags, | ||
1479 | .done = page_not_mapped, | ||
1480 | .file_nonlinear = try_to_unmap_nonlinear, | ||
1481 | .anon_lock = page_lock_anon_vma_read, | ||
1482 | }; | ||
1625 | 1483 | ||
1626 | BUG_ON(!PageLocked(page)); | ||
1627 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | 1484 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); |
1628 | 1485 | ||
1629 | if (unlikely(PageKsm(page))) | 1486 | /* |
1630 | ret = try_to_unmap_ksm(page, flags); | 1487 | * During exec, a temporary VMA is setup and later moved. |
1631 | else if (PageAnon(page)) | 1488 | * The VMA is moved under the anon_vma lock but not the |
1632 | ret = try_to_unmap_anon(page, flags); | 1489 | * page tables leading to a race where migration cannot |
1633 | else | 1490 | * find the migration ptes. Rather than increasing the |
1634 | ret = try_to_unmap_file(page, flags); | 1491 | * locking requirements of exec(), migration skips |
1492 | * temporary VMAs until after exec() completes. | ||
1493 | */ | ||
1494 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | ||
1495 | rwc.invalid_vma = invalid_migration_vma; | ||
1496 | |||
1497 | ret = rmap_walk(page, &rwc); | ||
1498 | |||
1635 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1499 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
1636 | ret = SWAP_SUCCESS; | 1500 | ret = SWAP_SUCCESS; |
1637 | return ret; | 1501 | return ret; |
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1654 | */ | 1518 | */ |
1655 | int try_to_munlock(struct page *page) | 1519 | int try_to_munlock(struct page *page) |
1656 | { | 1520 | { |
1521 | int ret; | ||
1522 | struct rmap_walk_control rwc = { | ||
1523 | .rmap_one = try_to_unmap_one, | ||
1524 | .arg = (void *)TTU_MUNLOCK, | ||
1525 | .done = page_not_mapped, | ||
1526 | /* | ||
1527 | * We don't bother to try to find the munlocked page in | ||
1528 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
1529 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
1530 | */ | ||
1531 | .file_nonlinear = NULL, | ||
1532 | .anon_lock = page_lock_anon_vma_read, | ||
1533 | |||
1534 | }; | ||
1535 | |||
1657 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1536 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1658 | 1537 | ||
1659 | if (unlikely(PageKsm(page))) | 1538 | ret = rmap_walk(page, &rwc); |
1660 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | 1539 | return ret; |
1661 | else if (PageAnon(page)) | ||
1662 | return try_to_unmap_anon(page, TTU_MUNLOCK); | ||
1663 | else | ||
1664 | return try_to_unmap_file(page, TTU_MUNLOCK); | ||
1665 | } | 1540 | } |
1666 | 1541 | ||
1667 | void __put_anon_vma(struct anon_vma *anon_vma) | 1542 | void __put_anon_vma(struct anon_vma *anon_vma) |
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma) | |||
1674 | anon_vma_free(anon_vma); | 1549 | anon_vma_free(anon_vma); |
1675 | } | 1550 | } |
1676 | 1551 | ||
1677 | #ifdef CONFIG_MIGRATION | 1552 | static struct anon_vma *rmap_walk_anon_lock(struct page *page, |
1678 | /* | 1553 | struct rmap_walk_control *rwc) |
1679 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1680 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1681 | */ | ||
1682 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1683 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1684 | { | 1554 | { |
1685 | struct anon_vma *anon_vma; | 1555 | struct anon_vma *anon_vma; |
1686 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1556 | |
1687 | struct anon_vma_chain *avc; | 1557 | if (rwc->anon_lock) |
1688 | int ret = SWAP_AGAIN; | 1558 | return rwc->anon_lock(page); |
1689 | 1559 | ||
1690 | /* | 1560 | /* |
1691 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() | 1561 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1695 | */ | 1565 | */ |
1696 | anon_vma = page_anon_vma(page); | 1566 | anon_vma = page_anon_vma(page); |
1697 | if (!anon_vma) | 1567 | if (!anon_vma) |
1698 | return ret; | 1568 | return NULL; |
1569 | |||
1699 | anon_vma_lock_read(anon_vma); | 1570 | anon_vma_lock_read(anon_vma); |
1571 | return anon_vma; | ||
1572 | } | ||
1573 | |||
1574 | /* | ||
1575 | * rmap_walk_anon - do something to anonymous page using the object-based | ||
1576 | * rmap method | ||
1577 | * @page: the page to be handled | ||
1578 | * @rwc: control variable according to each walk type | ||
1579 | * | ||
1580 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1581 | * contained in the anon_vma struct it points to. | ||
1582 | * | ||
1583 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1584 | * where the page was found will be held for write. So, we won't recheck | ||
1585 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1586 | * LOCKED. | ||
1587 | */ | ||
1588 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | ||
1589 | { | ||
1590 | struct anon_vma *anon_vma; | ||
1591 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1592 | struct anon_vma_chain *avc; | ||
1593 | int ret = SWAP_AGAIN; | ||
1594 | |||
1595 | anon_vma = rmap_walk_anon_lock(page, rwc); | ||
1596 | if (!anon_vma) | ||
1597 | return ret; | ||
1598 | |||
1700 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1599 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1701 | struct vm_area_struct *vma = avc->vma; | 1600 | struct vm_area_struct *vma = avc->vma; |
1702 | unsigned long address = vma_address(page, vma); | 1601 | unsigned long address = vma_address(page, vma); |
1703 | ret = rmap_one(page, vma, address, arg); | 1602 | |
1603 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1604 | continue; | ||
1605 | |||
1606 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1704 | if (ret != SWAP_AGAIN) | 1607 | if (ret != SWAP_AGAIN) |
1705 | break; | 1608 | break; |
1609 | if (rwc->done && rwc->done(page)) | ||
1610 | break; | ||
1706 | } | 1611 | } |
1707 | anon_vma_unlock_read(anon_vma); | 1612 | anon_vma_unlock_read(anon_vma); |
1708 | return ret; | 1613 | return ret; |
1709 | } | 1614 | } |
1710 | 1615 | ||
1711 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | 1616 | /* |
1712 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1617 | * rmap_walk_file - do something to file page using the object-based rmap method |
1618 | * @page: the page to be handled | ||
1619 | * @rwc: control variable according to each walk type | ||
1620 | * | ||
1621 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1622 | * contained in the address_space struct it points to. | ||
1623 | * | ||
1624 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1625 | * where the page was found will be held for write. So, we won't recheck | ||
1626 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1627 | * LOCKED. | ||
1628 | */ | ||
1629 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | ||
1713 | { | 1630 | { |
1714 | struct address_space *mapping = page->mapping; | 1631 | struct address_space *mapping = page->mapping; |
1715 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1632 | pgoff_t pgoff = page->index << compound_order(page); |
1716 | struct vm_area_struct *vma; | 1633 | struct vm_area_struct *vma; |
1717 | int ret = SWAP_AGAIN; | 1634 | int ret = SWAP_AGAIN; |
1718 | 1635 | ||
1636 | /* | ||
1637 | * The page lock not only makes sure that page->mapping cannot | ||
1638 | * suddenly be NULLified by truncation, it makes sure that the | ||
1639 | * structure at mapping cannot be freed and reused yet, | ||
1640 | * so we can safely take mapping->i_mmap_mutex. | ||
1641 | */ | ||
1642 | VM_BUG_ON(!PageLocked(page)); | ||
1643 | |||
1719 | if (!mapping) | 1644 | if (!mapping) |
1720 | return ret; | 1645 | return ret; |
1721 | mutex_lock(&mapping->i_mmap_mutex); | 1646 | mutex_lock(&mapping->i_mmap_mutex); |
1722 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1647 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1723 | unsigned long address = vma_address(page, vma); | 1648 | unsigned long address = vma_address(page, vma); |
1724 | ret = rmap_one(page, vma, address, arg); | 1649 | |
1650 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1651 | continue; | ||
1652 | |||
1653 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1725 | if (ret != SWAP_AGAIN) | 1654 | if (ret != SWAP_AGAIN) |
1726 | break; | 1655 | goto done; |
1656 | if (rwc->done && rwc->done(page)) | ||
1657 | goto done; | ||
1727 | } | 1658 | } |
1728 | /* | 1659 | |
1729 | * No nonlinear handling: being always shared, nonlinear vmas | 1660 | if (!rwc->file_nonlinear) |
1730 | * never contain migration ptes. Decide what to do about this | 1661 | goto done; |
1731 | * limitation to linear when we need rmap_walk() on nonlinear. | 1662 | |
1732 | */ | 1663 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1664 | goto done; | ||
1665 | |||
1666 | ret = rwc->file_nonlinear(page, mapping, vma); | ||
1667 | |||
1668 | done: | ||
1733 | mutex_unlock(&mapping->i_mmap_mutex); | 1669 | mutex_unlock(&mapping->i_mmap_mutex); |
1734 | return ret; | 1670 | return ret; |
1735 | } | 1671 | } |
1736 | 1672 | ||
1737 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 1673 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
1738 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1739 | { | 1674 | { |
1740 | VM_BUG_ON(!PageLocked(page)); | ||
1741 | |||
1742 | if (unlikely(PageKsm(page))) | 1675 | if (unlikely(PageKsm(page))) |
1743 | return rmap_walk_ksm(page, rmap_one, arg); | 1676 | return rmap_walk_ksm(page, rwc); |
1744 | else if (PageAnon(page)) | 1677 | else if (PageAnon(page)) |
1745 | return rmap_walk_anon(page, rmap_one, arg); | 1678 | return rmap_walk_anon(page, rwc); |
1746 | else | 1679 | else |
1747 | return rmap_walk_file(page, rmap_one, arg); | 1680 | return rmap_walk_file(page, rwc); |
1748 | } | 1681 | } |
1749 | #endif /* CONFIG_MIGRATION */ | ||
1750 | 1682 | ||
1751 | #ifdef CONFIG_HUGETLB_PAGE | 1683 | #ifdef CONFIG_HUGETLB_PAGE |
1752 | /* | 1684 | /* |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
40 | unsigned long align, | 40 | unsigned long align, |
41 | unsigned long goal) | 41 | unsigned long goal) |
42 | { | 42 | { |
43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); | 43 | return memblock_virt_alloc_try_nid(size, align, goal, |
44 | BOOTMEM_ALLOC_ACCESSIBLE, node); | ||
44 | } | 45 | } |
45 | 46 | ||
46 | static void *vmemmap_buf; | 47 | static void *vmemmap_buf; |
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
226 | 227 | ||
227 | if (vmemmap_buf_start) { | 228 | if (vmemmap_buf_start) { |
228 | /* need to free left buf */ | 229 | /* need to free left buf */ |
229 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | 230 | memblock_free_early(__pa(vmemmap_buf), |
231 | vmemmap_buf_end - vmemmap_buf); | ||
230 | vmemmap_buf = NULL; | 232 | vmemmap_buf = NULL; |
231 | vmemmap_buf_end = NULL; | 233 | vmemmap_buf_end = NULL; |
232 | } | 234 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
69 | else | 69 | else |
70 | section = kzalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else { | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = memblock_virt_alloc_node(array_size, nid); |
73 | } | 73 | } |
74 | 74 | ||
75 | return section; | 75 | return section; |
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
279 | limit = goal + (1UL << PA_SECTION_SHIFT); | 279 | limit = goal + (1UL << PA_SECTION_SHIFT); |
280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | 280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
281 | again: | 281 | again: |
282 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | 282 | p = memblock_virt_alloc_try_nid_nopanic(size, |
283 | SMP_CACHE_BYTES, goal, limit); | 283 | SMP_CACHE_BYTES, goal, limit, |
284 | nid); | ||
284 | if (!p && limit) { | 285 | if (!p && limit) { |
285 | limit = 0; | 286 | limit = 0; |
286 | goto again; | 287 | goto again; |
@@ -331,7 +332,7 @@ static unsigned long * __init | |||
331 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 332 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
332 | unsigned long size) | 333 | unsigned long size) |
333 | { | 334 | { |
334 | return alloc_bootmem_node_nopanic(pgdat, size); | 335 | return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
335 | } | 336 | } |
336 | 337 | ||
337 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 338 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
376 | return map; | 377 | return map; |
377 | 378 | ||
378 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); | 379 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
379 | map = __alloc_bootmem_node_high(NODE_DATA(nid), size, | 380 | map = memblock_virt_alloc_try_nid(size, |
380 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 381 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
382 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
381 | return map; | 383 | return map; |
382 | } | 384 | } |
383 | void __init sparse_mem_maps_populate_node(struct page **map_map, | 385 | void __init sparse_mem_maps_populate_node(struct page **map_map, |
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
401 | } | 403 | } |
402 | 404 | ||
403 | size = PAGE_ALIGN(size); | 405 | size = PAGE_ALIGN(size); |
404 | map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, | 406 | map = memblock_virt_alloc_try_nid(size * map_count, |
405 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 407 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
408 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); | ||
406 | if (map) { | 409 | if (map) { |
407 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 410 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
408 | if (!present_section_nr(pnum)) | 411 | if (!present_section_nr(pnum)) |
@@ -545,7 +548,7 @@ void __init sparse_init(void) | |||
545 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | 548 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. |
546 | */ | 549 | */ |
547 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; | 550 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; |
548 | usemap_map = alloc_bootmem(size); | 551 | usemap_map = memblock_virt_alloc(size, 0); |
549 | if (!usemap_map) | 552 | if (!usemap_map) |
550 | panic("can not allocate usemap_map\n"); | 553 | panic("can not allocate usemap_map\n"); |
551 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, | 554 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, |
@@ -553,7 +556,7 @@ void __init sparse_init(void) | |||
553 | 556 | ||
554 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 557 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
555 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | 558 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; |
556 | map_map = alloc_bootmem(size2); | 559 | map_map = memblock_virt_alloc(size2, 0); |
557 | if (!map_map) | 560 | if (!map_map) |
558 | panic("can not allocate map_map\n"); | 561 | panic("can not allocate map_map\n"); |
559 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, | 562 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, |
@@ -583,9 +586,9 @@ void __init sparse_init(void) | |||
583 | vmemmap_populate_print_last(); | 586 | vmemmap_populate_print_last(); |
584 | 587 | ||
585 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 588 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
586 | free_bootmem(__pa(map_map), size2); | 589 | memblock_free_early(__pa(map_map), size2); |
587 | #endif | 590 | #endif |
588 | free_bootmem(__pa(usemap_map), size); | 591 | memblock_free_early(__pa(usemap_map), size); |
589 | } | 592 | } |
590 | 593 | ||
591 | #ifdef CONFIG_MEMORY_HOTPLUG | 594 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | ||
35 | 34 | ||
36 | #include "internal.h" | 35 | #include "internal.h" |
37 | 36 | ||
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page) | |||
82 | 81 | ||
83 | static void put_compound_page(struct page *page) | 82 | static void put_compound_page(struct page *page) |
84 | { | 83 | { |
85 | if (unlikely(PageTail(page))) { | 84 | struct page *page_head; |
86 | /* __split_huge_page_refcount can run under us */ | ||
87 | struct page *page_head = compound_trans_head(page); | ||
88 | |||
89 | if (likely(page != page_head && | ||
90 | get_page_unless_zero(page_head))) { | ||
91 | unsigned long flags; | ||
92 | 85 | ||
86 | if (likely(!PageTail(page))) { | ||
87 | if (put_page_testzero(page)) { | ||
93 | /* | 88 | /* |
94 | * THP can not break up slab pages so avoid taking | 89 | * By the time all refcounts have been released |
95 | * compound_lock(). Slab performs non-atomic bit ops | 90 | * split_huge_page cannot run anymore from under us. |
96 | * on page->flags for better performance. In particular | ||
97 | * slab_unlock() in slub used to be a hot path. It is | ||
98 | * still hot on arches that do not support | ||
99 | * this_cpu_cmpxchg_double(). | ||
100 | */ | 91 | */ |
101 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 92 | if (PageHead(page)) |
102 | if (likely(PageTail(page))) { | 93 | __put_compound_page(page); |
103 | /* | 94 | else |
104 | * __split_huge_page_refcount | 95 | __put_single_page(page); |
105 | * cannot race here. | 96 | } |
106 | */ | 97 | return; |
107 | VM_BUG_ON(!PageHead(page_head)); | 98 | } |
108 | atomic_dec(&page->_mapcount); | 99 | |
109 | if (put_page_testzero(page_head)) | 100 | /* __split_huge_page_refcount can run under us */ |
110 | VM_BUG_ON(1); | 101 | page_head = compound_trans_head(page); |
111 | if (put_page_testzero(page_head)) | 102 | |
112 | __put_compound_page(page_head); | 103 | /* |
113 | return; | 104 | * THP can not break up slab pages so avoid taking |
114 | } else | 105 | * compound_lock() and skip the tail page refcounting (in |
115 | /* | 106 | * _mapcount) too. Slab performs non-atomic bit ops on |
116 | * __split_huge_page_refcount | 107 | * page->flags for better performance. In particular |
117 | * run before us, "page" was a | 108 | * slab_unlock() in slub used to be a hot path. It is still |
118 | * THP tail. The split | 109 | * hot on arches that do not support |
119 | * page_head has been freed | 110 | * this_cpu_cmpxchg_double(). |
120 | * and reallocated as slab or | 111 | * |
121 | * hugetlbfs page of smaller | 112 | * If "page" is part of a slab or hugetlbfs page it cannot be |
122 | * order (only possible if | 113 | * splitted and the head page cannot change from under us. And |
123 | * reallocated as slab on | 114 | * if "page" is part of a THP page under splitting, if the |
124 | * x86). | 115 | * head page pointed by the THP tail isn't a THP head anymore, |
125 | */ | 116 | * we'll find PageTail clear after smp_rmb() and we'll treat |
126 | goto skip_lock; | 117 | * it as a single page. |
127 | } | 118 | */ |
119 | if (!__compound_tail_refcounted(page_head)) { | ||
120 | /* | ||
121 | * If "page" is a THP tail, we must read the tail page | ||
122 | * flags after the head page flags. The | ||
123 | * split_huge_page side enforces write memory barriers | ||
124 | * between clearing PageTail and before the head page | ||
125 | * can be freed and reallocated. | ||
126 | */ | ||
127 | smp_rmb(); | ||
128 | if (likely(PageTail(page))) { | ||
128 | /* | 129 | /* |
129 | * page_head wasn't a dangling pointer but it | 130 | * __split_huge_page_refcount cannot race |
130 | * may not be a head page anymore by the time | 131 | * here. |
131 | * we obtain the lock. That is ok as long as it | ||
132 | * can't be freed from under us. | ||
133 | */ | 132 | */ |
134 | flags = compound_lock_irqsave(page_head); | 133 | VM_BUG_ON(!PageHead(page_head)); |
135 | if (unlikely(!PageTail(page))) { | 134 | VM_BUG_ON(page_mapcount(page) != 0); |
136 | /* __split_huge_page_refcount run before us */ | 135 | if (put_page_testzero(page_head)) { |
137 | compound_unlock_irqrestore(page_head, flags); | 136 | /* |
138 | skip_lock: | 137 | * If this is the tail of a slab |
139 | if (put_page_testzero(page_head)) { | 138 | * compound page, the tail pin must |
140 | /* | 139 | * not be the last reference held on |
141 | * The head page may have been | 140 | * the page, because the PG_slab |
142 | * freed and reallocated as a | 141 | * cannot be cleared before all tail |
143 | * compound page of smaller | 142 | * pins (which skips the _mapcount |
144 | * order and then freed again. | 143 | * tail refcounting) have been |
145 | * All we know is that it | 144 | * released. For hugetlbfs the tail |
146 | * cannot have become: a THP | 145 | * pin may be the last reference on |
147 | * page, a compound page of | 146 | * the page instead, because |
148 | * higher order, a tail page. | 147 | * PageHeadHuge will not go away until |
149 | * That is because we still | 148 | * the compound page enters the buddy |
150 | * hold the refcount of the | 149 | * allocator. |
151 | * split THP tail and | 150 | */ |
152 | * page_head was the THP head | 151 | VM_BUG_ON(PageSlab(page_head)); |
153 | * before the split. | 152 | __put_compound_page(page_head); |
154 | */ | ||
155 | if (PageHead(page_head)) | ||
156 | __put_compound_page(page_head); | ||
157 | else | ||
158 | __put_single_page(page_head); | ||
159 | } | ||
160 | out_put_single: | ||
161 | if (put_page_testzero(page)) | ||
162 | __put_single_page(page); | ||
163 | return; | ||
164 | } | 153 | } |
165 | VM_BUG_ON(page_head != page->first_page); | 154 | return; |
155 | } else | ||
166 | /* | 156 | /* |
167 | * We can release the refcount taken by | 157 | * __split_huge_page_refcount run before us, |
168 | * get_page_unless_zero() now that | 158 | * "page" was a THP tail. The split page_head |
169 | * __split_huge_page_refcount() is blocked on | 159 | * has been freed and reallocated as slab or |
170 | * the compound_lock. | 160 | * hugetlbfs page of smaller order (only |
161 | * possible if reallocated as slab on x86). | ||
171 | */ | 162 | */ |
172 | if (put_page_testzero(page_head)) | 163 | goto out_put_single; |
173 | VM_BUG_ON(1); | 164 | } |
174 | /* __split_huge_page_refcount will wait now */ | ||
175 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
176 | atomic_dec(&page->_mapcount); | ||
177 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
178 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
179 | compound_unlock_irqrestore(page_head, flags); | ||
180 | 165 | ||
166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
167 | unsigned long flags; | ||
168 | |||
169 | /* | ||
170 | * page_head wasn't a dangling pointer but it may not | ||
171 | * be a head page anymore by the time we obtain the | ||
172 | * lock. That is ok as long as it can't be freed from | ||
173 | * under us. | ||
174 | */ | ||
175 | flags = compound_lock_irqsave(page_head); | ||
176 | if (unlikely(!PageTail(page))) { | ||
177 | /* __split_huge_page_refcount run before us */ | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
181 | if (put_page_testzero(page_head)) { | 179 | if (put_page_testzero(page_head)) { |
180 | /* | ||
181 | * The head page may have been freed | ||
182 | * and reallocated as a compound page | ||
183 | * of smaller order and then freed | ||
184 | * again. All we know is that it | ||
185 | * cannot have become: a THP page, a | ||
186 | * compound page of higher order, a | ||
187 | * tail page. That is because we | ||
188 | * still hold the refcount of the | ||
189 | * split THP tail and page_head was | ||
190 | * the THP head before the split. | ||
191 | */ | ||
182 | if (PageHead(page_head)) | 192 | if (PageHead(page_head)) |
183 | __put_compound_page(page_head); | 193 | __put_compound_page(page_head); |
184 | else | 194 | else |
185 | __put_single_page(page_head); | 195 | __put_single_page(page_head); |
186 | } | 196 | } |
187 | } else { | 197 | out_put_single: |
188 | /* page_head is a dangling pointer */ | 198 | if (put_page_testzero(page)) |
189 | VM_BUG_ON(PageTail(page)); | 199 | __put_single_page(page); |
190 | goto out_put_single; | 200 | return; |
191 | } | 201 | } |
192 | } else if (put_page_testzero(page)) { | 202 | VM_BUG_ON(page_head != page->first_page); |
193 | if (PageHead(page)) | 203 | /* |
194 | __put_compound_page(page); | 204 | * We can release the refcount taken by |
195 | else | 205 | * get_page_unless_zero() now that |
196 | __put_single_page(page); | 206 | * __split_huge_page_refcount() is blocked on the |
207 | * compound_lock. | ||
208 | */ | ||
209 | if (put_page_testzero(page_head)) | ||
210 | VM_BUG_ON(1); | ||
211 | /* __split_huge_page_refcount will wait now */ | ||
212 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
213 | atomic_dec(&page->_mapcount); | ||
214 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
215 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
216 | compound_unlock_irqrestore(page_head, flags); | ||
217 | |||
218 | if (put_page_testzero(page_head)) { | ||
219 | if (PageHead(page_head)) | ||
220 | __put_compound_page(page_head); | ||
221 | else | ||
222 | __put_single_page(page_head); | ||
223 | } | ||
224 | } else { | ||
225 | /* page_head is a dangling pointer */ | ||
226 | VM_BUG_ON(PageTail(page)); | ||
227 | goto out_put_single; | ||
197 | } | 228 | } |
198 | } | 229 | } |
199 | 230 | ||
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page) | |||
221 | * split_huge_page(). | 252 | * split_huge_page(). |
222 | */ | 253 | */ |
223 | unsigned long flags; | 254 | unsigned long flags; |
224 | bool got = false; | 255 | bool got; |
225 | struct page *page_head = compound_trans_head(page); | 256 | struct page *page_head = compound_trans_head(page); |
226 | 257 | ||
227 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 258 | /* Ref to put_compound_page() comment. */ |
228 | /* Ref to put_compound_page() comment. */ | 259 | if (!__compound_tail_refcounted(page_head)) { |
229 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 260 | smp_rmb(); |
230 | if (likely(PageTail(page))) { | 261 | if (likely(PageTail(page))) { |
231 | /* | 262 | /* |
232 | * This is a hugetlbfs page or a slab | 263 | * This is a hugetlbfs page or a slab |
233 | * page. __split_huge_page_refcount | 264 | * page. __split_huge_page_refcount |
234 | * cannot race here. | 265 | * cannot race here. |
235 | */ | 266 | */ |
236 | VM_BUG_ON(!PageHead(page_head)); | 267 | VM_BUG_ON(!PageHead(page_head)); |
237 | __get_page_tail_foll(page, false); | 268 | __get_page_tail_foll(page, true); |
238 | return true; | 269 | return true; |
239 | } else { | 270 | } else { |
240 | /* | 271 | /* |
241 | * __split_huge_page_refcount run | 272 | * __split_huge_page_refcount run |
242 | * before us, "page" was a THP | 273 | * before us, "page" was a THP |
243 | * tail. The split page_head has been | 274 | * tail. The split page_head has been |
244 | * freed and reallocated as slab or | 275 | * freed and reallocated as slab or |
245 | * hugetlbfs page of smaller order | 276 | * hugetlbfs page of smaller order |
246 | * (only possible if reallocated as | 277 | * (only possible if reallocated as |
247 | * slab on x86). | 278 | * slab on x86). |
248 | */ | 279 | */ |
249 | put_page(page_head); | 280 | return false; |
250 | return false; | ||
251 | } | ||
252 | } | 281 | } |
282 | } | ||
253 | 283 | ||
284 | got = false; | ||
285 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
254 | /* | 286 | /* |
255 | * page_head wasn't a dangling pointer but it | 287 | * page_head wasn't a dangling pointer but it |
256 | * may not be a head page anymore by the time | 288 | * may not be a head page anymore by the time |
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) | |||
404 | return mapping; | 404 | return mapping; |
405 | } | 405 | } |
406 | 406 | ||
407 | int overcommit_ratio_handler(struct ctl_table *table, int write, | ||
408 | void __user *buffer, size_t *lenp, | ||
409 | loff_t *ppos) | ||
410 | { | ||
411 | int ret; | ||
412 | |||
413 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
414 | if (ret == 0 && write) | ||
415 | sysctl_overcommit_kbytes = 0; | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | int overcommit_kbytes_handler(struct ctl_table *table, int write, | ||
420 | void __user *buffer, size_t *lenp, | ||
421 | loff_t *ppos) | ||
422 | { | ||
423 | int ret; | ||
424 | |||
425 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | ||
426 | if (ret == 0 && write) | ||
427 | sysctl_overcommit_ratio = 0; | ||
428 | return ret; | ||
429 | } | ||
430 | |||
407 | /* | 431 | /* |
408 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | 432 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used |
409 | */ | 433 | */ |
410 | unsigned long vm_commit_limit(void) | 434 | unsigned long vm_commit_limit(void) |
411 | { | 435 | { |
412 | return ((totalram_pages - hugetlb_total_pages()) | 436 | unsigned long allowed; |
413 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 437 | |
438 | if (sysctl_overcommit_kbytes) | ||
439 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); | ||
440 | else | ||
441 | allowed = ((totalram_pages - hugetlb_total_pages()) | ||
442 | * sysctl_overcommit_ratio / 100); | ||
443 | allowed += total_swap_pages; | ||
444 | |||
445 | return allowed; | ||
414 | } | 446 | } |
415 | 447 | ||
416 | 448 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..e4f0db2a3eae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Walk a vmap address to the struct page it maps. | 223 | * Walk a vmap address to the physical pfn it maps to. |
224 | */ | 224 | */ |
225 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 225 | unsigned long vmalloc_to_pfn(const void *vmalloc_addr) |
226 | { | 226 | { |
227 | unsigned long addr = (unsigned long) vmalloc_addr; | 227 | unsigned long addr = (unsigned long) vmalloc_addr; |
228 | struct page *page = NULL; | 228 | unsigned long pfn = 0; |
229 | pgd_t *pgd = pgd_offset_k(addr); | 229 | pgd_t *pgd = pgd_offset_k(addr); |
230 | 230 | ||
231 | /* | 231 | /* |
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) | |||
244 | ptep = pte_offset_map(pmd, addr); | 244 | ptep = pte_offset_map(pmd, addr); |
245 | pte = *ptep; | 245 | pte = *ptep; |
246 | if (pte_present(pte)) | 246 | if (pte_present(pte)) |
247 | page = pte_page(pte); | 247 | pfn = pte_pfn(pte); |
248 | pte_unmap(ptep); | 248 | pte_unmap(ptep); |
249 | } | 249 | } |
250 | } | 250 | } |
251 | } | 251 | } |
252 | return page; | 252 | return pfn; |
253 | } | 253 | } |
254 | EXPORT_SYMBOL(vmalloc_to_page); | 254 | EXPORT_SYMBOL(vmalloc_to_pfn); |
255 | 255 | ||
256 | /* | 256 | /* |
257 | * Map a vmalloc()-space virtual address to the physical page frame number. | 257 | * Map a vmalloc()-space virtual address to the struct page. |
258 | */ | 258 | */ |
259 | unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | 259 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
260 | { | 260 | { |
261 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | 261 | return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); |
262 | } | 262 | } |
263 | EXPORT_SYMBOL(vmalloc_to_pfn); | 263 | EXPORT_SYMBOL(vmalloc_to_page); |
264 | 264 | ||
265 | 265 | ||
266 | /*** Global kva allocator ***/ | 266 | /*** Global kva allocator ***/ |