aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 22:05:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 22:05:45 -0500
commitdf32e43a54d04eda35d2859beaf90e3864d53288 (patch)
tree7a61cf658b2949bd426285eb9902be7758ced1ba
parentfbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff)
parent78d5506e82b21a1a1de68c24182db2c2fe521422 (diff)
Merge branch 'akpm' (incoming from Andrew)
Merge first patch-bomb from Andrew Morton: - a couple of misc things - inotify/fsnotify work from Jan - ocfs2 updates (partial) - about half of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) mm/migrate: remove unused function, fail_migrate_page() mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages mm/migrate: correct failure handling if !hugepage_migration_support() mm/migrate: add comment about permanent failure path mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure mm: compaction: reset scanner positions immediately when they meet mm: compaction: do not mark unmovable pageblocks as skipped in async compaction mm: compaction: detect when scanners meet in isolate_freepages mm: compaction: reset cached scanner pfn's before reading them mm: compaction: encapsulate defer reset logic mm: compaction: trace compaction begin and end memcg, oom: lock mem_cgroup_print_oom_info sched: add tracepoints related to NUMA task migration mm: numa: do not automatically migrate KSM pages mm: numa: trace tasks that fail migration due to rate limiting mm: numa: limit scope of lock for NUMA migrate rate limiting mm: numa: make NUMA-migrate related functions static lib/show_mem.c: show num_poisoned_pages when oom mm/hwpoison: add '#' to hwpoison_inject mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter ...
-rw-r--r--Documentation/filesystems/proc.txt9
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/vm/overcommit-accounting7
-rw-r--r--arch/arm/include/asm/dma.h4
-rw-r--r--arch/arm/kernel/devtree.c2
-rw-r--r--arch/arm/kernel/setup.c2
-rw-r--r--arch/arm/mach-omap2/omap_hwmod.c8
-rw-r--r--arch/arm/mm/init.c5
-rw-r--r--arch/ia64/mm/contig.c68
-rw-r--r--arch/ia64/mm/discontig.c63
-rw-r--r--arch/ia64/mm/init.c48
-rw-r--r--arch/metag/mm/init.c3
-rw-r--r--arch/metag/mm/numa.c3
-rw-r--r--arch/microblaze/mm/init.c3
-rw-r--r--arch/parisc/mm/init.c59
-rw-r--r--arch/powerpc/mm/mem.c2
-rw-r--r--arch/powerpc/mm/numa.c8
-rw-r--r--arch/score/Kconfig1
-rw-r--r--arch/sh/kernel/kgdb.c1
-rw-r--r--arch/sh/kernel/setup.c4
-rw-r--r--arch/sparc/mm/init_64.c5
-rw-r--r--arch/unicore32/mm/init.c3
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/memtest.c2
-rw-r--r--arch/x86/mm/numa.c52
-rw-r--r--arch/x86/mm/srat.c5
-rw-r--r--drivers/char/mem.c1
-rw-r--r--drivers/firmware/memmap.c2
-rw-r--r--drivers/iommu/intel-iommu.c2
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/notify/dnotify/dnotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.c224
-rw-r--r--fs/notify/fanotify/fanotify.h23
-rw-r--r--fs/notify/fanotify/fanotify_user.c41
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/group.c1
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c149
-rw-r--r--fs/notify/inotify/inotify_user.c119
-rw-r--r--fs/notify/notification.c334
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c10
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/file.c3
-rw-r--r--fs/ocfs2/ioctl.c7
-rw-r--r--fs/ocfs2/move_extents.c77
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/stack_user.c308
-rw-r--r--fs/ocfs2/stackglue.c16
-rw-r--r--fs/ocfs2/stackglue.h15
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/suballoc.h12
-rw-r--r--fs/ocfs2/super.c20
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/posix_acl.c84
-rw-r--r--fs/proc/meminfo.c37
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/super.c3
-rw-r--r--include/linux/bootmem.h153
-rw-r--r--include/linux/compaction.h16
-rw-r--r--include/linux/dma-debug.h6
-rw-r--r--include/linux/fsnotify_backend.h118
-rw-r--r--include/linux/huge_mm.h23
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/ksm.h15
-rw-r--r--include/linux/memblock.h54
-rw-r--r--include/linux/mempolicy.h32
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h70
-rw-r--r--include/linux/mman.h1
-rw-r--r--include/linux/mmzone.h11
-rw-r--r--include/linux/posix_acl.h78
-rw-r--r--include/linux/rmap.h27
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/trace/events/compaction.h42
-rw-r--r--include/trace/events/migrate.h26
-rw-r--r--include/trace/events/sched.h87
-rw-r--r--init/main.c10
-rw-r--r--kernel/audit_tree.c20
-rw-r--r--kernel/audit_watch.c24
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c10
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/sysctl.c11
-rw-r--r--lib/Kconfig.debug12
-rw-r--r--lib/cpumask.c4
-rw-r--r--lib/dma-debug.c193
-rw-r--r--lib/show_mem.c6
-rw-r--r--lib/swiotlb.c35
-rw-r--r--mm/compaction.c61
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/ksm.c121
-rw-r--r--mm/memblock.c387
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c16
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c89
-rw-r--r--mm/mlock.c18
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c10
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page_alloc.c89
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/percpu.c38
-rw-r--r--mm/rmap.c580
-rw-r--r--mm/sparse-vmemmap.c6
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c278
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmalloc.c20
139 files changed, 2902 insertions, 2512 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22d89aa37218..8533f5f9bb2d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -767,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not.
767 767
768MemTotal: 16344972 kB 768MemTotal: 16344972 kB
769MemFree: 13634064 kB 769MemFree: 13634064 kB
770MemAvailable: 14836172 kB
770Buffers: 3656 kB 771Buffers: 3656 kB
771Cached: 1195708 kB 772Cached: 1195708 kB
772SwapCached: 0 kB 773SwapCached: 0 kB
@@ -799,6 +800,14 @@ AnonHugePages: 49152 kB
799 MemTotal: Total usable ram (i.e. physical ram minus a few reserved 800 MemTotal: Total usable ram (i.e. physical ram minus a few reserved
800 bits and the kernel binary code) 801 bits and the kernel binary code)
801 MemFree: The sum of LowFree+HighFree 802 MemFree: The sum of LowFree+HighFree
803MemAvailable: An estimate of how much memory is available for starting new
804 applications, without swapping. Calculated from MemFree,
805 SReclaimable, the size of the file LRU lists, and the low
806 watermarks in each zone.
807 The estimate takes into account that the system needs some
808 page cache to function well, and that not all reclaimable
809 slab will be reclaimable, due to items being in use. The
810 impact of those factors will vary from system to system.
802 Buffers: Relatively temporary storage for raw disk blocks 811 Buffers: Relatively temporary storage for raw disk blocks
803 shouldn't get tremendously large (20MB or so) 812 shouldn't get tremendously large (20MB or so)
804 Cached: in-memory cache for files read from the disk (the 813 Cached: in-memory cache for files read from the disk (the
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 1fbd4eb7b64a..9f5481bdc5a4 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm:
47- numa_zonelist_order 47- numa_zonelist_order
48- oom_dump_tasks 48- oom_dump_tasks
49- oom_kill_allocating_task 49- oom_kill_allocating_task
50- overcommit_kbytes
50- overcommit_memory 51- overcommit_memory
51- overcommit_ratio 52- overcommit_ratio
52- page-cluster 53- page-cluster
@@ -574,6 +575,17 @@ The default value is 0.
574 575
575============================================================== 576==============================================================
576 577
578overcommit_kbytes:
579
580When overcommit_memory is set to 2, the committed address space is not
581permitted to exceed swap plus this amount of physical RAM. See below.
582
583Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
584of them may be specified at a time. Setting one disables the other (which
585then appears as 0 when read).
586
587==============================================================
588
577overcommit_memory: 589overcommit_memory:
578 590
579This value contains a flag that enables memory overcommitment. 591This value contains a flag that enables memory overcommitment.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 8eaa2fc4b8fa..cbfaaa674118 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes
14 14
152 - Don't overcommit. The total address space commit 152 - Don't overcommit. The total address space commit
16 for the system is not permitted to exceed swap + a 16 for the system is not permitted to exceed swap + a
17 configurable percentage (default is 50) of physical RAM. 17 configurable amount (default is 50%) of physical RAM.
18 Depending on the percentage you use, in most situations 18 Depending on the amount you use, in most situations
19 this means a process will not be killed while accessing 19 this means a process will not be killed while accessing
20 pages but will receive errors on memory allocation as 20 pages but will receive errors on memory allocation as
21 appropriate. 21 appropriate.
@@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes
26 26
27The overcommit policy is set via the sysctl `vm.overcommit_memory'. 27The overcommit policy is set via the sysctl `vm.overcommit_memory'.
28 28
29The overcommit percentage is set via `vm.overcommit_ratio'. 29The overcommit amount can be set via `vm.overcommit_ratio' (percentage)
30or `vm.overcommit_kbytes' (absolute value).
30 31
31The current overcommit limit and amount committed are viewable in 32The current overcommit limit and amount committed are viewable in
32/proc/meminfo as CommitLimit and Committed_AS respectively. 33/proc/meminfo as CommitLimit and Committed_AS respectively.
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index 58b8c6a0ab1f..99084431d6ae 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -8,8 +8,8 @@
8#define MAX_DMA_ADDRESS 0xffffffffUL 8#define MAX_DMA_ADDRESS 0xffffffffUL
9#else 9#else
10#define MAX_DMA_ADDRESS ({ \ 10#define MAX_DMA_ADDRESS ({ \
11 extern unsigned long arm_dma_zone_size; \ 11 extern phys_addr_t arm_dma_zone_size; \
12 arm_dma_zone_size ? \ 12 arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \
13 (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; }) 13 (PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; })
14#endif 14#endif
15 15
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index 34d5fd585bbb..f751714d52c1 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -33,7 +33,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
33 33
34void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) 34void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
35{ 35{
36 return alloc_bootmem_align(size, align); 36 return memblock_virt_alloc(size, align);
37} 37}
38 38
39void __init arm_dt_memblock_reserve(void) 39void __init arm_dt_memblock_reserve(void)
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 987a7f5bce5f..8ce1cbd08dba 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -717,7 +717,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
717 kernel_data.end = virt_to_phys(_end - 1); 717 kernel_data.end = virt_to_phys(_end - 1);
718 718
719 for_each_memblock(memory, region) { 719 for_each_memblock(memory, region) {
720 res = alloc_bootmem_low(sizeof(*res)); 720 res = memblock_virt_alloc(sizeof(*res), 0);
721 res->name = "System RAM"; 721 res->name = "System RAM";
722 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); 722 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
723 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; 723 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 8a1b5e0bad40..f7a6fd35b1e4 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2791,9 +2791,7 @@ static int __init _alloc_links(struct omap_hwmod_link **ml,
2791 sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF; 2791 sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF;
2792 2792
2793 *sl = NULL; 2793 *sl = NULL;
2794 *ml = alloc_bootmem(sz); 2794 *ml = memblock_virt_alloc(sz, 0);
2795
2796 memset(*ml, 0, sz);
2797 2795
2798 *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link); 2796 *sl = (void *)(*ml) + sizeof(struct omap_hwmod_link);
2799 2797
@@ -2912,9 +2910,7 @@ static int __init _alloc_linkspace(struct omap_hwmod_ocp_if **ois)
2912 pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n", 2910 pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n",
2913 __func__, sz, max_ls); 2911 __func__, sz, max_ls);
2914 2912
2915 linkspace = alloc_bootmem(sz); 2913 linkspace = memblock_virt_alloc(sz, 0);
2916
2917 memset(linkspace, 0, sz);
2918 2914
2919 return 0; 2915 return 0;
2920} 2916}
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 3e8f106ee5fe..11eb8add7820 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -92,9 +92,6 @@ void show_mem(unsigned int filter)
92 printk("Mem-info:\n"); 92 printk("Mem-info:\n");
93 show_free_areas(filter); 93 show_free_areas(filter);
94 94
95 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
96 return;
97
98 for_each_bank (i, mi) { 95 for_each_bank (i, mi) {
99 struct membank *bank = &mi->bank[i]; 96 struct membank *bank = &mi->bank[i];
100 unsigned int pfn1, pfn2; 97 unsigned int pfn1, pfn2;
@@ -461,7 +458,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
461 * free the section of the memmap array. 458 * free the section of the memmap array.
462 */ 459 */
463 if (pg < pgend) 460 if (pg < pgend)
464 free_bootmem(pg, pgend - pg); 461 memblock_free_early(pg, pgend - pg);
465} 462}
466 463
467/* 464/*
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index da5237d636d6..52715a71aede 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -31,74 +31,6 @@
31static unsigned long max_gap; 31static unsigned long max_gap;
32#endif 32#endif
33 33
34/**
35 * show_mem - give short summary of memory stats
36 *
37 * Shows a simple page count of reserved and used pages in the system.
38 * For discontig machines, it does this on a per-pgdat basis.
39 */
40void show_mem(unsigned int filter)
41{
42 int i, total_reserved = 0;
43 int total_shared = 0, total_cached = 0;
44 unsigned long total_present = 0;
45 pg_data_t *pgdat;
46
47 printk(KERN_INFO "Mem-info:\n");
48 show_free_areas(filter);
49 printk(KERN_INFO "Node memory in pages:\n");
50 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
51 return;
52 for_each_online_pgdat(pgdat) {
53 unsigned long present;
54 unsigned long flags;
55 int shared = 0, cached = 0, reserved = 0;
56 int nid = pgdat->node_id;
57
58 if (skip_free_areas_node(filter, nid))
59 continue;
60 pgdat_resize_lock(pgdat, &flags);
61 present = pgdat->node_present_pages;
62 for(i = 0; i < pgdat->node_spanned_pages; i++) {
63 struct page *page;
64 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
65 touch_nmi_watchdog();
66 if (pfn_valid(pgdat->node_start_pfn + i))
67 page = pfn_to_page(pgdat->node_start_pfn + i);
68 else {
69#ifdef CONFIG_VIRTUAL_MEM_MAP
70 if (max_gap < LARGE_GAP)
71 continue;
72#endif
73 i = vmemmap_find_next_valid_pfn(nid, i) - 1;
74 continue;
75 }
76 if (PageReserved(page))
77 reserved++;
78 else if (PageSwapCache(page))
79 cached++;
80 else if (page_count(page))
81 shared += page_count(page)-1;
82 }
83 pgdat_resize_unlock(pgdat, &flags);
84 total_present += present;
85 total_reserved += reserved;
86 total_cached += cached;
87 total_shared += shared;
88 printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, "
89 "shrd: %10d, swpd: %10d\n", nid,
90 present, reserved, shared, cached);
91 }
92 printk(KERN_INFO "%ld pages of RAM\n", total_present);
93 printk(KERN_INFO "%d reserved pages\n", total_reserved);
94 printk(KERN_INFO "%d pages shared\n", total_shared);
95 printk(KERN_INFO "%d pages swap cached\n", total_cached);
96 printk(KERN_INFO "Total of %ld pages in page table cache\n",
97 quicklist_total_size());
98 printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
99}
100
101
102/* physical address where the bootmem map is located */ 34/* physical address where the bootmem map is located */
103unsigned long bootmap_start; 35unsigned long bootmap_start;
104 36
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 2de08f4d9930..878626805369 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -608,69 +608,6 @@ void *per_cpu_init(void)
608#endif /* CONFIG_SMP */ 608#endif /* CONFIG_SMP */
609 609
610/** 610/**
611 * show_mem - give short summary of memory stats
612 *
613 * Shows a simple page count of reserved and used pages in the system.
614 * For discontig machines, it does this on a per-pgdat basis.
615 */
616void show_mem(unsigned int filter)
617{
618 int i, total_reserved = 0;
619 int total_shared = 0, total_cached = 0;
620 unsigned long total_present = 0;
621 pg_data_t *pgdat;
622
623 printk(KERN_INFO "Mem-info:\n");
624 show_free_areas(filter);
625 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
626 return;
627 printk(KERN_INFO "Node memory in pages:\n");
628 for_each_online_pgdat(pgdat) {
629 unsigned long present;
630 unsigned long flags;
631 int shared = 0, cached = 0, reserved = 0;
632 int nid = pgdat->node_id;
633
634 if (skip_free_areas_node(filter, nid))
635 continue;
636 pgdat_resize_lock(pgdat, &flags);
637 present = pgdat->node_present_pages;
638 for(i = 0; i < pgdat->node_spanned_pages; i++) {
639 struct page *page;
640 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
641 touch_nmi_watchdog();
642 if (pfn_valid(pgdat->node_start_pfn + i))
643 page = pfn_to_page(pgdat->node_start_pfn + i);
644 else {
645 i = vmemmap_find_next_valid_pfn(nid, i) - 1;
646 continue;
647 }
648 if (PageReserved(page))
649 reserved++;
650 else if (PageSwapCache(page))
651 cached++;
652 else if (page_count(page))
653 shared += page_count(page)-1;
654 }
655 pgdat_resize_unlock(pgdat, &flags);
656 total_present += present;
657 total_reserved += reserved;
658 total_cached += cached;
659 total_shared += shared;
660 printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, "
661 "shrd: %10d, swpd: %10d\n", nid,
662 present, reserved, shared, cached);
663 }
664 printk(KERN_INFO "%ld pages of RAM\n", total_present);
665 printk(KERN_INFO "%d reserved pages\n", total_reserved);
666 printk(KERN_INFO "%d pages shared\n", total_shared);
667 printk(KERN_INFO "%d pages swap cached\n", total_cached);
668 printk(KERN_INFO "Total of %ld pages in page table cache\n",
669 quicklist_total_size());
670 printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
671}
672
673/**
674 * call_pernode_memory - use SRAT to call callback functions with node info 611 * call_pernode_memory - use SRAT to call callback functions with node info
675 * @start: physical start of range 612 * @start: physical start of range
676 * @len: length of range 613 * @len: length of range
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 88504abf5704..25c350264a41 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -684,3 +684,51 @@ per_linux32_init(void)
684} 684}
685 685
686__initcall(per_linux32_init); 686__initcall(per_linux32_init);
687
688/**
689 * show_mem - give short summary of memory stats
690 *
691 * Shows a simple page count of reserved and used pages in the system.
692 * For discontig machines, it does this on a per-pgdat basis.
693 */
694void show_mem(unsigned int filter)
695{
696 int total_reserved = 0;
697 unsigned long total_present = 0;
698 pg_data_t *pgdat;
699
700 printk(KERN_INFO "Mem-info:\n");
701 show_free_areas(filter);
702 printk(KERN_INFO "Node memory in pages:\n");
703 for_each_online_pgdat(pgdat) {
704 unsigned long present;
705 unsigned long flags;
706 int reserved = 0;
707 int nid = pgdat->node_id;
708 int zoneid;
709
710 if (skip_free_areas_node(filter, nid))
711 continue;
712 pgdat_resize_lock(pgdat, &flags);
713
714 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
715 struct zone *zone = &pgdat->node_zones[zoneid];
716 if (!populated_zone(zone))
717 continue;
718
719 reserved += zone->present_pages - zone->managed_pages;
720 }
721 present = pgdat->node_present_pages;
722
723 pgdat_resize_unlock(pgdat, &flags);
724 total_present += present;
725 total_reserved += reserved;
726 printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ",
727 nid, present, reserved);
728 }
729 printk(KERN_INFO "%ld pages of RAM\n", total_present);
730 printk(KERN_INFO "%d reserved pages\n", total_reserved);
731 printk(KERN_INFO "Total of %ld pages in page table cache\n",
732 quicklist_total_size());
733 printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
734}
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index 3cd6288f65c2..11fa51c89617 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -204,7 +204,8 @@ static void __init do_init_bootmem(void)
204 start_pfn = memblock_region_memory_base_pfn(reg); 204 start_pfn = memblock_region_memory_base_pfn(reg);
205 end_pfn = memblock_region_memory_end_pfn(reg); 205 end_pfn = memblock_region_memory_end_pfn(reg);
206 memblock_set_node(PFN_PHYS(start_pfn), 206 memblock_set_node(PFN_PHYS(start_pfn),
207 PFN_PHYS(end_pfn - start_pfn), 0); 207 PFN_PHYS(end_pfn - start_pfn),
208 &memblock.memory, 0);
208 } 209 }
209 210
210 /* All of system RAM sits in node 0 for the non-NUMA case */ 211 /* All of system RAM sits in node 0 for the non-NUMA case */
diff --git a/arch/metag/mm/numa.c b/arch/metag/mm/numa.c
index b172aa45fcf8..67b46c295072 100644
--- a/arch/metag/mm/numa.c
+++ b/arch/metag/mm/numa.c
@@ -42,7 +42,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
42 memblock_add(start, end - start); 42 memblock_add(start, end - start);
43 43
44 memblock_set_node(PFN_PHYS(start_pfn), 44 memblock_set_node(PFN_PHYS(start_pfn),
45 PFN_PHYS(end_pfn - start_pfn), nid); 45 PFN_PHYS(end_pfn - start_pfn),
46 &memblock.memory, nid);
46 47
47 /* Node-local pgdat */ 48 /* Node-local pgdat */
48 pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data), 49 pgdat_paddr = memblock_alloc_base(sizeof(struct pglist_data),
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 74c7bcc1e82d..89077d346714 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -192,7 +192,8 @@ void __init setup_memory(void)
192 start_pfn = memblock_region_memory_base_pfn(reg); 192 start_pfn = memblock_region_memory_base_pfn(reg);
193 end_pfn = memblock_region_memory_end_pfn(reg); 193 end_pfn = memblock_region_memory_end_pfn(reg);
194 memblock_set_node(start_pfn << PAGE_SHIFT, 194 memblock_set_node(start_pfn << PAGE_SHIFT,
195 (end_pfn - start_pfn) << PAGE_SHIFT, 0); 195 (end_pfn - start_pfn) << PAGE_SHIFT,
196 &memblock.memory, 0);
196 } 197 }
197 198
198 /* free bootmem is whole main memory */ 199 /* free bootmem is whole main memory */
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 96f8168cf4ec..ae085ad0fba0 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -645,55 +645,30 @@ EXPORT_SYMBOL(empty_zero_page);
645 645
646void show_mem(unsigned int filter) 646void show_mem(unsigned int filter)
647{ 647{
648 int i,free = 0,total = 0,reserved = 0; 648 int total = 0,reserved = 0;
649 int shared = 0, cached = 0; 649 pg_data_t *pgdat;
650 650
651 printk(KERN_INFO "Mem-info:\n"); 651 printk(KERN_INFO "Mem-info:\n");
652 show_free_areas(filter); 652 show_free_areas(filter);
653 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
654 return;
655#ifndef CONFIG_DISCONTIGMEM
656 i = max_mapnr;
657 while (i-- > 0) {
658 total++;
659 if (PageReserved(mem_map+i))
660 reserved++;
661 else if (PageSwapCache(mem_map+i))
662 cached++;
663 else if (!page_count(&mem_map[i]))
664 free++;
665 else
666 shared += page_count(&mem_map[i]) - 1;
667 }
668#else
669 for (i = 0; i < npmem_ranges; i++) {
670 int j;
671 653
672 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { 654 for_each_online_pgdat(pgdat) {
673 struct page *p; 655 unsigned long flags;
674 unsigned long flags; 656 int zoneid;
675 657
676 pgdat_resize_lock(NODE_DATA(i), &flags); 658 pgdat_resize_lock(pgdat, &flags);
677 p = nid_page_nr(i, j) - node_start_pfn(i); 659 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
678 660 struct zone *zone = &pgdat->node_zones[zoneid];
679 total++; 661 if (!populated_zone(zone))
680 if (PageReserved(p)) 662 continue;
681 reserved++; 663
682 else if (PageSwapCache(p)) 664 total += zone->present_pages;
683 cached++; 665 reserved = zone->present_pages - zone->managed_pages;
684 else if (!page_count(p)) 666 }
685 free++; 667 pgdat_resize_unlock(pgdat, &flags);
686 else
687 shared += page_count(p) - 1;
688 pgdat_resize_unlock(NODE_DATA(i), &flags);
689 }
690 } 668 }
691#endif 669
692 printk(KERN_INFO "%d pages of RAM\n", total); 670 printk(KERN_INFO "%d pages of RAM\n", total);
693 printk(KERN_INFO "%d reserved pages\n", reserved); 671 printk(KERN_INFO "%d reserved pages\n", reserved);
694 printk(KERN_INFO "%d pages shared\n", shared);
695 printk(KERN_INFO "%d pages swap cached\n", cached);
696
697 672
698#ifdef CONFIG_DISCONTIGMEM 673#ifdef CONFIG_DISCONTIGMEM
699 { 674 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3fa93dc7fe75..8c1dd23652a1 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -209,7 +209,7 @@ void __init do_init_bootmem(void)
209 /* Place all memblock_regions in the same node and merge contiguous 209 /* Place all memblock_regions in the same node and merge contiguous
210 * memblock_regions 210 * memblock_regions
211 */ 211 */
212 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 212 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
213 213
214 /* Add all physical memory to the bootmem map, mark each area 214 /* Add all physical memory to the bootmem map, mark each area
215 * present. 215 * present.
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 078d3e00a616..5a944f25e94f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -670,7 +670,8 @@ static void __init parse_drconf_memory(struct device_node *memory)
670 node_set_online(nid); 670 node_set_online(nid);
671 sz = numa_enforce_memory_limit(base, size); 671 sz = numa_enforce_memory_limit(base, size);
672 if (sz) 672 if (sz)
673 memblock_set_node(base, sz, nid); 673 memblock_set_node(base, sz,
674 &memblock.memory, nid);
674 } while (--ranges); 675 } while (--ranges);
675 } 676 }
676} 677}
@@ -760,7 +761,7 @@ new_range:
760 continue; 761 continue;
761 } 762 }
762 763
763 memblock_set_node(start, size, nid); 764 memblock_set_node(start, size, &memblock.memory, nid);
764 765
765 if (--ranges) 766 if (--ranges)
766 goto new_range; 767 goto new_range;
@@ -797,7 +798,8 @@ static void __init setup_nonnuma(void)
797 798
798 fake_numa_create_new_node(end_pfn, &nid); 799 fake_numa_create_new_node(end_pfn, &nid);
799 memblock_set_node(PFN_PHYS(start_pfn), 800 memblock_set_node(PFN_PHYS(start_pfn),
800 PFN_PHYS(end_pfn - start_pfn), nid); 801 PFN_PHYS(end_pfn - start_pfn),
802 &memblock.memory, nid);
801 node_set_online(nid); 803 node_set_online(nid);
802 } 804 }
803} 805}
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 305f7ee1f382..c75d06aa27c3 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -2,7 +2,6 @@ menu "Machine selection"
2 2
3config SCORE 3config SCORE
4 def_bool y 4 def_bool y
5 select HAVE_GENERIC_HARDIRQS
6 select GENERIC_IRQ_SHOW 5 select GENERIC_IRQ_SHOW
7 select GENERIC_IOMAP 6 select GENERIC_IOMAP
8 select GENERIC_ATOMIC64 7 select GENERIC_ATOMIC64
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index 38b313909ac9..adad46e41a1d 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -13,6 +13,7 @@
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/irq.h> 14#include <linux/irq.h>
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/sched.h>
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/traps.h> 18#include <asm/traps.h>
18 19
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 1cf90e947dbf..de19cfa768f2 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -230,8 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
230 pmb_bolt_mapping((unsigned long)__va(start), start, end - start, 230 pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
231 PAGE_KERNEL); 231 PAGE_KERNEL);
232 232
233 memblock_set_node(PFN_PHYS(start_pfn), 233 memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
234 PFN_PHYS(end_pfn - start_pfn), nid); 234 &memblock.memory, nid);
235} 235}
236 236
237void __init __weak plat_early_device_setup(void) 237void __init __weak plat_early_device_setup(void)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e530d09c..eafbc65c9c47 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1021,7 +1021,8 @@ static void __init add_node_ranges(void)
1021 "start[%lx] end[%lx]\n", 1021 "start[%lx] end[%lx]\n",
1022 nid, start, this_end); 1022 nid, start, this_end);
1023 1023
1024 memblock_set_node(start, this_end - start, nid); 1024 memblock_set_node(start, this_end - start,
1025 &memblock.memory, nid);
1025 start = this_end; 1026 start = this_end;
1026 } 1027 }
1027 } 1028 }
@@ -1325,7 +1326,7 @@ static void __init bootmem_init_nonnuma(void)
1325 (top_of_ram - total_ram) >> 20); 1326 (top_of_ram - total_ram) >> 20);
1326 1327
1327 init_node_masks_nonnuma(); 1328 init_node_masks_nonnuma();
1328 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 1329 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
1329 allocate_node_data(0); 1330 allocate_node_data(0);
1330 node_set_online(0); 1331 node_set_online(0);
1331} 1332}
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index ae6bc036db92..be2bde9b07cf 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -66,9 +66,6 @@ void show_mem(unsigned int filter)
66 printk(KERN_DEFAULT "Mem-info:\n"); 66 printk(KERN_DEFAULT "Mem-info:\n");
67 show_free_areas(filter); 67 show_free_areas(filter);
68 68
69 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
70 return;
71
72 for_each_bank(i, mi) { 69 for_each_bank(i, mi) {
73 struct membank *bank = &mi->bank[i]; 70 struct membank *bank = &mi->bank[i];
74 unsigned int pfn1, pfn2; 71 unsigned int pfn1, pfn2;
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..2f59cce3b38a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,9 +51,9 @@ extern int devmem_is_allowed(unsigned long pagenr);
51extern unsigned long max_low_pfn_mapped; 51extern unsigned long max_low_pfn_mapped;
52extern unsigned long max_pfn_mapped; 52extern unsigned long max_pfn_mapped;
53 53
54static inline phys_addr_t get_max_mapped(void) 54static inline phys_addr_t get_max_low_mapped(void)
55{ 55{
56 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; 56 return (phys_addr_t)max_low_pfn_mapped << PAGE_SHIFT;
57} 57}
58 58
59bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); 59bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
91 91
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 93
94 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 94 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
96 PAGE_SIZE, corruption_check_size); 96 PAGE_SIZE, corruption_check_size);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
1120 nr_pages += end_pfn - start_pfn; 1120 nr_pages += end_pfn - start_pfn;
1121 } 1121 }
1122 1122
1123 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { 1123 for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1126 if (start_pfn < end_pfn) 1126 if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 06853e670354..c9675594d7ca 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
1119 1119
1120 setup_real_mode(); 1120 setup_real_mode();
1121 1121
1122 memblock_set_current_limit(get_max_mapped()); 1122 memblock_set_current_limit(get_max_low_mapped());
1123 dma_contiguous_reserve(0); 1123 dma_contiguous_reserve(0);
1124 1124
1125 /* 1125 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5bdc5430597c..e39504878aec 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
666#endif 666#endif
667 667
668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 668 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
669 sparse_memory_present_with_active_regions(0); 669 sparse_memory_present_with_active_regions(0);
670 670
671#ifdef CONFIG_FLATMEM 671#ifdef CONFIG_FLATMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245f..f35c66c5959a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
643#ifndef CONFIG_NUMA 643#ifndef CONFIG_NUMA
644void __init initmem_init(void) 644void __init initmem_init(void)
645{ 645{
646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); 646 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
647} 647}
648#endif 648#endif
649 649
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409ee..1e9da795767a 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 i; 74 u64 i;
75 phys_addr_t this_start, this_end; 75 phys_addr_t this_start, this_end;
76 76
77 for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { 77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp_t(phys_addr_t, this_start, start, end); 78 this_start = clamp_t(phys_addr_t, this_start, start, end);
79 this_end = clamp_t(phys_addr_t, this_end, start, end); 79 this_end = clamp_t(phys_addr_t, this_end, start, end);
80 if (this_start < this_end) { 80 if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c85da7bb6b60..81b2750f3666 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -491,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
491 491
492 for (i = 0; i < mi->nr_blks; i++) { 492 for (i = 0; i < mi->nr_blks; i++) {
493 struct numa_memblk *mb = &mi->blk[i]; 493 struct numa_memblk *mb = &mi->blk[i];
494 memblock_set_node(mb->start, mb->end - mb->start, mb->nid); 494 memblock_set_node(mb->start, mb->end - mb->start,
495 &memblock.memory, mb->nid);
496
497 /*
498 * At this time, all memory regions reserved by memblock are
499 * used by the kernel. Set the nid in memblock.reserved will
500 * mark out all the nodes the kernel resides in.
501 */
502 memblock_set_node(mb->start, mb->end - mb->start,
503 &memblock.reserved, mb->nid);
495 } 504 }
496 505
497 /* 506 /*
@@ -553,6 +562,30 @@ static void __init numa_init_array(void)
553 } 562 }
554} 563}
555 564
565static void __init numa_clear_kernel_node_hotplug(void)
566{
567 int i, nid;
568 nodemask_t numa_kernel_nodes;
569 unsigned long start, end;
570 struct memblock_type *type = &memblock.reserved;
571
572 /* Mark all kernel nodes. */
573 for (i = 0; i < type->cnt; i++)
574 node_set(type->regions[i].nid, numa_kernel_nodes);
575
576 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
577 for (i = 0; i < numa_meminfo.nr_blks; i++) {
578 nid = numa_meminfo.blk[i].nid;
579 if (!node_isset(nid, numa_kernel_nodes))
580 continue;
581
582 start = numa_meminfo.blk[i].start;
583 end = numa_meminfo.blk[i].end;
584
585 memblock_clear_hotplug(start, end - start);
586 }
587}
588
556static int __init numa_init(int (*init_func)(void)) 589static int __init numa_init(int (*init_func)(void))
557{ 590{
558 int i; 591 int i;
@@ -565,7 +598,12 @@ static int __init numa_init(int (*init_func)(void))
565 nodes_clear(node_possible_map); 598 nodes_clear(node_possible_map);
566 nodes_clear(node_online_map); 599 nodes_clear(node_online_map);
567 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 600 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
568 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 601 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
602 MAX_NUMNODES));
603 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
604 MAX_NUMNODES));
605 /* In case that parsing SRAT failed. */
606 WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
569 numa_reset_distance(); 607 numa_reset_distance();
570 608
571 ret = init_func(); 609 ret = init_func();
@@ -601,6 +639,16 @@ static int __init numa_init(int (*init_func)(void))
601 numa_clear_node(i); 639 numa_clear_node(i);
602 } 640 }
603 numa_init_array(); 641 numa_init_array();
642
643 /*
644 * At very early time, the kernel have to use some memory such as
645 * loading the kernel image. We cannot prevent this anyway. So any
646 * node the kernel resides in should be un-hotpluggable.
647 *
648 * And when we come here, numa_init() won't fail.
649 */
650 numa_clear_kernel_node_hotplug();
651
604 return 0; 652 return 0;
605} 653}
606 654
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 266ca912f62e..1a25187e151e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -181,6 +181,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
181 (unsigned long long) start, (unsigned long long) end - 1, 181 (unsigned long long) start, (unsigned long long) end - 1,
182 hotpluggable ? " hotplug" : ""); 182 hotpluggable ? " hotplug" : "");
183 183
184 /* Mark hotplug range in memblock. */
185 if (hotpluggable && memblock_mark_hotplug(start, ma->length))
186 pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
187 (unsigned long long)start, (unsigned long long)end - 1);
188
184 return 0; 189 return 0;
185out_err_bad_srat: 190out_err_bad_srat:
186 bad_srat(); 191 bad_srat();
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f895a8c8a244..92c5937f80c3 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -22,7 +22,6 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/backing-dev.h> 24#include <linux/backing-dev.h>
25#include <linux/bootmem.h>
26#include <linux/splice.h> 25#include <linux/splice.h>
27#include <linux/pfn.h> 26#include <linux/pfn.h>
28#include <linux/export.h> 27#include <linux/export.h>
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index e2e04b007e15..17cf96c45f2b 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
324{ 324{
325 struct firmware_map_entry *entry; 325 struct firmware_map_entry *entry;
326 326
327 entry = alloc_bootmem(sizeof(struct firmware_map_entry)); 327 entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0);
328 if (WARN_ON(!entry)) 328 if (WARN_ON(!entry))
329 return -ENOMEM; 329 return -ENOMEM;
330 330
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 43b9bfea48fa..59779e19315e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
917 917
918 /* If range covers entire pagetable, free it */ 918 /* If range covers entire pagetable, free it */
919 if (!(start_pfn > level_pfn || 919 if (!(start_pfn > level_pfn ||
920 last_pfn < level_pfn + level_size(level))) { 920 last_pfn < level_pfn + level_size(level) - 1)) {
921 dma_clear_pte(pte); 921 dma_clear_pte(pte);
922 domain_flush_cache(domain, pte, sizeof(*pte)); 922 domain_flush_cache(domain, pte, sizeof(*pte));
923 free_pgtable_page(level_pte); 923 free_pgtable_page(level_pte);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
680 struct i2c_msg __user *tmsgs; 680 struct i2c_msg __user *tmsgs;
681 struct i2c_msg32 __user *umsgs; 681 struct i2c_msg32 __user *umsgs;
682 compat_caddr_t datap; 682 compat_caddr_t datap;
683 int nmsgs, i; 683 u32 nmsgs;
684 int i;
684 685
685 if (get_user(nmsgs, &udata->nmsgs)) 686 if (get_user(nmsgs, &udata->nmsgs))
686 return -EFAULT; 687 return -EFAULT;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..0b9ff4395e6a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
82 * events. 82 * events.
83 */ 83 */
84static int dnotify_handle_event(struct fsnotify_group *group, 84static int dnotify_handle_event(struct fsnotify_group *group,
85 struct inode *inode,
85 struct fsnotify_mark *inode_mark, 86 struct fsnotify_mark *inode_mark,
86 struct fsnotify_mark *vfsmount_mark, 87 struct fsnotify_mark *vfsmount_mark,
87 struct fsnotify_event *event) 88 u32 mask, void *data, int data_type,
89 const unsigned char *file_name)
88{ 90{
89 struct dnotify_mark *dn_mark; 91 struct dnotify_mark *dn_mark;
90 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
95 96
96 BUG_ON(vfsmount_mark); 97 /* not a dir, dnotify doesn't care */
98 if (!S_ISDIR(inode->i_mode))
99 return 0;
97 100
98 to_tell = event->to_tell; 101 BUG_ON(vfsmount_mark);
99 102
100 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); 103 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
101 104
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
122 return 0; 125 return 0;
123} 126}
124 127
125/*
126 * Given an inode and mask determine if dnotify would be interested in sending
127 * userspace notification for that pair.
128 */
129static bool dnotify_should_send_event(struct fsnotify_group *group,
130 struct inode *inode,
131 struct fsnotify_mark *inode_mark,
132 struct fsnotify_mark *vfsmount_mark,
133 __u32 mask, void *data, int data_type)
134{
135 /* not a dir, dnotify doesn't care */
136 if (!S_ISDIR(inode->i_mode))
137 return false;
138
139 return true;
140}
141
142static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) 128static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
143{ 129{
144 struct dnotify_mark *dn_mark = container_of(fsn_mark, 130 struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
152 138
153static struct fsnotify_ops dnotify_fsnotify_ops = { 139static struct fsnotify_ops dnotify_fsnotify_ops = {
154 .handle_event = dnotify_handle_event, 140 .handle_event = dnotify_handle_event,
155 .should_send_event = dnotify_should_send_event,
156 .free_group_priv = NULL,
157 .freeing_mark = NULL,
158 .free_event_priv = NULL,
159}; 141};
160 142
161/* 143/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..58772623f02a 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,31 +9,27 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11 11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) 12#include "fanotify.h"
13
14static bool should_merge(struct fsnotify_event *old_fsn,
15 struct fsnotify_event *new_fsn)
13{ 16{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new); 17 struct fanotify_event_info *old, *new;
15 18
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH):
21#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 19#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
22 /* dont merge two permission events */ 20 /* dont merge two permission events */
23 if ((old->mask & FAN_ALL_PERM_EVENTS) && 21 if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
24 (new->mask & FAN_ALL_PERM_EVENTS)) 22 (new_fsn->mask & FAN_ALL_PERM_EVENTS))
25 return false; 23 return false;
26#endif 24#endif
27 if ((old->path.mnt == new->path.mnt) && 25 pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
28 (old->path.dentry == new->path.dentry)) 26 old = FANOTIFY_E(old_fsn);
29 return true; 27 new = FANOTIFY_E(new_fsn);
30 break; 28
31 case (FSNOTIFY_EVENT_NONE): 29 if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
32 return true; 30 old->path.mnt == new->path.mnt &&
33 default: 31 old->path.dentry == new->path.dentry)
34 BUG(); 32 return true;
35 };
36 }
37 return false; 33 return false;
38} 34}
39 35
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
41static struct fsnotify_event *fanotify_merge(struct list_head *list, 37static struct fsnotify_event *fanotify_merge(struct list_head *list,
42 struct fsnotify_event *event) 38 struct fsnotify_event *event)
43{ 39{
44 struct fsnotify_event_holder *test_holder; 40 struct fsnotify_event *test_event;
45 struct fsnotify_event *test_event = NULL; 41 bool do_merge = false;
46 struct fsnotify_event *new_event;
47 42
48 pr_debug("%s: list=%p event=%p\n", __func__, list, event); 43 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
49 44
50 45 list_for_each_entry_reverse(test_event, list, list) {
51 list_for_each_entry_reverse(test_holder, list, event_list) { 46 if (should_merge(test_event, event)) {
52 if (should_merge(test_holder->event, event)) { 47 do_merge = true;
53 test_event = test_holder->event;
54 break; 48 break;
55 } 49 }
56 } 50 }
57 51
58 if (!test_event) 52 if (!do_merge)
59 return NULL; 53 return NULL;
60 54
61 fsnotify_get_event(test_event); 55 test_event->mask |= event->mask;
62 56 return test_event;
63 /* if they are exactly the same we are done */
64 if (test_event->mask == event->mask)
65 return test_event;
66
67 /*
68 * if the refcnt == 2 this is the only queue
69 * for this event and so we can update the mask
70 * in place.
71 */
72 if (atomic_read(&test_event->refcnt) == 2) {
73 test_event->mask |= event->mask;
74 return test_event;
75 }
76
77 new_event = fsnotify_clone_event(test_event);
78
79 /* done with test_event */
80 fsnotify_put_event(test_event);
81
82 /* couldn't allocate memory, merge was not possible */
83 if (unlikely(!new_event))
84 return ERR_PTR(-ENOMEM);
85
86 /* build new event and replace it on the list */
87 new_event->mask = (test_event->mask | event->mask);
88 fsnotify_replace_event(test_holder, new_event);
89
90 /* we hold a reference on new_event from clone_event */
91 return new_event;
92} 57}
93 58
94#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 59#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
95static int fanotify_get_response_from_access(struct fsnotify_group *group, 60static int fanotify_get_response_from_access(struct fsnotify_group *group,
96 struct fsnotify_event *event) 61 struct fanotify_event_info *event)
97{ 62{
98 int ret; 63 int ret;
99 64
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
106 return 0; 71 return 0;
107 72
108 /* userspace responded, convert to something usable */ 73 /* userspace responded, convert to something usable */
109 spin_lock(&event->lock);
110 switch (event->response) { 74 switch (event->response) {
111 case FAN_ALLOW: 75 case FAN_ALLOW:
112 ret = 0; 76 ret = 0;
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
116 ret = -EPERM; 80 ret = -EPERM;
117 } 81 }
118 event->response = 0; 82 event->response = 0;
119 spin_unlock(&event->lock);
120 83
121 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 84 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
122 group, event, ret); 85 group, event, ret);
@@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
125} 88}
126#endif 89#endif
127 90
128static int fanotify_handle_event(struct fsnotify_group *group, 91static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
129 struct fsnotify_mark *inode_mark,
130 struct fsnotify_mark *fanotify_mark,
131 struct fsnotify_event *event)
132{
133 int ret = 0;
134 struct fsnotify_event *notify_event = NULL;
135
136 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
137 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
138 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
139 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
140 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
141 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
142 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
143 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
144 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
145 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
146
147 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
148
149 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
150 if (IS_ERR(notify_event))
151 return PTR_ERR(notify_event);
152
153#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
154 if (event->mask & FAN_ALL_PERM_EVENTS) {
155 /* if we merged we need to wait on the new event */
156 if (notify_event)
157 event = notify_event;
158 ret = fanotify_get_response_from_access(group, event);
159 }
160#endif
161
162 if (notify_event)
163 fsnotify_put_event(notify_event);
164
165 return ret;
166}
167
168static bool fanotify_should_send_event(struct fsnotify_group *group,
169 struct inode *to_tell,
170 struct fsnotify_mark *inode_mark,
171 struct fsnotify_mark *vfsmnt_mark, 92 struct fsnotify_mark *vfsmnt_mark,
172 __u32 event_mask, void *data, int data_type) 93 u32 event_mask,
94 void *data, int data_type)
173{ 95{
174 __u32 marks_mask, marks_ignored_mask; 96 __u32 marks_mask, marks_ignored_mask;
175 struct path *path = data; 97 struct path *path = data;
176 98
177 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 99 pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
178 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 100 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
179 inode_mark, vfsmnt_mark, event_mask, data, data_type); 101 event_mask, data, data_type);
180 102
181 /* if we don't have enough info to send an event to userspace say no */ 103 /* if we don't have enough info to send an event to userspace say no */
182 if (data_type != FSNOTIFY_EVENT_PATH) 104 if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
217 return false; 139 return false;
218} 140}
219 141
142static int fanotify_handle_event(struct fsnotify_group *group,
143 struct inode *inode,
144 struct fsnotify_mark *inode_mark,
145 struct fsnotify_mark *fanotify_mark,
146 u32 mask, void *data, int data_type,
147 const unsigned char *file_name)
148{
149 int ret = 0;
150 struct fanotify_event_info *event;
151 struct fsnotify_event *fsn_event;
152 struct fsnotify_event *notify_fsn_event;
153
154 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
155 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
156 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
157 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
158 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
159 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
160 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
161 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
162 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
163 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
164
165 if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
166 data_type))
167 return 0;
168
169 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
170 mask);
171
172 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
173 if (unlikely(!event))
174 return -ENOMEM;
175
176 fsn_event = &event->fse;
177 fsnotify_init_event(fsn_event, inode, mask);
178 event->tgid = get_pid(task_tgid(current));
179 if (data_type == FSNOTIFY_EVENT_PATH) {
180 struct path *path = data;
181 event->path = *path;
182 path_get(&event->path);
183 } else {
184 event->path.mnt = NULL;
185 event->path.dentry = NULL;
186 }
187#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
188 event->response = 0;
189#endif
190
191 notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
192 fanotify_merge);
193 if (notify_fsn_event) {
194 /* Our event wasn't used in the end. Free it. */
195 fsnotify_destroy_event(group, fsn_event);
196 if (IS_ERR(notify_fsn_event))
197 return PTR_ERR(notify_fsn_event);
198 /* We need to ask about a different events after a merge... */
199 event = FANOTIFY_E(notify_fsn_event);
200 fsn_event = notify_fsn_event;
201 }
202
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
205 ret = fanotify_get_response_from_access(group, event);
206#endif
207 return ret;
208}
209
220static void fanotify_free_group_priv(struct fsnotify_group *group) 210static void fanotify_free_group_priv(struct fsnotify_group *group)
221{ 211{
222 struct user_struct *user; 212 struct user_struct *user;
@@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
226 free_uid(user); 216 free_uid(user);
227} 217}
228 218
219static void fanotify_free_event(struct fsnotify_event *fsn_event)
220{
221 struct fanotify_event_info *event;
222
223 event = FANOTIFY_E(fsn_event);
224 path_put(&event->path);
225 put_pid(event->tgid);
226 kmem_cache_free(fanotify_event_cachep, event);
227}
228
229const struct fsnotify_ops fanotify_fsnotify_ops = { 229const struct fsnotify_ops fanotify_fsnotify_ops = {
230 .handle_event = fanotify_handle_event, 230 .handle_event = fanotify_handle_event,
231 .should_send_event = fanotify_should_send_event,
232 .free_group_priv = fanotify_free_group_priv, 231 .free_group_priv = fanotify_free_group_priv,
233 .free_event_priv = NULL, 232 .free_event = fanotify_free_event,
234 .freeing_mark = NULL,
235}; 233};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..0e90174a116a
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,23 @@
1#include <linux/fsnotify_backend.h>
2#include <linux/path.h>
3#include <linux/slab.h>
4
5extern struct kmem_cache *fanotify_event_cachep;
6
7struct fanotify_event_info {
8 struct fsnotify_event fse;
9 /*
10 * We hold ref to this path so it may be dereferenced at any point
11 * during this object's lifetime
12 */
13 struct path path;
14 struct pid *tgid;
15#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
16 u32 response; /* userspace answer to question */
17#endif
18};
19
20static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
21{
22 return container_of(fse, struct fanotify_event_info, fse);
23}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..57d7c083cb4b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
19 19
20#include "../../mount.h" 20#include "../../mount.h"
21#include "../fdinfo.h" 21#include "../fdinfo.h"
22#include "fanotify.h"
22 23
23#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 24#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
24#define FANOTIFY_DEFAULT_MAX_MARKS 8192 25#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
28 29
29static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
30static struct kmem_cache *fanotify_response_event_cache __read_mostly; 31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly;
31 33
32struct fanotify_response_event { 34struct fanotify_response_event {
33 struct list_head list; 35 struct list_head list;
34 __s32 fd; 36 __s32 fd;
35 struct fsnotify_event *event; 37 struct fanotify_event_info *event;
36}; 38};
37 39
38/* 40/*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
61} 63}
62 64
63static int create_fd(struct fsnotify_group *group, 65static int create_fd(struct fsnotify_group *group,
64 struct fsnotify_event *event, 66 struct fanotify_event_info *event,
65 struct file **file) 67 struct file **file)
66{ 68{
67 int client_fd; 69 int client_fd;
68 struct file *new_file; 70 struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
73 if (client_fd < 0) 75 if (client_fd < 0)
74 return client_fd; 76 return client_fd;
75 77
76 if (event->data_type != FSNOTIFY_EVENT_PATH) {
77 WARN_ON(1);
78 put_unused_fd(client_fd);
79 return -EINVAL;
80 }
81
82 /* 78 /*
83 * we need a new file handle for the userspace program so it can read even if it was 79 * we need a new file handle for the userspace program so it can read even if it was
84 * originally opened O_WRONLY. 80 * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
109} 105}
110 106
111static int fill_event_metadata(struct fsnotify_group *group, 107static int fill_event_metadata(struct fsnotify_group *group,
112 struct fanotify_event_metadata *metadata, 108 struct fanotify_event_metadata *metadata,
113 struct fsnotify_event *event, 109 struct fsnotify_event *fsn_event,
114 struct file **file) 110 struct file **file)
115{ 111{
116 int ret = 0; 112 int ret = 0;
113 struct fanotify_event_info *event;
117 114
118 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
119 group, metadata, event); 116 group, metadata, fsn_event);
120 117
121 *file = NULL; 118 *file = NULL;
119 event = container_of(fsn_event, struct fanotify_event_info, fse);
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 120 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 121 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 122 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0; 123 metadata->reserved = 0;
126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 124 metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
127 metadata->pid = pid_vnr(event->tgid); 125 metadata->pid = pid_vnr(event->tgid);
128 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 126 if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
129 metadata->fd = FAN_NOFD; 127 metadata->fd = FAN_NOFD;
130 else { 128 else {
131 metadata->fd = create_fd(group, event, file); 129 metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
209 if (!re) 207 if (!re)
210 return -ENOMEM; 208 return -ENOMEM;
211 209
212 re->event = event; 210 re->event = FANOTIFY_E(event);
213 re->fd = fd; 211 re->fd = fd;
214 212
215 mutex_lock(&group->fanotify_data.access_mutex); 213 mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
217 if (atomic_read(&group->fanotify_data.bypass_perm)) { 215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
218 mutex_unlock(&group->fanotify_data.access_mutex); 216 mutex_unlock(&group->fanotify_data.access_mutex);
219 kmem_cache_free(fanotify_response_event_cache, re); 217 kmem_cache_free(fanotify_response_event_cache, re);
220 event->response = FAN_ALLOW; 218 FANOTIFY_E(event)->response = FAN_ALLOW;
221 return 0; 219 return 0;
222 } 220 }
223 221
@@ -273,7 +271,7 @@ out_close_fd:
273out: 271out:
274#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
275 if (event->mask & FAN_ALL_PERM_EVENTS) { 273 if (event->mask & FAN_ALL_PERM_EVENTS) {
276 event->response = FAN_DENY; 274 FANOTIFY_E(event)->response = FAN_DENY;
277 wake_up(&group->fanotify_data.access_waitq); 275 wake_up(&group->fanotify_data.access_waitq);
278 } 276 }
279#endif 277#endif
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
321 if (IS_ERR(kevent)) 319 if (IS_ERR(kevent))
322 break; 320 break;
323 ret = copy_event_to_user(group, kevent, buf); 321 ret = copy_event_to_user(group, kevent, buf);
324 fsnotify_put_event(kevent); 322 fsnotify_destroy_event(group, kevent);
325 if (ret < 0) 323 if (ret < 0)
326 break; 324 break;
327 buf += ret; 325 buf += ret;
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
409static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 407static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
410{ 408{
411 struct fsnotify_group *group; 409 struct fsnotify_group *group;
412 struct fsnotify_event_holder *holder; 410 struct fsnotify_event *fsn_event;
413 void __user *p; 411 void __user *p;
414 int ret = -ENOTTY; 412 int ret = -ENOTTY;
415 size_t send_len = 0; 413 size_t send_len = 0;
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
421 switch (cmd) { 419 switch (cmd) {
422 case FIONREAD: 420 case FIONREAD:
423 mutex_lock(&group->notification_mutex); 421 mutex_lock(&group->notification_mutex);
424 list_for_each_entry(holder, &group->notification_list, event_list) 422 list_for_each_entry(fsn_event, &group->notification_list, list)
425 send_len += FAN_EVENT_METADATA_LEN; 423 send_len += FAN_EVENT_METADATA_LEN;
426 mutex_unlock(&group->notification_mutex); 424 mutex_unlock(&group->notification_mutex);
427 ret = put_user(send_len, (int __user *) p); 425 ret = put_user(send_len, (int __user *) p);
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void)
906 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 904 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
907 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, 905 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
908 SLAB_PANIC); 906 SLAB_PANIC);
907 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
909 908
910 return 0; 909 return 0;
911} 910}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..1d4e1ea2f37c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
128 struct fsnotify_mark *vfsmount_mark, 128 struct fsnotify_mark *vfsmount_mark,
129 __u32 mask, void *data, 129 __u32 mask, void *data,
130 int data_is, u32 cookie, 130 int data_is, u32 cookie,
131 const unsigned char *file_name, 131 const unsigned char *file_name)
132 struct fsnotify_event **event)
133{ 132{
134 struct fsnotify_group *group = NULL; 133 struct fsnotify_group *group = NULL;
135 __u32 inode_test_mask = 0; 134 __u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
170 169
171 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" 170 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" 171 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
173 " data=%p data_is=%d cookie=%d event=%p\n", 172 " data=%p data_is=%d cookie=%d\n",
174 __func__, group, to_tell, mask, inode_mark, 173 __func__, group, to_tell, mask, inode_mark,
175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, 174 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
176 data_is, cookie, *event); 175 data_is, cookie);
177 176
178 if (!inode_test_mask && !vfsmount_test_mask) 177 if (!inode_test_mask && !vfsmount_test_mask)
179 return 0; 178 return 0;
180 179
181 if (group->ops->should_send_event(group, to_tell, inode_mark, 180 return group->ops->handle_event(group, to_tell, inode_mark,
182 vfsmount_mark, mask, data, 181 vfsmount_mark, mask, data, data_is,
183 data_is) == false) 182 file_name);
184 return 0;
185
186 if (!*event) {
187 *event = fsnotify_create_event(to_tell, mask, data,
188 data_is, file_name,
189 cookie, GFP_KERNEL);
190 if (!*event)
191 return -ENOMEM;
192 }
193 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
194} 183}
195 184
196/* 185/*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
205 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; 194 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
206 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 195 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
207 struct fsnotify_group *inode_group, *vfsmount_group; 196 struct fsnotify_group *inode_group, *vfsmount_group;
208 struct fsnotify_event *event = NULL;
209 struct mount *mnt; 197 struct mount *mnt;
210 int idx, ret = 0; 198 int idx, ret = 0;
211 /* global tests shouldn't care about events on child only the specific event */ 199 /* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
258 246
259 if (inode_group > vfsmount_group) { 247 if (inode_group > vfsmount_group) {
260 /* handle inode */ 248 /* handle inode */
261 ret = send_to_group(to_tell, inode_mark, NULL, mask, data, 249 ret = send_to_group(to_tell, inode_mark, NULL, mask,
262 data_is, cookie, file_name, &event); 250 data, data_is, cookie, file_name);
263 /* we didn't use the vfsmount_mark */ 251 /* we didn't use the vfsmount_mark */
264 vfsmount_group = NULL; 252 vfsmount_group = NULL;
265 } else if (vfsmount_group > inode_group) { 253 } else if (vfsmount_group > inode_group) {
266 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, 254 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
267 data_is, cookie, file_name, &event); 255 data, data_is, cookie, file_name);
268 inode_group = NULL; 256 inode_group = NULL;
269 } else { 257 } else {
270 ret = send_to_group(to_tell, inode_mark, vfsmount_mark, 258 ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
271 mask, data, data_is, cookie, file_name, 259 mask, data, data_is, cookie,
272 &event); 260 file_name);
273 } 261 }
274 262
275 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) 263 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
285 ret = 0; 273 ret = 0;
286out: 274out:
287 srcu_read_unlock(&fsnotify_mark_srcu, idx); 275 srcu_read_unlock(&fsnotify_mark_srcu, idx);
288 /*
289 * fsnotify_create_event() took a reference so the event can't be cleaned
290 * up while we are still trying to add it to lists, drop that one.
291 */
292 if (event)
293 fsnotify_put_event(event);
294 276
295 return ret; 277 return ret;
296} 278}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ee674fe2cec7 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
99 INIT_LIST_HEAD(&group->marks_list); 99 INIT_LIST_HEAD(&group->marks_list);
100 100
101 group->ops = ops; 101 group->ops = ops;
102 fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW);
102 103
103 return group; 104 return group;
104} 105}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..485eef3f4407 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/slab.h> /* struct kmem_cache */ 3#include <linux/slab.h> /* struct kmem_cache */
4 4
5extern struct kmem_cache *event_priv_cachep; 5struct inotify_event_info {
6 6 struct fsnotify_event fse;
7struct inotify_event_private_data {
8 struct fsnotify_event_private_data fsnotify_event_priv_data;
9 int wd; 7 int wd;
8 u32 sync_cookie;
9 int name_len;
10 char name[];
10}; 11};
11 12
12struct inotify_inode_mark { 13struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
14 int wd; 15 int wd;
15}; 16};
16 17
18static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
19{
20 return container_of(fse, struct inotify_event_info, fse);
21}
22
17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, 23extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
18 struct fsnotify_group *group); 24 struct fsnotify_group *group);
19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 25extern int inotify_handle_event(struct fsnotify_group *group,
26 struct inode *inode,
27 struct fsnotify_mark *inode_mark,
28 struct fsnotify_mark *vfsmount_mark,
29 u32 mask, void *data, int data_type,
30 const unsigned char *file_name);
20 31
21extern const struct fsnotify_ops inotify_fsnotify_ops; 32extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..aad1a35e9af1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,100 +34,87 @@
34#include "inotify.h" 34#include "inotify.h"
35 35
36/* 36/*
37 * Check if 2 events contain the same information. We do not compare private data 37 * Check if 2 events contain the same information.
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */ 38 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) 39static bool event_compare(struct fsnotify_event *old_fsn,
40 struct fsnotify_event *new_fsn)
41{ 41{
42 if ((old->mask == new->mask) && 42 struct inotify_event_info *old, *new;
43 (old->to_tell == new->to_tell) && 43
44 (old->data_type == new->data_type) && 44 if (old_fsn->mask & FS_IN_IGNORED)
45 (old->name_len == new->name_len)) { 45 return false;
46 switch (old->data_type) { 46 old = INOTIFY_E(old_fsn);
47 case (FSNOTIFY_EVENT_INODE): 47 new = INOTIFY_E(new_fsn);
48 /* remember, after old was put on the wait_q we aren't 48 if ((old_fsn->mask == new_fsn->mask) &&
49 * allowed to look at the inode any more, only thing 49 (old_fsn->inode == new_fsn->inode) &&
50 * left to check was if the file_name is the same */ 50 (old->name_len == new->name_len) &&
51 if (!old->name_len || 51 (!old->name_len || !strcmp(old->name, new->name)))
52 !strcmp(old->file_name, new->file_name)) 52 return true;
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false; 53 return false;
69} 54}
70 55
71static struct fsnotify_event *inotify_merge(struct list_head *list, 56static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event) 57 struct fsnotify_event *event)
73{ 58{
74 struct fsnotify_event_holder *last_holder;
75 struct fsnotify_event *last_event; 59 struct fsnotify_event *last_event;
76 60
77 /* and the list better be locked by something too */ 61 last_event = list_entry(list->prev, struct fsnotify_event, list);
78 spin_lock(&event->lock); 62 if (!event_compare(last_event, event))
79 63 return NULL;
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event; 64 return last_event;
90} 65}
91 66
92static int inotify_handle_event(struct fsnotify_group *group, 67int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark, 68 struct inode *inode,
94 struct fsnotify_mark *vfsmount_mark, 69 struct fsnotify_mark *inode_mark,
95 struct fsnotify_event *event) 70 struct fsnotify_mark *vfsmount_mark,
71 u32 mask, void *data, int data_type,
72 const unsigned char *file_name)
96{ 73{
97 struct inotify_inode_mark *i_mark; 74 struct inotify_inode_mark *i_mark;
98 struct inode *to_tell; 75 struct inotify_event_info *event;
99 struct inotify_event_private_data *event_priv;
100 struct fsnotify_event_private_data *fsn_event_priv;
101 struct fsnotify_event *added_event; 76 struct fsnotify_event *added_event;
102 int wd, ret = 0; 77 struct fsnotify_event *fsn_event;
78 int ret = 0;
79 int len = 0;
80 int alloc_len = sizeof(struct inotify_event_info);
103 81
104 BUG_ON(vfsmount_mark); 82 BUG_ON(vfsmount_mark);
105 83
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, 84 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
107 event, event->to_tell, event->mask); 85 (data_type == FSNOTIFY_EVENT_PATH)) {
86 struct path *path = data;
108 87
109 to_tell = event->to_tell; 88 if (d_unlinked(path->dentry))
89 return 0;
90 }
91 if (file_name) {
92 len = strlen(file_name);
93 alloc_len += len + 1;
94 }
95
96 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
97 mask);
110 98
111 i_mark = container_of(inode_mark, struct inotify_inode_mark, 99 i_mark = container_of(inode_mark, struct inotify_inode_mark,
112 fsn_mark); 100 fsn_mark);
113 wd = i_mark->wd;
114 101
115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 102 event = kmalloc(alloc_len, GFP_KERNEL);
116 if (unlikely(!event_priv)) 103 if (unlikely(!event))
117 return -ENOMEM; 104 return -ENOMEM;
118 105
119 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 106 fsn_event = &event->fse;
120 107 fsnotify_init_event(fsn_event, inode, mask);
121 fsnotify_get_group(group); 108 event->wd = i_mark->wd;
122 fsn_event_priv->group = group; 109 event->name_len = len;
123 event_priv->wd = wd; 110 if (len)
111 strcpy(event->name, file_name);
124 112
125 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); 113 added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
126 if (added_event) { 114 if (added_event) {
127 inotify_free_event_priv(fsn_event_priv); 115 /* Our event wasn't used in the end. Free it. */
128 if (!IS_ERR(added_event)) 116 fsnotify_destroy_event(group, fsn_event);
129 fsnotify_put_event(added_event); 117 if (IS_ERR(added_event))
130 else
131 ret = PTR_ERR(added_event); 118 ret = PTR_ERR(added_event);
132 } 119 }
133 120
@@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
142 inotify_ignored_and_remove_idr(fsn_mark, group); 129 inotify_ignored_and_remove_idr(fsn_mark, group);
143} 130}
144 131
145static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
146 struct fsnotify_mark *inode_mark,
147 struct fsnotify_mark *vfsmount_mark,
148 __u32 mask, void *data, int data_type)
149{
150 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
151 (data_type == FSNOTIFY_EVENT_PATH)) {
152 struct path *path = data;
153
154 if (d_unlinked(path->dentry))
155 return false;
156 }
157
158 return true;
159}
160
161/* 132/*
162 * This is NEVER supposed to be called. Inotify marks should either have been 133 * This is NEVER supposed to be called. Inotify marks should either have been
163 * removed from the idr when the watch was removed or in the 134 * removed from the idr when the watch was removed or in the
@@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
202 free_uid(group->inotify_data.user); 173 free_uid(group->inotify_data.user);
203} 174}
204 175
205void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 176static void inotify_free_event(struct fsnotify_event *fsn_event)
206{ 177{
207 struct inotify_event_private_data *event_priv; 178 kfree(INOTIFY_E(fsn_event));
208
209
210 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
211 fsnotify_event_priv_data);
212
213 fsnotify_put_group(fsn_event_priv->group);
214 kmem_cache_free(event_priv_cachep, event_priv);
215} 179}
216 180
217const struct fsnotify_ops inotify_fsnotify_ops = { 181const struct fsnotify_ops inotify_fsnotify_ops = {
218 .handle_event = inotify_handle_event, 182 .handle_event = inotify_handle_event,
219 .should_send_event = inotify_should_send_event,
220 .free_group_priv = inotify_free_group_priv, 183 .free_group_priv = inotify_free_group_priv,
221 .free_event_priv = inotify_free_event_priv, 184 .free_event = inotify_free_event,
222 .freeing_mark = inotify_freeing_mark, 185 .freeing_mark = inotify_freeing_mark,
223}; 186};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..497395c8274b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
50static int inotify_max_user_watches __read_mostly; 50static int inotify_max_user_watches __read_mostly;
51 51
52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
53struct kmem_cache *event_priv_cachep __read_mostly;
54 53
55#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
56 55
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
124 return ret; 123 return ret;
125} 124}
126 125
126static int round_event_name_len(struct fsnotify_event *fsn_event)
127{
128 struct inotify_event_info *event;
129
130 event = INOTIFY_E(fsn_event);
131 if (!event->name_len)
132 return 0;
133 return roundup(event->name_len + 1, sizeof(struct inotify_event));
134}
135
127/* 136/*
128 * Get an inotify_kernel_event if one exists and is small 137 * Get an inotify_kernel_event if one exists and is small
129 * enough to fit in "count". Return an error pointer if 138 * enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 153
145 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 154 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
146 155
147 if (event->name_len) 156 event_size += round_event_name_len(event);
148 event_size += roundup(event->name_len + 1, event_size);
149
150 if (event_size > count) 157 if (event_size > count)
151 return ERR_PTR(-EINVAL); 158 return ERR_PTR(-EINVAL);
152 159
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
164 * buffer we had in "get_one_event()" above. 171 * buffer we had in "get_one_event()" above.
165 */ 172 */
166static ssize_t copy_event_to_user(struct fsnotify_group *group, 173static ssize_t copy_event_to_user(struct fsnotify_group *group,
167 struct fsnotify_event *event, 174 struct fsnotify_event *fsn_event,
168 char __user *buf) 175 char __user *buf)
169{ 176{
170 struct inotify_event inotify_event; 177 struct inotify_event inotify_event;
171 struct fsnotify_event_private_data *fsn_priv; 178 struct inotify_event_info *event;
172 struct inotify_event_private_data *priv;
173 size_t event_size = sizeof(struct inotify_event); 179 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 180 size_t name_len;
175 181 size_t pad_name_len;
176 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
177 182
178 /* we get the inotify watch descriptor from the event private data */ 183 pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
179 spin_lock(&event->lock);
180 fsn_priv = fsnotify_remove_priv_from_event(group, event);
181 spin_unlock(&event->lock);
182
183 if (!fsn_priv)
184 inotify_event.wd = -1;
185 else {
186 priv = container_of(fsn_priv, struct inotify_event_private_data,
187 fsnotify_event_priv_data);
188 inotify_event.wd = priv->wd;
189 inotify_free_event_priv(fsn_priv);
190 }
191 184
185 event = INOTIFY_E(fsn_event);
186 name_len = event->name_len;
192 /* 187 /*
193 * round up event->name_len so it is a multiple of event_size 188 * round up name length so it is a multiple of event_size
194 * plus an extra byte for the terminating '\0'. 189 * plus an extra byte for the terminating '\0'.
195 */ 190 */
196 if (event->name_len) 191 pad_name_len = round_event_name_len(fsn_event);
197 name_len = roundup(event->name_len + 1, event_size); 192 inotify_event.len = pad_name_len;
198 inotify_event.len = name_len; 193 inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
199 194 inotify_event.wd = event->wd;
200 inotify_event.mask = inotify_mask_to_arg(event->mask);
201 inotify_event.cookie = event->sync_cookie; 195 inotify_event.cookie = event->sync_cookie;
202 196
203 /* send the main event */ 197 /* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
209 /* 203 /*
210 * fsnotify only stores the pathname, so here we have to send the pathname 204 * fsnotify only stores the pathname, so here we have to send the pathname
211 * and then pad that pathname out to a multiple of sizeof(inotify_event) 205 * and then pad that pathname out to a multiple of sizeof(inotify_event)
212 * with zeros. I get my zeros from the nul_inotify_event. 206 * with zeros.
213 */ 207 */
214 if (name_len) { 208 if (pad_name_len) {
215 unsigned int len_to_zero = name_len - event->name_len;
216 /* copy the path name */ 209 /* copy the path name */
217 if (copy_to_user(buf, event->file_name, event->name_len)) 210 if (copy_to_user(buf, event->name, name_len))
218 return -EFAULT; 211 return -EFAULT;
219 buf += event->name_len; 212 buf += name_len;
220 213
221 /* fill userspace with 0's */ 214 /* fill userspace with 0's */
222 if (clear_user(buf, len_to_zero)) 215 if (clear_user(buf, pad_name_len - name_len))
223 return -EFAULT; 216 return -EFAULT;
224 buf += len_to_zero; 217 event_size += pad_name_len;
225 event_size += name_len;
226 } 218 }
227 219
228 return event_size; 220 return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
254 if (IS_ERR(kevent)) 246 if (IS_ERR(kevent))
255 break; 247 break;
256 ret = copy_event_to_user(group, kevent, buf); 248 ret = copy_event_to_user(group, kevent, buf);
257 fsnotify_put_event(kevent); 249 fsnotify_destroy_event(group, kevent);
258 if (ret < 0) 250 if (ret < 0)
259 break; 251 break;
260 buf += ret; 252 buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
297 unsigned long arg) 289 unsigned long arg)
298{ 290{
299 struct fsnotify_group *group; 291 struct fsnotify_group *group;
300 struct fsnotify_event_holder *holder; 292 struct fsnotify_event *fsn_event;
301 struct fsnotify_event *event;
302 void __user *p; 293 void __user *p;
303 int ret = -ENOTTY; 294 int ret = -ENOTTY;
304 size_t send_len = 0; 295 size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
311 switch (cmd) { 302 switch (cmd) {
312 case FIONREAD: 303 case FIONREAD:
313 mutex_lock(&group->notification_mutex); 304 mutex_lock(&group->notification_mutex);
314 list_for_each_entry(holder, &group->notification_list, event_list) { 305 list_for_each_entry(fsn_event, &group->notification_list,
315 event = holder->event; 306 list) {
316 send_len += sizeof(struct inotify_event); 307 send_len += sizeof(struct inotify_event);
317 if (event->name_len) 308 send_len += round_event_name_len(fsn_event);
318 send_len += roundup(event->name_len + 1,
319 sizeof(struct inotify_event));
320 } 309 }
321 mutex_unlock(&group->notification_mutex); 310 mutex_unlock(&group->notification_mutex);
322 ret = put_user(send_len, (int __user *) p); 311 ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
503 struct fsnotify_group *group) 492 struct fsnotify_group *group)
504{ 493{
505 struct inotify_inode_mark *i_mark; 494 struct inotify_inode_mark *i_mark;
506 struct fsnotify_event *ignored_event, *notify_event;
507 struct inotify_event_private_data *event_priv;
508 struct fsnotify_event_private_data *fsn_event_priv;
509 int ret;
510
511 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
512
513 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
514 FSNOTIFY_EVENT_NONE, NULL, 0,
515 GFP_NOFS);
516 if (!ignored_event)
517 goto skip_send_ignore;
518
519 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
520 if (unlikely(!event_priv))
521 goto skip_send_ignore;
522
523 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
524
525 fsnotify_get_group(group);
526 fsn_event_priv->group = group;
527 event_priv->wd = i_mark->wd;
528
529 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
530 if (notify_event) {
531 if (IS_ERR(notify_event))
532 ret = PTR_ERR(notify_event);
533 else
534 fsnotify_put_event(notify_event);
535 inotify_free_event_priv(fsn_event_priv);
536 }
537 495
538skip_send_ignore: 496 /* Queue ignore event for the watch */
539 /* matches the reference taken when the event was created */ 497 inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
540 if (ignored_event) 498 NULL, FSNOTIFY_EVENT_NONE, NULL);
541 fsnotify_put_event(ignored_event);
542 499
500 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
543 /* remove this mark from the idr */ 501 /* remove this mark from the idr */
544 inotify_remove_from_idr(group, i_mark); 502 inotify_remove_from_idr(group, i_mark);
545 503
@@ -836,7 +794,6 @@ static int __init inotify_user_setup(void)
836 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 794 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
837 795
838 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 796 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
839 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
840 797
841 inotify_max_queued_events = 16384; 798 inotify_max_queued_events = 16384;
842 inotify_max_user_instances = 128; 799 inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..952237b8e2d2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
48#include <linux/fsnotify_backend.h> 48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h" 49#include "fsnotify.h"
50 50
51static struct kmem_cache *fsnotify_event_cachep;
52static struct kmem_cache *fsnotify_event_holder_cachep;
53/*
54 * This is a magic event we send when the q is too full. Since it doesn't
55 * hold real event information we just keep one system wide and use it any time
56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed'
58 */
59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 51static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 52
62/** 53/**
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
76 return list_empty(&group->notification_list) ? true : false; 67 return list_empty(&group->notification_list) ? true : false;
77} 68}
78 69
79void fsnotify_get_event(struct fsnotify_event *event) 70void fsnotify_destroy_event(struct fsnotify_group *group,
71 struct fsnotify_event *event)
80{ 72{
81 atomic_inc(&event->refcnt); 73 /* Overflow events are per-group and we don't want to free them */
82} 74 if (!event || event->mask == FS_Q_OVERFLOW)
83
84void fsnotify_put_event(struct fsnotify_event *event)
85{
86 if (!event)
87 return; 75 return;
88 76
89 if (atomic_dec_and_test(&event->refcnt)) { 77 group->ops->free_event(event);
90 pr_debug("%s: event=%p\n", __func__, event);
91
92 if (event->data_type == FSNOTIFY_EVENT_PATH)
93 path_put(&event->path);
94
95 BUG_ON(!list_empty(&event->private_data_list));
96
97 kfree(event->file_name);
98 put_pid(event->tgid);
99 kmem_cache_free(fsnotify_event_cachep, event);
100 }
101}
102
103struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104{
105 return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
106}
107
108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
109{
110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
112}
113
114/*
115 * Find the private data that the group previously attached to this event when
116 * the group added the event to the notification queue (fsnotify_add_notify_event)
117 */
118struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
119{
120 struct fsnotify_event_private_data *lpriv;
121 struct fsnotify_event_private_data *priv = NULL;
122
123 assert_spin_locked(&event->lock);
124
125 list_for_each_entry(lpriv, &event->private_data_list, event_list) {
126 if (lpriv->group == group) {
127 priv = lpriv;
128 list_del(&priv->event_list);
129 break;
130 }
131 }
132 return priv;
133} 78}
134 79
135/* 80/*
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
137 * event off the queue to deal with. If the event is successfully added to the 82 * event off the queue to deal with. If the event is successfully added to the
138 * group's notification queue, a reference is taken on event. 83 * group's notification queue, a reference is taken on event.
139 */ 84 */
140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 85struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
141 struct fsnotify_event_private_data *priv, 86 struct fsnotify_event *event,
142 struct fsnotify_event *(*merge)(struct list_head *, 87 struct fsnotify_event *(*merge)(struct list_head *,
143 struct fsnotify_event *)) 88 struct fsnotify_event *))
144{ 89{
145 struct fsnotify_event *return_event = NULL; 90 struct fsnotify_event *return_event = NULL;
146 struct fsnotify_event_holder *holder = NULL;
147 struct list_head *list = &group->notification_list; 91 struct list_head *list = &group->notification_list;
148 92
149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
150
151 /*
152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
153 * Check if we expect to be able to use that holder. If not alloc a new
154 * holder.
155 * For the overflow event it's possible that something will use the in
156 * event holder before we get the lock so we may need to jump back and
157 * alloc a new holder, this can't happen for most events...
158 */
159 if (!list_empty(&event->holder.event_list)) {
160alloc_holder:
161 holder = fsnotify_alloc_event_holder();
162 if (!holder)
163 return ERR_PTR(-ENOMEM);
164 }
165 94
166 mutex_lock(&group->notification_mutex); 95 mutex_lock(&group->notification_mutex);
167 96
168 if (group->q_len >= group->max_events) { 97 if (group->q_len >= group->max_events) {
169 event = q_overflow_event; 98 /* Queue overflow event only if it isn't already queued */
170 99 if (list_empty(&group->overflow_event.list))
171 /* 100 event = &group->overflow_event;
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event; 101 return_event = event;
177
178 /* sorry, no private data on the overflow event */
179 priv = NULL;
180 } 102 }
181 103
182 if (!list_empty(list) && merge) { 104 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp; 105 return_event = merge(list, event);
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex);
188
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 }
195 }
196
197 spin_lock(&event->lock);
198
199 if (list_empty(&event->holder.event_list)) {
200 if (unlikely(holder))
201 fsnotify_destroy_event_holder(holder);
202 holder = &event->holder;
203 } else if (unlikely(!holder)) {
204 /* between the time we checked above and got the lock the in
205 * event holder was used, go back and get a new one */
206 spin_unlock(&event->lock);
207 mutex_unlock(&group->notification_mutex);
208
209 if (return_event) { 106 if (return_event) {
210 fsnotify_put_event(return_event); 107 mutex_unlock(&group->notification_mutex);
211 return_event = NULL; 108 return return_event;
212 } 109 }
213
214 goto alloc_holder;
215 } 110 }
216 111
217 group->q_len++; 112 group->q_len++;
218 holder->event = event; 113 list_add_tail(&event->list, list);
219
220 fsnotify_get_event(event);
221 list_add_tail(&holder->event_list, list);
222 if (priv)
223 list_add_tail(&priv->event_list, &event->private_data_list);
224 spin_unlock(&event->lock);
225 mutex_unlock(&group->notification_mutex); 114 mutex_unlock(&group->notification_mutex);
226 115
227 wake_up(&group->notification_waitq); 116 wake_up(&group->notification_waitq);
@@ -230,32 +119,20 @@ alloc_holder:
230} 119}
231 120
232/* 121/*
233 * Remove and return the first event from the notification list. There is a 122 * Remove and return the first event from the notification list. It is the
234 * reference held on this event since it was on the list. It is the responsibility 123 * responsibility of the caller to destroy the obtained event
235 * of the caller to drop this reference.
236 */ 124 */
237struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) 125struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
238{ 126{
239 struct fsnotify_event *event; 127 struct fsnotify_event *event;
240 struct fsnotify_event_holder *holder;
241 128
242 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 129 BUG_ON(!mutex_is_locked(&group->notification_mutex));
243 130
244 pr_debug("%s: group=%p\n", __func__, group); 131 pr_debug("%s: group=%p\n", __func__, group);
245 132
246 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 133 event = list_first_entry(&group->notification_list,
247 134 struct fsnotify_event, list);
248 event = holder->event; 135 list_del(&event->list);
249
250 spin_lock(&event->lock);
251 holder->event = NULL;
252 list_del_init(&holder->event_list);
253 spin_unlock(&event->lock);
254
255 /* event == holder means we are referenced through the in event holder */
256 if (holder != &event->holder)
257 fsnotify_destroy_event_holder(holder);
258
259 group->q_len--; 136 group->q_len--;
260 137
261 return event; 138 return event;
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
266 */ 143 */
267struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) 144struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
268{ 145{
269 struct fsnotify_event *event;
270 struct fsnotify_event_holder *holder;
271
272 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 146 BUG_ON(!mutex_is_locked(&group->notification_mutex));
273 147
274 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 148 return list_first_entry(&group->notification_list,
275 event = holder->event; 149 struct fsnotify_event, list);
276
277 return event;
278} 150}
279 151
280/* 152/*
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
284void fsnotify_flush_notify(struct fsnotify_group *group) 156void fsnotify_flush_notify(struct fsnotify_group *group)
285{ 157{
286 struct fsnotify_event *event; 158 struct fsnotify_event *event;
287 struct fsnotify_event_private_data *priv;
288 159
289 mutex_lock(&group->notification_mutex); 160 mutex_lock(&group->notification_mutex);
290 while (!fsnotify_notify_queue_is_empty(group)) { 161 while (!fsnotify_notify_queue_is_empty(group)) {
291 event = fsnotify_remove_notify_event(group); 162 event = fsnotify_remove_notify_event(group);
292 /* if they don't implement free_event_priv they better not have attached any */ 163 fsnotify_destroy_event(group, event);
293 if (group->ops->free_event_priv) {
294 spin_lock(&event->lock);
295 priv = fsnotify_remove_priv_from_event(group, event);
296 spin_unlock(&event->lock);
297 if (priv)
298 group->ops->free_event_priv(priv);
299 }
300 fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
301 } 164 }
302 mutex_unlock(&group->notification_mutex); 165 mutex_unlock(&group->notification_mutex);
303} 166}
304 167
305static void initialize_event(struct fsnotify_event *event)
306{
307 INIT_LIST_HEAD(&event->holder.event_list);
308 atomic_set(&event->refcnt, 1);
309
310 spin_lock_init(&event->lock);
311
312 INIT_LIST_HEAD(&event->private_data_list);
313}
314
315/*
316 * Caller damn well better be holding whatever mutex is protecting the
317 * old_holder->event_list and the new_event must be a clean event which
318 * cannot be found anywhere else in the kernel.
319 */
320int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
321 struct fsnotify_event *new_event)
322{
323 struct fsnotify_event *old_event = old_holder->event;
324 struct fsnotify_event_holder *new_holder = &new_event->holder;
325
326 enum event_spinlock_class {
327 SPINLOCK_OLD,
328 SPINLOCK_NEW,
329 };
330
331 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
332
333 /*
334 * if the new_event's embedded holder is in use someone
335 * screwed up and didn't give us a clean new event.
336 */
337 BUG_ON(!list_empty(&new_holder->event_list));
338
339 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
340 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
341
342 new_holder->event = new_event;
343 list_replace_init(&old_holder->event_list, &new_holder->event_list);
344
345 spin_unlock(&new_event->lock);
346 spin_unlock(&old_event->lock);
347
348 /* event == holder means we are referenced through the in event holder */
349 if (old_holder != &old_event->holder)
350 fsnotify_destroy_event_holder(old_holder);
351
352 fsnotify_get_event(new_event); /* on the list take reference */
353 fsnotify_put_event(old_event); /* off the list, drop reference */
354
355 return 0;
356}
357
358struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
359{
360 struct fsnotify_event *event;
361
362 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
363 if (!event)
364 return NULL;
365
366 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
367
368 memcpy(event, old_event, sizeof(*event));
369 initialize_event(event);
370
371 if (event->name_len) {
372 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
373 if (!event->file_name) {
374 kmem_cache_free(fsnotify_event_cachep, event);
375 return NULL;
376 }
377 }
378 event->tgid = get_pid(old_event->tgid);
379 if (event->data_type == FSNOTIFY_EVENT_PATH)
380 path_get(&event->path);
381
382 return event;
383}
384
385/* 168/*
386 * fsnotify_create_event - Allocate a new event which will be sent to each 169 * fsnotify_create_event - Allocate a new event which will be sent to each
387 * group's handle_event function if the group was interested in this 170 * group's handle_event function if the group was interested in this
388 * particular event. 171 * particular event.
389 * 172 *
390 * @to_tell the inode which is supposed to receive the event (sometimes a 173 * @inode the inode which is supposed to receive the event (sometimes a
391 * parent of the inode to which the event happened. 174 * parent of the inode to which the event happened.
392 * @mask what actually happened. 175 * @mask what actually happened.
393 * @data pointer to the object which was actually affected 176 * @data pointer to the object which was actually affected
394 * @data_type flag indication if the data is a file, path, inode, nothing... 177 * @data_type flag indication if the data is a file, path, inode, nothing...
395 * @name the filename, if available 178 * @name the filename, if available
396 */ 179 */
397struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 180void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
398 int data_type, const unsigned char *name, 181 u32 mask)
399 u32 cookie, gfp_t gfp)
400{ 182{
401 struct fsnotify_event *event; 183 INIT_LIST_HEAD(&event->list);
402 184 event->inode = inode;
403 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
404 if (!event)
405 return NULL;
406
407 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
408 __func__, event, to_tell, mask, data, data_type);
409
410 initialize_event(event);
411
412 if (name) {
413 event->file_name = kstrdup(name, gfp);
414 if (!event->file_name) {
415 kmem_cache_free(fsnotify_event_cachep, event);
416 return NULL;
417 }
418 event->name_len = strlen(event->file_name);
419 }
420
421 event->tgid = get_pid(task_tgid(current));
422 event->sync_cookie = cookie;
423 event->to_tell = to_tell;
424 event->data_type = data_type;
425
426 switch (data_type) {
427 case FSNOTIFY_EVENT_PATH: {
428 struct path *path = data;
429 event->path.dentry = path->dentry;
430 event->path.mnt = path->mnt;
431 path_get(&event->path);
432 break;
433 }
434 case FSNOTIFY_EVENT_INODE:
435 event->inode = data;
436 break;
437 case FSNOTIFY_EVENT_NONE:
438 event->inode = NULL;
439 event->path.dentry = NULL;
440 event->path.mnt = NULL;
441 break;
442 default:
443 BUG();
444 }
445
446 event->mask = mask; 185 event->mask = mask;
447
448 return event;
449}
450
451static __init int fsnotify_notification_init(void)
452{
453 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
454 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
455
456 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
457 FSNOTIFY_EVENT_NONE, NULL, 0,
458 GFP_KERNEL);
459 if (!q_overflow_event)
460 panic("unable to allocate fsnotify q_overflow_event\n");
461
462 return 0;
463} 186}
464subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
38 symlink.o \ 38 symlink.o \
39 sysfile.o \ 39 sysfile.o \
40 uptodate.o \ 40 uptodate.o \
41 ver.o \
42 quota_local.o \ 41 quota_local.o \
43 quota_global.o \ 42 quota_global.o \
44 xattr.o \ 43 xattr.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..8750ae1b8636 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7260 start = range->start >> osb->s_clustersize_bits; 7260 start = range->start >> osb->s_clustersize_bits;
7261 len = range->len >> osb->s_clustersize_bits; 7261 len = range->len >> osb->s_clustersize_bits;
7262 minlen = range->minlen >> osb->s_clustersize_bits; 7262 minlen = range->minlen >> osb->s_clustersize_bits;
7263 trimmed = 0;
7264
7265 if (!len) {
7266 range->len = 0;
7267 return 0;
7268 }
7269 7263
7270 if (minlen >= osb->bitmap_cpg) 7264 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7271 return -EINVAL; 7265 return -EINVAL;
7272 7266
7273 main_bm_inode = ocfs2_get_system_file_inode(osb, 7267 main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7293 goto out_unlock; 7287 goto out_unlock;
7294 } 7288 }
7295 7289
7290 len = range->len >> osb->s_clustersize_bits;
7296 if (start + len > le32_to_cpu(main_bm->i_clusters)) 7291 if (start + len > le32_to_cpu(main_bm->i_clusters))
7297 len = le32_to_cpu(main_bm->i_clusters) - start; 7292 len = le32_to_cpu(main_bm->i_clusters) - start;
7298 7293
@@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7307 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); 7302 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7308 last_bit = osb->bitmap_cpg; 7303 last_bit = osb->bitmap_cpg;
7309 7304
7305 trimmed = 0;
7310 for (group = first_group; group <= last_group;) { 7306 for (group = first_group; group <= last_group;) {
7311 if (first_bit + len >= osb->bitmap_cpg) 7307 if (first_bit + len >= osb->bitmap_cpg)
7312 last_bit = osb->bitmap_cpg; 7308 last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o netdebug.o ver.o 4 quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
29#include "heartbeat.h" 29#include "heartbeat.h"
30#include "masklog.h" 30#include "masklog.h"
31#include "sys.h" 31#include "sys.h"
32#include "ver.h"
33 32
34/* for now we operate under the assertion that there can be only one 33/* for now we operate under the assertion that there can be only one
35 * cluster active at a time. Changing this will require trickling 34 * cluster active at a time. Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
945{ 944{
946 int ret = -1; 945 int ret = -1;
947 946
948 cluster_print_version();
949
950 ret = o2hb_init(); 947 ret = o2hb_init();
951 if (ret) 948 if (ret)
952 goto out; 949 goto out;
@@ -984,6 +981,7 @@ out:
984 981
985MODULE_AUTHOR("Oracle"); 982MODULE_AUTHOR("Oracle");
986MODULE_LICENSE("GPL"); 983MODULE_LICENSE("GPL");
984MODULE_DESCRIPTION("OCFS2 cluster management");
987 985
988module_init(init_o2nm) 986module_init(init_o2nm)
989module_exit(exit_o2nm) 987module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
7 7
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
43#include "dlmdomain.h" 43#include "dlmdomain.h"
44#include "dlmdebug.h" 44#include "dlmdebug.h"
45 45
46#include "dlmver.h"
47
48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 46#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
49#include "cluster/masklog.h" 47#include "cluster/masklog.h"
50 48
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
2328{ 2326{
2329 int status; 2327 int status;
2330 2328
2331 dlm_print_version();
2332
2333 status = dlm_init_mle_cache(); 2329 status = dlm_init_mle_cache();
2334 if (status) { 2330 if (status) {
2335 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); 2331 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
2379 2375
2380MODULE_AUTHOR("Oracle"); 2376MODULE_AUTHOR("Oracle");
2381MODULE_LICENSE("GPL"); 2377MODULE_LICENSE("GPL");
2378MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
2382 2379
2383module_init(dlm_init); 2380module_init(dlm_init);
2384module_exit(dlm_exit); 2381module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4 4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o 5ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
49 49
50#include "stackglue.h" 50#include "stackglue.h"
51#include "userdlm.h" 51#include "userdlm.h"
52#include "dlmfsver.h"
53 52
54#define MLOG_MASK_PREFIX ML_DLMFS 53#define MLOG_MASK_PREFIX ML_DLMFS
55#include "cluster/masklog.h" 54#include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
644 int status; 643 int status;
645 int cleanup_inode = 0, cleanup_worker = 0; 644 int cleanup_inode = 0, cleanup_worker = 0;
646 645
647 dlmfs_print_version();
648
649 status = bdi_init(&dlmfs_backing_dev_info); 646 status = bdi_init(&dlmfs_backing_dev_info);
650 if (status) 647 if (status)
651 return status; 648 return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
701 698
702MODULE_AUTHOR("Oracle"); 699MODULE_AUTHOR("Oracle");
703MODULE_LICENSE("GPL"); 700MODULE_LICENSE("GPL");
701MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
704 702
705module_init(init_dlmfs_fs) 703module_init(init_dlmfs_fs)
706module_exit(exit_dlmfs_fs) 704module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2996 2996
2997 /* for now, uuid == domain */ 2997 /* for now, uuid == domain */
2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2999 osb->osb_cluster_name,
3000 strlen(osb->osb_cluster_name),
2999 osb->uuid_str, 3001 osb->uuid_str,
3000 strlen(osb->uuid_str), 3002 strlen(osb->uuid_str),
3001 &lproto, ocfs2_do_node_down, osb, 3003 &lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
3005 goto bail; 3007 goto bail;
3006 } 3008 }
3007 3009
3008 status = ocfs2_cluster_this_node(&osb->node_num); 3010 status = ocfs2_cluster_this_node(conn, &osb->node_num);
3009 if (status < 0) { 3011 if (status < 0) {
3010 mlog_errno(status); 3012 mlog_errno(status);
3011 mlog(ML_ERROR, 3013 mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..f42eecef6478 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1869 } 1869 }
1870 size = sr->l_start + sr->l_len; 1870 size = sr->l_start + sr->l_len;
1871 1871
1872 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1872 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1873 cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1873 if (sr->l_len <= 0) { 1874 if (sr->l_len <= 0) {
1874 ret = -EINVAL; 1875 ret = -EINVAL;
1875 goto out_inode_unlock; 1876 goto out_inode_unlock;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/blkdev.h>
10#include <linux/compat.h> 11#include <linux/compat.h>
11 12
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
966 case FITRIM: 967 case FITRIM:
967 { 968 {
968 struct super_block *sb = inode->i_sb; 969 struct super_block *sb = inode->i_sb;
970 struct request_queue *q = bdev_get_queue(sb->s_bdev);
969 struct fstrim_range range; 971 struct fstrim_range range;
970 int ret = 0; 972 int ret = 0;
971 973
972 if (!capable(CAP_SYS_ADMIN)) 974 if (!capable(CAP_SYS_ADMIN))
973 return -EPERM; 975 return -EPERM;
974 976
977 if (!blk_queue_discard(q))
978 return -EOPNOTSUPP;
979
975 if (copy_from_user(&range, argp, sizeof(range))) 980 if (copy_from_user(&range, argp, sizeof(range)))
976 return -EFAULT; 981 return -EFAULT;
977 982
983 range.minlen = max_t(u64, q->limits.discard_granularity,
984 range.minlen);
978 ret = ocfs2_trim_fs(sb, &range); 985 ret = ocfs2_trim_fs(sb, &range);
979 if (ret < 0) 986 if (ret < 0)
980 return ret; 987 return ret;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
562} 562}
563 563
564static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
565 handle_t *handle,
566 struct buffer_head *di_bh,
567 u32 num_bits,
568 u16 chain)
569{
570 int ret;
571 u32 tmp_used;
572 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
573 struct ocfs2_chain_list *cl =
574 (struct ocfs2_chain_list *) &di->id2.i_chain;
575
576 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
577 OCFS2_JOURNAL_ACCESS_WRITE);
578 if (ret < 0) {
579 mlog_errno(ret);
580 goto out;
581 }
582
583 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
584 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
585 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
586 ocfs2_journal_dirty(handle, di_bh);
587
588out:
589 return ret;
590}
591
592static inline int ocfs2_block_group_set_bits(handle_t *handle,
593 struct inode *alloc_inode,
594 struct ocfs2_group_desc *bg,
595 struct buffer_head *group_bh,
596 unsigned int bit_off,
597 unsigned int num_bits)
598{
599 int status;
600 void *bitmap = bg->bg_bitmap;
601 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
602
603 /* All callers get the descriptor via
604 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
605 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
606 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
607
608 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
609 num_bits);
610
611 if (ocfs2_is_cluster_bitmap(alloc_inode))
612 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
613
614 status = ocfs2_journal_access_gd(handle,
615 INODE_CACHE(alloc_inode),
616 group_bh,
617 journal_type);
618 if (status < 0) {
619 mlog_errno(status);
620 goto bail;
621 }
622
623 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
624 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
625 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
626 " count %u but claims %u are freed. num_bits %d",
627 (unsigned long long)le64_to_cpu(bg->bg_blkno),
628 le16_to_cpu(bg->bg_bits),
629 le16_to_cpu(bg->bg_free_bits_count), num_bits);
630 return -EROFS;
631 }
632 while (num_bits--)
633 ocfs2_set_bit(bit_off++, bitmap);
634
635 ocfs2_journal_dirty(handle, group_bh);
636
637bail:
638 return status;
639}
640
641static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 564static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
642 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 565 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
643 u32 len, int ext_flags) 566 u32 len, int ext_flags)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
387 u8 osb_stackflags; 387 u8 osb_stackflags;
388 388
389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
390 char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
390 struct ocfs2_cluster_connection *cconn; 391 struct ocfs2_cluster_connection *cconn;
391 struct ocfs2_lock_res osb_super_lockres; 392 struct ocfs2_lock_res osb_super_lockres;
392 struct ocfs2_lock_res osb_rename_lockres; 393 struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
398 return 0; 398 return 0;
399} 399}
400 400
401static int o2cb_cluster_this_node(unsigned int *node) 401static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
402 unsigned int *node)
402{ 403{
403 int node_num; 404 int node_num;
404 405
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/sched.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "stackglue.h" 29#include "stackglue.h"
@@ -102,6 +103,12 @@
102#define OCFS2_TEXT_UUID_LEN 32 103#define OCFS2_TEXT_UUID_LEN 32
103#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 104#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
104#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 105#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
106#define VERSION_LOCK "version_lock"
107
108enum ocfs2_connection_type {
109 WITH_CONTROLD,
110 NO_CONTROLD
111};
105 112
106/* 113/*
107 * ocfs2_live_connection is refcounted because the filesystem and 114 * ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
110struct ocfs2_live_connection { 117struct ocfs2_live_connection {
111 struct list_head oc_list; 118 struct list_head oc_list;
112 struct ocfs2_cluster_connection *oc_conn; 119 struct ocfs2_cluster_connection *oc_conn;
120 enum ocfs2_connection_type oc_type;
121 atomic_t oc_this_node;
122 int oc_our_slot;
123 struct dlm_lksb oc_version_lksb;
124 char oc_lvb[DLM_LVB_LEN];
125 struct completion oc_sync_wait;
126 wait_queue_head_t oc_wait;
113}; 127};
114 128
115struct ocfs2_control_private { 129struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
198 * mount path. Since the VFS prevents multiple calls to 212 * mount path. Since the VFS prevents multiple calls to
199 * fill_super(), we can't get dupes here. 213 * fill_super(), we can't get dupes here.
200 */ 214 */
201static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, 215static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
202 struct ocfs2_live_connection **c_ret) 216 struct ocfs2_live_connection *c)
203{ 217{
204 int rc = 0; 218 int rc = 0;
205 struct ocfs2_live_connection *c;
206
207 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
208 if (!c)
209 return -ENOMEM;
210 219
211 mutex_lock(&ocfs2_control_lock); 220 mutex_lock(&ocfs2_control_lock);
212 c->oc_conn = conn; 221 c->oc_conn = conn;
213 222
214 if (atomic_read(&ocfs2_control_opened)) 223 if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
215 list_add(&c->oc_list, &ocfs2_live_connection_list); 224 list_add(&c->oc_list, &ocfs2_live_connection_list);
216 else { 225 else {
217 printk(KERN_ERR 226 printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
220 } 229 }
221 230
222 mutex_unlock(&ocfs2_control_lock); 231 mutex_unlock(&ocfs2_control_lock);
223
224 if (!rc)
225 *c_ret = c;
226 else
227 kfree(c);
228
229 return rc; 232 return rc;
230} 233}
231 234
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
799 return 0; 802 return 0;
800} 803}
801 804
805static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
806{
807 struct ocfs2_protocol_version *pv =
808 (struct ocfs2_protocol_version *)lvb;
809 /*
810 * ocfs2_protocol_version has two u8 variables, so we don't
811 * need any endian conversion.
812 */
813 ver->pv_major = pv->pv_major;
814 ver->pv_minor = pv->pv_minor;
815}
816
817static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
818{
819 struct ocfs2_protocol_version *pv =
820 (struct ocfs2_protocol_version *)lvb;
821 /*
822 * ocfs2_protocol_version has two u8 variables, so we don't
823 * need any endian conversion.
824 */
825 pv->pv_major = ver->pv_major;
826 pv->pv_minor = ver->pv_minor;
827}
828
829static void sync_wait_cb(void *arg)
830{
831 struct ocfs2_cluster_connection *conn = arg;
832 struct ocfs2_live_connection *lc = conn->cc_private;
833 complete(&lc->oc_sync_wait);
834}
835
836static int sync_unlock(struct ocfs2_cluster_connection *conn,
837 struct dlm_lksb *lksb, char *name)
838{
839 int error;
840 struct ocfs2_live_connection *lc = conn->cc_private;
841
842 error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
843 if (error) {
844 printk(KERN_ERR "%s lkid %x error %d\n",
845 name, lksb->sb_lkid, error);
846 return error;
847 }
848
849 wait_for_completion(&lc->oc_sync_wait);
850
851 if (lksb->sb_status != -DLM_EUNLOCK) {
852 printk(KERN_ERR "%s lkid %x status %d\n",
853 name, lksb->sb_lkid, lksb->sb_status);
854 return -1;
855 }
856 return 0;
857}
858
859static int sync_lock(struct ocfs2_cluster_connection *conn,
860 int mode, uint32_t flags,
861 struct dlm_lksb *lksb, char *name)
862{
863 int error, status;
864 struct ocfs2_live_connection *lc = conn->cc_private;
865
866 error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
867 name, strlen(name),
868 0, sync_wait_cb, conn, NULL);
869 if (error) {
870 printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
871 name, lksb->sb_lkid, flags, mode, error);
872 return error;
873 }
874
875 wait_for_completion(&lc->oc_sync_wait);
876
877 status = lksb->sb_status;
878
879 if (status && status != -EAGAIN) {
880 printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
881 name, lksb->sb_lkid, flags, mode, status);
882 }
883
884 return status;
885}
886
887
888static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
889 int flags)
890{
891 struct ocfs2_live_connection *lc = conn->cc_private;
892 return sync_lock(conn, mode, flags,
893 &lc->oc_version_lksb, VERSION_LOCK);
894}
895
896static int version_unlock(struct ocfs2_cluster_connection *conn)
897{
898 struct ocfs2_live_connection *lc = conn->cc_private;
899 return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
900}
901
902/* get_protocol_version()
903 *
904 * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
905 * The algorithm is:
906 * 1. Attempt to take the lock in EX mode (non-blocking).
907 * 2. If successful (which means it is the first mount), write the
908 * version number and downconvert to PR lock.
909 * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
910 * taking the PR lock.
911 */
912
913static int get_protocol_version(struct ocfs2_cluster_connection *conn)
914{
915 int ret;
916 struct ocfs2_live_connection *lc = conn->cc_private;
917 struct ocfs2_protocol_version pv;
918
919 running_proto.pv_major =
920 ocfs2_user_plugin.sp_max_proto.pv_major;
921 running_proto.pv_minor =
922 ocfs2_user_plugin.sp_max_proto.pv_minor;
923
924 lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
925 ret = version_lock(conn, DLM_LOCK_EX,
926 DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
927 if (!ret) {
928 conn->cc_version.pv_major = running_proto.pv_major;
929 conn->cc_version.pv_minor = running_proto.pv_minor;
930 version_to_lvb(&running_proto, lc->oc_lvb);
931 version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
932 } else if (ret == -EAGAIN) {
933 ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
934 if (ret)
935 goto out;
936 lvb_to_version(lc->oc_lvb, &pv);
937
938 if ((pv.pv_major != running_proto.pv_major) ||
939 (pv.pv_minor > running_proto.pv_minor)) {
940 ret = -EINVAL;
941 goto out;
942 }
943
944 conn->cc_version.pv_major = pv.pv_major;
945 conn->cc_version.pv_minor = pv.pv_minor;
946 }
947out:
948 return ret;
949}
950
951static void user_recover_prep(void *arg)
952{
953}
954
955static void user_recover_slot(void *arg, struct dlm_slot *slot)
956{
957 struct ocfs2_cluster_connection *conn = arg;
958 printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
959 slot->nodeid, slot->slot);
960 conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
961
962}
963
964static void user_recover_done(void *arg, struct dlm_slot *slots,
965 int num_slots, int our_slot,
966 uint32_t generation)
967{
968 struct ocfs2_cluster_connection *conn = arg;
969 struct ocfs2_live_connection *lc = conn->cc_private;
970 int i;
971
972 for (i = 0; i < num_slots; i++)
973 if (slots[i].slot == our_slot) {
974 atomic_set(&lc->oc_this_node, slots[i].nodeid);
975 break;
976 }
977
978 lc->oc_our_slot = our_slot;
979 wake_up(&lc->oc_wait);
980}
981
982static const struct dlm_lockspace_ops ocfs2_ls_ops = {
983 .recover_prep = user_recover_prep,
984 .recover_slot = user_recover_slot,
985 .recover_done = user_recover_done,
986};
987
988static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
989{
990 version_unlock(conn);
991 dlm_release_lockspace(conn->cc_lockspace, 2);
992 conn->cc_lockspace = NULL;
993 ocfs2_live_connection_drop(conn->cc_private);
994 conn->cc_private = NULL;
995 return 0;
996}
997
802static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 998static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
803{ 999{
804 dlm_lockspace_t *fsdlm; 1000 dlm_lockspace_t *fsdlm;
805 struct ocfs2_live_connection *uninitialized_var(control); 1001 struct ocfs2_live_connection *lc;
806 int rc = 0; 1002 int rc, ops_rv;
807 1003
808 BUG_ON(conn == NULL); 1004 BUG_ON(conn == NULL);
809 1005
810 rc = ocfs2_live_connection_new(conn, &control); 1006 lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
1007 if (!lc) {
1008 rc = -ENOMEM;
1009 goto out;
1010 }
1011
1012 init_waitqueue_head(&lc->oc_wait);
1013 init_completion(&lc->oc_sync_wait);
1014 atomic_set(&lc->oc_this_node, 0);
1015 conn->cc_private = lc;
1016 lc->oc_type = NO_CONTROLD;
1017
1018 rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
1019 DLM_LSFL_FS, DLM_LVB_LEN,
1020 &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
1021 if (rc)
1022 goto out;
1023
1024 if (ops_rv == -EOPNOTSUPP) {
1025 lc->oc_type = WITH_CONTROLD;
1026 printk(KERN_NOTICE "ocfs2: You seem to be using an older "
1027 "version of dlm_controld and/or ocfs2-tools."
1028 " Please consider upgrading.\n");
1029 } else if (ops_rv) {
1030 rc = ops_rv;
1031 goto out;
1032 }
1033 conn->cc_lockspace = fsdlm;
1034
1035 rc = ocfs2_live_connection_attach(conn, lc);
811 if (rc) 1036 if (rc)
812 goto out; 1037 goto out;
813 1038
1039 if (lc->oc_type == NO_CONTROLD) {
1040 rc = get_protocol_version(conn);
1041 if (rc) {
1042 printk(KERN_ERR "ocfs2: Could not determine"
1043 " locking version\n");
1044 user_cluster_disconnect(conn);
1045 goto out;
1046 }
1047 wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
1048 }
1049
814 /* 1050 /*
815 * running_proto must have been set before we allowed any mounts 1051 * running_proto must have been set before we allowed any mounts
816 * to proceed. 1052 * to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
818 if (fs_protocol_compare(&running_proto, &conn->cc_version)) { 1054 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
819 printk(KERN_ERR 1055 printk(KERN_ERR
820 "Unable to mount with fs locking protocol version " 1056 "Unable to mount with fs locking protocol version "
821 "%u.%u because the userspace control daemon has " 1057 "%u.%u because negotiated protocol is %u.%u\n",
822 "negotiated %u.%u\n",
823 conn->cc_version.pv_major, conn->cc_version.pv_minor, 1058 conn->cc_version.pv_major, conn->cc_version.pv_minor,
824 running_proto.pv_major, running_proto.pv_minor); 1059 running_proto.pv_major, running_proto.pv_minor);
825 rc = -EPROTO; 1060 rc = -EPROTO;
826 ocfs2_live_connection_drop(control); 1061 ocfs2_live_connection_drop(lc);
827 goto out; 1062 lc = NULL;
828 }
829
830 rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
831 NULL, NULL, NULL, &fsdlm);
832 if (rc) {
833 ocfs2_live_connection_drop(control);
834 goto out;
835 } 1063 }
836 1064
837 conn->cc_private = control;
838 conn->cc_lockspace = fsdlm;
839out: 1065out:
1066 if (rc && lc)
1067 kfree(lc);
840 return rc; 1068 return rc;
841} 1069}
842 1070
843static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
844{
845 dlm_release_lockspace(conn->cc_lockspace, 2);
846 conn->cc_lockspace = NULL;
847 ocfs2_live_connection_drop(conn->cc_private);
848 conn->cc_private = NULL;
849 return 0;
850}
851 1071
852static int user_cluster_this_node(unsigned int *this_node) 1072static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
1073 unsigned int *this_node)
853{ 1074{
854 int rc; 1075 int rc;
1076 struct ocfs2_live_connection *lc = conn->cc_private;
1077
1078 if (lc->oc_type == WITH_CONTROLD)
1079 rc = ocfs2_control_get_this_node();
1080 else if (lc->oc_type == NO_CONTROLD)
1081 rc = atomic_read(&lc->oc_this_node);
1082 else
1083 rc = -EINVAL;
855 1084
856 rc = ocfs2_control_get_this_node();
857 if (rc < 0) 1085 if (rc < 0)
858 return rc; 1086 return rc;
859 1087
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..1324e6600e57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
309EXPORT_SYMBOL_GPL(ocfs2_plock); 309EXPORT_SYMBOL_GPL(ocfs2_plock);
310 310
311int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
312 const char *cluster_name,
313 int cluster_name_len,
312 const char *group, 314 const char *group,
313 int grouplen, 315 int grouplen,
314 struct ocfs2_locking_protocol *lproto, 316 struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name,
342 goto out; 344 goto out;
343 } 345 }
344 346
345 memcpy(new_conn->cc_name, group, grouplen); 347 strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
346 new_conn->cc_namelen = grouplen; 348 new_conn->cc_namelen = grouplen;
349 strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1);
350 new_conn->cc_cluster_name_len = cluster_name_len;
347 new_conn->cc_recovery_handler = recovery_handler; 351 new_conn->cc_recovery_handler = recovery_handler;
348 new_conn->cc_recovery_data = recovery_data; 352 new_conn->cc_recovery_data = recovery_data;
349 353
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
386 390
387 if (cluster_stack_name[0]) 391 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name; 392 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, 393 return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
390 recovery_handler, recovery_data, conn); 394 lproto, recovery_handler, recovery_data,
395 conn);
391} 396}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); 397EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393 398
@@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
460} 465}
461EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); 466EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
462 467
463int ocfs2_cluster_this_node(unsigned int *node) 468int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
469 unsigned int *node)
464{ 470{
465 return active_stack->sp_ops->this_node(node); 471 return active_stack->sp_ops->this_node(conn, node);
466} 472}
467EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); 473EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
468 474
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
45 */ 45 */
46#define GROUP_NAME_MAX 64 46#define GROUP_NAME_MAX 64
47 47
48/* This shadows OCFS2_CLUSTER_NAME_LEN */
49#define CLUSTER_NAME_MAX 16
50
48 51
49/* 52/*
50 * ocfs2_protocol_version changes when ocfs2 does something different in 53 * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
97 * locking compatibility. 100 * locking compatibility.
98 */ 101 */
99struct ocfs2_cluster_connection { 102struct ocfs2_cluster_connection {
100 char cc_name[GROUP_NAME_MAX]; 103 char cc_name[GROUP_NAME_MAX + 1];
101 int cc_namelen; 104 int cc_namelen;
105 char cc_cluster_name[CLUSTER_NAME_MAX + 1];
106 int cc_cluster_name_len;
102 struct ocfs2_protocol_version cc_version; 107 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto; 108 struct ocfs2_locking_protocol *cc_proto;
104 void (*cc_recovery_handler)(int node_num, void *recovery_data); 109 void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
152 * ->this_node() returns the cluster's unique identifier for the 157 * ->this_node() returns the cluster's unique identifier for the
153 * local node. 158 * local node.
154 */ 159 */
155 int (*this_node)(unsigned int *node); 160 int (*this_node)(struct ocfs2_cluster_connection *conn,
161 unsigned int *node);
156 162
157 /* 163 /*
158 * Call the underlying dlm lock function. The ->dlm_lock() 164 * Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
239 245
240/* Used by the filesystem */ 246/* Used by the filesystem */
241int ocfs2_cluster_connect(const char *stack_name, 247int ocfs2_cluster_connect(const char *stack_name,
248 const char *cluster_name,
249 int cluster_name_len,
242 const char *group, 250 const char *group,
243 int grouplen, 251 int grouplen,
244 struct ocfs2_locking_protocol *lproto, 252 struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 268int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
261 int hangup_pending); 269 int hangup_pending);
262void ocfs2_cluster_hangup(const char *group, int grouplen); 270void ocfs2_cluster_hangup(const char *group, int grouplen);
263int ocfs2_cluster_this_node(unsigned int *node); 271int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
272 unsigned int *node);
264 273
265struct ocfs2_lock_res; 274struct ocfs2_lock_res;
266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 275int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
113 struct ocfs2_suballoc_result *res); 113 struct ocfs2_suballoc_result *res);
114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115 int nr); 115 int nr);
116static inline int ocfs2_block_group_set_bits(handle_t *handle,
117 struct inode *alloc_inode,
118 struct ocfs2_group_desc *bg,
119 struct buffer_head *group_bh,
120 unsigned int bit_off,
121 unsigned int num_bits);
122static int ocfs2_relink_block_group(handle_t *handle, 116static int ocfs2_relink_block_group(handle_t *handle,
123 struct inode *alloc_inode, 117 struct inode *alloc_inode,
124 struct buffer_head *fe_bh, 118 struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1343 return status; 1337 return status;
1344} 1338}
1345 1339
1346static inline int ocfs2_block_group_set_bits(handle_t *handle, 1340int ocfs2_block_group_set_bits(handle_t *handle,
1347 struct inode *alloc_inode, 1341 struct inode *alloc_inode,
1348 struct ocfs2_group_desc *bg, 1342 struct ocfs2_group_desc *bg,
1349 struct buffer_head *group_bh, 1343 struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1388 ocfs2_journal_dirty(handle, group_bh); 1382 ocfs2_journal_dirty(handle, group_bh);
1389 1383
1390bail: 1384bail:
1391 if (status)
1392 mlog_errno(status);
1393 return status; 1385 return status;
1394} 1386}
1395 1387
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1588 return ret; 1580 return ret;
1589} 1581}
1590 1582
1591static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1583int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1592 handle_t *handle, 1584 handle_t *handle,
1593 struct buffer_head *di_bh, 1585 struct buffer_head *di_bh,
1594 u32 num_bits, 1586 u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
86 u32 bits_wanted, 86 u32 bits_wanted,
87 struct ocfs2_alloc_context **ac); 87 struct ocfs2_alloc_context **ac);
88 88
89int ocfs2_alloc_dinode_update_counts(struct inode *inode,
90 handle_t *handle,
91 struct buffer_head *di_bh,
92 u32 num_bits,
93 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg,
97 struct buffer_head *group_bh,
98 unsigned int bit_off,
99 unsigned int num_bits);
100
89int ocfs2_claim_metadata(handle_t *handle, 101int ocfs2_claim_metadata(handle_t *handle,
90 struct ocfs2_alloc_context *ac, 102 struct ocfs2_alloc_context *ac,
91 u32 bits_wanted, 103 u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
68#include "super.h" 68#include "super.h"
69#include "sysfile.h" 69#include "sysfile.h"
70#include "uptodate.h" 70#include "uptodate.h"
71#include "ver.h"
72#include "xattr.h" 71#include "xattr.h"
73#include "quota.h" 72#include "quota.h"
74#include "refcounttree.h" 73#include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
90 89
91MODULE_AUTHOR("Oracle"); 90MODULE_AUTHOR("Oracle");
92MODULE_LICENSE("GPL"); 91MODULE_LICENSE("GPL");
92MODULE_DESCRIPTION("OCFS2 cluster file system");
93 93
94struct mount_options 94struct mount_options
95{ 95{
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
1618{ 1618{
1619 int status, i; 1619 int status, i;
1620 1620
1621 ocfs2_print_version();
1622
1623 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) 1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1624 init_waitqueue_head(&ocfs2__ioend_wq[i]); 1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1625 1623
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1947 1945
1948 ocfs2_shutdown_local_alloc(osb); 1946 ocfs2_shutdown_local_alloc(osb);
1949 1947
1950 ocfs2_truncate_log_shutdown(osb);
1951
1952 /* This will disable recovery and flush any recovery work. */ 1948 /* This will disable recovery and flush any recovery work. */
1953 ocfs2_recovery_exit(osb); 1949 ocfs2_recovery_exit(osb);
1954 1950
1951 /*
1952 * During dismount, when it recovers another node it will call
1953 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
1954 */
1955 ocfs2_truncate_log_shutdown(osb);
1956
1955 ocfs2_journal_shutdown(osb); 1957 ocfs2_journal_shutdown(osb);
1956 1958
1957 ocfs2_sync_blockdev(sb); 1959 ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2225 if (ocfs2_clusterinfo_valid(osb)) { 2227 if (ocfs2_clusterinfo_valid(osb)) {
2226 osb->osb_stackflags = 2228 osb->osb_stackflags =
2227 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; 2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2228 memcpy(osb->osb_cluster_stack, 2230 strlcpy(osb->osb_cluster_stack,
2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2231 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2230 OCFS2_STACK_LABEL_LEN); 2232 OCFS2_STACK_LABEL_LEN + 1);
2231 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
2232 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { 2233 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
2233 mlog(ML_ERROR, 2234 mlog(ML_ERROR,
2234 "couldn't mount because of an invalid " 2235 "couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2237 status = -EINVAL; 2238 status = -EINVAL;
2238 goto bail; 2239 goto bail;
2239 } 2240 }
2241 strlcpy(osb->osb_cluster_name,
2242 OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
2243 OCFS2_CLUSTER_NAME_LEN + 1);
2240 } else { 2244 } else {
2241 /* The empty string is identical with classic tools that 2245 /* The empty string is identical with classic tools that
2242 * don't know about s_cluster_info. */ 2246 * don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.5.0"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..021e7c069b86 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,11 +22,80 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init); 25struct posix_acl **acl_by_type(struct inode *inode, int type)
26EXPORT_SYMBOL(posix_acl_alloc); 26{
27EXPORT_SYMBOL(posix_acl_valid); 27 switch (type) {
28EXPORT_SYMBOL(posix_acl_equiv_mode); 28 case ACL_TYPE_ACCESS:
29EXPORT_SYMBOL(posix_acl_from_mode); 29 return &inode->i_acl;
30 case ACL_TYPE_DEFAULT:
31 return &inode->i_default_acl;
32 default:
33 BUG();
34 }
35}
36EXPORT_SYMBOL(acl_by_type);
37
38struct posix_acl *get_cached_acl(struct inode *inode, int type)
39{
40 struct posix_acl **p = acl_by_type(inode, type);
41 struct posix_acl *acl = ACCESS_ONCE(*p);
42 if (acl) {
43 spin_lock(&inode->i_lock);
44 acl = *p;
45 if (acl != ACL_NOT_CACHED)
46 acl = posix_acl_dup(acl);
47 spin_unlock(&inode->i_lock);
48 }
49 return acl;
50}
51EXPORT_SYMBOL(get_cached_acl);
52
53struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
54{
55 return rcu_dereference(*acl_by_type(inode, type));
56}
57EXPORT_SYMBOL(get_cached_acl_rcu);
58
59void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
60{
61 struct posix_acl **p = acl_by_type(inode, type);
62 struct posix_acl *old;
63 spin_lock(&inode->i_lock);
64 old = *p;
65 rcu_assign_pointer(*p, posix_acl_dup(acl));
66 spin_unlock(&inode->i_lock);
67 if (old != ACL_NOT_CACHED)
68 posix_acl_release(old);
69}
70EXPORT_SYMBOL(set_cached_acl);
71
72void forget_cached_acl(struct inode *inode, int type)
73{
74 struct posix_acl **p = acl_by_type(inode, type);
75 struct posix_acl *old;
76 spin_lock(&inode->i_lock);
77 old = *p;
78 *p = ACL_NOT_CACHED;
79 spin_unlock(&inode->i_lock);
80 if (old != ACL_NOT_CACHED)
81 posix_acl_release(old);
82}
83EXPORT_SYMBOL(forget_cached_acl);
84
85void forget_all_cached_acls(struct inode *inode)
86{
87 struct posix_acl *old_access, *old_default;
88 spin_lock(&inode->i_lock);
89 old_access = inode->i_acl;
90 old_default = inode->i_default_acl;
91 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
92 spin_unlock(&inode->i_lock);
93 if (old_access != ACL_NOT_CACHED)
94 posix_acl_release(old_access);
95 if (old_default != ACL_NOT_CACHED)
96 posix_acl_release(old_default);
97}
98EXPORT_SYMBOL(forget_all_cached_acls);
30 99
31/* 100/*
32 * Init a fresh posix_acl 101 * Init a fresh posix_acl
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count)
37 atomic_set(&acl->a_refcount, 1); 106 atomic_set(&acl->a_refcount, 1);
38 acl->a_count = count; 107 acl->a_count = count;
39} 108}
109EXPORT_SYMBOL(posix_acl_init);
40 110
41/* 111/*
42 * Allocate a new ACL with the specified number of entries. 112 * Allocate a new ACL with the specified number of entries.
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags)
51 posix_acl_init(acl, count); 121 posix_acl_init(acl, count);
52 return acl; 122 return acl;
53} 123}
124EXPORT_SYMBOL(posix_acl_alloc);
54 125
55/* 126/*
56 * Clone an ACL. 127 * Clone an ACL.
@@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl)
146 return 0; 217 return 0;
147 return -EINVAL; 218 return -EINVAL;
148} 219}
220EXPORT_SYMBOL(posix_acl_valid);
149 221
150/* 222/*
151 * Returns 0 if the acl can be exactly represented in the traditional 223 * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
186 *mode_p = (*mode_p & ~S_IRWXUGO) | mode; 258 *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
187 return not_equiv; 259 return not_equiv;
188} 260}
261EXPORT_SYMBOL(posix_acl_equiv_mode);
189 262
190/* 263/*
191 * Create an ACL representing the file mode permission bits of an inode. 264 * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
207 acl->a_entries[2].e_perm = (mode & S_IRWXO); 280 acl->a_entries[2].e_perm = (mode & S_IRWXO);
208 return acl; 281 return acl;
209} 282}
283EXPORT_SYMBOL(posix_acl_from_mode);
210 284
211/* 285/*
212 * Return 0 if current is granted want access to the inode 286 * Return 0 if current is granted want access to the inode
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..24270eceddbf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
26 unsigned long committed; 26 unsigned long committed;
27 struct vmalloc_info vmi; 27 struct vmalloc_info vmi;
28 long cached; 28 long cached;
29 long available;
30 unsigned long pagecache;
31 unsigned long wmark_low = 0;
29 unsigned long pages[NR_LRU_LISTS]; 32 unsigned long pages[NR_LRU_LISTS];
33 struct zone *zone;
30 int lru; 34 int lru;
31 35
32/* 36/*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
47 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 51 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
48 pages[lru] = global_page_state(NR_LRU_BASE + lru); 52 pages[lru] = global_page_state(NR_LRU_BASE + lru);
49 53
54 for_each_zone(zone)
55 wmark_low += zone->watermark[WMARK_LOW];
56
57 /*
58 * Estimate the amount of memory available for userspace allocations,
59 * without causing swapping.
60 *
61 * Free memory cannot be taken below the low watermark, before the
62 * system starts swapping.
63 */
64 available = i.freeram - wmark_low;
65
66 /*
67 * Not all the page cache can be freed, otherwise the system will
68 * start swapping. Assume at least half of the page cache, or the
69 * low watermark worth of cache, needs to stay.
70 */
71 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
72 pagecache -= min(pagecache / 2, wmark_low);
73 available += pagecache;
74
75 /*
76 * Part of the reclaimable swap consists of items that are in use,
77 * and cannot be freed. Cap this estimate at the low watermark.
78 */
79 available += global_page_state(NR_SLAB_RECLAIMABLE) -
80 min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
81
82 if (available < 0)
83 available = 0;
84
50 /* 85 /*
51 * Tagged format, for easy grepping and expansion. 86 * Tagged format, for easy grepping and expansion.
52 */ 87 */
53 seq_printf(m, 88 seq_printf(m,
54 "MemTotal: %8lu kB\n" 89 "MemTotal: %8lu kB\n"
55 "MemFree: %8lu kB\n" 90 "MemFree: %8lu kB\n"
91 "MemAvailable: %8lu kB\n"
56 "Buffers: %8lu kB\n" 92 "Buffers: %8lu kB\n"
57 "Cached: %8lu kB\n" 93 "Cached: %8lu kB\n"
58 "SwapCached: %8lu kB\n" 94 "SwapCached: %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
105 , 141 ,
106 K(i.totalram), 142 K(i.totalram),
107 K(i.freeram), 143 K(i.freeram),
144 K(available),
108 K(i.bufferram), 145 K(i.bufferram),
109 K(cached), 146 K(cached),
110 K(total_swapcache_pages()), 147 K(total_swapcache_pages()),
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..6a3e2c420180 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -275,4 +275,4 @@ int __init init_ramfs_fs(void)
275 275
276 return err; 276 return err;
277} 277}
278module_init(init_ramfs_fs) 278fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..1193ffd03565 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
901 io_fn_t fn; 901 io_fn_t fn;
902 iov_fn_t fnv; 902 iov_fn_t fnv;
903 903
904 ret = -EFAULT;
905 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
906 goto out;
907
908 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 904 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
909 UIO_FASTIOV, iovstack, &iov); 905 UIO_FASTIOV, iovstack, &iov);
910 if (ret <= 0) 906 if (ret <= 0)
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..cecd780e0f44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
166 if (!s) 166 if (!s)
167 return NULL; 167 return NULL;
168 168
169 INIT_LIST_HEAD(&s->s_mounts);
170
169 if (security_sb_alloc(s)) 171 if (security_sb_alloc(s))
170 goto fail; 172 goto fail;
171 173
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
188 if (list_lru_init(&s->s_inode_lru)) 190 if (list_lru_init(&s->s_inode_lru))
189 goto fail; 191 goto fail;
190 192
191 INIT_LIST_HEAD(&s->s_mounts);
192 init_rwsem(&s->s_umount); 193 init_rwsem(&s->s_umount);
193 lockdep_set_class(&s->s_umount, &type->s_umount_key); 194 lockdep_set_class(&s->s_umount, &type->s_umount_key);
194 /* 195 /*
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index f1f07d31a3af..2fae55def608 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -5,6 +5,7 @@
5#define _LINUX_BOOTMEM_H 5#define _LINUX_BOOTMEM_H
6 6
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <linux/mm_types.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
9 10
10/* 11/*
@@ -52,7 +53,6 @@ extern void free_bootmem_node(pg_data_t *pgdat,
52 unsigned long size); 53 unsigned long size);
53extern void free_bootmem(unsigned long physaddr, unsigned long size); 54extern void free_bootmem(unsigned long physaddr, unsigned long size);
54extern void free_bootmem_late(unsigned long physaddr, unsigned long size); 55extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
55extern void __free_pages_bootmem(struct page *page, unsigned int order);
56 56
57/* 57/*
58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, 58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
@@ -142,6 +142,157 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
142#define alloc_bootmem_low_pages_node(pgdat, x) \ 142#define alloc_bootmem_low_pages_node(pgdat, x) \
143 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 143 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
144 144
145
146#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)
147
148/* FIXME: use MEMBLOCK_ALLOC_* variants here */
149#define BOOTMEM_ALLOC_ACCESSIBLE 0
150#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
151
152/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
153void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
154 phys_addr_t align, phys_addr_t min_addr,
155 phys_addr_t max_addr, int nid);
156void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align,
157 phys_addr_t min_addr, phys_addr_t max_addr, int nid);
158void __memblock_free_early(phys_addr_t base, phys_addr_t size);
159void __memblock_free_late(phys_addr_t base, phys_addr_t size);
160
161static inline void * __init memblock_virt_alloc(
162 phys_addr_t size, phys_addr_t align)
163{
164 return memblock_virt_alloc_try_nid(size, align, BOOTMEM_LOW_LIMIT,
165 BOOTMEM_ALLOC_ACCESSIBLE,
166 NUMA_NO_NODE);
167}
168
169static inline void * __init memblock_virt_alloc_nopanic(
170 phys_addr_t size, phys_addr_t align)
171{
172 return memblock_virt_alloc_try_nid_nopanic(size, align,
173 BOOTMEM_LOW_LIMIT,
174 BOOTMEM_ALLOC_ACCESSIBLE,
175 NUMA_NO_NODE);
176}
177
178static inline void * __init memblock_virt_alloc_from_nopanic(
179 phys_addr_t size, phys_addr_t align, phys_addr_t min_addr)
180{
181 return memblock_virt_alloc_try_nid_nopanic(size, align, min_addr,
182 BOOTMEM_ALLOC_ACCESSIBLE,
183 NUMA_NO_NODE);
184}
185
186static inline void * __init memblock_virt_alloc_node(
187 phys_addr_t size, int nid)
188{
189 return memblock_virt_alloc_try_nid(size, 0, BOOTMEM_LOW_LIMIT,
190 BOOTMEM_ALLOC_ACCESSIBLE, nid);
191}
192
193static inline void * __init memblock_virt_alloc_node_nopanic(
194 phys_addr_t size, int nid)
195{
196 return memblock_virt_alloc_try_nid_nopanic(size, 0, BOOTMEM_LOW_LIMIT,
197 BOOTMEM_ALLOC_ACCESSIBLE,
198 nid);
199}
200
201static inline void __init memblock_free_early(
202 phys_addr_t base, phys_addr_t size)
203{
204 __memblock_free_early(base, size);
205}
206
207static inline void __init memblock_free_early_nid(
208 phys_addr_t base, phys_addr_t size, int nid)
209{
210 __memblock_free_early(base, size);
211}
212
213static inline void __init memblock_free_late(
214 phys_addr_t base, phys_addr_t size)
215{
216 __memblock_free_late(base, size);
217}
218
219#else
220
221#define BOOTMEM_ALLOC_ACCESSIBLE 0
222
223
224/* Fall back to all the existing bootmem APIs */
225static inline void * __init memblock_virt_alloc(
226 phys_addr_t size, phys_addr_t align)
227{
228 if (!align)
229 align = SMP_CACHE_BYTES;
230 return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
231}
232
233static inline void * __init memblock_virt_alloc_nopanic(
234 phys_addr_t size, phys_addr_t align)
235{
236 if (!align)
237 align = SMP_CACHE_BYTES;
238 return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
239}
240
241static inline void * __init memblock_virt_alloc_from_nopanic(
242 phys_addr_t size, phys_addr_t align, phys_addr_t min_addr)
243{
244 return __alloc_bootmem_nopanic(size, align, min_addr);
245}
246
247static inline void * __init memblock_virt_alloc_node(
248 phys_addr_t size, int nid)
249{
250 return __alloc_bootmem_node(NODE_DATA(nid), size, SMP_CACHE_BYTES,
251 BOOTMEM_LOW_LIMIT);
252}
253
254static inline void * __init memblock_virt_alloc_node_nopanic(
255 phys_addr_t size, int nid)
256{
257 return __alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
258 SMP_CACHE_BYTES,
259 BOOTMEM_LOW_LIMIT);
260}
261
262static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size,
263 phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid)
264{
265 return __alloc_bootmem_node_high(NODE_DATA(nid), size, align,
266 min_addr);
267}
268
269static inline void * __init memblock_virt_alloc_try_nid_nopanic(
270 phys_addr_t size, phys_addr_t align,
271 phys_addr_t min_addr, phys_addr_t max_addr, int nid)
272{
273 return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
274 min_addr, max_addr);
275}
276
277static inline void __init memblock_free_early(
278 phys_addr_t base, phys_addr_t size)
279{
280 free_bootmem(base, size);
281}
282
283static inline void __init memblock_free_early_nid(
284 phys_addr_t base, phys_addr_t size, int nid)
285{
286 free_bootmem_node(NODE_DATA(nid), base, size);
287}
288
289static inline void __init memblock_free_late(
290 phys_addr_t base, phys_addr_t size)
291{
292 free_bootmem_late(base, size);
293}
294#endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */
295
145#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP 296#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
146extern void *alloc_remap(int nid, unsigned long size); 297extern void *alloc_remap(int nid, unsigned long size);
147#else 298#else
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 091d72e70d8a..7e1c76e3cd68 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
62 return zone->compact_considered < defer_limit; 62 return zone->compact_considered < defer_limit;
63} 63}
64 64
65/*
66 * Update defer tracking counters after successful compaction of given order,
67 * which means an allocation either succeeded (alloc_success == true) or is
68 * expected to succeed.
69 */
70static inline void compaction_defer_reset(struct zone *zone, int order,
71 bool alloc_success)
72{
73 if (alloc_success) {
74 zone->compact_considered = 0;
75 zone->compact_defer_shift = 0;
76 }
77 if (order >= zone->compact_order_failed)
78 zone->compact_order_failed = order + 1;
79}
80
65/* Returns true if restarting compaction after many failures */ 81/* Returns true if restarting compaction after many failures */
66static inline bool compaction_restarting(struct zone *zone, int order) 82static inline bool compaction_restarting(struct zone *zone, int order)
67{ 83{
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index fc0e34ce038f..fe8cb610deac 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -85,6 +85,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
85 85
86extern void debug_dma_dump_mappings(struct device *dev); 86extern void debug_dma_dump_mappings(struct device *dev);
87 87
88extern void debug_dma_assert_idle(struct page *page);
89
88#else /* CONFIG_DMA_API_DEBUG */ 90#else /* CONFIG_DMA_API_DEBUG */
89 91
90static inline void dma_debug_add_bus(struct bus_type *bus) 92static inline void dma_debug_add_bus(struct bus_type *bus)
@@ -183,6 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev)
183{ 185{
184} 186}
185 187
188static inline void debug_dma_assert_idle(struct page *page)
189{
190}
191
186#endif /* CONFIG_DMA_API_DEBUG */ 192#endif /* CONFIG_DMA_API_DEBUG */
187 193
188#endif /* __DMA_DEBUG_H */ 194#endif /* __DMA_DEBUG_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4b2ee8d12f5e..7d8d5e608594 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -15,7 +15,6 @@
15#include <linux/path.h> /* struct path */ 15#include <linux/path.h> /* struct path */
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/types.h> 17#include <linux/types.h>
18
19#include <linux/atomic.h> 18#include <linux/atomic.h>
20 19
21/* 20/*
@@ -79,6 +78,7 @@ struct fsnotify_group;
79struct fsnotify_event; 78struct fsnotify_event;
80struct fsnotify_mark; 79struct fsnotify_mark;
81struct fsnotify_event_private_data; 80struct fsnotify_event_private_data;
81struct fsnotify_fname;
82 82
83/* 83/*
84 * Each group much define these ops. The fsnotify infrastructure will call 84 * Each group much define these ops. The fsnotify infrastructure will call
@@ -94,17 +94,27 @@ struct fsnotify_event_private_data;
94 * userspace messages that marks have been removed. 94 * userspace messages that marks have been removed.
95 */ 95 */
96struct fsnotify_ops { 96struct fsnotify_ops {
97 bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
98 struct fsnotify_mark *inode_mark,
99 struct fsnotify_mark *vfsmount_mark,
100 __u32 mask, void *data, int data_type);
101 int (*handle_event)(struct fsnotify_group *group, 97 int (*handle_event)(struct fsnotify_group *group,
98 struct inode *inode,
102 struct fsnotify_mark *inode_mark, 99 struct fsnotify_mark *inode_mark,
103 struct fsnotify_mark *vfsmount_mark, 100 struct fsnotify_mark *vfsmount_mark,
104 struct fsnotify_event *event); 101 u32 mask, void *data, int data_type,
102 const unsigned char *file_name);
105 void (*free_group_priv)(struct fsnotify_group *group); 103 void (*free_group_priv)(struct fsnotify_group *group);
106 void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); 104 void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
107 void (*free_event_priv)(struct fsnotify_event_private_data *priv); 105 void (*free_event)(struct fsnotify_event *event);
106};
107
108/*
109 * all of the information about the original object we want to now send to
110 * a group. If you want to carry more info from the accessing task to the
111 * listener this structure is where you need to be adding fields.
112 */
113struct fsnotify_event {
114 struct list_head list;
115 /* inode may ONLY be dereferenced during handle_event(). */
116 struct inode *inode; /* either the inode the event happened to or its parent */
117 u32 mask; /* the type of access, bitwise OR for FS_* event types */
108}; 118};
109 119
110/* 120/*
@@ -148,7 +158,11 @@ struct fsnotify_group {
148 * a group */ 158 * a group */
149 struct list_head marks_list; /* all inode marks for this group */ 159 struct list_head marks_list; /* all inode marks for this group */
150 160
151 struct fasync_struct *fsn_fa; /* async notification */ 161 struct fasync_struct *fsn_fa; /* async notification */
162
163 struct fsnotify_event overflow_event; /* Event we queue when the
164 * notification list is too
165 * full */
152 166
153 /* groups can define private fields here or use the void *private */ 167 /* groups can define private fields here or use the void *private */
154 union { 168 union {
@@ -177,76 +191,10 @@ struct fsnotify_group {
177 }; 191 };
178}; 192};
179 193
180/*
181 * A single event can be queued in multiple group->notification_lists.
182 *
183 * each group->notification_list will point to an event_holder which in turns points
184 * to the actual event that needs to be sent to userspace.
185 *
186 * Seemed cheaper to create a refcnt'd event and a small holder for every group
187 * than create a different event for every group
188 *
189 */
190struct fsnotify_event_holder {
191 struct fsnotify_event *event;
192 struct list_head event_list;
193};
194
195/*
196 * Inotify needs to tack data onto an event. This struct lets us later find the
197 * correct private data of the correct group.
198 */
199struct fsnotify_event_private_data {
200 struct fsnotify_group *group;
201 struct list_head event_list;
202};
203
204/*
205 * all of the information about the original object we want to now send to
206 * a group. If you want to carry more info from the accessing task to the
207 * listener this structure is where you need to be adding fields.
208 */
209struct fsnotify_event {
210 /*
211 * If we create an event we are also likely going to need a holder
212 * to link to a group. So embed one holder in the event. Means only
213 * one allocation for the common case where we only have one group
214 */
215 struct fsnotify_event_holder holder;
216 spinlock_t lock; /* protection for the associated event_holder and private_list */
217 /* to_tell may ONLY be dereferenced during handle_event(). */
218 struct inode *to_tell; /* either the inode the event happened to or its parent */
219 /*
220 * depending on the event type we should have either a path or inode
221 * We hold a reference on path, but NOT on inode. Since we have the ref on
222 * the path, it may be dereferenced at any point during this object's
223 * lifetime. That reference is dropped when this object's refcnt hits
224 * 0. If this event contains an inode instead of a path, the inode may
225 * ONLY be used during handle_event().
226 */
227 union {
228 struct path path;
229 struct inode *inode;
230 };
231/* when calling fsnotify tell it if the data is a path or inode */ 194/* when calling fsnotify tell it if the data is a path or inode */
232#define FSNOTIFY_EVENT_NONE 0 195#define FSNOTIFY_EVENT_NONE 0
233#define FSNOTIFY_EVENT_PATH 1 196#define FSNOTIFY_EVENT_PATH 1
234#define FSNOTIFY_EVENT_INODE 2 197#define FSNOTIFY_EVENT_INODE 2
235 int data_type; /* which of the above union we have */
236 atomic_t refcnt; /* how many groups still are using/need to send this event */
237 __u32 mask; /* the type of access, bitwise OR for FS_* event types */
238
239 u32 sync_cookie; /* used to corrolate events, namely inotify mv events */
240 const unsigned char *file_name;
241 size_t name_len;
242 struct pid *tgid;
243
244#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
245 __u32 response; /* userspace answer to question */
246#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
247
248 struct list_head private_data_list; /* groups can store private data here */
249};
250 198
251/* 199/*
252 * Inode specific fields in an fsnotify_mark 200 * Inode specific fields in an fsnotify_mark
@@ -370,17 +318,12 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
370extern void fsnotify_destroy_group(struct fsnotify_group *group); 318extern void fsnotify_destroy_group(struct fsnotify_group *group);
371/* fasync handler function */ 319/* fasync handler function */
372extern int fsnotify_fasync(int fd, struct file *file, int on); 320extern int fsnotify_fasync(int fd, struct file *file, int on);
373/* take a reference to an event */ 321/* Free event from memory */
374extern void fsnotify_get_event(struct fsnotify_event *event); 322extern void fsnotify_destroy_event(struct fsnotify_group *group,
375extern void fsnotify_put_event(struct fsnotify_event *event); 323 struct fsnotify_event *event);
376/* find private data previously attached to an event and unlink it */
377extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
378 struct fsnotify_event *event);
379
380/* attach the event to the group notification queue */ 324/* attach the event to the group notification queue */
381extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, 325extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
382 struct fsnotify_event *event, 326 struct fsnotify_event *event,
383 struct fsnotify_event_private_data *priv,
384 struct fsnotify_event *(*merge)(struct list_head *, 327 struct fsnotify_event *(*merge)(struct list_head *,
385 struct fsnotify_event *)); 328 struct fsnotify_event *));
386/* true if the group notification queue is empty */ 329/* true if the group notification queue is empty */
@@ -430,15 +373,8 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
430extern void fsnotify_unmount_inodes(struct list_head *list); 373extern void fsnotify_unmount_inodes(struct list_head *list);
431 374
432/* put here because inotify does some weird stuff when destroying watches */ 375/* put here because inotify does some weird stuff when destroying watches */
433extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, 376extern void fsnotify_init_event(struct fsnotify_event *event,
434 void *data, int data_is, 377 struct inode *to_tell, u32 mask);
435 const unsigned char *name,
436 u32 cookie, gfp_t gfp);
437
438/* fanotify likes to change events after they are on lists... */
439extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
440extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
441 struct fsnotify_event *new_event);
442 378
443#else 379#else
444 380
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 91672e2deec3..db512014e061 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,6 +157,26 @@ static inline int hpage_nr_pages(struct page *page)
157 return HPAGE_PMD_NR; 157 return HPAGE_PMD_NR;
158 return 1; 158 return 1;
159} 159}
160/*
161 * compound_trans_head() should be used instead of compound_head(),
162 * whenever the "page" passed as parameter could be the tail of a
163 * transparent hugepage that could be undergoing a
164 * __split_huge_page_refcount(). The page structure layout often
165 * changes across releases and it makes extensive use of unions. So if
166 * the page structure layout will change in a way that
167 * page->first_page gets clobbered by __split_huge_page_refcount, the
168 * implementation making use of smp_rmb() will be required.
169 *
170 * Currently we define compound_trans_head as compound_head, because
171 * page->private is in the same union with page->first_page, and
172 * page->private isn't clobbered. However this also means we're
173 * currently leaving dirt into the page->private field of anonymous
174 * pages resulting from a THP split, instead of setting page->private
175 * to zero like for every other page that has PG_private not set. But
176 * anonymous pages don't use page->private so this is not a problem.
177 */
178#if 0
179/* This will be needed if page->private will be clobbered in split_huge_page */
160static inline struct page *compound_trans_head(struct page *page) 180static inline struct page *compound_trans_head(struct page *page)
161{ 181{
162 if (PageTail(page)) { 182 if (PageTail(page)) {
@@ -174,6 +194,9 @@ static inline struct page *compound_trans_head(struct page *page)
174 } 194 }
175 return page; 195 return page;
176} 196}
197#else
198#define compound_trans_head(page) compound_head(page)
199#endif
177 200
178extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 201extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
179 unsigned long addr, pmd_t pmd, pmd_t *pmdp); 202 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index bd7e98752222..d01cc972a1d9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -31,7 +31,6 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
31void hugepage_put_subpool(struct hugepage_subpool *spool); 31void hugepage_put_subpool(struct hugepage_subpool *spool);
32 32
33int PageHuge(struct page *page); 33int PageHuge(struct page *page);
34int PageHeadHuge(struct page *page_head);
35 34
36void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 35void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
37int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 36int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -104,11 +103,6 @@ static inline int PageHuge(struct page *page)
104 return 0; 103 return 0;
105} 104}
106 105
107static inline int PageHeadHuge(struct page *page_head)
108{
109 return 0;
110}
111
112static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 106static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
113{ 107{
114} 108}
@@ -360,6 +354,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
360 354
361static inline struct hstate *page_hstate(struct page *page) 355static inline struct hstate *page_hstate(struct page *page)
362{ 356{
357 VM_BUG_ON(!PageHuge(page));
363 return size_to_hstate(PAGE_SIZE << compound_order(page)); 358 return size_to_hstate(PAGE_SIZE << compound_order(page));
364} 359}
365 360
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f0e52383a001..1516a8ff8f92 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -41,6 +41,7 @@ extern struct fs_struct init_fs;
41 41
42#define INIT_SIGNALS(sig) { \ 42#define INIT_SIGNALS(sig) { \
43 .nr_threads = 1, \ 43 .nr_threads = 1, \
44 .thread_head = LIST_HEAD_INIT(init_task.thread_node), \
44 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ 45 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
45 .shared_pending = { \ 46 .shared_pending = { \
46 .list = LIST_HEAD_INIT(sig.shared_pending.list), \ 47 .list = LIST_HEAD_INIT(sig.shared_pending.list), \
@@ -222,6 +223,7 @@ extern struct task_group root_task_group;
222 [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ 223 [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
223 }, \ 224 }, \
224 .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ 225 .thread_group = LIST_HEAD_INIT(tsk.thread_group), \
226 .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
225 INIT_IDS \ 227 INIT_IDS \
226 INIT_PERF_EVENTS(tsk) \ 228 INIT_PERF_EVENTS(tsk) \
227 INIT_TRACE_IRQFLAGS \ 229 INIT_TRACE_IRQFLAGS \
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 45c9b6a17bcb..3be6bb18562d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -73,11 +73,7 @@ static inline void set_page_stable_node(struct page *page,
73struct page *ksm_might_need_to_copy(struct page *page, 73struct page *ksm_might_need_to_copy(struct page *page,
74 struct vm_area_struct *vma, unsigned long address); 74 struct vm_area_struct *vma, unsigned long address);
75 75
76int page_referenced_ksm(struct page *page, 76int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
77 struct mem_cgroup *memcg, unsigned long *vm_flags);
78int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
79int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
80 struct vm_area_struct *, unsigned long, void *), void *arg);
81void ksm_migrate_page(struct page *newpage, struct page *oldpage); 77void ksm_migrate_page(struct page *newpage, struct page *oldpage);
82 78
83#else /* !CONFIG_KSM */ 79#else /* !CONFIG_KSM */
@@ -115,13 +111,8 @@ static inline int page_referenced_ksm(struct page *page,
115 return 0; 111 return 0;
116} 112}
117 113
118static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 114static inline int rmap_walk_ksm(struct page *page,
119{ 115 struct rmap_walk_control *rwc)
120 return 0;
121}
122
123static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
124 struct vm_area_struct *, unsigned long, void *), void *arg)
125{ 116{
126 return 0; 117 return 0;
127} 118}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77c60e52939d..cd0274bebd4c 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -19,9 +19,13 @@
19 19
20#define INIT_MEMBLOCK_REGIONS 128 20#define INIT_MEMBLOCK_REGIONS 128
21 21
22/* Definition of memblock flags. */
23#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
24
22struct memblock_region { 25struct memblock_region {
23 phys_addr_t base; 26 phys_addr_t base;
24 phys_addr_t size; 27 phys_addr_t size;
28 unsigned long flags;
25#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 29#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
26 int nid; 30 int nid;
27#endif 31#endif
@@ -43,12 +47,17 @@ struct memblock {
43 47
44extern struct memblock memblock; 48extern struct memblock memblock;
45extern int memblock_debug; 49extern int memblock_debug;
50#ifdef CONFIG_MOVABLE_NODE
51/* If movable_node boot option specified */
52extern bool movable_node_enabled;
53#endif /* CONFIG_MOVABLE_NODE */
46 54
47#define memblock_dbg(fmt, ...) \ 55#define memblock_dbg(fmt, ...) \
48 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 56 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
49 57
50phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, 58phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
51 phys_addr_t size, phys_addr_t align, int nid); 59 phys_addr_t start, phys_addr_t end,
60 int nid);
52phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 61phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
53 phys_addr_t size, phys_addr_t align); 62 phys_addr_t size, phys_addr_t align);
54phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); 63phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -59,6 +68,28 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
59int memblock_free(phys_addr_t base, phys_addr_t size); 68int memblock_free(phys_addr_t base, phys_addr_t size);
60int memblock_reserve(phys_addr_t base, phys_addr_t size); 69int memblock_reserve(phys_addr_t base, phys_addr_t size);
61void memblock_trim_memory(phys_addr_t align); 70void memblock_trim_memory(phys_addr_t align);
71int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
72int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
73#ifdef CONFIG_MOVABLE_NODE
74static inline bool memblock_is_hotpluggable(struct memblock_region *m)
75{
76 return m->flags & MEMBLOCK_HOTPLUG;
77}
78
79static inline bool movable_node_is_enabled(void)
80{
81 return movable_node_enabled;
82}
83#else
84static inline bool memblock_is_hotpluggable(struct memblock_region *m)
85{
86 return false;
87}
88static inline bool movable_node_is_enabled(void)
89{
90 return false;
91}
92#endif
62 93
63#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 94#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
64int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, 95int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
@@ -87,7 +118,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
87/** 118/**
88 * for_each_free_mem_range - iterate through free memblock areas 119 * for_each_free_mem_range - iterate through free memblock areas
89 * @i: u64 used as loop variable 120 * @i: u64 used as loop variable
90 * @nid: node selector, %MAX_NUMNODES for all nodes 121 * @nid: node selector, %NUMA_NO_NODE for all nodes
91 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 122 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
92 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 123 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
93 * @p_nid: ptr to int for nid of the range, can be %NULL 124 * @p_nid: ptr to int for nid of the range, can be %NULL
@@ -107,7 +138,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
107/** 138/**
108 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 139 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
109 * @i: u64 used as loop variable 140 * @i: u64 used as loop variable
110 * @nid: node selector, %MAX_NUMNODES for all nodes 141 * @nid: node selector, %NUMA_NO_NODE for all nodes
111 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 142 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
112 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 143 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
113 * @p_nid: ptr to int for nid of the range, can be %NULL 144 * @p_nid: ptr to int for nid of the range, can be %NULL
@@ -121,8 +152,21 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
121 i != (u64)ULLONG_MAX; \ 152 i != (u64)ULLONG_MAX; \
122 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) 153 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
123 154
155static inline void memblock_set_region_flags(struct memblock_region *r,
156 unsigned long flags)
157{
158 r->flags |= flags;
159}
160
161static inline void memblock_clear_region_flags(struct memblock_region *r,
162 unsigned long flags)
163{
164 r->flags &= ~flags;
165}
166
124#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 167#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
125int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); 168int memblock_set_node(phys_addr_t base, phys_addr_t size,
169 struct memblock_type *type, int nid);
126 170
127static inline void memblock_set_region_node(struct memblock_region *r, int nid) 171static inline void memblock_set_region_node(struct memblock_region *r, int nid)
128{ 172{
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 9fe426b30a41..5f1ea756aace 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -211,20 +211,8 @@ static inline void mpol_get(struct mempolicy *pol)
211{ 211{
212} 212}
213 213
214static inline struct mempolicy *mpol_dup(struct mempolicy *old)
215{
216 return NULL;
217}
218
219struct shared_policy {}; 214struct shared_policy {};
220 215
221static inline int mpol_set_shared_policy(struct shared_policy *info,
222 struct vm_area_struct *vma,
223 struct mempolicy *new)
224{
225 return -EINVAL;
226}
227
228static inline void mpol_shared_policy_init(struct shared_policy *sp, 216static inline void mpol_shared_policy_init(struct shared_policy *sp,
229 struct mempolicy *mpol) 217 struct mempolicy *mpol)
230{ 218{
@@ -234,12 +222,6 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
234{ 222{
235} 223}
236 224
237static inline struct mempolicy *
238mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
239{
240 return NULL;
241}
242
243#define vma_policy(vma) NULL 225#define vma_policy(vma) NULL
244 226
245static inline int 227static inline int
@@ -266,10 +248,6 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
266{ 248{
267} 249}
268 250
269static inline void mpol_fix_fork_child_flag(struct task_struct *p)
270{
271}
272
273static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 251static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
274 unsigned long addr, gfp_t gfp_flags, 252 unsigned long addr, gfp_t gfp_flags,
275 struct mempolicy **mpol, nodemask_t **nodemask) 253 struct mempolicy **mpol, nodemask_t **nodemask)
@@ -284,12 +262,6 @@ static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
284 return false; 262 return false;
285} 263}
286 264
287static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
288 const nodemask_t *mask)
289{
290 return false;
291}
292
293static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 265static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
294 const nodemask_t *to, int flags) 266 const nodemask_t *to, int flags)
295{ 267{
@@ -307,10 +279,6 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
307} 279}
308#endif 280#endif
309 281
310static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
311{
312}
313
314static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, 282static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
315 unsigned long address) 283 unsigned long address)
316{ 284{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f015c059e159..84a31ad0b791 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -35,16 +35,12 @@ enum migrate_reason {
35 35
36#ifdef CONFIG_MIGRATION 36#ifdef CONFIG_MIGRATION
37 37
38extern void putback_lru_pages(struct list_head *l);
39extern void putback_movable_pages(struct list_head *l); 38extern void putback_movable_pages(struct list_head *l);
40extern int migrate_page(struct address_space *, 39extern int migrate_page(struct address_space *,
41 struct page *, struct page *, enum migrate_mode); 40 struct page *, struct page *, enum migrate_mode);
42extern int migrate_pages(struct list_head *l, new_page_t x, 41extern int migrate_pages(struct list_head *l, new_page_t x,
43 unsigned long private, enum migrate_mode mode, int reason); 42 unsigned long private, enum migrate_mode mode, int reason);
44 43
45extern int fail_migrate_page(struct address_space *,
46 struct page *, struct page *);
47
48extern int migrate_prep(void); 44extern int migrate_prep(void);
49extern int migrate_prep_local(void); 45extern int migrate_prep_local(void);
50extern int migrate_vmas(struct mm_struct *mm, 46extern int migrate_vmas(struct mm_struct *mm,
@@ -59,7 +55,6 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
59 int extra_count); 55 int extra_count);
60#else 56#else
61 57
62static inline void putback_lru_pages(struct list_head *l) {}
63static inline void putback_movable_pages(struct list_head *l) {} 58static inline void putback_movable_pages(struct list_head *l) {}
64static inline int migrate_pages(struct list_head *l, new_page_t x, 59static inline int migrate_pages(struct list_head *l, new_page_t x,
65 unsigned long private, enum migrate_mode mode, int reason) 60 unsigned long private, enum migrate_mode mode, int reason)
@@ -86,7 +81,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
86 81
87/* Possible settings for the migrate_page() method in address_operations */ 82/* Possible settings for the migrate_page() method in address_operations */
88#define migrate_page NULL 83#define migrate_page NULL
89#define fail_migrate_page NULL
90 84
91#endif /* CONFIG_MIGRATION */ 85#endif /* CONFIG_MIGRATION */
92 86
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35527173cf50..a512dd836931 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -57,6 +57,15 @@ extern int sysctl_legacy_va_layout;
57extern unsigned long sysctl_user_reserve_kbytes; 57extern unsigned long sysctl_user_reserve_kbytes;
58extern unsigned long sysctl_admin_reserve_kbytes; 58extern unsigned long sysctl_admin_reserve_kbytes;
59 59
60extern int sysctl_overcommit_memory;
61extern int sysctl_overcommit_ratio;
62extern unsigned long sysctl_overcommit_kbytes;
63
64extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
65 size_t *, loff_t *);
66extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
67 size_t *, loff_t *);
68
60#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 69#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
61 70
62/* to align the pointer to the (next) page boundary */ 71/* to align the pointer to the (next) page boundary */
@@ -414,15 +423,44 @@ static inline int page_count(struct page *page)
414 return atomic_read(&compound_head(page)->_count); 423 return atomic_read(&compound_head(page)->_count);
415} 424}
416 425
426#ifdef CONFIG_HUGETLB_PAGE
427extern int PageHeadHuge(struct page *page_head);
428#else /* CONFIG_HUGETLB_PAGE */
429static inline int PageHeadHuge(struct page *page_head)
430{
431 return 0;
432}
433#endif /* CONFIG_HUGETLB_PAGE */
434
435static inline bool __compound_tail_refcounted(struct page *page)
436{
437 return !PageSlab(page) && !PageHeadHuge(page);
438}
439
440/*
441 * This takes a head page as parameter and tells if the
442 * tail page reference counting can be skipped.
443 *
444 * For this to be safe, PageSlab and PageHeadHuge must remain true on
445 * any given page where they return true here, until all tail pins
446 * have been released.
447 */
448static inline bool compound_tail_refcounted(struct page *page)
449{
450 VM_BUG_ON(!PageHead(page));
451 return __compound_tail_refcounted(page);
452}
453
417static inline void get_huge_page_tail(struct page *page) 454static inline void get_huge_page_tail(struct page *page)
418{ 455{
419 /* 456 /*
420 * __split_huge_page_refcount() cannot run 457 * __split_huge_page_refcount() cannot run from under us.
421 * from under us.
422 */ 458 */
459 VM_BUG_ON(!PageTail(page));
423 VM_BUG_ON(page_mapcount(page) < 0); 460 VM_BUG_ON(page_mapcount(page) < 0);
424 VM_BUG_ON(atomic_read(&page->_count) != 0); 461 VM_BUG_ON(atomic_read(&page->_count) != 0);
425 atomic_inc(&page->_mapcount); 462 if (compound_tail_refcounted(page->first_page))
463 atomic_inc(&page->_mapcount);
426} 464}
427 465
428extern bool __get_page_tail(struct page *page); 466extern bool __get_page_tail(struct page *page);
@@ -846,11 +884,14 @@ static __always_inline void *lowmem_page_address(const struct page *page)
846#endif 884#endif
847 885
848#if defined(WANT_PAGE_VIRTUAL) 886#if defined(WANT_PAGE_VIRTUAL)
849#define page_address(page) ((page)->virtual) 887static inline void *page_address(const struct page *page)
850#define set_page_address(page, address) \ 888{
851 do { \ 889 return page->virtual;
852 (page)->virtual = (address); \ 890}
853 } while(0) 891static inline void set_page_address(struct page *page, void *address)
892{
893 page->virtual = address;
894}
854#define page_address_init() do { } while(0) 895#define page_address_init() do { } while(0)
855#endif 896#endif
856 897
@@ -984,7 +1025,6 @@ extern void pagefault_out_of_memory(void);
984 * various contexts. 1025 * various contexts.
985 */ 1026 */
986#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ 1027#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
987#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */
988 1028
989extern void show_free_areas(unsigned int flags); 1029extern void show_free_areas(unsigned int flags);
990extern bool skip_free_areas_node(unsigned int flags, int nid); 1030extern bool skip_free_areas_node(unsigned int flags, int nid);
@@ -1318,6 +1358,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
1318 1358
1319#if USE_SPLIT_PTE_PTLOCKS 1359#if USE_SPLIT_PTE_PTLOCKS
1320#if ALLOC_SPLIT_PTLOCKS 1360#if ALLOC_SPLIT_PTLOCKS
1361void __init ptlock_cache_init(void);
1321extern bool ptlock_alloc(struct page *page); 1362extern bool ptlock_alloc(struct page *page);
1322extern void ptlock_free(struct page *page); 1363extern void ptlock_free(struct page *page);
1323 1364
@@ -1326,6 +1367,10 @@ static inline spinlock_t *ptlock_ptr(struct page *page)
1326 return page->ptl; 1367 return page->ptl;
1327} 1368}
1328#else /* ALLOC_SPLIT_PTLOCKS */ 1369#else /* ALLOC_SPLIT_PTLOCKS */
1370static inline void ptlock_cache_init(void)
1371{
1372}
1373
1329static inline bool ptlock_alloc(struct page *page) 1374static inline bool ptlock_alloc(struct page *page)
1330{ 1375{
1331 return true; 1376 return true;
@@ -1378,10 +1423,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
1378{ 1423{
1379 return &mm->page_table_lock; 1424 return &mm->page_table_lock;
1380} 1425}
1426static inline void ptlock_cache_init(void) {}
1381static inline bool ptlock_init(struct page *page) { return true; } 1427static inline bool ptlock_init(struct page *page) { return true; }
1382static inline void pte_lock_deinit(struct page *page) {} 1428static inline void pte_lock_deinit(struct page *page) {}
1383#endif /* USE_SPLIT_PTE_PTLOCKS */ 1429#endif /* USE_SPLIT_PTE_PTLOCKS */
1384 1430
1431static inline void pgtable_init(void)
1432{
1433 ptlock_cache_init();
1434 pgtable_cache_init();
1435}
1436
1385static inline bool pgtable_page_ctor(struct page *page) 1437static inline bool pgtable_page_ctor(struct page *page)
1386{ 1438{
1387 inc_zone_page_state(page, NR_PAGETABLE); 1439 inc_zone_page_state(page, NR_PAGETABLE);
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 7f7f8dae4b1d..16373c8f5f57 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -9,6 +9,7 @@
9 9
10extern int sysctl_overcommit_memory; 10extern int sysctl_overcommit_memory;
11extern int sysctl_overcommit_ratio; 11extern int sysctl_overcommit_ratio;
12extern unsigned long sysctl_overcommit_kbytes;
12extern struct percpu_counter vm_committed_as; 13extern struct percpu_counter vm_committed_as;
13 14
14#ifdef CONFIG_SMP 15#ifdef CONFIG_SMP
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bd791e452ad7..5f2052c83154 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -490,6 +490,12 @@ struct zone {
490 unsigned long managed_pages; 490 unsigned long managed_pages;
491 491
492 /* 492 /*
493 * Number of MIGRATE_RESEVE page block. To maintain for just
494 * optimization. Protected by zone->lock.
495 */
496 int nr_migrate_reserve_block;
497
498 /*
493 * rarely used fields: 499 * rarely used fields:
494 */ 500 */
495 const char *name; 501 const char *name;
@@ -758,10 +764,7 @@ typedef struct pglist_data {
758 int kswapd_max_order; 764 int kswapd_max_order;
759 enum zone_type classzone_idx; 765 enum zone_type classzone_idx;
760#ifdef CONFIG_NUMA_BALANCING 766#ifdef CONFIG_NUMA_BALANCING
761 /* 767 /* Lock serializing the migrate rate limiting window */
762 * Lock serializing the per destination node AutoNUMA memory
763 * migration rate limiting data.
764 */
765 spinlock_t numabalancing_migrate_lock; 768 spinlock_t numabalancing_migrate_lock;
766 769
767 /* Rate limiting time interval */ 770 /* Rate limiting time interval */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 7931efe71175..fb616942e4c7 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -94,78 +94,12 @@ extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
94extern struct posix_acl *get_posix_acl(struct inode *, int); 94extern struct posix_acl *get_posix_acl(struct inode *, int);
95extern int set_posix_acl(struct inode *, int, struct posix_acl *); 95extern int set_posix_acl(struct inode *, int, struct posix_acl *);
96 96
97#ifdef CONFIG_FS_POSIX_ACL 97struct posix_acl **acl_by_type(struct inode *inode, int type);
98static inline struct posix_acl **acl_by_type(struct inode *inode, int type) 98struct posix_acl *get_cached_acl(struct inode *inode, int type);
99{ 99struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
100 switch (type) { 100void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
101 case ACL_TYPE_ACCESS: 101void forget_cached_acl(struct inode *inode, int type);
102 return &inode->i_acl; 102void forget_all_cached_acls(struct inode *inode);
103 case ACL_TYPE_DEFAULT:
104 return &inode->i_default_acl;
105 default:
106 BUG();
107 }
108}
109
110static inline struct posix_acl *get_cached_acl(struct inode *inode, int type)
111{
112 struct posix_acl **p = acl_by_type(inode, type);
113 struct posix_acl *acl = ACCESS_ONCE(*p);
114 if (acl) {
115 spin_lock(&inode->i_lock);
116 acl = *p;
117 if (acl != ACL_NOT_CACHED)
118 acl = posix_acl_dup(acl);
119 spin_unlock(&inode->i_lock);
120 }
121 return acl;
122}
123
124static inline struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
125{
126 return rcu_dereference(*acl_by_type(inode, type));
127}
128
129static inline void set_cached_acl(struct inode *inode,
130 int type,
131 struct posix_acl *acl)
132{
133 struct posix_acl **p = acl_by_type(inode, type);
134 struct posix_acl *old;
135 spin_lock(&inode->i_lock);
136 old = *p;
137 rcu_assign_pointer(*p, posix_acl_dup(acl));
138 spin_unlock(&inode->i_lock);
139 if (old != ACL_NOT_CACHED)
140 posix_acl_release(old);
141}
142
143static inline void forget_cached_acl(struct inode *inode, int type)
144{
145 struct posix_acl **p = acl_by_type(inode, type);
146 struct posix_acl *old;
147 spin_lock(&inode->i_lock);
148 old = *p;
149 *p = ACL_NOT_CACHED;
150 spin_unlock(&inode->i_lock);
151 if (old != ACL_NOT_CACHED)
152 posix_acl_release(old);
153}
154
155static inline void forget_all_cached_acls(struct inode *inode)
156{
157 struct posix_acl *old_access, *old_default;
158 spin_lock(&inode->i_lock);
159 old_access = inode->i_acl;
160 old_default = inode->i_default_acl;
161 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
162 spin_unlock(&inode->i_lock);
163 if (old_access != ACL_NOT_CACHED)
164 posix_acl_release(old_access);
165 if (old_default != ACL_NOT_CACHED)
166 posix_acl_release(old_default);
167}
168#endif
169 103
170static inline void cache_no_acl(struct inode *inode) 104static inline void cache_no_acl(struct inode *inode)
171{ 105{
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6dacb93a6d94..1da693d51255 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -184,13 +184,13 @@ static inline void page_dup_rmap(struct page *page)
184int page_referenced(struct page *, int is_locked, 184int page_referenced(struct page *, int is_locked,
185 struct mem_cgroup *memcg, unsigned long *vm_flags); 185 struct mem_cgroup *memcg, unsigned long *vm_flags);
186int page_referenced_one(struct page *, struct vm_area_struct *, 186int page_referenced_one(struct page *, struct vm_area_struct *,
187 unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); 187 unsigned long address, void *arg);
188 188
189#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) 189#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
190 190
191int try_to_unmap(struct page *, enum ttu_flags flags); 191int try_to_unmap(struct page *, enum ttu_flags flags);
192int try_to_unmap_one(struct page *, struct vm_area_struct *, 192int try_to_unmap_one(struct page *, struct vm_area_struct *,
193 unsigned long address, enum ttu_flags flags); 193 unsigned long address, void *arg);
194 194
195/* 195/*
196 * Called from mm/filemap_xip.c to unmap empty zero page 196 * Called from mm/filemap_xip.c to unmap empty zero page
@@ -236,10 +236,27 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
236int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 236int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
237 237
238/* 238/*
239 * Called by migrate.c to remove migration ptes, but might be used more later. 239 * rmap_walk_control: To control rmap traversing for specific needs
240 *
241 * arg: passed to rmap_one() and invalid_vma()
242 * rmap_one: executed on each vma where page is mapped
243 * done: for checking traversing termination condition
244 * file_nonlinear: for handling file nonlinear mapping
245 * anon_lock: for getting anon_lock by optimized way rather than default
246 * invalid_vma: for skipping uninterested vma
240 */ 247 */
241int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 248struct rmap_walk_control {
242 struct vm_area_struct *, unsigned long, void *), void *arg); 249 void *arg;
250 int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
251 unsigned long addr, void *arg);
252 int (*done)(struct page *page);
253 int (*file_nonlinear)(struct page *, struct address_space *,
254 struct vm_area_struct *vma);
255 struct anon_vma *(*anon_lock)(struct page *page);
256 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
257};
258
259int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
243 260
244#else /* !CONFIG_MMU */ 261#else /* !CONFIG_MMU */
245 262
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffccdad050b5..485234d2fd42 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,6 +549,7 @@ struct signal_struct {
549 atomic_t sigcnt; 549 atomic_t sigcnt;
550 atomic_t live; 550 atomic_t live;
551 int nr_threads; 551 int nr_threads;
552 struct list_head thread_head;
552 553
553 wait_queue_head_t wait_chldexit; /* for wait4() */ 554 wait_queue_head_t wait_chldexit; /* for wait4() */
554 555
@@ -1271,6 +1272,7 @@ struct task_struct {
1271 /* PID/PID hash table linkage. */ 1272 /* PID/PID hash table linkage. */
1272 struct pid_link pids[PIDTYPE_MAX]; 1273 struct pid_link pids[PIDTYPE_MAX];
1273 struct list_head thread_group; 1274 struct list_head thread_group;
1275 struct list_head thread_node;
1274 1276
1275 struct completion *vfork_done; /* for vfork() */ 1277 struct completion *vfork_done; /* for vfork() */
1276 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1278 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
@@ -2341,6 +2343,16 @@ extern bool current_is_single_threaded(void);
2341#define while_each_thread(g, t) \ 2343#define while_each_thread(g, t) \
2342 while ((t = next_thread(t)) != g) 2344 while ((t = next_thread(t)) != g)
2343 2345
2346#define __for_each_thread(signal, t) \
2347 list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
2348
2349#define for_each_thread(p, t) \
2350 __for_each_thread((p)->signal, t)
2351
2352/* Careful: this is a double loop, 'break' won't work as expected. */
2353#define for_each_process_thread(p, t) \
2354 for_each_process(p) for_each_thread(p, t)
2355
2344static inline int get_nr_threads(struct task_struct *tsk) 2356static inline int get_nr_threads(struct task_struct *tsk)
2345{ 2357{
2346 return tsk->signal->nr_threads; 2358 return tsk->signal->nr_threads;
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index fde1b3e94c7d..06f544ef2f6f 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -67,6 +67,48 @@ TRACE_EVENT(mm_compaction_migratepages,
67 __entry->nr_failed) 67 __entry->nr_failed)
68); 68);
69 69
70TRACE_EVENT(mm_compaction_begin,
71 TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
72 unsigned long free_start, unsigned long zone_end),
73
74 TP_ARGS(zone_start, migrate_start, free_start, zone_end),
75
76 TP_STRUCT__entry(
77 __field(unsigned long, zone_start)
78 __field(unsigned long, migrate_start)
79 __field(unsigned long, free_start)
80 __field(unsigned long, zone_end)
81 ),
82
83 TP_fast_assign(
84 __entry->zone_start = zone_start;
85 __entry->migrate_start = migrate_start;
86 __entry->free_start = free_start;
87 __entry->zone_end = zone_end;
88 ),
89
90 TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
91 __entry->zone_start,
92 __entry->migrate_start,
93 __entry->free_start,
94 __entry->zone_end)
95);
96
97TRACE_EVENT(mm_compaction_end,
98 TP_PROTO(int status),
99
100 TP_ARGS(status),
101
102 TP_STRUCT__entry(
103 __field(int, status)
104 ),
105
106 TP_fast_assign(
107 __entry->status = status;
108 ),
109
110 TP_printk("status=%d", __entry->status)
111);
70 112
71#endif /* _TRACE_COMPACTION_H */ 113#endif /* _TRACE_COMPACTION_H */
72 114
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index ec2a6ccfd7e5..3075ffbb9a83 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -45,6 +45,32 @@ TRACE_EVENT(mm_migrate_pages,
45 __print_symbolic(__entry->reason, MIGRATE_REASON)) 45 __print_symbolic(__entry->reason, MIGRATE_REASON))
46); 46);
47 47
48TRACE_EVENT(mm_numa_migrate_ratelimit,
49
50 TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
51
52 TP_ARGS(p, dst_nid, nr_pages),
53
54 TP_STRUCT__entry(
55 __array( char, comm, TASK_COMM_LEN)
56 __field( pid_t, pid)
57 __field( int, dst_nid)
58 __field( unsigned long, nr_pages)
59 ),
60
61 TP_fast_assign(
62 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
63 __entry->pid = p->pid;
64 __entry->dst_nid = dst_nid;
65 __entry->nr_pages = nr_pages;
66 ),
67
68 TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
69 __entry->comm,
70 __entry->pid,
71 __entry->dst_nid,
72 __entry->nr_pages)
73);
48#endif /* _TRACE_MIGRATE_H */ 74#endif /* _TRACE_MIGRATE_H */
49 75
50/* This part must be outside protection */ 76/* This part must be outside protection */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 04c308413a5d..67e1bbf83695 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang,
443); 443);
444#endif /* CONFIG_DETECT_HUNG_TASK */ 444#endif /* CONFIG_DETECT_HUNG_TASK */
445 445
446DECLARE_EVENT_CLASS(sched_move_task_template,
447
448 TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
449
450 TP_ARGS(tsk, src_cpu, dst_cpu),
451
452 TP_STRUCT__entry(
453 __field( pid_t, pid )
454 __field( pid_t, tgid )
455 __field( pid_t, ngid )
456 __field( int, src_cpu )
457 __field( int, src_nid )
458 __field( int, dst_cpu )
459 __field( int, dst_nid )
460 ),
461
462 TP_fast_assign(
463 __entry->pid = task_pid_nr(tsk);
464 __entry->tgid = task_tgid_nr(tsk);
465 __entry->ngid = task_numa_group_id(tsk);
466 __entry->src_cpu = src_cpu;
467 __entry->src_nid = cpu_to_node(src_cpu);
468 __entry->dst_cpu = dst_cpu;
469 __entry->dst_nid = cpu_to_node(dst_cpu);
470 ),
471
472 TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
473 __entry->pid, __entry->tgid, __entry->ngid,
474 __entry->src_cpu, __entry->src_nid,
475 __entry->dst_cpu, __entry->dst_nid)
476);
477
478/*
479 * Tracks migration of tasks from one runqueue to another. Can be used to
480 * detect if automatic NUMA balancing is bouncing between nodes
481 */
482DEFINE_EVENT(sched_move_task_template, sched_move_numa,
483 TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
484
485 TP_ARGS(tsk, src_cpu, dst_cpu)
486);
487
488DEFINE_EVENT(sched_move_task_template, sched_stick_numa,
489 TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
490
491 TP_ARGS(tsk, src_cpu, dst_cpu)
492);
493
494TRACE_EVENT(sched_swap_numa,
495
496 TP_PROTO(struct task_struct *src_tsk, int src_cpu,
497 struct task_struct *dst_tsk, int dst_cpu),
498
499 TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),
500
501 TP_STRUCT__entry(
502 __field( pid_t, src_pid )
503 __field( pid_t, src_tgid )
504 __field( pid_t, src_ngid )
505 __field( int, src_cpu )
506 __field( int, src_nid )
507 __field( pid_t, dst_pid )
508 __field( pid_t, dst_tgid )
509 __field( pid_t, dst_ngid )
510 __field( int, dst_cpu )
511 __field( int, dst_nid )
512 ),
513
514 TP_fast_assign(
515 __entry->src_pid = task_pid_nr(src_tsk);
516 __entry->src_tgid = task_tgid_nr(src_tsk);
517 __entry->src_ngid = task_numa_group_id(src_tsk);
518 __entry->src_cpu = src_cpu;
519 __entry->src_nid = cpu_to_node(src_cpu);
520 __entry->dst_pid = task_pid_nr(dst_tsk);
521 __entry->dst_tgid = task_tgid_nr(dst_tsk);
522 __entry->dst_ngid = task_numa_group_id(dst_tsk);
523 __entry->dst_cpu = dst_cpu;
524 __entry->dst_nid = cpu_to_node(dst_cpu);
525 ),
526
527 TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
528 __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
529 __entry->src_cpu, __entry->src_nid,
530 __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
531 __entry->dst_cpu, __entry->dst_nid)
532);
446#endif /* _TRACE_SCHED_H */ 533#endif /* _TRACE_SCHED_H */
447 534
448/* This part must be outside protection */ 535/* This part must be outside protection */
diff --git a/init/main.c b/init/main.c
index febc511e078a..f865261fb096 100644
--- a/init/main.c
+++ b/init/main.c
@@ -355,9 +355,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
355 */ 355 */
356static void __init setup_command_line(char *command_line) 356static void __init setup_command_line(char *command_line)
357{ 357{
358 saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); 358 saved_command_line =
359 initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1); 359 memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
360 static_command_line = alloc_bootmem(strlen (command_line)+1); 360 initcall_command_line =
361 memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
362 static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
361 strcpy (saved_command_line, boot_command_line); 363 strcpy (saved_command_line, boot_command_line);
362 strcpy (static_command_line, command_line); 364 strcpy (static_command_line, command_line);
363} 365}
@@ -476,7 +478,7 @@ static void __init mm_init(void)
476 mem_init(); 478 mem_init();
477 kmem_cache_init(); 479 kmem_cache_init();
478 percpu_init_late(); 480 percpu_init_late();
479 pgtable_cache_init(); 481 pgtable_init();
480 vmalloc_init(); 482 vmalloc_init();
481} 483}
482 484
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 43c307dc9453..67ccf0e7cca9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk)
912} 912}
913 913
914static int audit_tree_handle_event(struct fsnotify_group *group, 914static int audit_tree_handle_event(struct fsnotify_group *group,
915 struct inode *to_tell,
915 struct fsnotify_mark *inode_mark, 916 struct fsnotify_mark *inode_mark,
916 struct fsnotify_mark *vfsmonut_mark, 917 struct fsnotify_mark *vfsmount_mark,
917 struct fsnotify_event *event) 918 u32 mask, void *data, int data_type,
919 const unsigned char *file_name)
918{ 920{
919 BUG(); 921 return 0;
920 return -EOPNOTSUPP;
921} 922}
922 923
923static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) 924static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
@@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
933 BUG_ON(atomic_read(&entry->refcnt) < 1); 934 BUG_ON(atomic_read(&entry->refcnt) < 1);
934} 935}
935 936
936static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
937 struct fsnotify_mark *inode_mark,
938 struct fsnotify_mark *vfsmount_mark,
939 __u32 mask, void *data, int data_type)
940{
941 return false;
942}
943
944static const struct fsnotify_ops audit_tree_ops = { 937static const struct fsnotify_ops audit_tree_ops = {
945 .handle_event = audit_tree_handle_event, 938 .handle_event = audit_tree_handle_event,
946 .should_send_event = audit_tree_send_event,
947 .free_group_priv = NULL,
948 .free_event_priv = NULL,
949 .freeing_mark = audit_tree_freeing_mark, 939 .freeing_mark = audit_tree_freeing_mark,
950}; 940};
951 941
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369c..2596fac5dcb4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule)
465 } 465 }
466} 466}
467 467
468static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
469 struct fsnotify_mark *inode_mark,
470 struct fsnotify_mark *vfsmount_mark,
471 __u32 mask, void *data, int data_type)
472{
473 return true;
474}
475
476/* Update watch data in audit rules based on fsnotify events. */ 468/* Update watch data in audit rules based on fsnotify events. */
477static int audit_watch_handle_event(struct fsnotify_group *group, 469static int audit_watch_handle_event(struct fsnotify_group *group,
470 struct inode *to_tell,
478 struct fsnotify_mark *inode_mark, 471 struct fsnotify_mark *inode_mark,
479 struct fsnotify_mark *vfsmount_mark, 472 struct fsnotify_mark *vfsmount_mark,
480 struct fsnotify_event *event) 473 u32 mask, void *data, int data_type,
474 const unsigned char *dname)
481{ 475{
482 struct inode *inode; 476 struct inode *inode;
483 __u32 mask = event->mask;
484 const char *dname = event->file_name;
485 struct audit_parent *parent; 477 struct audit_parent *parent;
486 478
487 parent = container_of(inode_mark, struct audit_parent, mark); 479 parent = container_of(inode_mark, struct audit_parent, mark);
488 480
489 BUG_ON(group != audit_watch_group); 481 BUG_ON(group != audit_watch_group);
490 482
491 switch (event->data_type) { 483 switch (data_type) {
492 case (FSNOTIFY_EVENT_PATH): 484 case (FSNOTIFY_EVENT_PATH):
493 inode = event->path.dentry->d_inode; 485 inode = ((struct path *)data)->dentry->d_inode;
494 break; 486 break;
495 case (FSNOTIFY_EVENT_INODE): 487 case (FSNOTIFY_EVENT_INODE):
496 inode = event->inode; 488 inode = (struct inode *)data;
497 break; 489 break;
498 default: 490 default:
499 BUG(); 491 BUG();
@@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
512} 504}
513 505
514static const struct fsnotify_ops audit_watch_fsnotify_ops = { 506static const struct fsnotify_ops audit_watch_fsnotify_ops = {
515 .should_send_event = audit_watch_should_send_event,
516 .handle_event = audit_watch_handle_event, 507 .handle_event = audit_watch_handle_event,
517 .free_group_priv = NULL,
518 .freeing_mark = NULL,
519 .free_event_priv = NULL,
520}; 508};
521 509
522static int __init audit_watch_init(void) 510static int __init audit_watch_init(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819055d5..1e77fc645317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 } 75 }
76 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
77 list_del_rcu(&p->thread_node);
77} 78}
78 79
79/* 80/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 294189fc7ac8..2f11bbe376b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1035,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1035 sig->nr_threads = 1; 1035 sig->nr_threads = 1;
1036 atomic_set(&sig->live, 1); 1036 atomic_set(&sig->live, 1);
1037 atomic_set(&sig->sigcnt, 1); 1037 atomic_set(&sig->sigcnt, 1);
1038
1039 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1040 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1041 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1042
1038 init_waitqueue_head(&sig->wait_chldexit); 1043 init_waitqueue_head(&sig->wait_chldexit);
1039 sig->curr_target = tsk; 1044 sig->curr_target = tsk;
1040 init_sigpending(&sig->shared_pending); 1045 init_sigpending(&sig->shared_pending);
@@ -1474,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1474 atomic_inc(&current->signal->sigcnt); 1479 atomic_inc(&current->signal->sigcnt);
1475 list_add_tail_rcu(&p->thread_group, 1480 list_add_tail_rcu(&p->thread_group,
1476 &p->group_leader->thread_group); 1481 &p->group_leader->thread_group);
1482 list_add_tail_rcu(&p->thread_node,
1483 &p->signal->thread_head);
1477 } 1484 }
1478 attach_pid(p, PIDTYPE_PID); 1485 attach_pid(p, PIDTYPE_PID);
1479 nr_threads++; 1486 nr_threads++;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e204af..d9f61a145802 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
637 BUG_ON(!region); 637 BUG_ON(!region);
638 } else 638 } else
639 /* This allocation cannot fail */ 639 /* This allocation cannot fail */
640 region = alloc_bootmem(sizeof(struct nosave_region)); 640 region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
641 region->start_pfn = start_pfn; 641 region->start_pfn = start_pfn;
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86bae576..f8b41bddc6dc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
757 return; 757 return;
758 758
759 if (early) { 759 if (early) {
760 unsigned long mem; 760 new_log_buf =
761 761 memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
762 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
763 if (!mem)
764 return;
765 new_log_buf = __va(mem);
766 } else { 762 } else {
767 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); 763 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
768 } 764 }
769 765
770 if (unlikely(!new_log_buf)) { 766 if (unlikely(!new_log_buf)) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3897e09e86a2..4d6964e49711 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out; 1109 goto out;
1110 1110
1111 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1112 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112 1113
1113out: 1114out:
@@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4603 4604
4604 /* TODO: This is not properly updating schedstats */ 4605 /* TODO: This is not properly updating schedstats */
4605 4606
4607 trace_sched_move_numa(p, curr_cpu, target_cpu);
4606 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4608 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4607} 4609}
4608 4610
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b24b6cfde9aa..867b0a4b0893 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
1250 p->numa_scan_period = task_scan_min(p); 1250 p->numa_scan_period = task_scan_min(p);
1251 1251
1252 if (env.best_task == NULL) { 1252 if (env.best_task == NULL) {
1253 int ret = migrate_task_to(p, env.best_cpu); 1253 ret = migrate_task_to(p, env.best_cpu);
1254 if (ret != 0)
1255 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1254 return ret; 1256 return ret;
1255 } 1257 }
1256 1258
1257 ret = migrate_swap(p, env.best_task); 1259 ret = migrate_swap(p, env.best_task);
1260 if (ret != 0)
1261 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1258 put_task_struct(env.best_task); 1262 put_task_struct(env.best_task);
1259 return ret; 1263 return ret;
1260} 1264}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f905cf..332cefcdb04b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,8 +95,6 @@
95#if defined(CONFIG_SYSCTL) 95#if defined(CONFIG_SYSCTL)
96 96
97/* External variables not in a header file. */ 97/* External variables not in a header file. */
98extern int sysctl_overcommit_memory;
99extern int sysctl_overcommit_ratio;
100extern int max_threads; 98extern int max_threads;
101extern int suid_dumpable; 99extern int suid_dumpable;
102#ifdef CONFIG_COREDUMP 100#ifdef CONFIG_COREDUMP
@@ -1121,7 +1119,14 @@ static struct ctl_table vm_table[] = {
1121 .data = &sysctl_overcommit_ratio, 1119 .data = &sysctl_overcommit_ratio,
1122 .maxlen = sizeof(sysctl_overcommit_ratio), 1120 .maxlen = sizeof(sysctl_overcommit_ratio),
1123 .mode = 0644, 1121 .mode = 0644,
1124 .proc_handler = proc_dointvec, 1122 .proc_handler = overcommit_ratio_handler,
1123 },
1124 {
1125 .procname = "overcommit_kbytes",
1126 .data = &sysctl_overcommit_kbytes,
1127 .maxlen = sizeof(sysctl_overcommit_kbytes),
1128 .mode = 0644,
1129 .proc_handler = overcommit_kbytes_handler,
1125 }, 1130 },
1126 { 1131 {
1127 .procname = "page-cluster", 1132 .procname = "page-cluster",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6982094a7e74..900b63c1e899 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1584,8 +1584,16 @@ config DMA_API_DEBUG
1584 With this option you will be able to detect common bugs in device 1584 With this option you will be able to detect common bugs in device
1585 drivers like double-freeing of DMA mappings or freeing mappings that 1585 drivers like double-freeing of DMA mappings or freeing mappings that
1586 were never allocated. 1586 were never allocated.
1587 This option causes a performance degredation. Use only if you want 1587
1588 to debug device drivers. If unsure, say N. 1588 This also attempts to catch cases where a page owned by DMA is
1589 accessed by the cpu in a way that could cause data corruption. For
1590 example, this enables cow_user_page() to check that the source page is
1591 not undergoing DMA.
1592
1593 This option causes a performance degradation. Use only if you want to
1594 debug device drivers and dma interactions.
1595
1596 If unsure, say N.
1589 1597
1590source "samples/Kconfig" 1598source "samples/Kconfig"
1591 1599
diff --git a/lib/cpumask.c b/lib/cpumask.c
index d327b87c99b7..b810b753c607 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var);
140 */ 140 */
141void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) 141void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
142{ 142{
143 *mask = alloc_bootmem(cpumask_size()); 143 *mask = memblock_virt_alloc(cpumask_size(), 0);
144} 144}
145 145
146/** 146/**
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var);
161 */ 161 */
162void __init free_bootmem_cpumask_var(cpumask_var_t mask) 162void __init free_bootmem_cpumask_var(cpumask_var_t mask)
163{ 163{
164 free_bootmem(__pa(mask), cpumask_size()); 164 memblock_free_early(__pa(mask), cpumask_size());
165} 165}
166#endif 166#endif
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d87a17a819d0..c38083871f11 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -53,11 +53,26 @@ enum map_err_types {
53 53
54#define DMA_DEBUG_STACKTRACE_ENTRIES 5 54#define DMA_DEBUG_STACKTRACE_ENTRIES 5
55 55
56/**
57 * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
58 * @list: node on pre-allocated free_entries list
59 * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
60 * @type: single, page, sg, coherent
61 * @pfn: page frame of the start address
62 * @offset: offset of mapping relative to pfn
63 * @size: length of the mapping
64 * @direction: enum dma_data_direction
65 * @sg_call_ents: 'nents' from dma_map_sg
66 * @sg_mapped_ents: 'mapped_ents' from dma_map_sg
67 * @map_err_type: track whether dma_mapping_error() was checked
68 * @stacktrace: support backtraces when a violation is detected
69 */
56struct dma_debug_entry { 70struct dma_debug_entry {
57 struct list_head list; 71 struct list_head list;
58 struct device *dev; 72 struct device *dev;
59 int type; 73 int type;
60 phys_addr_t paddr; 74 unsigned long pfn;
75 size_t offset;
61 u64 dev_addr; 76 u64 dev_addr;
62 u64 size; 77 u64 size;
63 int direction; 78 int direction;
@@ -372,6 +387,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
372 list_del(&entry->list); 387 list_del(&entry->list);
373} 388}
374 389
390static unsigned long long phys_addr(struct dma_debug_entry *entry)
391{
392 return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
393}
394
375/* 395/*
376 * Dump mapping entries for debugging purposes 396 * Dump mapping entries for debugging purposes
377 */ 397 */
@@ -389,9 +409,9 @@ void debug_dma_dump_mappings(struct device *dev)
389 list_for_each_entry(entry, &bucket->list, list) { 409 list_for_each_entry(entry, &bucket->list, list) {
390 if (!dev || dev == entry->dev) { 410 if (!dev || dev == entry->dev) {
391 dev_info(entry->dev, 411 dev_info(entry->dev,
392 "%s idx %d P=%Lx D=%Lx L=%Lx %s %s\n", 412 "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
393 type2name[entry->type], idx, 413 type2name[entry->type], idx,
394 (unsigned long long)entry->paddr, 414 phys_addr(entry), entry->pfn,
395 entry->dev_addr, entry->size, 415 entry->dev_addr, entry->size,
396 dir2name[entry->direction], 416 dir2name[entry->direction],
397 maperr2str[entry->map_err_type]); 417 maperr2str[entry->map_err_type]);
@@ -404,6 +424,133 @@ void debug_dma_dump_mappings(struct device *dev)
404EXPORT_SYMBOL(debug_dma_dump_mappings); 424EXPORT_SYMBOL(debug_dma_dump_mappings);
405 425
406/* 426/*
427 * For each page mapped (initial page in the case of
428 * dma_alloc_coherent/dma_map_{single|page}, or each page in a
429 * scatterlist) insert into this tree using the pfn as the key. At
430 * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If
431 * the pfn already exists at insertion time add a tag as a reference
432 * count for the overlapping mappings. For now, the overlap tracking
433 * just ensures that 'unmaps' balance 'maps' before marking the pfn
434 * idle, but we should also be flagging overlaps as an API violation.
435 *
436 * Memory usage is mostly constrained by the maximum number of available
437 * dma-debug entries in that we need a free dma_debug_entry before
438 * inserting into the tree. In the case of dma_map_{single|page} and
439 * dma_alloc_coherent there is only one dma_debug_entry and one pfn to
440 * track per event. dma_map_sg(), on the other hand,
441 * consumes a single dma_debug_entry, but inserts 'nents' entries into
442 * the tree.
443 *
444 * At any time debug_dma_assert_idle() can be called to trigger a
445 * warning if the given page is in the active set.
446 */
447static RADIX_TREE(dma_active_pfn, GFP_NOWAIT);
448static DEFINE_SPINLOCK(radix_lock);
449#define ACTIVE_PFN_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
450
451static int active_pfn_read_overlap(unsigned long pfn)
452{
453 int overlap = 0, i;
454
455 for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
456 if (radix_tree_tag_get(&dma_active_pfn, pfn, i))
457 overlap |= 1 << i;
458 return overlap;
459}
460
461static int active_pfn_set_overlap(unsigned long pfn, int overlap)
462{
463 int i;
464
465 if (overlap > ACTIVE_PFN_MAX_OVERLAP || overlap < 0)
466 return 0;
467
468 for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
469 if (overlap & 1 << i)
470 radix_tree_tag_set(&dma_active_pfn, pfn, i);
471 else
472 radix_tree_tag_clear(&dma_active_pfn, pfn, i);
473
474 return overlap;
475}
476
477static void active_pfn_inc_overlap(unsigned long pfn)
478{
479 int overlap = active_pfn_read_overlap(pfn);
480
481 overlap = active_pfn_set_overlap(pfn, ++overlap);
482
483 /* If we overflowed the overlap counter then we're potentially
484 * leaking dma-mappings. Otherwise, if maps and unmaps are
485 * balanced then this overflow may cause false negatives in
486 * debug_dma_assert_idle() as the pfn may be marked idle
487 * prematurely.
488 */
489 WARN_ONCE(overlap == 0,
490 "DMA-API: exceeded %d overlapping mappings of pfn %lx\n",
491 ACTIVE_PFN_MAX_OVERLAP, pfn);
492}
493
494static int active_pfn_dec_overlap(unsigned long pfn)
495{
496 int overlap = active_pfn_read_overlap(pfn);
497
498 return active_pfn_set_overlap(pfn, --overlap);
499}
500
501static int active_pfn_insert(struct dma_debug_entry *entry)
502{
503 unsigned long flags;
504 int rc;
505
506 spin_lock_irqsave(&radix_lock, flags);
507 rc = radix_tree_insert(&dma_active_pfn, entry->pfn, entry);
508 if (rc == -EEXIST)
509 active_pfn_inc_overlap(entry->pfn);
510 spin_unlock_irqrestore(&radix_lock, flags);
511
512 return rc;
513}
514
515static void active_pfn_remove(struct dma_debug_entry *entry)
516{
517 unsigned long flags;
518
519 spin_lock_irqsave(&radix_lock, flags);
520 if (active_pfn_dec_overlap(entry->pfn) == 0)
521 radix_tree_delete(&dma_active_pfn, entry->pfn);
522 spin_unlock_irqrestore(&radix_lock, flags);
523}
524
525/**
526 * debug_dma_assert_idle() - assert that a page is not undergoing dma
527 * @page: page to lookup in the dma_active_pfn tree
528 *
529 * Place a call to this routine in cases where the cpu touching the page
530 * before the dma completes (page is dma_unmapped) will lead to data
531 * corruption.
532 */
533void debug_dma_assert_idle(struct page *page)
534{
535 unsigned long flags;
536 struct dma_debug_entry *entry;
537
538 if (!page)
539 return;
540
541 spin_lock_irqsave(&radix_lock, flags);
542 entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page));
543 spin_unlock_irqrestore(&radix_lock, flags);
544
545 if (!entry)
546 return;
547
548 err_printk(entry->dev, entry,
549 "DMA-API: cpu touching an active dma mapped page "
550 "[pfn=0x%lx]\n", entry->pfn);
551}
552
553/*
407 * Wrapper function for adding an entry to the hash. 554 * Wrapper function for adding an entry to the hash.
408 * This function takes care of locking itself. 555 * This function takes care of locking itself.
409 */ 556 */
@@ -411,10 +558,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
411{ 558{
412 struct hash_bucket *bucket; 559 struct hash_bucket *bucket;
413 unsigned long flags; 560 unsigned long flags;
561 int rc;
414 562
415 bucket = get_hash_bucket(entry, &flags); 563 bucket = get_hash_bucket(entry, &flags);
416 hash_bucket_add(bucket, entry); 564 hash_bucket_add(bucket, entry);
417 put_hash_bucket(bucket, &flags); 565 put_hash_bucket(bucket, &flags);
566
567 rc = active_pfn_insert(entry);
568 if (rc == -ENOMEM) {
569 pr_err("DMA-API: pfn tracking ENOMEM, dma-debug disabled\n");
570 global_disable = true;
571 }
572
573 /* TODO: report -EEXIST errors here as overlapping mappings are
574 * not supported by the DMA API
575 */
418} 576}
419 577
420static struct dma_debug_entry *__dma_entry_alloc(void) 578static struct dma_debug_entry *__dma_entry_alloc(void)
@@ -469,6 +627,8 @@ static void dma_entry_free(struct dma_debug_entry *entry)
469{ 627{
470 unsigned long flags; 628 unsigned long flags;
471 629
630 active_pfn_remove(entry);
631
472 /* 632 /*
473 * add to beginning of the list - this way the entries are 633 * add to beginning of the list - this way the entries are
474 * more likely cache hot when they are reallocated. 634 * more likely cache hot when they are reallocated.
@@ -895,15 +1055,15 @@ static void check_unmap(struct dma_debug_entry *ref)
895 ref->dev_addr, ref->size, 1055 ref->dev_addr, ref->size,
896 type2name[entry->type], type2name[ref->type]); 1056 type2name[entry->type], type2name[ref->type]);
897 } else if ((entry->type == dma_debug_coherent) && 1057 } else if ((entry->type == dma_debug_coherent) &&
898 (ref->paddr != entry->paddr)) { 1058 (phys_addr(ref) != phys_addr(entry))) {
899 err_printk(ref->dev, entry, "DMA-API: device driver frees " 1059 err_printk(ref->dev, entry, "DMA-API: device driver frees "
900 "DMA memory with different CPU address " 1060 "DMA memory with different CPU address "
901 "[device address=0x%016llx] [size=%llu bytes] " 1061 "[device address=0x%016llx] [size=%llu bytes] "
902 "[cpu alloc address=0x%016llx] " 1062 "[cpu alloc address=0x%016llx] "
903 "[cpu free address=0x%016llx]", 1063 "[cpu free address=0x%016llx]",
904 ref->dev_addr, ref->size, 1064 ref->dev_addr, ref->size,
905 (unsigned long long)entry->paddr, 1065 phys_addr(entry),
906 (unsigned long long)ref->paddr); 1066 phys_addr(ref));
907 } 1067 }
908 1068
909 if (ref->sg_call_ents && ref->type == dma_debug_sg && 1069 if (ref->sg_call_ents && ref->type == dma_debug_sg &&
@@ -1052,7 +1212,8 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
1052 1212
1053 entry->dev = dev; 1213 entry->dev = dev;
1054 entry->type = dma_debug_page; 1214 entry->type = dma_debug_page;
1055 entry->paddr = page_to_phys(page) + offset; 1215 entry->pfn = page_to_pfn(page);
1216 entry->offset = offset,
1056 entry->dev_addr = dma_addr; 1217 entry->dev_addr = dma_addr;
1057 entry->size = size; 1218 entry->size = size;
1058 entry->direction = direction; 1219 entry->direction = direction;
@@ -1148,7 +1309,8 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
1148 1309
1149 entry->type = dma_debug_sg; 1310 entry->type = dma_debug_sg;
1150 entry->dev = dev; 1311 entry->dev = dev;
1151 entry->paddr = sg_phys(s); 1312 entry->pfn = page_to_pfn(sg_page(s));
1313 entry->offset = s->offset,
1152 entry->size = sg_dma_len(s); 1314 entry->size = sg_dma_len(s);
1153 entry->dev_addr = sg_dma_address(s); 1315 entry->dev_addr = sg_dma_address(s);
1154 entry->direction = direction; 1316 entry->direction = direction;
@@ -1198,7 +1360,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
1198 struct dma_debug_entry ref = { 1360 struct dma_debug_entry ref = {
1199 .type = dma_debug_sg, 1361 .type = dma_debug_sg,
1200 .dev = dev, 1362 .dev = dev,
1201 .paddr = sg_phys(s), 1363 .pfn = page_to_pfn(sg_page(s)),
1364 .offset = s->offset,
1202 .dev_addr = sg_dma_address(s), 1365 .dev_addr = sg_dma_address(s),
1203 .size = sg_dma_len(s), 1366 .size = sg_dma_len(s),
1204 .direction = dir, 1367 .direction = dir,
@@ -1233,7 +1396,8 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
1233 1396
1234 entry->type = dma_debug_coherent; 1397 entry->type = dma_debug_coherent;
1235 entry->dev = dev; 1398 entry->dev = dev;
1236 entry->paddr = virt_to_phys(virt); 1399 entry->pfn = page_to_pfn(virt_to_page(virt));
1400 entry->offset = (size_t) virt & PAGE_MASK;
1237 entry->size = size; 1401 entry->size = size;
1238 entry->dev_addr = dma_addr; 1402 entry->dev_addr = dma_addr;
1239 entry->direction = DMA_BIDIRECTIONAL; 1403 entry->direction = DMA_BIDIRECTIONAL;
@@ -1248,7 +1412,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
1248 struct dma_debug_entry ref = { 1412 struct dma_debug_entry ref = {
1249 .type = dma_debug_coherent, 1413 .type = dma_debug_coherent,
1250 .dev = dev, 1414 .dev = dev,
1251 .paddr = virt_to_phys(virt), 1415 .pfn = page_to_pfn(virt_to_page(virt)),
1416 .offset = (size_t) virt & PAGE_MASK,
1252 .dev_addr = addr, 1417 .dev_addr = addr,
1253 .size = size, 1418 .size = size,
1254 .direction = DMA_BIDIRECTIONAL, 1419 .direction = DMA_BIDIRECTIONAL,
@@ -1356,7 +1521,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
1356 struct dma_debug_entry ref = { 1521 struct dma_debug_entry ref = {
1357 .type = dma_debug_sg, 1522 .type = dma_debug_sg,
1358 .dev = dev, 1523 .dev = dev,
1359 .paddr = sg_phys(s), 1524 .pfn = page_to_pfn(sg_page(s)),
1525 .offset = s->offset,
1360 .dev_addr = sg_dma_address(s), 1526 .dev_addr = sg_dma_address(s),
1361 .size = sg_dma_len(s), 1527 .size = sg_dma_len(s),
1362 .direction = direction, 1528 .direction = direction,
@@ -1388,7 +1554,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
1388 struct dma_debug_entry ref = { 1554 struct dma_debug_entry ref = {
1389 .type = dma_debug_sg, 1555 .type = dma_debug_sg,
1390 .dev = dev, 1556 .dev = dev,
1391 .paddr = sg_phys(s), 1557 .pfn = page_to_pfn(sg_page(s)),
1558 .offset = s->offset,
1392 .dev_addr = sg_dma_address(s), 1559 .dev_addr = sg_dma_address(s),
1393 .size = sg_dma_len(s), 1560 .size = sg_dma_len(s),
1394 .direction = direction, 1561 .direction = direction,
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 5847a4921b8e..09225796991a 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -17,9 +17,6 @@ void show_mem(unsigned int filter)
17 printk("Mem-Info:\n"); 17 printk("Mem-Info:\n");
18 show_free_areas(filter); 18 show_free_areas(filter);
19 19
20 if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
21 return;
22
23 for_each_online_pgdat(pgdat) { 20 for_each_online_pgdat(pgdat) {
24 unsigned long flags; 21 unsigned long flags;
25 int zoneid; 22 int zoneid;
@@ -46,4 +43,7 @@ void show_mem(unsigned int filter)
46 printk("%lu pages in pagetable cache\n", 43 printk("%lu pages in pagetable cache\n",
47 quicklist_total_size()); 44 quicklist_total_size());
48#endif 45#endif
46#ifdef CONFIG_MEMORY_FAILURE
47 printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
48#endif
49} 49}
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index e4399fa65ad6..615f3de4b5ce 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
172 /* 172 /*
173 * Get the overflow emergency buffer 173 * Get the overflow emergency buffer
174 */ 174 */
175 v_overflow_buffer = alloc_bootmem_low_pages_nopanic( 175 v_overflow_buffer = memblock_virt_alloc_nopanic(
176 PAGE_ALIGN(io_tlb_overflow)); 176 PAGE_ALIGN(io_tlb_overflow),
177 PAGE_SIZE);
177 if (!v_overflow_buffer) 178 if (!v_overflow_buffer)
178 return -ENOMEM; 179 return -ENOMEM;
179 180
@@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
184 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE 185 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
185 * between io_tlb_start and io_tlb_end. 186 * between io_tlb_start and io_tlb_end.
186 */ 187 */
187 io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); 188 io_tlb_list = memblock_virt_alloc(
189 PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
190 PAGE_SIZE);
188 for (i = 0; i < io_tlb_nslabs; i++) 191 for (i = 0; i < io_tlb_nslabs; i++)
189 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); 192 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
190 io_tlb_index = 0; 193 io_tlb_index = 0;
191 io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); 194 io_tlb_orig_addr = memblock_virt_alloc(
195 PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
196 PAGE_SIZE);
192 197
193 if (verbose) 198 if (verbose)
194 swiotlb_print_info(); 199 swiotlb_print_info();
@@ -215,13 +220,13 @@ swiotlb_init(int verbose)
215 bytes = io_tlb_nslabs << IO_TLB_SHIFT; 220 bytes = io_tlb_nslabs << IO_TLB_SHIFT;
216 221
217 /* Get IO TLB memory from the low pages */ 222 /* Get IO TLB memory from the low pages */
218 vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes)); 223 vstart = memblock_virt_alloc_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
219 if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) 224 if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
220 return; 225 return;
221 226
222 if (io_tlb_start) 227 if (io_tlb_start)
223 free_bootmem(io_tlb_start, 228 memblock_free_early(io_tlb_start,
224 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 229 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
225 pr_warn("Cannot allocate SWIOTLB buffer"); 230 pr_warn("Cannot allocate SWIOTLB buffer");
226 no_iotlb_memory = true; 231 no_iotlb_memory = true;
227} 232}
@@ -357,14 +362,14 @@ void __init swiotlb_free(void)
357 free_pages((unsigned long)phys_to_virt(io_tlb_start), 362 free_pages((unsigned long)phys_to_virt(io_tlb_start),
358 get_order(io_tlb_nslabs << IO_TLB_SHIFT)); 363 get_order(io_tlb_nslabs << IO_TLB_SHIFT));
359 } else { 364 } else {
360 free_bootmem_late(io_tlb_overflow_buffer, 365 memblock_free_late(io_tlb_overflow_buffer,
361 PAGE_ALIGN(io_tlb_overflow)); 366 PAGE_ALIGN(io_tlb_overflow));
362 free_bootmem_late(__pa(io_tlb_orig_addr), 367 memblock_free_late(__pa(io_tlb_orig_addr),
363 PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); 368 PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
364 free_bootmem_late(__pa(io_tlb_list), 369 memblock_free_late(__pa(io_tlb_list),
365 PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); 370 PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
366 free_bootmem_late(io_tlb_start, 371 memblock_free_late(io_tlb_start,
367 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); 372 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
368 } 373 }
369 io_tlb_nslabs = 0; 374 io_tlb_nslabs = 0;
370} 375}
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..3a91a2ea3d34 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
459 unsigned long flags; 459 unsigned long flags;
460 bool locked = false; 460 bool locked = false;
461 struct page *page = NULL, *valid_page = NULL; 461 struct page *page = NULL, *valid_page = NULL;
462 bool skipped_async_unsuitable = false;
462 463
463 /* 464 /*
464 * Ensure that there are not too many pages isolated from the LRU 465 * Ensure that there are not too many pages isolated from the LRU
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
534 if (!cc->sync && last_pageblock_nr != pageblock_nr && 535 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
535 !migrate_async_suitable(get_pageblock_migratetype(page))) { 536 !migrate_async_suitable(get_pageblock_migratetype(page))) {
536 cc->finished_update_migrate = true; 537 cc->finished_update_migrate = true;
538 skipped_async_unsuitable = true;
537 goto next_pageblock; 539 goto next_pageblock;
538 } 540 }
539 541
@@ -627,8 +629,13 @@ next_pageblock:
627 if (locked) 629 if (locked)
628 spin_unlock_irqrestore(&zone->lru_lock, flags); 630 spin_unlock_irqrestore(&zone->lru_lock, flags);
629 631
630 /* Update the pageblock-skip if the whole pageblock was scanned */ 632 /*
631 if (low_pfn == end_pfn) 633 * Update the pageblock-skip information and cached scanner pfn,
634 * if the whole pageblock was scanned without isolating any page.
635 * This is not done when pageblock was skipped due to being unsuitable
636 * for async compaction, so that eventual sync compaction can try.
637 */
638 if (low_pfn == end_pfn && !skipped_async_unsuitable)
632 update_pageblock_skip(cc, valid_page, nr_isolated, true); 639 update_pageblock_skip(cc, valid_page, nr_isolated, true);
633 640
634 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 641 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone,
660 * is the end of the pageblock the migration scanner is using. 667 * is the end of the pageblock the migration scanner is using.
661 */ 668 */
662 pfn = cc->free_pfn; 669 pfn = cc->free_pfn;
663 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 670 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
664 671
665 /* 672 /*
666 * Take care that if the migration scanner is at the end of the zone 673 * Take care that if the migration scanner is at the end of the zone
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone,
676 * pages on cc->migratepages. We stop searching if the migrate 683 * pages on cc->migratepages. We stop searching if the migrate
677 * and free page scanners meet or enough free pages are isolated. 684 * and free page scanners meet or enough free pages are isolated.
678 */ 685 */
679 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 686 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
680 pfn -= pageblock_nr_pages) { 687 pfn -= pageblock_nr_pages) {
681 unsigned long isolated; 688 unsigned long isolated;
682 689
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone,
738 /* split_free_page does not map the pages */ 745 /* split_free_page does not map the pages */
739 map_pages(freelist); 746 map_pages(freelist);
740 747
741 cc->free_pfn = high_pfn; 748 /*
749 * If we crossed the migrate scanner, we want to keep it that way
750 * so that compact_finished() may detect this
751 */
752 if (pfn < low_pfn)
753 cc->free_pfn = max(pfn, zone->zone_start_pfn);
754 else
755 cc->free_pfn = high_pfn;
742 cc->nr_freepages = nr_freepages; 756 cc->nr_freepages = nr_freepages;
743} 757}
744 758
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone,
837 851
838 /* Compaction run completes if the migrate and free scanner meet */ 852 /* Compaction run completes if the migrate and free scanner meet */
839 if (cc->free_pfn <= cc->migrate_pfn) { 853 if (cc->free_pfn <= cc->migrate_pfn) {
854 /* Let the next compaction start anew. */
855 zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
856 zone->compact_cached_free_pfn = zone_end_pfn(zone);
857
840 /* 858 /*
841 * Mark that the PG_migrate_skip information should be cleared 859 * Mark that the PG_migrate_skip information should be cleared
842 * by kswapd when it goes to sleep. kswapd does not set the 860 * by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
947 } 965 }
948 966
949 /* 967 /*
968 * Clear pageblock skip if there were failures recently and compaction
969 * is about to be retried after being deferred. kswapd does not do
970 * this reset as it'll reset the cached information when going to sleep.
971 */
972 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
973 __reset_isolation_suitable(zone);
974
975 /*
950 * Setup to move all movable pages to the end of the zone. Used cached 976 * Setup to move all movable pages to the end of the zone. Used cached
951 * information on where the scanners should start but check that it 977 * information on where the scanners should start but check that it
952 * is initialised by ensuring the values are within zone boundaries. 978 * is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
962 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 988 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
963 } 989 }
964 990
965 /* 991 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
966 * Clear pageblock skip if there were failures recently and compaction
967 * is about to be retried after being deferred. kswapd does not do
968 * this reset as it'll reset the cached information when going to sleep.
969 */
970 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
971 __reset_isolation_suitable(zone);
972 992
973 migrate_prep_local(); 993 migrate_prep_local();
974 994
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1003 if (err) { 1023 if (err) {
1004 putback_movable_pages(&cc->migratepages); 1024 putback_movable_pages(&cc->migratepages);
1005 cc->nr_migratepages = 0; 1025 cc->nr_migratepages = 0;
1006 if (err == -ENOMEM) { 1026 /*
1027 * migrate_pages() may return -ENOMEM when scanners meet
1028 * and we want compact_finished() to detect it
1029 */
1030 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
1007 ret = COMPACT_PARTIAL; 1031 ret = COMPACT_PARTIAL;
1008 goto out; 1032 goto out;
1009 } 1033 }
@@ -1015,6 +1039,8 @@ out:
1015 cc->nr_freepages -= release_freepages(&cc->freepages); 1039 cc->nr_freepages -= release_freepages(&cc->freepages);
1016 VM_BUG_ON(cc->nr_freepages != 0); 1040 VM_BUG_ON(cc->nr_freepages != 0);
1017 1041
1042 trace_mm_compaction_end(ret);
1043
1018 return ret; 1044 return ret;
1019} 1045}
1020 1046
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1120 compact_zone(zone, cc); 1146 compact_zone(zone, cc);
1121 1147
1122 if (cc->order > 0) { 1148 if (cc->order > 0) {
1123 int ok = zone_watermark_ok(zone, cc->order, 1149 if (zone_watermark_ok(zone, cc->order,
1124 low_wmark_pages(zone), 0, 0); 1150 low_wmark_pages(zone), 0, 0))
1125 if (ok && cc->order >= zone->compact_order_failed) 1151 compaction_defer_reset(zone, cc->order, false);
1126 zone->compact_order_failed = cc->order + 1;
1127 /* Currently async compaction is never deferred. */ 1152 /* Currently async compaction is never deferred. */
1128 else if (!ok && cc->sync) 1153 else if (cc->sync)
1129 defer_compaction(zone, cc->order); 1154 defer_compaction(zone, cc->order);
1130 } 1155 }
1131 1156
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..04306b9de90d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
690 */ 690 */
691int PageHuge(struct page *page) 691int PageHuge(struct page *page)
692{ 692{
693 compound_page_dtor *dtor;
694
695 if (!PageCompound(page)) 693 if (!PageCompound(page))
696 return 0; 694 return 0;
697 695
698 page = compound_head(page); 696 page = compound_head(page);
699 dtor = get_compound_page_dtor(page); 697 return get_compound_page_dtor(page) == free_huge_page;
700
701 return dtor == free_huge_page;
702} 698}
703EXPORT_SYMBOL_GPL(PageHuge); 699EXPORT_SYMBOL_GPL(PageHuge);
704 700
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
708 */ 704 */
709int PageHeadHuge(struct page *page_head) 705int PageHeadHuge(struct page *page_head)
710{ 706{
711 compound_page_dtor *dtor;
712
713 if (!PageHead(page_head)) 707 if (!PageHead(page_head))
714 return 0; 708 return 0;
715 709
716 dtor = get_compound_page_dtor(page_head); 710 return get_compound_page_dtor(page_head) == free_huge_page;
717
718 return dtor == free_huge_page;
719} 711}
720EXPORT_SYMBOL_GPL(PageHeadHuge);
721 712
722pgoff_t __basepage_index(struct page *page) 713pgoff_t __basepage_index(struct page *page)
723{ 714{
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1280 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1271 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1281 void *addr; 1272 void *addr;
1282 1273
1283 addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1274 addr = memblock_virt_alloc_try_nid_nopanic(
1284 huge_page_size(h), huge_page_size(h), 0); 1275 huge_page_size(h), huge_page_size(h),
1285 1276 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1286 if (addr) { 1277 if (addr) {
1287 /* 1278 /*
1288 * Use the beginning of the huge page to store the 1279 * Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
1322 1313
1323#ifdef CONFIG_HIGHMEM 1314#ifdef CONFIG_HIGHMEM
1324 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1315 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1325 free_bootmem_late((unsigned long)m, 1316 memblock_free_late(__pa(m),
1326 sizeof(struct huge_bootmem_page)); 1317 sizeof(struct huge_bootmem_page));
1327#else 1318#else
1328 page = virt_to_page(m); 1319 page = virt_to_page(m);
1329#endif 1320#endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2355 int cow; 2346 int cow;
2356 struct hstate *h = hstate_vma(vma); 2347 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2348 unsigned long sz = huge_page_size(h);
2349 unsigned long mmun_start; /* For mmu_notifiers */
2350 unsigned long mmun_end; /* For mmu_notifiers */
2351 int ret = 0;
2358 2352
2359 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2353 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2360 2354
2355 mmun_start = vma->vm_start;
2356 mmun_end = vma->vm_end;
2357 if (cow)
2358 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2359
2361 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2360 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2362 spinlock_t *src_ptl, *dst_ptl; 2361 spinlock_t *src_ptl, *dst_ptl;
2363 src_pte = huge_pte_offset(src, addr); 2362 src_pte = huge_pte_offset(src, addr);
2364 if (!src_pte) 2363 if (!src_pte)
2365 continue; 2364 continue;
2366 dst_pte = huge_pte_alloc(dst, addr, sz); 2365 dst_pte = huge_pte_alloc(dst, addr, sz);
2367 if (!dst_pte) 2366 if (!dst_pte) {
2368 goto nomem; 2367 ret = -ENOMEM;
2368 break;
2369 }
2369 2370
2370 /* If the pagetables are shared don't copy or take references */ 2371 /* If the pagetables are shared don't copy or take references */
2371 if (dst_pte == src_pte) 2372 if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2386 spin_unlock(src_ptl); 2387 spin_unlock(src_ptl);
2387 spin_unlock(dst_ptl); 2388 spin_unlock(dst_ptl);
2388 } 2389 }
2389 return 0;
2390 2390
2391nomem: 2391 if (cow)
2392 return -ENOMEM; 2392 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2393
2394 return ret;
2393} 2395}
2394 2396
2395static int is_hugetlb_entry_migration(pte_t pte) 2397static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3079same_page: 3081same_page:
3080 if (pages) { 3082 if (pages) {
3081 pages[i] = mem_map_offset(page, pfn_offset); 3083 pages[i] = mem_map_offset(page, pfn_offset);
3082 get_page(pages[i]); 3084 get_page_foll(pages[i]);
3083 } 3085 }
3084 3086
3085 if (vmas) 3087 if (vmas)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
55 return 0; 55 return 0;
56 56
57inject: 57inject:
58 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); 58 pr_info("Injecting memory failure at pfn %#lx\n", pfn);
59 return memory_failure(pfn, 18, MF_COUNT_INCREASED); 59 return memory_failure(pfn, 18, MF_COUNT_INCREASED);
60} 60}
61 61
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..a346ba120e42 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page,
47 * page_cache_get_speculative()) on tail pages. 47 * page_cache_get_speculative()) on tail pages.
48 */ 48 */
49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 49 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
50 VM_BUG_ON(atomic_read(&page->_count) != 0);
51 VM_BUG_ON(page_mapcount(page) < 0);
52 if (get_page_head) 50 if (get_page_head)
53 atomic_inc(&page->first_page->_count); 51 atomic_inc(&page->first_page->_count);
54 atomic_inc(&page->_mapcount); 52 get_huge_page_tail(page);
55} 53}
56 54
57/* 55/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..3df141e5f3e0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
1891 return new_page; 1891 return new_page;
1892} 1892}
1893 1893
1894int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1894int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1895 unsigned long *vm_flags)
1896{ 1895{
1897 struct stable_node *stable_node; 1896 struct stable_node *stable_node;
1898 struct rmap_item *rmap_item; 1897 struct rmap_item *rmap_item;
1899 unsigned int mapcount = page_mapcount(page); 1898 int ret = SWAP_AGAIN;
1900 int referenced = 0;
1901 int search_new_forks = 0; 1899 int search_new_forks = 0;
1902 1900
1903 VM_BUG_ON(!PageKsm(page)); 1901 VM_BUG_ON(!PageKsm(page));
1902
1903 /*
1904 * Rely on the page lock to protect against concurrent modifications
1905 * to that page's node of the stable tree.
1906 */
1904 VM_BUG_ON(!PageLocked(page)); 1907 VM_BUG_ON(!PageLocked(page));
1905 1908
1906 stable_node = page_stable_node(page); 1909 stable_node = page_stable_node(page);
1907 if (!stable_node) 1910 if (!stable_node)
1908 return 0; 1911 return ret;
1909again: 1912again:
1910 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 1913 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1911 struct anon_vma *anon_vma = rmap_item->anon_vma; 1914 struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
1928 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1931 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1929 continue; 1932 continue;
1930 1933
1931 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1934 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1932 continue;
1933
1934 referenced += page_referenced_one(page, vma,
1935 rmap_item->address, &mapcount, vm_flags);
1936 if (!search_new_forks || !mapcount)
1937 break;
1938 }
1939 anon_vma_unlock_read(anon_vma);
1940 if (!mapcount)
1941 goto out;
1942 }
1943 if (!search_new_forks++)
1944 goto again;
1945out:
1946 return referenced;
1947}
1948
1949int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1950{
1951 struct stable_node *stable_node;
1952 struct rmap_item *rmap_item;
1953 int ret = SWAP_AGAIN;
1954 int search_new_forks = 0;
1955
1956 VM_BUG_ON(!PageKsm(page));
1957 VM_BUG_ON(!PageLocked(page));
1958
1959 stable_node = page_stable_node(page);
1960 if (!stable_node)
1961 return SWAP_FAIL;
1962again:
1963 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1964 struct anon_vma *anon_vma = rmap_item->anon_vma;
1965 struct anon_vma_chain *vmac;
1966 struct vm_area_struct *vma;
1967
1968 anon_vma_lock_read(anon_vma);
1969 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1970 0, ULONG_MAX) {
1971 vma = vmac->vma;
1972 if (rmap_item->address < vma->vm_start ||
1973 rmap_item->address >= vma->vm_end)
1974 continue;
1975 /*
1976 * Initially we examine only the vma which covers this
1977 * rmap_item; but later, if there is still work to do,
1978 * we examine covering vmas in other mms: in case they
1979 * were forked from the original since ksmd passed.
1980 */
1981 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1982 continue; 1935 continue;
1983 1936
1984 ret = try_to_unmap_one(page, vma, 1937 ret = rwc->rmap_one(page, vma,
1985 rmap_item->address, flags); 1938 rmap_item->address, rwc->arg);
1986 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1939 if (ret != SWAP_AGAIN) {
1987 anon_vma_unlock_read(anon_vma); 1940 anon_vma_unlock_read(anon_vma);
1988 goto out; 1941 goto out;
1989 } 1942 }
1990 } 1943 if (rwc->done && rwc->done(page)) {
1991 anon_vma_unlock_read(anon_vma);
1992 }
1993 if (!search_new_forks++)
1994 goto again;
1995out:
1996 return ret;
1997}
1998
1999#ifdef CONFIG_MIGRATION
2000int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
2001 struct vm_area_struct *, unsigned long, void *), void *arg)
2002{
2003 struct stable_node *stable_node;
2004 struct rmap_item *rmap_item;
2005 int ret = SWAP_AGAIN;
2006 int search_new_forks = 0;
2007
2008 VM_BUG_ON(!PageKsm(page));
2009 VM_BUG_ON(!PageLocked(page));
2010
2011 stable_node = page_stable_node(page);
2012 if (!stable_node)
2013 return ret;
2014again:
2015 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2016 struct anon_vma *anon_vma = rmap_item->anon_vma;
2017 struct anon_vma_chain *vmac;
2018 struct vm_area_struct *vma;
2019
2020 anon_vma_lock_read(anon_vma);
2021 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2022 0, ULONG_MAX) {
2023 vma = vmac->vma;
2024 if (rmap_item->address < vma->vm_start ||
2025 rmap_item->address >= vma->vm_end)
2026 continue;
2027 /*
2028 * Initially we examine only the vma which covers this
2029 * rmap_item; but later, if there is still work to do,
2030 * we examine covering vmas in other mms: in case they
2031 * were forked from the original since ksmd passed.
2032 */
2033 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2034 continue;
2035
2036 ret = rmap_one(page, vma, rmap_item->address, arg);
2037 if (ret != SWAP_AGAIN) {
2038 anon_vma_unlock_read(anon_vma); 1944 anon_vma_unlock_read(anon_vma);
2039 goto out; 1945 goto out;
2040 } 1946 }
@@ -2047,6 +1953,7 @@ out:
2047 return ret; 1953 return ret;
2048} 1954}
2049 1955
1956#ifdef CONFIG_MIGRATION
2050void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1957void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2051{ 1958{
2052 struct stable_node *stable_node; 1959 struct stable_node *stable_node;
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..1c2ef2c7edab 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h> 23#include <asm-generic/sections.h>
24#include <linux/io.h>
25
26#include "internal.h"
24 27
25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 28static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 29static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
39}; 42};
40 43
41int memblock_debug __initdata_memblock; 44int memblock_debug __initdata_memblock;
45#ifdef CONFIG_MOVABLE_NODE
46bool movable_node_enabled __initdata_memblock = false;
47#endif
42static int memblock_can_resize __initdata_memblock; 48static int memblock_can_resize __initdata_memblock;
43static int memblock_memory_in_slab __initdata_memblock = 0; 49static int memblock_memory_in_slab __initdata_memblock = 0;
44static int memblock_reserved_in_slab __initdata_memblock = 0; 50static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 97 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find 98 * @size: size of free area to find
93 * @align: alignment of free area to find 99 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 100 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
95 * 101 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up. 102 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 * 103 *
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 129 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find 130 * @size: size of free area to find
125 * @align: alignment of free area to find 131 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 132 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
127 * 133 *
128 * Utility called from memblock_find_in_range_node(), find free area top-down. 134 * Utility called from memblock_find_in_range_node(), find free area top-down.
129 * 135 *
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
154 160
155/** 161/**
156 * memblock_find_in_range_node - find free area in given range and node 162 * memblock_find_in_range_node - find free area in given range and node
157 * @start: start of candidate range
158 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
159 * @size: size of free area to find 163 * @size: size of free area to find
160 * @align: alignment of free area to find 164 * @align: alignment of free area to find
161 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 165 * @start: start of candidate range
166 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
167 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
162 * 168 *
163 * Find @size free area aligned to @align in the specified range and node. 169 * Find @size free area aligned to @align in the specified range and node.
164 * 170 *
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
173 * RETURNS: 179 * RETURNS:
174 * Found address on success, 0 on failure. 180 * Found address on success, 0 on failure.
175 */ 181 */
176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 182phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
177 phys_addr_t end, phys_addr_t size, 183 phys_addr_t align, phys_addr_t start,
178 phys_addr_t align, int nid) 184 phys_addr_t end, int nid)
179{ 185{
180 int ret; 186 int ret;
181 phys_addr_t kernel_end; 187 phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
238 phys_addr_t end, phys_addr_t size, 244 phys_addr_t end, phys_addr_t size,
239 phys_addr_t align) 245 phys_addr_t align)
240{ 246{
241 return memblock_find_in_range_node(start, end, size, align, 247 return memblock_find_in_range_node(size, align, start, end,
242 MAX_NUMNODES); 248 NUMA_NO_NODE);
243} 249}
244 250
245static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 251static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
255 type->cnt = 1; 261 type->cnt = 1;
256 type->regions[0].base = 0; 262 type->regions[0].base = 0;
257 type->regions[0].size = 0; 263 type->regions[0].size = 0;
264 type->regions[0].flags = 0;
258 memblock_set_region_node(&type->regions[0], MAX_NUMNODES); 265 memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
259 } 266 }
260} 267}
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
265 if (memblock.reserved.regions == memblock_reserved_init_regions) 272 if (memblock.reserved.regions == memblock_reserved_init_regions)
266 return 0; 273 return 0;
267 274
275 /*
276 * Don't allow nobootmem allocator to free reserved memory regions
277 * array if
278 * - CONFIG_DEBUG_FS is enabled;
279 * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
280 * - reserved memory regions array have been resized during boot.
281 * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
282 * will show garbage instead of state of memory reservations.
283 */
284 if (IS_ENABLED(CONFIG_DEBUG_FS) &&
285 !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
286 return 0;
287
268 *addr = __pa(memblock.reserved.regions); 288 *addr = __pa(memblock.reserved.regions);
269 289
270 return PAGE_ALIGN(sizeof(struct memblock_region) * 290 return PAGE_ALIGN(sizeof(struct memblock_region) *
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
405 425
406 if (this->base + this->size != next->base || 426 if (this->base + this->size != next->base ||
407 memblock_get_region_node(this) != 427 memblock_get_region_node(this) !=
408 memblock_get_region_node(next)) { 428 memblock_get_region_node(next) ||
429 this->flags != next->flags) {
409 BUG_ON(this->base + this->size > next->base); 430 BUG_ON(this->base + this->size > next->base);
410 i++; 431 i++;
411 continue; 432 continue;
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
425 * @base: base address of the new region 446 * @base: base address of the new region
426 * @size: size of the new region 447 * @size: size of the new region
427 * @nid: node id of the new region 448 * @nid: node id of the new region
449 * @flags: flags of the new region
428 * 450 *
429 * Insert new memblock region [@base,@base+@size) into @type at @idx. 451 * Insert new memblock region [@base,@base+@size) into @type at @idx.
430 * @type must already have extra room to accomodate the new region. 452 * @type must already have extra room to accomodate the new region.
431 */ 453 */
432static void __init_memblock memblock_insert_region(struct memblock_type *type, 454static void __init_memblock memblock_insert_region(struct memblock_type *type,
433 int idx, phys_addr_t base, 455 int idx, phys_addr_t base,
434 phys_addr_t size, int nid) 456 phys_addr_t size,
457 int nid, unsigned long flags)
435{ 458{
436 struct memblock_region *rgn = &type->regions[idx]; 459 struct memblock_region *rgn = &type->regions[idx];
437 460
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
439 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); 462 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
440 rgn->base = base; 463 rgn->base = base;
441 rgn->size = size; 464 rgn->size = size;
465 rgn->flags = flags;
442 memblock_set_region_node(rgn, nid); 466 memblock_set_region_node(rgn, nid);
443 type->cnt++; 467 type->cnt++;
444 type->total_size += size; 468 type->total_size += size;
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
450 * @base: base address of the new region 474 * @base: base address of the new region
451 * @size: size of the new region 475 * @size: size of the new region
452 * @nid: nid of the new region 476 * @nid: nid of the new region
477 * @flags: flags of the new region
453 * 478 *
454 * Add new memblock region [@base,@base+@size) into @type. The new region 479 * Add new memblock region [@base,@base+@size) into @type. The new region
455 * is allowed to overlap with existing ones - overlaps don't affect already 480 * is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
460 * 0 on success, -errno on failure. 485 * 0 on success, -errno on failure.
461 */ 486 */
462static int __init_memblock memblock_add_region(struct memblock_type *type, 487static int __init_memblock memblock_add_region(struct memblock_type *type,
463 phys_addr_t base, phys_addr_t size, int nid) 488 phys_addr_t base, phys_addr_t size,
489 int nid, unsigned long flags)
464{ 490{
465 bool insert = false; 491 bool insert = false;
466 phys_addr_t obase = base; 492 phys_addr_t obase = base;
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
475 WARN_ON(type->cnt != 1 || type->total_size); 501 WARN_ON(type->cnt != 1 || type->total_size);
476 type->regions[0].base = base; 502 type->regions[0].base = base;
477 type->regions[0].size = size; 503 type->regions[0].size = size;
504 type->regions[0].flags = flags;
478 memblock_set_region_node(&type->regions[0], nid); 505 memblock_set_region_node(&type->regions[0], nid);
479 type->total_size = size; 506 type->total_size = size;
480 return 0; 507 return 0;
@@ -505,7 +532,8 @@ repeat:
505 nr_new++; 532 nr_new++;
506 if (insert) 533 if (insert)
507 memblock_insert_region(type, i++, base, 534 memblock_insert_region(type, i++, base,
508 rbase - base, nid); 535 rbase - base, nid,
536 flags);
509 } 537 }
510 /* area below @rend is dealt with, forget about it */ 538 /* area below @rend is dealt with, forget about it */
511 base = min(rend, end); 539 base = min(rend, end);
@@ -515,7 +543,8 @@ repeat:
515 if (base < end) { 543 if (base < end) {
516 nr_new++; 544 nr_new++;
517 if (insert) 545 if (insert)
518 memblock_insert_region(type, i, base, end - base, nid); 546 memblock_insert_region(type, i, base, end - base,
547 nid, flags);
519 } 548 }
520 549
521 /* 550 /*
@@ -537,12 +566,13 @@ repeat:
537int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 566int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
538 int nid) 567 int nid)
539{ 568{
540 return memblock_add_region(&memblock.memory, base, size, nid); 569 return memblock_add_region(&memblock.memory, base, size, nid, 0);
541} 570}
542 571
543int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 572int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
544{ 573{
545 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); 574 return memblock_add_region(&memblock.memory, base, size,
575 MAX_NUMNODES, 0);
546} 576}
547 577
548/** 578/**
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
597 rgn->size -= base - rbase; 627 rgn->size -= base - rbase;
598 type->total_size -= base - rbase; 628 type->total_size -= base - rbase;
599 memblock_insert_region(type, i, rbase, base - rbase, 629 memblock_insert_region(type, i, rbase, base - rbase,
600 memblock_get_region_node(rgn)); 630 memblock_get_region_node(rgn),
631 rgn->flags);
601 } else if (rend > end) { 632 } else if (rend > end) {
602 /* 633 /*
603 * @rgn intersects from above. Split and redo the 634 * @rgn intersects from above. Split and redo the
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
607 rgn->size -= end - rbase; 638 rgn->size -= end - rbase;
608 type->total_size -= end - rbase; 639 type->total_size -= end - rbase;
609 memblock_insert_region(type, i--, rbase, end - rbase, 640 memblock_insert_region(type, i--, rbase, end - rbase,
610 memblock_get_region_node(rgn)); 641 memblock_get_region_node(rgn),
642 rgn->flags);
611 } else { 643 } else {
612 /* @rgn is fully contained, record it */ 644 /* @rgn is fully contained, record it */
613 if (!*end_rgn) 645 if (!*end_rgn)
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
643{ 675{
644 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 676 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
645 (unsigned long long)base, 677 (unsigned long long)base,
646 (unsigned long long)base + size, 678 (unsigned long long)base + size - 1,
647 (void *)_RET_IP_); 679 (void *)_RET_IP_);
648 680
649 return __memblock_remove(&memblock.reserved, base, size); 681 return __memblock_remove(&memblock.reserved, base, size);
650} 682}
651 683
652int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 684static int __init_memblock memblock_reserve_region(phys_addr_t base,
685 phys_addr_t size,
686 int nid,
687 unsigned long flags)
653{ 688{
654 struct memblock_type *_rgn = &memblock.reserved; 689 struct memblock_type *_rgn = &memblock.reserved;
655 690
656 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", 691 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
657 (unsigned long long)base, 692 (unsigned long long)base,
658 (unsigned long long)base + size, 693 (unsigned long long)base + size - 1,
659 (void *)_RET_IP_); 694 flags, (void *)_RET_IP_);
660 695
661 return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 696 return memblock_add_region(_rgn, base, size, nid, flags);
697}
698
699int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
700{
701 return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
702}
703
704/**
705 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
706 * @base: the base phys addr of the region
707 * @size: the size of the region
708 *
709 * This function isolates region [@base, @base + @size), and mark it with flag
710 * MEMBLOCK_HOTPLUG.
711 *
712 * Return 0 on succees, -errno on failure.
713 */
714int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
715{
716 struct memblock_type *type = &memblock.memory;
717 int i, ret, start_rgn, end_rgn;
718
719 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
720 if (ret)
721 return ret;
722
723 for (i = start_rgn; i < end_rgn; i++)
724 memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
725
726 memblock_merge_regions(type);
727 return 0;
728}
729
730/**
731 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
732 * @base: the base phys addr of the region
733 * @size: the size of the region
734 *
735 * This function isolates region [@base, @base + @size), and clear flag
736 * MEMBLOCK_HOTPLUG for the isolated regions.
737 *
738 * Return 0 on succees, -errno on failure.
739 */
740int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
741{
742 struct memblock_type *type = &memblock.memory;
743 int i, ret, start_rgn, end_rgn;
744
745 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
746 if (ret)
747 return ret;
748
749 for (i = start_rgn; i < end_rgn; i++)
750 memblock_clear_region_flags(&type->regions[i],
751 MEMBLOCK_HOTPLUG);
752
753 memblock_merge_regions(type);
754 return 0;
662} 755}
663 756
664/** 757/**
665 * __next_free_mem_range - next function for for_each_free_mem_range() 758 * __next_free_mem_range - next function for for_each_free_mem_range()
666 * @idx: pointer to u64 loop variable 759 * @idx: pointer to u64 loop variable
667 * @nid: node selector, %MAX_NUMNODES for all nodes 760 * @nid: node selector, %NUMA_NO_NODE for all nodes
668 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 761 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
669 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 762 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
670 * @out_nid: ptr to int for nid of the range, can be %NULL 763 * @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
693 int mi = *idx & 0xffffffff; 786 int mi = *idx & 0xffffffff;
694 int ri = *idx >> 32; 787 int ri = *idx >> 32;
695 788
789 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
790 nid = NUMA_NO_NODE;
791
696 for ( ; mi < mem->cnt; mi++) { 792 for ( ; mi < mem->cnt; mi++) {
697 struct memblock_region *m = &mem->regions[mi]; 793 struct memblock_region *m = &mem->regions[mi];
698 phys_addr_t m_start = m->base; 794 phys_addr_t m_start = m->base;
699 phys_addr_t m_end = m->base + m->size; 795 phys_addr_t m_end = m->base + m->size;
700 796
701 /* only memory regions are associated with nodes, check it */ 797 /* only memory regions are associated with nodes, check it */
702 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 798 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
703 continue; 799 continue;
704 800
705 /* scan areas before each reservation for intersection */ 801 /* scan areas before each reservation for intersection */
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
740/** 836/**
741 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 837 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
742 * @idx: pointer to u64 loop variable 838 * @idx: pointer to u64 loop variable
743 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 839 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
744 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 840 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
745 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 841 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
746 * @out_nid: ptr to int for nid of the range, can be %NULL 842 * @out_nid: ptr to int for nid of the range, can be %NULL
747 * 843 *
748 * Reverse of __next_free_mem_range(). 844 * Reverse of __next_free_mem_range().
845 *
846 * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
847 * be able to hot-remove hotpluggable memory used by the kernel. So this
848 * function skip hotpluggable regions if needed when allocating memory for the
849 * kernel.
749 */ 850 */
750void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 851void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
751 phys_addr_t *out_start, 852 phys_addr_t *out_start,
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
756 int mi = *idx & 0xffffffff; 857 int mi = *idx & 0xffffffff;
757 int ri = *idx >> 32; 858 int ri = *idx >> 32;
758 859
860 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
861 nid = NUMA_NO_NODE;
862
759 if (*idx == (u64)ULLONG_MAX) { 863 if (*idx == (u64)ULLONG_MAX) {
760 mi = mem->cnt - 1; 864 mi = mem->cnt - 1;
761 ri = rsv->cnt; 865 ri = rsv->cnt;
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
767 phys_addr_t m_end = m->base + m->size; 871 phys_addr_t m_end = m->base + m->size;
768 872
769 /* only memory regions are associated with nodes, check it */ 873 /* only memory regions are associated with nodes, check it */
770 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 874 if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
875 continue;
876
877 /* skip hotpluggable memory regions if needed */
878 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
771 continue; 879 continue;
772 880
773 /* scan areas before each reservation for intersection */ 881 /* scan areas before each reservation for intersection */
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
837 * memblock_set_node - set node ID on memblock regions 945 * memblock_set_node - set node ID on memblock regions
838 * @base: base of area to set node ID for 946 * @base: base of area to set node ID for
839 * @size: size of area to set node ID for 947 * @size: size of area to set node ID for
948 * @type: memblock type to set node ID for
840 * @nid: node ID to set 949 * @nid: node ID to set
841 * 950 *
842 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. 951 * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
843 * Regions which cross the area boundaries are split as necessary. 952 * Regions which cross the area boundaries are split as necessary.
844 * 953 *
845 * RETURNS: 954 * RETURNS:
846 * 0 on success, -errno on failure. 955 * 0 on success, -errno on failure.
847 */ 956 */
848int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, 957int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
849 int nid) 958 struct memblock_type *type, int nid)
850{ 959{
851 struct memblock_type *type = &memblock.memory;
852 int start_rgn, end_rgn; 960 int start_rgn, end_rgn;
853 int i, ret; 961 int i, ret;
854 962
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
870{ 978{
871 phys_addr_t found; 979 phys_addr_t found;
872 980
873 if (WARN_ON(!align)) 981 if (!align)
874 align = __alignof__(long long); 982 align = SMP_CACHE_BYTES;
875 983
876 /* align @size to avoid excessive fragmentation on reserved array */ 984 /* align @size to avoid excessive fragmentation on reserved array */
877 size = round_up(size, align); 985 size = round_up(size, align);
878 986
879 found = memblock_find_in_range_node(0, max_addr, size, align, nid); 987 found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
880 if (found && !memblock_reserve(found, size)) 988 if (found && !memblock_reserve(found, size))
881 return found; 989 return found;
882 990
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
890 998
891phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 999phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
892{ 1000{
893 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 1001 return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
894} 1002}
895 1003
896phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 1004phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
920 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 1028 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
921} 1029}
922 1030
1031/**
1032 * memblock_virt_alloc_internal - allocate boot memory block
1033 * @size: size of memory block to be allocated in bytes
1034 * @align: alignment of the region and block's size
1035 * @min_addr: the lower bound of the memory region to allocate (phys address)
1036 * @max_addr: the upper bound of the memory region to allocate (phys address)
1037 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1038 *
1039 * The @min_addr limit is dropped if it can not be satisfied and the allocation
1040 * will fall back to memory below @min_addr. Also, allocation may fall back
1041 * to any node in the system if the specified node can not
1042 * hold the requested memory.
1043 *
1044 * The allocation is performed from memory region limited by
1045 * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
1046 *
1047 * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
1048 *
1049 * The phys address of allocated boot memory block is converted to virtual and
1050 * allocated memory is reset to 0.
1051 *
1052 * In addition, function sets the min_count to 0 using kmemleak_alloc for
1053 * allocated boot memory block, so that it is never reported as leaks.
1054 *
1055 * RETURNS:
1056 * Virtual address of allocated memory block on success, NULL on failure.
1057 */
1058static void * __init memblock_virt_alloc_internal(
1059 phys_addr_t size, phys_addr_t align,
1060 phys_addr_t min_addr, phys_addr_t max_addr,
1061 int nid)
1062{
1063 phys_addr_t alloc;
1064 void *ptr;
1065
1066 if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
1067 nid = NUMA_NO_NODE;
1068
1069 /*
1070 * Detect any accidental use of these APIs after slab is ready, as at
1071 * this moment memblock may be deinitialized already and its
1072 * internal data may be destroyed (after execution of free_all_bootmem)
1073 */
1074 if (WARN_ON_ONCE(slab_is_available()))
1075 return kzalloc_node(size, GFP_NOWAIT, nid);
1076
1077 if (!align)
1078 align = SMP_CACHE_BYTES;
1079
1080 /* align @size to avoid excessive fragmentation on reserved array */
1081 size = round_up(size, align);
1082
1083again:
1084 alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
1085 nid);
1086 if (alloc)
1087 goto done;
1088
1089 if (nid != NUMA_NO_NODE) {
1090 alloc = memblock_find_in_range_node(size, align, min_addr,
1091 max_addr, NUMA_NO_NODE);
1092 if (alloc)
1093 goto done;
1094 }
1095
1096 if (min_addr) {
1097 min_addr = 0;
1098 goto again;
1099 } else {
1100 goto error;
1101 }
1102
1103done:
1104 memblock_reserve(alloc, size);
1105 ptr = phys_to_virt(alloc);
1106 memset(ptr, 0, size);
1107
1108 /*
1109 * The min_count is set to 0 so that bootmem allocated blocks
1110 * are never reported as leaks. This is because many of these blocks
1111 * are only referred via the physical address which is not
1112 * looked up by kmemleak.
1113 */
1114 kmemleak_alloc(ptr, size, 0, 0);
1115
1116 return ptr;
1117
1118error:
1119 return NULL;
1120}
1121
1122/**
1123 * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
1124 * @size: size of memory block to be allocated in bytes
1125 * @align: alignment of the region and block's size
1126 * @min_addr: the lower bound of the memory region from where the allocation
1127 * is preferred (phys address)
1128 * @max_addr: the upper bound of the memory region from where the allocation
1129 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1130 * allocate only from memory limited by memblock.current_limit value
1131 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1132 *
1133 * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
1134 * additional debug information (including caller info), if enabled.
1135 *
1136 * RETURNS:
1137 * Virtual address of allocated memory block on success, NULL on failure.
1138 */
1139void * __init memblock_virt_alloc_try_nid_nopanic(
1140 phys_addr_t size, phys_addr_t align,
1141 phys_addr_t min_addr, phys_addr_t max_addr,
1142 int nid)
1143{
1144 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1145 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1146 (u64)max_addr, (void *)_RET_IP_);
1147 return memblock_virt_alloc_internal(size, align, min_addr,
1148 max_addr, nid);
1149}
1150
1151/**
1152 * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
1153 * @size: size of memory block to be allocated in bytes
1154 * @align: alignment of the region and block's size
1155 * @min_addr: the lower bound of the memory region from where the allocation
1156 * is preferred (phys address)
1157 * @max_addr: the upper bound of the memory region from where the allocation
1158 * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
1159 * allocate only from memory limited by memblock.current_limit value
1160 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
1161 *
1162 * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
1163 * which provides debug information (including caller info), if enabled,
1164 * and panics if the request can not be satisfied.
1165 *
1166 * RETURNS:
1167 * Virtual address of allocated memory block on success, NULL on failure.
1168 */
1169void * __init memblock_virt_alloc_try_nid(
1170 phys_addr_t size, phys_addr_t align,
1171 phys_addr_t min_addr, phys_addr_t max_addr,
1172 int nid)
1173{
1174 void *ptr;
1175
1176 memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
1177 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1178 (u64)max_addr, (void *)_RET_IP_);
1179 ptr = memblock_virt_alloc_internal(size, align,
1180 min_addr, max_addr, nid);
1181 if (ptr)
1182 return ptr;
1183
1184 panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
1185 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
1186 (u64)max_addr);
1187 return NULL;
1188}
1189
1190/**
1191 * __memblock_free_early - free boot memory block
1192 * @base: phys starting address of the boot memory block
1193 * @size: size of the boot memory block in bytes
1194 *
1195 * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
1196 * The freeing memory will not be released to the buddy allocator.
1197 */
1198void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
1199{
1200 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1201 __func__, (u64)base, (u64)base + size - 1,
1202 (void *)_RET_IP_);
1203 kmemleak_free_part(__va(base), size);
1204 __memblock_remove(&memblock.reserved, base, size);
1205}
1206
1207/*
1208 * __memblock_free_late - free bootmem block pages directly to buddy allocator
1209 * @addr: phys starting address of the boot memory block
1210 * @size: size of the boot memory block in bytes
1211 *
1212 * This is only useful when the bootmem allocator has already been torn
1213 * down, but we are still initializing the system. Pages are released directly
1214 * to the buddy allocator, no bootmem metadata is updated because it is gone.
1215 */
1216void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
1217{
1218 u64 cursor, end;
1219
1220 memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
1221 __func__, (u64)base, (u64)base + size - 1,
1222 (void *)_RET_IP_);
1223 kmemleak_free_part(__va(base), size);
1224 cursor = PFN_UP(base);
1225 end = PFN_DOWN(base + size);
1226
1227 for (; cursor < end; cursor++) {
1228 __free_pages_bootmem(pfn_to_page(cursor), 0);
1229 totalram_pages++;
1230 }
1231}
923 1232
924/* 1233/*
925 * Remaining API functions 1234 * Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
1101static void __init_memblock memblock_dump(struct memblock_type *type, char *name) 1410static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
1102{ 1411{
1103 unsigned long long base, size; 1412 unsigned long long base, size;
1413 unsigned long flags;
1104 int i; 1414 int i;
1105 1415
1106 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); 1416 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
1111 1421
1112 base = rgn->base; 1422 base = rgn->base;
1113 size = rgn->size; 1423 size = rgn->size;
1424 flags = rgn->flags;
1114#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1425#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1115 if (memblock_get_region_node(rgn) != MAX_NUMNODES) 1426 if (memblock_get_region_node(rgn) != MAX_NUMNODES)
1116 snprintf(nid_buf, sizeof(nid_buf), " on node %d", 1427 snprintf(nid_buf, sizeof(nid_buf), " on node %d",
1117 memblock_get_region_node(rgn)); 1428 memblock_get_region_node(rgn));
1118#endif 1429#endif
1119 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", 1430 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
1120 name, i, base, base + size - 1, size, nid_buf); 1431 name, i, base, base + size - 1, size, nid_buf, flags);
1121 } 1432 }
1122} 1433}
1123 1434
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7caff36180cd..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1688 */ 1688 */
1689void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1689void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1690{ 1690{
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1693 /* 1691 /*
1694 * Need a buffer in BSS, can't rely on allocations. The code relies 1692 * protects memcg_name and makes sure that parallel ooms do not
1695 * on the assumption that OOM is serialized for memory controller. 1693 * interleave
1696 * If this assumption is broken, revisit this code.
1697 */ 1694 */
1695 static DEFINE_SPINLOCK(oom_info_lock);
1696 struct cgroup *task_cgrp;
1697 struct cgroup *mem_cgrp;
1698 static char memcg_name[PATH_MAX]; 1698 static char memcg_name[PATH_MAX];
1699 int ret; 1699 int ret;
1700 struct mem_cgroup *iter; 1700 struct mem_cgroup *iter;
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1703 if (!p) 1703 if (!p)
1704 return; 1704 return;
1705 1705
1706 spin_lock(&oom_info_lock);
1706 rcu_read_lock(); 1707 rcu_read_lock();
1707 1708
1708 mem_cgrp = memcg->css.cgroup; 1709 mem_cgrp = memcg->css.cgroup;
@@ -1771,6 +1772,7 @@ done:
1771 1772
1772 pr_cont("\n"); 1773 pr_cont("\n");
1773 } 1774 }
1775 spin_unlock(&oom_info_lock);
1774} 1776}
1775 1777
1776/* 1778/*
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex);
3000static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 3002static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
3001{ 3003{
3002 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 3004 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
3003 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 3005 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
3006 KMEM_ACCOUNTED_MASK;
3004} 3007}
3005 3008
3006/* 3009/*
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg)
3126 * But when we create a new cache, we can call this as well if its parent 3129 * But when we create a new cache, we can call this as well if its parent
3127 * is kmem-limited. That will have to hold set_limit_mutex as well. 3130 * is kmem-limited. That will have to hold set_limit_mutex as well.
3128 */ 3131 */
3129int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3132static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
3130{ 3133{
3131 int num, ret; 3134 int num, ret;
3132 3135
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..b25ed321e667 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
611} 611}
612 612
613/* 613/*
614 * Dirty cache page page 614 * Dirty pagecache page
615 * Issues: when the error hit a hole page the error is not properly 615 * Issues: when the error hit a hole page the error is not properly
616 * propagated. 616 * propagated.
617 */ 617 */
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags)
1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1585 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1586 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1586 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1587 if (ret) { 1587 if (ret) {
1588 putback_lru_pages(&pagelist); 1588 if (!list_empty(&pagelist)) {
1589 list_del(&page->lru);
1590 dec_zone_page_state(page, NR_ISOLATED_ANON +
1591 page_is_file_cache(page));
1592 putback_lru_page(page);
1593 }
1594
1589 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1595 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1590 pfn, ret, page->flags); 1596 pfn, ret, page->flags);
1591 if (ret > 0) 1597 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..86487dfa5e59 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h> 60#include <linux/migrate.h>
61#include <linux/string.h> 61#include <linux/string.h>
62#include <linux/dma-debug.h>
62 63
63#include <asm/io.h> 64#include <asm/io.h>
64#include <asm/pgalloc.h> 65#include <asm/pgalloc.h>
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2559 2560
2560static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2561static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2561{ 2562{
2563 debug_dma_assert_idle(src);
2564
2562 /* 2565 /*
2563 * If the source page was a PFN mapping, we don't have 2566 * If the source page was a PFN mapping, we don't have
2564 * a "struct page" for it. We do a best-effort copy by 2567 * a "struct page" for it. We do a best-effort copy by
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4275#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273 4276
4274#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS 4277#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4278
4279static struct kmem_cache *page_ptl_cachep;
4280
4281void __init ptlock_cache_init(void)
4282{
4283 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4284 SLAB_PANIC, NULL);
4285}
4286
4275bool ptlock_alloc(struct page *page) 4287bool ptlock_alloc(struct page *page)
4276{ 4288{
4277 spinlock_t *ptl; 4289 spinlock_t *ptl;
4278 4290
4279 ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); 4291 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4280 if (!ptl) 4292 if (!ptl)
4281 return false; 4293 return false;
4282 page->ptl = ptl; 4294 page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
4285 4297
4286void ptlock_free(struct page *page) 4298void ptlock_free(struct page *page)
4287{ 4299{
4288 kfree(page->ptl); 4300 kmem_cache_free(page_ptl_cachep, page->ptl);
4289} 4301}
4290#endif 4302#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..cc2ab37220b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
9#include <linux/swap.h> 9#include <linux/swap.h>
10#include <linux/interrupt.h> 10#include <linux/interrupt.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/bootmem.h>
13#include <linux/compiler.h> 12#include <linux/compiler.h>
14#include <linux/export.h> 13#include <linux/export.h>
15#include <linux/pagevec.h> 14#include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
269} 268}
270 269
271/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 270/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
272 * alloc_bootmem_node_nopanic() */ 271 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
273static int __ref ensure_zone_is_initialized(struct zone *zone, 272static int __ref ensure_zone_is_initialized(struct zone *zone,
274 unsigned long start_pfn, unsigned long num_pages) 273 unsigned long start_pfn, unsigned long num_pages)
275{ 274{
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p)
1446 * the kernel away from hotpluggable memory. 1445 * the kernel away from hotpluggable memory.
1447 */ 1446 */
1448 memblock_set_bottom_up(true); 1447 memblock_set_bottom_up(true);
1448 movable_node_enabled = true;
1449#else 1449#else
1450 pr_warn("movable_node option not supported\n"); 1450 pr_warn("movable_node option not supported\n");
1451#endif 1451#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..a8025befc323 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
72} 72}
73 73
74/* 74/*
75 * Add isolated pages on the list back to the LRU under page lock
76 * to avoid leaking evictable pages back onto unevictable list.
77 */
78void putback_lru_pages(struct list_head *l)
79{
80 struct page *page;
81 struct page *page2;
82
83 list_for_each_entry_safe(page, page2, l, lru) {
84 list_del(&page->lru);
85 dec_zone_page_state(page, NR_ISOLATED_ANON +
86 page_is_file_cache(page));
87 putback_lru_page(page);
88 }
89}
90
91/*
92 * Put previously isolated pages back onto the appropriate lists 75 * Put previously isolated pages back onto the appropriate lists
93 * from where they were once taken off for compaction/migration. 76 * from where they were once taken off for compaction/migration.
94 * 77 *
95 * This function shall be used instead of putback_lru_pages(), 78 * This function shall be used whenever the isolated pageset has been
96 * whenever the isolated pageset has been built by isolate_migratepages_range() 79 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
80 * and isolate_huge_page().
97 */ 81 */
98void putback_movable_pages(struct list_head *l) 82void putback_movable_pages(struct list_head *l)
99{ 83{
@@ -199,7 +183,12 @@ out:
199 */ 183 */
200static void remove_migration_ptes(struct page *old, struct page *new) 184static void remove_migration_ptes(struct page *old, struct page *new)
201{ 185{
202 rmap_walk(new, remove_migration_pte, old); 186 struct rmap_walk_control rwc = {
187 .rmap_one = remove_migration_pte,
188 .arg = old,
189 };
190
191 rmap_walk(new, &rwc);
203} 192}
204 193
205/* 194/*
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
563 * Migration functions 552 * Migration functions
564 ***********************************************************/ 553 ***********************************************************/
565 554
566/* Always fail migration. Used for mappings that are not movable */
567int fail_migrate_page(struct address_space *mapping,
568 struct page *newpage, struct page *page)
569{
570 return -EIO;
571}
572EXPORT_SYMBOL(fail_migrate_page);
573
574/* 555/*
575 * Common logic to directly migrate a single page suitable for 556 * Common logic to directly migrate a single page suitable for
576 * pages that do not use PagePrivate/PagePrivate2. 557 * pages that do not use PagePrivate/PagePrivate2.
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1008{ 989{
1009 int rc = 0; 990 int rc = 0;
1010 int *result = NULL; 991 int *result = NULL;
1011 struct page *new_hpage = get_new_page(hpage, private, &result); 992 struct page *new_hpage;
1012 struct anon_vma *anon_vma = NULL; 993 struct anon_vma *anon_vma = NULL;
1013 994
1014 /* 995 /*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1018 * tables or check whether the hugepage is pmd-based or not before 999 * tables or check whether the hugepage is pmd-based or not before
1019 * kicking migration. 1000 * kicking migration.
1020 */ 1001 */
1021 if (!hugepage_migration_support(page_hstate(hpage))) 1002 if (!hugepage_migration_support(page_hstate(hpage))) {
1003 putback_active_hugepage(hpage);
1022 return -ENOSYS; 1004 return -ENOSYS;
1005 }
1023 1006
1007 new_hpage = get_new_page(hpage, private, &result);
1024 if (!new_hpage) 1008 if (!new_hpage)
1025 return -ENOMEM; 1009 return -ENOMEM;
1026 1010
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1120 nr_succeeded++; 1104 nr_succeeded++;
1121 break; 1105 break;
1122 default: 1106 default:
1123 /* Permanent failure */ 1107 /*
1108 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1109 * unlike -EAGAIN case, the failed page is
1110 * removed from migration page list and not
1111 * retried in the next outer loop.
1112 */
1124 nr_failed++; 1113 nr_failed++;
1125 break; 1114 break;
1126 } 1115 }
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node)
1594} 1583}
1595 1584
1596/* Returns true if the node is migrate rate-limited after the update */ 1585/* Returns true if the node is migrate rate-limited after the update */
1597bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) 1586static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1587 unsigned long nr_pages)
1598{ 1588{
1599 bool rate_limited = false;
1600
1601 /* 1589 /*
1602 * Rate-limit the amount of data that is being migrated to a node. 1590 * Rate-limit the amount of data that is being migrated to a node.
1603 * Optimal placement is no good if the memory bus is saturated and 1591 * Optimal placement is no good if the memory bus is saturated and
1604 * all the time is being spent migrating! 1592 * all the time is being spent migrating!
1605 */ 1593 */
1606 spin_lock(&pgdat->numabalancing_migrate_lock);
1607 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { 1594 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1595 spin_lock(&pgdat->numabalancing_migrate_lock);
1608 pgdat->numabalancing_migrate_nr_pages = 0; 1596 pgdat->numabalancing_migrate_nr_pages = 0;
1609 pgdat->numabalancing_migrate_next_window = jiffies + 1597 pgdat->numabalancing_migrate_next_window = jiffies +
1610 msecs_to_jiffies(migrate_interval_millisecs); 1598 msecs_to_jiffies(migrate_interval_millisecs);
1599 spin_unlock(&pgdat->numabalancing_migrate_lock);
1611 } 1600 }
1612 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) 1601 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1613 rate_limited = true; 1602 trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1614 else 1603 nr_pages);
1615 pgdat->numabalancing_migrate_nr_pages += nr_pages; 1604 return true;
1616 spin_unlock(&pgdat->numabalancing_migrate_lock); 1605 }
1617 1606
1618 return rate_limited; 1607 /*
1608 * This is an unlocked non-atomic update so errors are possible.
1609 * The consequences are failing to migrate when we potentiall should
1610 * have which is not severe enough to warrant locking. If it is ever
1611 * a problem, it can be converted to a per-cpu counter.
1612 */
1613 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1614 return false;
1619} 1615}
1620 1616
1621int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1617static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1622{ 1618{
1623 int page_lru; 1619 int page_lru;
1624 1620
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1705 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1701 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1706 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1702 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1707 if (nr_remaining) { 1703 if (nr_remaining) {
1708 putback_lru_pages(&migratepages); 1704 if (!list_empty(&migratepages)) {
1705 list_del(&page->lru);
1706 dec_zone_page_state(page, NR_ISOLATED_ANON +
1707 page_is_file_cache(page));
1708 putback_lru_page(page);
1709 }
1709 isolated = 0; 1710 isolated = 0;
1710 } else 1711 } else
1711 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1712 count_vm_numa_event(NUMA_PAGE_MIGRATE);
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..10819ed4df3e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
709 709
710 lru_add_drain_all(); /* flush pagevec */ 710 lru_add_drain_all(); /* flush pagevec */
711 711
712 down_write(&current->mm->mmap_sem);
713 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 712 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
714 start &= PAGE_MASK; 713 start &= PAGE_MASK;
715 714
716 locked = len >> PAGE_SHIFT;
717 locked += current->mm->locked_vm;
718
719 lock_limit = rlimit(RLIMIT_MEMLOCK); 715 lock_limit = rlimit(RLIMIT_MEMLOCK);
720 lock_limit >>= PAGE_SHIFT; 716 lock_limit >>= PAGE_SHIFT;
717 locked = len >> PAGE_SHIFT;
718
719 down_write(&current->mm->mmap_sem);
720
721 locked += current->mm->locked_vm;
721 722
722 /* check against resource limits */ 723 /* check against resource limits */
723 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 724 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
724 error = do_mlock(start, len, 1); 725 error = do_mlock(start, len, 1);
726
725 up_write(&current->mm->mmap_sem); 727 up_write(&current->mm->mmap_sem);
726 if (!error) 728 if (!error)
727 error = __mm_populate(start, len, 0); 729 error = __mm_populate(start, len, 0);
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
732{ 734{
733 int ret; 735 int ret;
734 736
735 down_write(&current->mm->mmap_sem);
736 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 737 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
737 start &= PAGE_MASK; 738 start &= PAGE_MASK;
739
740 down_write(&current->mm->mmap_sem);
738 ret = do_mlock(start, len, 0); 741 ret = do_mlock(start, len, 0);
739 up_write(&current->mm->mmap_sem); 742 up_write(&current->mm->mmap_sem);
743
740 return ret; 744 return ret;
741} 745}
742 746
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
781 if (flags & MCL_CURRENT) 785 if (flags & MCL_CURRENT)
782 lru_add_drain_all(); /* flush pagevec */ 786 lru_add_drain_all(); /* flush pagevec */
783 787
784 down_write(&current->mm->mmap_sem);
785
786 lock_limit = rlimit(RLIMIT_MEMLOCK); 788 lock_limit = rlimit(RLIMIT_MEMLOCK);
787 lock_limit >>= PAGE_SHIFT; 789 lock_limit >>= PAGE_SHIFT;
788 790
789 ret = -ENOMEM; 791 ret = -ENOMEM;
792 down_write(&current->mm->mmap_sem);
793
790 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 794 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
791 capable(CAP_IPC_LOCK)) 795 capable(CAP_IPC_LOCK))
792 ret = do_mlockall(flags); 796 ret = do_mlockall(flags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..a0e7153a79e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
86 86
87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
89unsigned long sysctl_overcommit_kbytes __read_mostly;
89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 90int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 91unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
91unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 92unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1190 return hint; 1191 return hint;
1191} 1192}
1192 1193
1194static inline int mlock_future_check(struct mm_struct *mm,
1195 unsigned long flags,
1196 unsigned long len)
1197{
1198 unsigned long locked, lock_limit;
1199
1200 /* mlock MCL_FUTURE? */
1201 if (flags & VM_LOCKED) {
1202 locked = len >> PAGE_SHIFT;
1203 locked += mm->locked_vm;
1204 lock_limit = rlimit(RLIMIT_MEMLOCK);
1205 lock_limit >>= PAGE_SHIFT;
1206 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1207 return -EAGAIN;
1208 }
1209 return 0;
1210}
1211
1193/* 1212/*
1194 * The caller must hold down_write(&current->mm->mmap_sem). 1213 * The caller must hold down_write(&current->mm->mmap_sem).
1195 */ 1214 */
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1251 if (!can_do_mlock()) 1270 if (!can_do_mlock())
1252 return -EPERM; 1271 return -EPERM;
1253 1272
1254 /* mlock MCL_FUTURE? */ 1273 if (mlock_future_check(mm, vm_flags, len))
1255 if (vm_flags & VM_LOCKED) { 1274 return -EAGAIN;
1256 unsigned long locked, lock_limit;
1257 locked = len >> PAGE_SHIFT;
1258 locked += mm->locked_vm;
1259 lock_limit = rlimit(RLIMIT_MEMLOCK);
1260 lock_limit >>= PAGE_SHIFT;
1261 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1262 return -EAGAIN;
1263 }
1264 1275
1265 if (file) { 1276 if (file) {
1266 struct inode *inode = file_inode(file); 1277 struct inode *inode = file_inode(file);
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2591 if (error & ~PAGE_MASK) 2602 if (error & ~PAGE_MASK)
2592 return error; 2603 return error;
2593 2604
2594 /* 2605 error = mlock_future_check(mm, mm->def_flags, len);
2595 * mlock MCL_FUTURE? 2606 if (error)
2596 */ 2607 return error;
2597 if (mm->def_flags & VM_LOCKED) {
2598 unsigned long locked, lock_limit;
2599 locked = len >> PAGE_SHIFT;
2600 locked += mm->locked_vm;
2601 lock_limit = rlimit(RLIMIT_MEMLOCK);
2602 lock_limit >>= PAGE_SHIFT;
2603 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2604 return -EAGAIN;
2605 }
2606 2608
2607 /* 2609 /*
2608 * mm->mmap_sem is required to protect against another thread 2610 * mm->mmap_sem is required to protect against another thread
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/mmu_notifier.h> 23#include <linux/mmu_notifier.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/ksm.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 64
64 ptent = *pte; 65 ptent = *pte;
65 page = vm_normal_page(vma, addr, oldpte); 66 page = vm_normal_page(vma, addr, oldpte);
66 if (page) { 67 if (page && !PageKsm(page)) {
67 if (!pte_numa(oldpte)) { 68 if (!pte_numa(oldpte)) {
68 ptent = pte_mknuma(ptent); 69 ptent = pte_mknuma(ptent);
69 set_pte_at(mm, addr, pte, ptent); 70 set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..19121ceb8874 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
41 if (limit > memblock.current_limit) 41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit; 42 limit = memblock.current_limit;
43 43
44 addr = memblock_find_in_range_node(goal, limit, size, align, nid); 44 addr = memblock_find_in_range_node(size, align, goal, limit, nid);
45 if (!addr) 45 if (!addr)
46 return NULL; 46 return NULL;
47 47
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void)
117 phys_addr_t start, end, size; 117 phys_addr_t start, end, size;
118 u64 i; 118 u64 i;
119 119
120 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 120 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
121 count += __free_memory_core(start, end); 121 count += __free_memory_core(start, end);
122 122
123 /* free range that is used for reserved array if we allocate it */ 123 /* free range that is used for reserved array if we allocate it */
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void)
161 reset_all_zones_managed_pages(); 161 reset_all_zones_managed_pages();
162 162
163 /* 163 /*
164 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 164 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
165 * because in some case like Node0 doesn't have RAM installed 165 * because in some case like Node0 doesn't have RAM installed
166 * low ram will be on Node1 166 * low ram will be on Node1
167 */ 167 */
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
215 215
216restart: 216restart:
217 217
218 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); 218 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
219 219
220 if (ptr) 220 if (ptr)
221 return ptr; 221 return ptr;
@@ -299,7 +299,7 @@ again:
299 if (ptr) 299 if (ptr)
300 return ptr; 300 return ptr;
301 301
302 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 302 ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
303 goal, limit); 303 goal, limit);
304 if (ptr) 304 if (ptr)
305 return ptr; 305 return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
60struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
62int sysctl_overcommit_ratio = 50; /* default is 50% */ 62int sysctl_overcommit_ratio = 50; /* default is 50% */
63unsigned long sysctl_overcommit_kbytes __read_mostly;
63int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
64int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
65unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..054ff47c4478 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
47#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
48/** 48/**
49 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
50 * @tsk: task struct of which task to consider 50 * @start: task struct of which task to consider
51 * @mask: nodemask passed to page allocator for mempolicy ooms 51 * @mask: nodemask passed to page allocator for mempolicy ooms
52 * 52 *
53 * Task eligibility is determined by whether or not a candidate task, @tsk, 53 * Task eligibility is determined by whether or not a candidate task, @tsk,
54 * shares the same mempolicy nodes as current if it is bound by such a policy 54 * shares the same mempolicy nodes as current if it is bound by such a policy
55 * and whether or not it has the same set of allowed cpuset nodes. 55 * and whether or not it has the same set of allowed cpuset nodes.
56 */ 56 */
57static bool has_intersects_mems_allowed(struct task_struct *tsk, 57static bool has_intersects_mems_allowed(struct task_struct *start,
58 const nodemask_t *mask) 58 const nodemask_t *mask)
59{ 59{
60 struct task_struct *start = tsk; 60 struct task_struct *tsk;
61 bool ret = false;
61 62
62 do { 63 rcu_read_lock();
64 for_each_thread(start, tsk) {
63 if (mask) { 65 if (mask) {
64 /* 66 /*
65 * If this is a mempolicy constrained oom, tsk's 67 * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
67 * mempolicy intersects current, otherwise it may be 69 * mempolicy intersects current, otherwise it may be
68 * needlessly killed. 70 * needlessly killed.
69 */ 71 */
70 if (mempolicy_nodemask_intersects(tsk, mask)) 72 ret = mempolicy_nodemask_intersects(tsk, mask);
71 return true;
72 } else { 73 } else {
73 /* 74 /*
74 * This is not a mempolicy constrained oom, so only 75 * This is not a mempolicy constrained oom, so only
75 * check the mems of tsk's cpuset. 76 * check the mems of tsk's cpuset.
76 */ 77 */
77 if (cpuset_mems_allowed_intersects(current, tsk)) 78 ret = cpuset_mems_allowed_intersects(current, tsk);
78 return true;
79 } 79 }
80 } while_each_thread(start, tsk); 80 if (ret)
81 break;
82 }
83 rcu_read_unlock();
81 84
82 return false; 85 return ret;
83} 86}
84#else 87#else
85static bool has_intersects_mems_allowed(struct task_struct *tsk, 88static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
97 */ 100 */
98struct task_struct *find_lock_task_mm(struct task_struct *p) 101struct task_struct *find_lock_task_mm(struct task_struct *p)
99{ 102{
100 struct task_struct *t = p; 103 struct task_struct *t;
101 104
102 do { 105 rcu_read_lock();
106
107 for_each_thread(p, t) {
103 task_lock(t); 108 task_lock(t);
104 if (likely(t->mm)) 109 if (likely(t->mm))
105 return t; 110 goto found;
106 task_unlock(t); 111 task_unlock(t);
107 } while_each_thread(p, t); 112 }
113 t = NULL;
114found:
115 rcu_read_unlock();
108 116
109 return NULL; 117 return t;
110} 118}
111 119
112/* return true if the task is not adequate as candidate victim task. */ 120/* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
301 unsigned long chosen_points = 0; 309 unsigned long chosen_points = 0;
302 310
303 rcu_read_lock(); 311 rcu_read_lock();
304 do_each_thread(g, p) { 312 for_each_process_thread(g, p) {
305 unsigned int points; 313 unsigned int points;
306 314
307 switch (oom_scan_process_thread(p, totalpages, nodemask, 315 switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
323 chosen = p; 331 chosen = p;
324 chosen_points = points; 332 chosen_points = points;
325 } 333 }
326 } while_each_thread(g, p); 334 }
327 if (chosen) 335 if (chosen)
328 get_task_struct(chosen); 336 get_task_struct(chosen);
329 rcu_read_unlock(); 337 rcu_read_unlock();
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
406{ 414{
407 struct task_struct *victim = p; 415 struct task_struct *victim = p;
408 struct task_struct *child; 416 struct task_struct *child;
409 struct task_struct *t = p; 417 struct task_struct *t;
410 struct mm_struct *mm; 418 struct mm_struct *mm;
411 unsigned int victim_points = 0; 419 unsigned int victim_points = 0;
412 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 420 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
437 * still freeing memory. 445 * still freeing memory.
438 */ 446 */
439 read_lock(&tasklist_lock); 447 read_lock(&tasklist_lock);
440 do { 448 for_each_thread(p, t) {
441 list_for_each_entry(child, &t->children, sibling) { 449 list_for_each_entry(child, &t->children, sibling) {
442 unsigned int child_points; 450 unsigned int child_points;
443 451
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
455 get_task_struct(victim); 463 get_task_struct(victim);
456 } 464 }
457 } 465 }
458 } while_each_thread(p, t); 466 }
459 read_unlock(&tasklist_lock); 467 read_unlock(&tasklist_lock);
460 468
461 rcu_read_lock();
462 p = find_lock_task_mm(victim); 469 p = find_lock_task_mm(victim);
463 if (!p) { 470 if (!p) {
464 rcu_read_unlock();
465 put_task_struct(victim); 471 put_task_struct(victim);
466 return; 472 return;
467 } else if (victim != p) { 473 } else if (victim != p) {
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
487 * That thread will now get access to memory reserves since it has a 493 * That thread will now get access to memory reserves since it has a
488 * pending fatal signal. 494 * pending fatal signal.
489 */ 495 */
496 rcu_read_lock();
490 for_each_process(p) 497 for_each_process(p)
491 if (p->mm == mm && !same_thread_group(p, victim) && 498 if (p->mm == mm && !same_thread_group(p, victim) &&
492 !(p->flags & PF_KTHREAD)) { 499 !(p->flags & PF_KTHREAD)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..533e2147d14f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2072 return; 2072 return;
2073 2073
2074 /* 2074 /*
2075 * Walking all memory to count page types is very expensive and should
2076 * be inhibited in non-blockable contexts.
2077 */
2078 if (!(gfp_mask & __GFP_WAIT))
2079 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2080
2081 /*
2082 * This documents exceptions given to allocations in certain 2075 * This documents exceptions given to allocations in certain
2083 * contexts that are allowed to allocate outside current's set 2076 * contexts that are allowed to allocate outside current's set
2084 * of allowed nodes. 2077 * of allowed nodes.
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2242 preferred_zone, migratetype); 2235 preferred_zone, migratetype);
2243 if (page) { 2236 if (page) {
2244 preferred_zone->compact_blockskip_flush = false; 2237 preferred_zone->compact_blockskip_flush = false;
2245 preferred_zone->compact_considered = 0; 2238 compaction_defer_reset(preferred_zone, order, true);
2246 preferred_zone->compact_defer_shift = 0;
2247 if (order >= preferred_zone->compact_order_failed)
2248 preferred_zone->compact_order_failed = order + 1;
2249 count_vm_event(COMPACTSUCCESS); 2239 count_vm_event(COMPACTSUCCESS);
2250 return page; 2240 return page;
2251 } 2241 }
@@ -2535,8 +2525,15 @@ rebalance:
2535 } 2525 }
2536 2526
2537 /* Atomic allocations - we can't balance anything */ 2527 /* Atomic allocations - we can't balance anything */
2538 if (!wait) 2528 if (!wait) {
2529 /*
2530 * All existing users of the deprecated __GFP_NOFAIL are
2531 * blockable, so warn of any new users that actually allow this
2532 * type of allocation to fail.
2533 */
2534 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
2539 goto nopage; 2535 goto nopage;
2536 }
2540 2537
2541 /* Avoid recursion of direct reclaim */ 2538 /* Avoid recursion of direct reclaim */
2542 if (current->flags & PF_MEMALLOC) 2539 if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3901 struct page *page; 3898 struct page *page;
3902 unsigned long block_migratetype; 3899 unsigned long block_migratetype;
3903 int reserve; 3900 int reserve;
3901 int old_reserve;
3904 3902
3905 /* 3903 /*
3906 * Get the start pfn, end pfn and the number of blocks to reserve 3904 * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3922 * future allocation of hugepages at runtime. 3920 * future allocation of hugepages at runtime.
3923 */ 3921 */
3924 reserve = min(2, reserve); 3922 reserve = min(2, reserve);
3923 old_reserve = zone->nr_migrate_reserve_block;
3924
3925 /* When memory hot-add, we almost always need to do nothing */
3926 if (reserve == old_reserve)
3927 return;
3928 zone->nr_migrate_reserve_block = reserve;
3925 3929
3926 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3930 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3927 if (!pfn_valid(pfn)) 3931 if (!pfn_valid(pfn))
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3959 reserve--; 3963 reserve--;
3960 continue; 3964 continue;
3961 } 3965 }
3966 } else if (!old_reserve) {
3967 /*
3968 * At boot time we don't need to scan the whole zone
3969 * for turning off MIGRATE_RESERVE.
3970 */
3971 break;
3962 } 3972 }
3963 3973
3964 /* 3974 /*
@@ -4209,7 +4219,6 @@ static noinline __init_refok
4209int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 4219int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4210{ 4220{
4211 int i; 4221 int i;
4212 struct pglist_data *pgdat = zone->zone_pgdat;
4213 size_t alloc_size; 4222 size_t alloc_size;
4214 4223
4215 /* 4224 /*
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4225 4234
4226 if (!slab_is_available()) { 4235 if (!slab_is_available()) {
4227 zone->wait_table = (wait_queue_head_t *) 4236 zone->wait_table = (wait_queue_head_t *)
4228 alloc_bootmem_node_nopanic(pgdat, alloc_size); 4237 memblock_virt_alloc_node_nopanic(
4238 alloc_size, zone->zone_pgdat->node_id);
4229 } else { 4239 } else {
4230 /* 4240 /*
4231 * This case means that a zone whose size was 0 gets new memory 4241 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4345#endif 4355#endif
4346 4356
4347/** 4357/**
4348 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4358 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
4349 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4359 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4350 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4360 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
4351 * 4361 *
4352 * If an architecture guarantees that all ranges registered with 4362 * If an architecture guarantees that all ranges registered with
4353 * add_active_ranges() contain no holes and may be freed, this 4363 * add_active_ranges() contain no holes and may be freed, this
4354 * this function may be used instead of calling free_bootmem() manually. 4364 * this function may be used instead of calling memblock_free_early_nid()
4365 * manually.
4355 */ 4366 */
4356void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4367void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4357{ 4368{
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4363 end_pfn = min(end_pfn, max_low_pfn); 4374 end_pfn = min(end_pfn, max_low_pfn);
4364 4375
4365 if (start_pfn < end_pfn) 4376 if (start_pfn < end_pfn)
4366 free_bootmem_node(NODE_DATA(this_nid), 4377 memblock_free_early_nid(PFN_PHYS(start_pfn),
4367 PFN_PHYS(start_pfn), 4378 (end_pfn - start_pfn) << PAGE_SHIFT,
4368 (end_pfn - start_pfn) << PAGE_SHIFT); 4379 this_nid);
4369 } 4380 }
4370} 4381}
4371 4382
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4636 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 4647 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4637 zone->pageblock_flags = NULL; 4648 zone->pageblock_flags = NULL;
4638 if (usemapsize) 4649 if (usemapsize)
4639 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4650 zone->pageblock_flags =
4640 usemapsize); 4651 memblock_virt_alloc_node_nopanic(usemapsize,
4652 pgdat->node_id);
4641} 4653}
4642#else 4654#else
4643static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, 4655static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4831 size = (end - start) * sizeof(struct page); 4843 size = (end - start) * sizeof(struct page);
4832 map = alloc_remap(pgdat->node_id, size); 4844 map = alloc_remap(pgdat->node_id, size);
4833 if (!map) 4845 if (!map)
4834 map = alloc_bootmem_node_nopanic(pgdat, size); 4846 map = memblock_virt_alloc_node_nopanic(size,
4847 pgdat->node_id);
4835 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4848 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4836 } 4849 }
4837#ifndef CONFIG_NEED_MULTIPLE_NODES 4850#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5012 nodemask_t saved_node_state = node_states[N_MEMORY]; 5025 nodemask_t saved_node_state = node_states[N_MEMORY];
5013 unsigned long totalpages = early_calculate_totalpages(); 5026 unsigned long totalpages = early_calculate_totalpages();
5014 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5027 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5028 struct memblock_type *type = &memblock.memory;
5029
5030 /* Need to find movable_zone earlier when movable_node is specified. */
5031 find_usable_zone_for_movable();
5032
5033 /*
5034 * If movable_node is specified, ignore kernelcore and movablecore
5035 * options.
5036 */
5037 if (movable_node_is_enabled()) {
5038 for (i = 0; i < type->cnt; i++) {
5039 if (!memblock_is_hotpluggable(&type->regions[i]))
5040 continue;
5041
5042 nid = type->regions[i].nid;
5043
5044 usable_startpfn = PFN_DOWN(type->regions[i].base);
5045 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5046 min(usable_startpfn, zone_movable_pfn[nid]) :
5047 usable_startpfn;
5048 }
5049
5050 goto out2;
5051 }
5015 5052
5016 /* 5053 /*
5017 * If movablecore was specified, calculate what size of 5054 * If movablecore=nn[KMG] was specified, calculate what size of
5018 * kernelcore that corresponds so that memory usable for 5055 * kernelcore that corresponds so that memory usable for
5019 * any allocation type is evenly spread. If both kernelcore 5056 * any allocation type is evenly spread. If both kernelcore
5020 * and movablecore are specified, then the value of kernelcore 5057 * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5040 goto out; 5077 goto out;
5041 5078
5042 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 5079 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
5043 find_usable_zone_for_movable();
5044 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 5080 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5045 5081
5046restart: 5082restart:
@@ -5131,6 +5167,7 @@ restart:
5131 if (usable_nodes && required_kernelcore > usable_nodes) 5167 if (usable_nodes && required_kernelcore > usable_nodes)
5132 goto restart; 5168 goto restart;
5133 5169
5170out2:
5134 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5171 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5135 for (nid = 0; nid < MAX_NUMNODES; nid++) 5172 for (nid = 0; nid < MAX_NUMNODES; nid++)
5136 zone_movable_pfn[nid] = 5173 zone_movable_pfn[nid] =
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename,
5857 do { 5894 do {
5858 size = bucketsize << log2qty; 5895 size = bucketsize << log2qty;
5859 if (flags & HASH_EARLY) 5896 if (flags & HASH_EARLY)
5860 table = alloc_bootmem_nopanic(size); 5897 table = memblock_virt_alloc_nopanic(size, 0);
5861 else if (hashdist) 5898 else if (hashdist)
5862 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5899 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5863 else { 5900 else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3bd0b8e6ab12..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
54 54
55 table_size = sizeof(struct page_cgroup) * nr_pages; 55 table_size = sizeof(struct page_cgroup) * nr_pages;
56 56
57 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
59 if (!base) 60 if (!base)
60 return -ENOMEM; 61 return -ENOMEM;
61 NODE_DATA(nid)->node_page_cgroup = base; 62 NODE_DATA(nid)->node_page_cgroup = base;
diff --git a/mm/percpu.c b/mm/percpu.c
index afbf352ae580..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1063 __alignof__(ai->groups[0].cpu_map[0])); 1063 __alignof__(ai->groups[0].cpu_map[0]));
1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1064 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1065 1065
1066 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); 1066 ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
1067 if (!ptr) 1067 if (!ptr)
1068 return NULL; 1068 return NULL;
1069 ai = ptr; 1069 ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1088 */ 1088 */
1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1089void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1090{ 1090{
1091 free_bootmem(__pa(ai), ai->__ai_size); 1091 memblock_free_early(__pa(ai), ai->__ai_size);
1092} 1092}
1093 1093
1094/** 1094/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1246 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1247 1247
1248 /* process group information and build config tables accordingly */ 1248 /* process group information and build config tables accordingly */
1249 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1249 group_offsets = memblock_virt_alloc(ai->nr_groups *
1250 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); 1250 sizeof(group_offsets[0]), 0);
1251 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); 1251 group_sizes = memblock_virt_alloc(ai->nr_groups *
1252 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1252 sizeof(group_sizes[0]), 0);
1253 unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
1254 unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
1253 1255
1254 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1256 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1255 unit_map[cpu] = UINT_MAX; 1257 unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1311 * empty chunks. 1313 * empty chunks.
1312 */ 1314 */
1313 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1315 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1314 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1316 pcpu_slot = memblock_virt_alloc(
1317 pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
1315 for (i = 0; i < pcpu_nr_slots; i++) 1318 for (i = 0; i < pcpu_nr_slots; i++)
1316 INIT_LIST_HEAD(&pcpu_slot[i]); 1319 INIT_LIST_HEAD(&pcpu_slot[i]);
1317 1320
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1322 * covers static area + reserved area (mostly used for module 1325 * covers static area + reserved area (mostly used for module
1323 * static percpu allocation). 1326 * static percpu allocation).
1324 */ 1327 */
1325 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1328 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1326 INIT_LIST_HEAD(&schunk->list); 1329 INIT_LIST_HEAD(&schunk->list);
1327 schunk->base_addr = base_addr; 1330 schunk->base_addr = base_addr;
1328 schunk->map = smap; 1331 schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1346 1349
1347 /* init dynamic chunk if necessary */ 1350 /* init dynamic chunk if necessary */
1348 if (dyn_size) { 1351 if (dyn_size) {
1349 dchunk = alloc_bootmem(pcpu_chunk_struct_size); 1352 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1350 INIT_LIST_HEAD(&dchunk->list); 1353 INIT_LIST_HEAD(&dchunk->list);
1351 dchunk->base_addr = base_addr; 1354 dchunk->base_addr = base_addr;
1352 dchunk->map = dmap; 1355 dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1626 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1629 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1627 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 1630 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1628 1631
1629 areas = alloc_bootmem_nopanic(areas_size); 1632 areas = memblock_virt_alloc_nopanic(areas_size, 0);
1630 if (!areas) { 1633 if (!areas) {
1631 rc = -ENOMEM; 1634 rc = -ENOMEM;
1632 goto out_free; 1635 goto out_free;
@@ -1712,7 +1715,7 @@ out_free_areas:
1712out_free: 1715out_free:
1713 pcpu_free_alloc_info(ai); 1716 pcpu_free_alloc_info(ai);
1714 if (areas) 1717 if (areas)
1715 free_bootmem(__pa(areas), areas_size); 1718 memblock_free_early(__pa(areas), areas_size);
1716 return rc; 1719 return rc;
1717} 1720}
1718#endif /* BUILD_EMBED_FIRST_CHUNK */ 1721#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
1760 /* unaligned allocations can't be freed, round up to page size */ 1763 /* unaligned allocations can't be freed, round up to page size */
1761 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 1764 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1762 sizeof(pages[0])); 1765 sizeof(pages[0]));
1763 pages = alloc_bootmem(pages_size); 1766 pages = memblock_virt_alloc(pages_size, 0);
1764 1767
1765 /* allocate pages */ 1768 /* allocate pages */
1766 j = 0; 1769 j = 0;
@@ -1823,7 +1826,7 @@ enomem:
1823 free_fn(page_address(pages[j]), PAGE_SIZE); 1826 free_fn(page_address(pages[j]), PAGE_SIZE);
1824 rc = -ENOMEM; 1827 rc = -ENOMEM;
1825out_free_ar: 1828out_free_ar:
1826 free_bootmem(__pa(pages), pages_size); 1829 memblock_free_early(__pa(pages), pages_size);
1827 pcpu_free_alloc_info(ai); 1830 pcpu_free_alloc_info(ai);
1828 return rc; 1831 return rc;
1829} 1832}
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
1848static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 1851static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1849 size_t align) 1852 size_t align)
1850{ 1853{
1851 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 1854 return memblock_virt_alloc_from_nopanic(
1855 size, align, __pa(MAX_DMA_ADDRESS));
1852} 1856}
1853 1857
1854static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 1858static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1855{ 1859{
1856 free_bootmem(__pa(ptr), size); 1860 memblock_free_early(__pa(ptr), size);
1857} 1861}
1858 1862
1859void __init setup_per_cpu_areas(void) 1863void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
1896 void *fc; 1900 void *fc;
1897 1901
1898 ai = pcpu_alloc_alloc_info(1, 1); 1902 ai = pcpu_alloc_alloc_info(1, 1);
1899 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1903 fc = memblock_virt_alloc_from_nopanic(unit_size,
1904 PAGE_SIZE,
1905 __pa(MAX_DMA_ADDRESS));
1900 if (!ai || !fc) 1906 if (!ai || !fc)
1901 panic("Failed to allocate memory for percpu areas."); 1907 panic("Failed to allocate memory for percpu areas.");
1902 /* kmemleak tracks the percpu allocations separately */ 1908 /* kmemleak tracks the percpu allocations separately */
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..962e2a1e13a0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
660 return 1; 660 return 1;
661} 661}
662 662
663struct page_referenced_arg {
664 int mapcount;
665 int referenced;
666 unsigned long vm_flags;
667 struct mem_cgroup *memcg;
668};
663/* 669/*
664 * Subfunctions of page_referenced: page_referenced_one called 670 * arg: page_referenced_arg will be passed
665 * repeatedly from either page_referenced_anon or page_referenced_file.
666 */ 671 */
667int page_referenced_one(struct page *page, struct vm_area_struct *vma, 672int page_referenced_one(struct page *page, struct vm_area_struct *vma,
668 unsigned long address, unsigned int *mapcount, 673 unsigned long address, void *arg)
669 unsigned long *vm_flags)
670{ 674{
671 struct mm_struct *mm = vma->vm_mm; 675 struct mm_struct *mm = vma->vm_mm;
672 spinlock_t *ptl; 676 spinlock_t *ptl;
673 int referenced = 0; 677 int referenced = 0;
678 struct page_referenced_arg *pra = arg;
674 679
675 if (unlikely(PageTransHuge(page))) { 680 if (unlikely(PageTransHuge(page))) {
676 pmd_t *pmd; 681 pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
682 pmd = page_check_address_pmd(page, mm, address, 687 pmd = page_check_address_pmd(page, mm, address,
683 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 688 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
684 if (!pmd) 689 if (!pmd)
685 goto out; 690 return SWAP_AGAIN;
686 691
687 if (vma->vm_flags & VM_LOCKED) { 692 if (vma->vm_flags & VM_LOCKED) {
688 spin_unlock(ptl); 693 spin_unlock(ptl);
689 *mapcount = 0; /* break early from loop */ 694 pra->vm_flags |= VM_LOCKED;
690 *vm_flags |= VM_LOCKED; 695 return SWAP_FAIL; /* To break the loop */
691 goto out;
692 } 696 }
693 697
694 /* go ahead even if the pmd is pmd_trans_splitting() */ 698 /* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
704 */ 708 */
705 pte = page_check_address(page, mm, address, &ptl, 0); 709 pte = page_check_address(page, mm, address, &ptl, 0);
706 if (!pte) 710 if (!pte)
707 goto out; 711 return SWAP_AGAIN;
708 712
709 if (vma->vm_flags & VM_LOCKED) { 713 if (vma->vm_flags & VM_LOCKED) {
710 pte_unmap_unlock(pte, ptl); 714 pte_unmap_unlock(pte, ptl);
711 *mapcount = 0; /* break early from loop */ 715 pra->vm_flags |= VM_LOCKED;
712 *vm_flags |= VM_LOCKED; 716 return SWAP_FAIL; /* To break the loop */
713 goto out;
714 } 717 }
715 718
716 if (ptep_clear_flush_young_notify(vma, address, pte)) { 719 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
727 pte_unmap_unlock(pte, ptl); 730 pte_unmap_unlock(pte, ptl);
728 } 731 }
729 732
730 (*mapcount)--; 733 if (referenced) {
731 734 pra->referenced++;
732 if (referenced) 735 pra->vm_flags |= vma->vm_flags;
733 *vm_flags |= vma->vm_flags;
734out:
735 return referenced;
736}
737
738static int page_referenced_anon(struct page *page,
739 struct mem_cgroup *memcg,
740 unsigned long *vm_flags)
741{
742 unsigned int mapcount;
743 struct anon_vma *anon_vma;
744 pgoff_t pgoff;
745 struct anon_vma_chain *avc;
746 int referenced = 0;
747
748 anon_vma = page_lock_anon_vma_read(page);
749 if (!anon_vma)
750 return referenced;
751
752 mapcount = page_mapcount(page);
753 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
754 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
755 struct vm_area_struct *vma = avc->vma;
756 unsigned long address = vma_address(page, vma);
757 /*
758 * If we are reclaiming on behalf of a cgroup, skip
759 * counting on behalf of references from different
760 * cgroups
761 */
762 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
763 continue;
764 referenced += page_referenced_one(page, vma, address,
765 &mapcount, vm_flags);
766 if (!mapcount)
767 break;
768 } 736 }
769 737
770 page_unlock_anon_vma_read(anon_vma); 738 pra->mapcount--;
771 return referenced; 739 if (!pra->mapcount)
740 return SWAP_SUCCESS; /* To break the loop */
741
742 return SWAP_AGAIN;
772} 743}
773 744
774/** 745static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
775 * page_referenced_file - referenced check for object-based rmap
776 * @page: the page we're checking references on.
777 * @memcg: target memory control group
778 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
779 *
780 * For an object-based mapped page, find all the places it is mapped and
781 * check/clear the referenced flag. This is done by following the page->mapping
782 * pointer, then walking the chain of vmas it holds. It returns the number
783 * of references it found.
784 *
785 * This function is only called from page_referenced for object-based pages.
786 */
787static int page_referenced_file(struct page *page,
788 struct mem_cgroup *memcg,
789 unsigned long *vm_flags)
790{ 746{
791 unsigned int mapcount; 747 struct page_referenced_arg *pra = arg;
792 struct address_space *mapping = page->mapping; 748 struct mem_cgroup *memcg = pra->memcg;
793 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
794 struct vm_area_struct *vma;
795 int referenced = 0;
796
797 /*
798 * The caller's checks on page->mapping and !PageAnon have made
799 * sure that this is a file page: the check for page->mapping
800 * excludes the case just before it gets set on an anon page.
801 */
802 BUG_ON(PageAnon(page));
803
804 /*
805 * The page lock not only makes sure that page->mapping cannot
806 * suddenly be NULLified by truncation, it makes sure that the
807 * structure at mapping cannot be freed and reused yet,
808 * so we can safely take mapping->i_mmap_mutex.
809 */
810 BUG_ON(!PageLocked(page));
811
812 mutex_lock(&mapping->i_mmap_mutex);
813 749
814 /* 750 if (!mm_match_cgroup(vma->vm_mm, memcg))
815 * i_mmap_mutex does not stabilize mapcount at all, but mapcount 751 return true;
816 * is more likely to be accurate if we note it after spinning.
817 */
818 mapcount = page_mapcount(page);
819
820 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
821 unsigned long address = vma_address(page, vma);
822 /*
823 * If we are reclaiming on behalf of a cgroup, skip
824 * counting on behalf of references from different
825 * cgroups
826 */
827 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
828 continue;
829 referenced += page_referenced_one(page, vma, address,
830 &mapcount, vm_flags);
831 if (!mapcount)
832 break;
833 }
834 752
835 mutex_unlock(&mapping->i_mmap_mutex); 753 return false;
836 return referenced;
837} 754}
838 755
839/** 756/**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
851 struct mem_cgroup *memcg, 768 struct mem_cgroup *memcg,
852 unsigned long *vm_flags) 769 unsigned long *vm_flags)
853{ 770{
854 int referenced = 0; 771 int ret;
855 int we_locked = 0; 772 int we_locked = 0;
773 struct page_referenced_arg pra = {
774 .mapcount = page_mapcount(page),
775 .memcg = memcg,
776 };
777 struct rmap_walk_control rwc = {
778 .rmap_one = page_referenced_one,
779 .arg = (void *)&pra,
780 .anon_lock = page_lock_anon_vma_read,
781 };
856 782
857 *vm_flags = 0; 783 *vm_flags = 0;
858 if (page_mapped(page) && page_rmapping(page)) { 784 if (!page_mapped(page))
859 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 785 return 0;
860 we_locked = trylock_page(page); 786
861 if (!we_locked) { 787 if (!page_rmapping(page))
862 referenced++; 788 return 0;
863 goto out; 789
864 } 790 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
865 } 791 we_locked = trylock_page(page);
866 if (unlikely(PageKsm(page))) 792 if (!we_locked)
867 referenced += page_referenced_ksm(page, memcg, 793 return 1;
868 vm_flags);
869 else if (PageAnon(page))
870 referenced += page_referenced_anon(page, memcg,
871 vm_flags);
872 else if (page->mapping)
873 referenced += page_referenced_file(page, memcg,
874 vm_flags);
875 if (we_locked)
876 unlock_page(page);
877 } 794 }
878out: 795
879 return referenced; 796 /*
797 * If we are reclaiming on behalf of a cgroup, skip
798 * counting on behalf of references from different
799 * cgroups
800 */
801 if (memcg) {
802 rwc.invalid_vma = invalid_page_referenced_vma;
803 }
804
805 ret = rmap_walk(page, &rwc);
806 *vm_flags = pra.vm_flags;
807
808 if (we_locked)
809 unlock_page(page);
810
811 return pra.referenced;
880} 812}
881 813
882static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 814static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
883 unsigned long address) 815 unsigned long address, void *arg)
884{ 816{
885 struct mm_struct *mm = vma->vm_mm; 817 struct mm_struct *mm = vma->vm_mm;
886 pte_t *pte; 818 pte_t *pte;
887 spinlock_t *ptl; 819 spinlock_t *ptl;
888 int ret = 0; 820 int ret = 0;
821 int *cleaned = arg;
889 822
890 pte = page_check_address(page, mm, address, &ptl, 1); 823 pte = page_check_address(page, mm, address, &ptl, 1);
891 if (!pte) 824 if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
904 837
905 pte_unmap_unlock(pte, ptl); 838 pte_unmap_unlock(pte, ptl);
906 839
907 if (ret) 840 if (ret) {
908 mmu_notifier_invalidate_page(mm, address); 841 mmu_notifier_invalidate_page(mm, address);
842 (*cleaned)++;
843 }
909out: 844out:
910 return ret; 845 return SWAP_AGAIN;
911} 846}
912 847
913static int page_mkclean_file(struct address_space *mapping, struct page *page) 848static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
914{ 849{
915 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 850 if (vma->vm_flags & VM_SHARED)
916 struct vm_area_struct *vma; 851 return 0;
917 int ret = 0;
918
919 BUG_ON(PageAnon(page));
920 852
921 mutex_lock(&mapping->i_mmap_mutex); 853 return 1;
922 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
923 if (vma->vm_flags & VM_SHARED) {
924 unsigned long address = vma_address(page, vma);
925 ret += page_mkclean_one(page, vma, address);
926 }
927 }
928 mutex_unlock(&mapping->i_mmap_mutex);
929 return ret;
930} 854}
931 855
932int page_mkclean(struct page *page) 856int page_mkclean(struct page *page)
933{ 857{
934 int ret = 0; 858 int cleaned = 0;
859 struct address_space *mapping;
860 struct rmap_walk_control rwc = {
861 .arg = (void *)&cleaned,
862 .rmap_one = page_mkclean_one,
863 .invalid_vma = invalid_mkclean_vma,
864 };
935 865
936 BUG_ON(!PageLocked(page)); 866 BUG_ON(!PageLocked(page));
937 867
938 if (page_mapped(page)) { 868 if (!page_mapped(page))
939 struct address_space *mapping = page_mapping(page); 869 return 0;
940 if (mapping)
941 ret = page_mkclean_file(mapping, page);
942 }
943 870
944 return ret; 871 mapping = page_mapping(page);
872 if (!mapping)
873 return 0;
874
875 rmap_walk(page, &rwc);
876
877 return cleaned;
945} 878}
946EXPORT_SYMBOL_GPL(page_mkclean); 879EXPORT_SYMBOL_GPL(page_mkclean);
947 880
@@ -1177,17 +1110,17 @@ out:
1177} 1110}
1178 1111
1179/* 1112/*
1180 * Subfunctions of try_to_unmap: try_to_unmap_one called 1113 * @arg: enum ttu_flags will be passed to this argument
1181 * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
1182 */ 1114 */
1183int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1115int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1184 unsigned long address, enum ttu_flags flags) 1116 unsigned long address, void *arg)
1185{ 1117{
1186 struct mm_struct *mm = vma->vm_mm; 1118 struct mm_struct *mm = vma->vm_mm;
1187 pte_t *pte; 1119 pte_t *pte;
1188 pte_t pteval; 1120 pte_t pteval;
1189 spinlock_t *ptl; 1121 spinlock_t *ptl;
1190 int ret = SWAP_AGAIN; 1122 int ret = SWAP_AGAIN;
1123 enum ttu_flags flags = (enum ttu_flags)arg;
1191 1124
1192 pte = page_check_address(page, mm, address, &ptl, 0); 1125 pte = page_check_address(page, mm, address, &ptl, 0);
1193 if (!pte) 1126 if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1426 return ret; 1359 return ret;
1427} 1360}
1428 1361
1429bool is_vma_temporary_stack(struct vm_area_struct *vma) 1362static int try_to_unmap_nonlinear(struct page *page,
1430{ 1363 struct address_space *mapping, struct vm_area_struct *vma)
1431 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1432
1433 if (!maybe_stack)
1434 return false;
1435
1436 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1437 VM_STACK_INCOMPLETE_SETUP)
1438 return true;
1439
1440 return false;
1441}
1442
1443/**
1444 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1445 * rmap method
1446 * @page: the page to unmap/unlock
1447 * @flags: action and flags
1448 *
1449 * Find all the mappings of a page using the mapping pointer and the vma chains
1450 * contained in the anon_vma struct it points to.
1451 *
1452 * This function is only called from try_to_unmap/try_to_munlock for
1453 * anonymous pages.
1454 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1455 * where the page was found will be held for write. So, we won't recheck
1456 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1457 * 'LOCKED.
1458 */
1459static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1460{
1461 struct anon_vma *anon_vma;
1462 pgoff_t pgoff;
1463 struct anon_vma_chain *avc;
1464 int ret = SWAP_AGAIN;
1465
1466 anon_vma = page_lock_anon_vma_read(page);
1467 if (!anon_vma)
1468 return ret;
1469
1470 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1471 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1472 struct vm_area_struct *vma = avc->vma;
1473 unsigned long address;
1474
1475 /*
1476 * During exec, a temporary VMA is setup and later moved.
1477 * The VMA is moved under the anon_vma lock but not the
1478 * page tables leading to a race where migration cannot
1479 * find the migration ptes. Rather than increasing the
1480 * locking requirements of exec(), migration skips
1481 * temporary VMAs until after exec() completes.
1482 */
1483 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1484 is_vma_temporary_stack(vma))
1485 continue;
1486
1487 address = vma_address(page, vma);
1488 ret = try_to_unmap_one(page, vma, address, flags);
1489 if (ret != SWAP_AGAIN || !page_mapped(page))
1490 break;
1491 }
1492
1493 page_unlock_anon_vma_read(anon_vma);
1494 return ret;
1495}
1496
1497/**
1498 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1499 * @page: the page to unmap/unlock
1500 * @flags: action and flags
1501 *
1502 * Find all the mappings of a page using the mapping pointer and the vma chains
1503 * contained in the address_space struct it points to.
1504 *
1505 * This function is only called from try_to_unmap/try_to_munlock for
1506 * object-based pages.
1507 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1508 * where the page was found will be held for write. So, we won't recheck
1509 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1510 * 'LOCKED.
1511 */
1512static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1513{ 1364{
1514 struct address_space *mapping = page->mapping;
1515 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1516 struct vm_area_struct *vma;
1517 int ret = SWAP_AGAIN; 1365 int ret = SWAP_AGAIN;
1518 unsigned long cursor; 1366 unsigned long cursor;
1519 unsigned long max_nl_cursor = 0; 1367 unsigned long max_nl_cursor = 0;
1520 unsigned long max_nl_size = 0; 1368 unsigned long max_nl_size = 0;
1521 unsigned int mapcount; 1369 unsigned int mapcount;
1522 1370
1523 if (PageHuge(page)) 1371 list_for_each_entry(vma,
1524 pgoff = page->index << compound_order(page); 1372 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1525 1373
1526 mutex_lock(&mapping->i_mmap_mutex);
1527 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1528 unsigned long address = vma_address(page, vma);
1529 ret = try_to_unmap_one(page, vma, address, flags);
1530 if (ret != SWAP_AGAIN || !page_mapped(page))
1531 goto out;
1532 }
1533
1534 if (list_empty(&mapping->i_mmap_nonlinear))
1535 goto out;
1536
1537 /*
1538 * We don't bother to try to find the munlocked page in nonlinears.
1539 * It's costly. Instead, later, page reclaim logic may call
1540 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1541 */
1542 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1543 goto out;
1544
1545 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1546 shared.nonlinear) {
1547 cursor = (unsigned long) vma->vm_private_data; 1374 cursor = (unsigned long) vma->vm_private_data;
1548 if (cursor > max_nl_cursor) 1375 if (cursor > max_nl_cursor)
1549 max_nl_cursor = cursor; 1376 max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1553 } 1380 }
1554 1381
1555 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1382 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1556 ret = SWAP_FAIL; 1383 return SWAP_FAIL;
1557 goto out;
1558 } 1384 }
1559 1385
1560 /* 1386 /*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1566 */ 1392 */
1567 mapcount = page_mapcount(page); 1393 mapcount = page_mapcount(page);
1568 if (!mapcount) 1394 if (!mapcount)
1569 goto out; 1395 return ret;
1396
1570 cond_resched(); 1397 cond_resched();
1571 1398
1572 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1399 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1574 max_nl_cursor = CLUSTER_SIZE; 1401 max_nl_cursor = CLUSTER_SIZE;
1575 1402
1576 do { 1403 do {
1577 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1404 list_for_each_entry(vma,
1578 shared.nonlinear) { 1405 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1406
1579 cursor = (unsigned long) vma->vm_private_data; 1407 cursor = (unsigned long) vma->vm_private_data;
1580 while ( cursor < max_nl_cursor && 1408 while (cursor < max_nl_cursor &&
1581 cursor < vma->vm_end - vma->vm_start) { 1409 cursor < vma->vm_end - vma->vm_start) {
1582 if (try_to_unmap_cluster(cursor, &mapcount, 1410 if (try_to_unmap_cluster(cursor, &mapcount,
1583 vma, page) == SWAP_MLOCK) 1411 vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1585 cursor += CLUSTER_SIZE; 1413 cursor += CLUSTER_SIZE;
1586 vma->vm_private_data = (void *) cursor; 1414 vma->vm_private_data = (void *) cursor;
1587 if ((int)mapcount <= 0) 1415 if ((int)mapcount <= 0)
1588 goto out; 1416 return ret;
1589 } 1417 }
1590 vma->vm_private_data = (void *) max_nl_cursor; 1418 vma->vm_private_data = (void *) max_nl_cursor;
1591 } 1419 }
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1600 */ 1428 */
1601 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) 1429 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1602 vma->vm_private_data = NULL; 1430 vma->vm_private_data = NULL;
1603out: 1431
1604 mutex_unlock(&mapping->i_mmap_mutex);
1605 return ret; 1432 return ret;
1606} 1433}
1607 1434
1435bool is_vma_temporary_stack(struct vm_area_struct *vma)
1436{
1437 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1438
1439 if (!maybe_stack)
1440 return false;
1441
1442 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1443 VM_STACK_INCOMPLETE_SETUP)
1444 return true;
1445
1446 return false;
1447}
1448
1449static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1450{
1451 return is_vma_temporary_stack(vma);
1452}
1453
1454static int page_not_mapped(struct page *page)
1455{
1456 return !page_mapped(page);
1457};
1458
1608/** 1459/**
1609 * try_to_unmap - try to remove all page table mappings to a page 1460 * try_to_unmap - try to remove all page table mappings to a page
1610 * @page: the page to get unmapped 1461 * @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
1622int try_to_unmap(struct page *page, enum ttu_flags flags) 1473int try_to_unmap(struct page *page, enum ttu_flags flags)
1623{ 1474{
1624 int ret; 1475 int ret;
1476 struct rmap_walk_control rwc = {
1477 .rmap_one = try_to_unmap_one,
1478 .arg = (void *)flags,
1479 .done = page_not_mapped,
1480 .file_nonlinear = try_to_unmap_nonlinear,
1481 .anon_lock = page_lock_anon_vma_read,
1482 };
1625 1483
1626 BUG_ON(!PageLocked(page));
1627 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); 1484 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1628 1485
1629 if (unlikely(PageKsm(page))) 1486 /*
1630 ret = try_to_unmap_ksm(page, flags); 1487 * During exec, a temporary VMA is setup and later moved.
1631 else if (PageAnon(page)) 1488 * The VMA is moved under the anon_vma lock but not the
1632 ret = try_to_unmap_anon(page, flags); 1489 * page tables leading to a race where migration cannot
1633 else 1490 * find the migration ptes. Rather than increasing the
1634 ret = try_to_unmap_file(page, flags); 1491 * locking requirements of exec(), migration skips
1492 * temporary VMAs until after exec() completes.
1493 */
1494 if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
1495 rwc.invalid_vma = invalid_migration_vma;
1496
1497 ret = rmap_walk(page, &rwc);
1498
1635 if (ret != SWAP_MLOCK && !page_mapped(page)) 1499 if (ret != SWAP_MLOCK && !page_mapped(page))
1636 ret = SWAP_SUCCESS; 1500 ret = SWAP_SUCCESS;
1637 return ret; 1501 return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1654 */ 1518 */
1655int try_to_munlock(struct page *page) 1519int try_to_munlock(struct page *page)
1656{ 1520{
1521 int ret;
1522 struct rmap_walk_control rwc = {
1523 .rmap_one = try_to_unmap_one,
1524 .arg = (void *)TTU_MUNLOCK,
1525 .done = page_not_mapped,
1526 /*
1527 * We don't bother to try to find the munlocked page in
1528 * nonlinears. It's costly. Instead, later, page reclaim logic
1529 * may call try_to_unmap() and recover PG_mlocked lazily.
1530 */
1531 .file_nonlinear = NULL,
1532 .anon_lock = page_lock_anon_vma_read,
1533
1534 };
1535
1657 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1536 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1658 1537
1659 if (unlikely(PageKsm(page))) 1538 ret = rmap_walk(page, &rwc);
1660 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1539 return ret;
1661 else if (PageAnon(page))
1662 return try_to_unmap_anon(page, TTU_MUNLOCK);
1663 else
1664 return try_to_unmap_file(page, TTU_MUNLOCK);
1665} 1540}
1666 1541
1667void __put_anon_vma(struct anon_vma *anon_vma) 1542void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
1674 anon_vma_free(anon_vma); 1549 anon_vma_free(anon_vma);
1675} 1550}
1676 1551
1677#ifdef CONFIG_MIGRATION 1552static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1678/* 1553 struct rmap_walk_control *rwc)
1679 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1680 * Called by migrate.c to remove migration ptes, but might be used more later.
1681 */
1682static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1683 struct vm_area_struct *, unsigned long, void *), void *arg)
1684{ 1554{
1685 struct anon_vma *anon_vma; 1555 struct anon_vma *anon_vma;
1686 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1556
1687 struct anon_vma_chain *avc; 1557 if (rwc->anon_lock)
1688 int ret = SWAP_AGAIN; 1558 return rwc->anon_lock(page);
1689 1559
1690 /* 1560 /*
1691 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1561 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1695 */ 1565 */
1696 anon_vma = page_anon_vma(page); 1566 anon_vma = page_anon_vma(page);
1697 if (!anon_vma) 1567 if (!anon_vma)
1698 return ret; 1568 return NULL;
1569
1699 anon_vma_lock_read(anon_vma); 1570 anon_vma_lock_read(anon_vma);
1571 return anon_vma;
1572}
1573
1574/*
1575 * rmap_walk_anon - do something to anonymous page using the object-based
1576 * rmap method
1577 * @page: the page to be handled
1578 * @rwc: control variable according to each walk type
1579 *
1580 * Find all the mappings of a page using the mapping pointer and the vma chains
1581 * contained in the anon_vma struct it points to.
1582 *
1583 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1584 * where the page was found will be held for write. So, we won't recheck
1585 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1586 * LOCKED.
1587 */
1588static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1589{
1590 struct anon_vma *anon_vma;
1591 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1592 struct anon_vma_chain *avc;
1593 int ret = SWAP_AGAIN;
1594
1595 anon_vma = rmap_walk_anon_lock(page, rwc);
1596 if (!anon_vma)
1597 return ret;
1598
1700 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1599 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1701 struct vm_area_struct *vma = avc->vma; 1600 struct vm_area_struct *vma = avc->vma;
1702 unsigned long address = vma_address(page, vma); 1601 unsigned long address = vma_address(page, vma);
1703 ret = rmap_one(page, vma, address, arg); 1602
1603 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1604 continue;
1605
1606 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1704 if (ret != SWAP_AGAIN) 1607 if (ret != SWAP_AGAIN)
1705 break; 1608 break;
1609 if (rwc->done && rwc->done(page))
1610 break;
1706 } 1611 }
1707 anon_vma_unlock_read(anon_vma); 1612 anon_vma_unlock_read(anon_vma);
1708 return ret; 1613 return ret;
1709} 1614}
1710 1615
1711static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1616/*
1712 struct vm_area_struct *, unsigned long, void *), void *arg) 1617 * rmap_walk_file - do something to file page using the object-based rmap method
1618 * @page: the page to be handled
1619 * @rwc: control variable according to each walk type
1620 *
1621 * Find all the mappings of a page using the mapping pointer and the vma chains
1622 * contained in the address_space struct it points to.
1623 *
1624 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1625 * where the page was found will be held for write. So, we won't recheck
1626 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1627 * LOCKED.
1628 */
1629static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1713{ 1630{
1714 struct address_space *mapping = page->mapping; 1631 struct address_space *mapping = page->mapping;
1715 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1632 pgoff_t pgoff = page->index << compound_order(page);
1716 struct vm_area_struct *vma; 1633 struct vm_area_struct *vma;
1717 int ret = SWAP_AGAIN; 1634 int ret = SWAP_AGAIN;
1718 1635
1636 /*
1637 * The page lock not only makes sure that page->mapping cannot
1638 * suddenly be NULLified by truncation, it makes sure that the
1639 * structure at mapping cannot be freed and reused yet,
1640 * so we can safely take mapping->i_mmap_mutex.
1641 */
1642 VM_BUG_ON(!PageLocked(page));
1643
1719 if (!mapping) 1644 if (!mapping)
1720 return ret; 1645 return ret;
1721 mutex_lock(&mapping->i_mmap_mutex); 1646 mutex_lock(&mapping->i_mmap_mutex);
1722 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1647 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1723 unsigned long address = vma_address(page, vma); 1648 unsigned long address = vma_address(page, vma);
1724 ret = rmap_one(page, vma, address, arg); 1649
1650 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1651 continue;
1652
1653 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1725 if (ret != SWAP_AGAIN) 1654 if (ret != SWAP_AGAIN)
1726 break; 1655 goto done;
1656 if (rwc->done && rwc->done(page))
1657 goto done;
1727 } 1658 }
1728 /* 1659
1729 * No nonlinear handling: being always shared, nonlinear vmas 1660 if (!rwc->file_nonlinear)
1730 * never contain migration ptes. Decide what to do about this 1661 goto done;
1731 * limitation to linear when we need rmap_walk() on nonlinear. 1662
1732 */ 1663 if (list_empty(&mapping->i_mmap_nonlinear))
1664 goto done;
1665
1666 ret = rwc->file_nonlinear(page, mapping, vma);
1667
1668done:
1733 mutex_unlock(&mapping->i_mmap_mutex); 1669 mutex_unlock(&mapping->i_mmap_mutex);
1734 return ret; 1670 return ret;
1735} 1671}
1736 1672
1737int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1673int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1738 struct vm_area_struct *, unsigned long, void *), void *arg)
1739{ 1674{
1740 VM_BUG_ON(!PageLocked(page));
1741
1742 if (unlikely(PageKsm(page))) 1675 if (unlikely(PageKsm(page)))
1743 return rmap_walk_ksm(page, rmap_one, arg); 1676 return rmap_walk_ksm(page, rwc);
1744 else if (PageAnon(page)) 1677 else if (PageAnon(page))
1745 return rmap_walk_anon(page, rmap_one, arg); 1678 return rmap_walk_anon(page, rwc);
1746 else 1679 else
1747 return rmap_walk_file(page, rmap_one, arg); 1680 return rmap_walk_file(page, rwc);
1748} 1681}
1749#endif /* CONFIG_MIGRATION */
1750 1682
1751#ifdef CONFIG_HUGETLB_PAGE 1683#ifdef CONFIG_HUGETLB_PAGE
1752/* 1684/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 40 unsigned long align,
41 unsigned long goal) 41 unsigned long goal)
42{ 42{
43 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); 43 return memblock_virt_alloc_try_nid(size, align, goal,
44 BOOTMEM_ALLOC_ACCESSIBLE, node);
44} 45}
45 46
46static void *vmemmap_buf; 47static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
226 227
227 if (vmemmap_buf_start) { 228 if (vmemmap_buf_start) {
228 /* need to free left buf */ 229 /* need to free left buf */
229 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 230 memblock_free_early(__pa(vmemmap_buf),
231 vmemmap_buf_end - vmemmap_buf);
230 vmemmap_buf = NULL; 232 vmemmap_buf = NULL;
231 vmemmap_buf_end = NULL; 233 vmemmap_buf_end = NULL;
232 } 234 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
69 else 69 else
70 section = kzalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else { 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = memblock_virt_alloc_node(array_size, nid);
73 } 73 }
74 74
75 return section; 75 return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
279 limit = goal + (1UL << PA_SECTION_SHIFT); 279 limit = goal + (1UL << PA_SECTION_SHIFT);
280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT); 280 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
281again: 281again:
282 p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, 282 p = memblock_virt_alloc_try_nid_nopanic(size,
283 SMP_CACHE_BYTES, goal, limit); 283 SMP_CACHE_BYTES, goal, limit,
284 nid);
284 if (!p && limit) { 285 if (!p && limit) {
285 limit = 0; 286 limit = 0;
286 goto again; 287 goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
331sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 332sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
332 unsigned long size) 333 unsigned long size)
333{ 334{
334 return alloc_bootmem_node_nopanic(pgdat, size); 335 return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
335} 336}
336 337
337static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 338static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
376 return map; 377 return map;
377 378
378 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); 379 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
379 map = __alloc_bootmem_node_high(NODE_DATA(nid), size, 380 map = memblock_virt_alloc_try_nid(size,
380 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 381 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
382 BOOTMEM_ALLOC_ACCESSIBLE, nid);
381 return map; 383 return map;
382} 384}
383void __init sparse_mem_maps_populate_node(struct page **map_map, 385void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
401 } 403 }
402 404
403 size = PAGE_ALIGN(size); 405 size = PAGE_ALIGN(size);
404 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, 406 map = memblock_virt_alloc_try_nid(size * map_count,
405 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 407 PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
408 BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
406 if (map) { 409 if (map) {
407 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 410 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
408 if (!present_section_nr(pnum)) 411 if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
545 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 548 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
546 */ 549 */
547 size = sizeof(unsigned long *) * NR_MEM_SECTIONS; 550 size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
548 usemap_map = alloc_bootmem(size); 551 usemap_map = memblock_virt_alloc(size, 0);
549 if (!usemap_map) 552 if (!usemap_map)
550 panic("can not allocate usemap_map\n"); 553 panic("can not allocate usemap_map\n");
551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, 554 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
553 556
554#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 557#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 558 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
556 map_map = alloc_bootmem(size2); 559 map_map = memblock_virt_alloc(size2, 0);
557 if (!map_map) 560 if (!map_map)
558 panic("can not allocate map_map\n"); 561 panic("can not allocate map_map\n");
559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, 562 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
583 vmemmap_populate_print_last(); 586 vmemmap_populate_print_last();
584 587
585#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 588#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
586 free_bootmem(__pa(map_map), size2); 589 memblock_free_early(__pa(map_map), size2);
587#endif 590#endif
588 free_bootmem(__pa(usemap_map), size); 591 memblock_free_early(__pa(usemap_map), size);
589} 592}
590 593
591#ifdef CONFIG_MEMORY_HOTPLUG 594#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..d1100b619e61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
35 34
36#include "internal.h" 35#include "internal.h"
37 36
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
82 81
83static void put_compound_page(struct page *page) 82static void put_compound_page(struct page *page)
84{ 83{
85 if (unlikely(PageTail(page))) { 84 struct page *page_head;
86 /* __split_huge_page_refcount can run under us */
87 struct page *page_head = compound_trans_head(page);
88
89 if (likely(page != page_head &&
90 get_page_unless_zero(page_head))) {
91 unsigned long flags;
92 85
86 if (likely(!PageTail(page))) {
87 if (put_page_testzero(page)) {
93 /* 88 /*
94 * THP can not break up slab pages so avoid taking 89 * By the time all refcounts have been released
95 * compound_lock(). Slab performs non-atomic bit ops 90 * split_huge_page cannot run anymore from under us.
96 * on page->flags for better performance. In particular
97 * slab_unlock() in slub used to be a hot path. It is
98 * still hot on arches that do not support
99 * this_cpu_cmpxchg_double().
100 */ 91 */
101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 92 if (PageHead(page))
102 if (likely(PageTail(page))) { 93 __put_compound_page(page);
103 /* 94 else
104 * __split_huge_page_refcount 95 __put_single_page(page);
105 * cannot race here. 96 }
106 */ 97 return;
107 VM_BUG_ON(!PageHead(page_head)); 98 }
108 atomic_dec(&page->_mapcount); 99
109 if (put_page_testzero(page_head)) 100 /* __split_huge_page_refcount can run under us */
110 VM_BUG_ON(1); 101 page_head = compound_trans_head(page);
111 if (put_page_testzero(page_head)) 102
112 __put_compound_page(page_head); 103 /*
113 return; 104 * THP can not break up slab pages so avoid taking
114 } else 105 * compound_lock() and skip the tail page refcounting (in
115 /* 106 * _mapcount) too. Slab performs non-atomic bit ops on
116 * __split_huge_page_refcount 107 * page->flags for better performance. In particular
117 * run before us, "page" was a 108 * slab_unlock() in slub used to be a hot path. It is still
118 * THP tail. The split 109 * hot on arches that do not support
119 * page_head has been freed 110 * this_cpu_cmpxchg_double().
120 * and reallocated as slab or 111 *
121 * hugetlbfs page of smaller 112 * If "page" is part of a slab or hugetlbfs page it cannot be
122 * order (only possible if 113 * splitted and the head page cannot change from under us. And
123 * reallocated as slab on 114 * if "page" is part of a THP page under splitting, if the
124 * x86). 115 * head page pointed by the THP tail isn't a THP head anymore,
125 */ 116 * we'll find PageTail clear after smp_rmb() and we'll treat
126 goto skip_lock; 117 * it as a single page.
127 } 118 */
119 if (!__compound_tail_refcounted(page_head)) {
120 /*
121 * If "page" is a THP tail, we must read the tail page
122 * flags after the head page flags. The
123 * split_huge_page side enforces write memory barriers
124 * between clearing PageTail and before the head page
125 * can be freed and reallocated.
126 */
127 smp_rmb();
128 if (likely(PageTail(page))) {
128 /* 129 /*
129 * page_head wasn't a dangling pointer but it 130 * __split_huge_page_refcount cannot race
130 * may not be a head page anymore by the time 131 * here.
131 * we obtain the lock. That is ok as long as it
132 * can't be freed from under us.
133 */ 132 */
134 flags = compound_lock_irqsave(page_head); 133 VM_BUG_ON(!PageHead(page_head));
135 if (unlikely(!PageTail(page))) { 134 VM_BUG_ON(page_mapcount(page) != 0);
136 /* __split_huge_page_refcount run before us */ 135 if (put_page_testzero(page_head)) {
137 compound_unlock_irqrestore(page_head, flags); 136 /*
138skip_lock: 137 * If this is the tail of a slab
139 if (put_page_testzero(page_head)) { 138 * compound page, the tail pin must
140 /* 139 * not be the last reference held on
141 * The head page may have been 140 * the page, because the PG_slab
142 * freed and reallocated as a 141 * cannot be cleared before all tail
143 * compound page of smaller 142 * pins (which skips the _mapcount
144 * order and then freed again. 143 * tail refcounting) have been
145 * All we know is that it 144 * released. For hugetlbfs the tail
146 * cannot have become: a THP 145 * pin may be the last reference on
147 * page, a compound page of 146 * the page instead, because
148 * higher order, a tail page. 147 * PageHeadHuge will not go away until
149 * That is because we still 148 * the compound page enters the buddy
150 * hold the refcount of the 149 * allocator.
151 * split THP tail and 150 */
152 * page_head was the THP head 151 VM_BUG_ON(PageSlab(page_head));
153 * before the split. 152 __put_compound_page(page_head);
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
160out_put_single:
161 if (put_page_testzero(page))
162 __put_single_page(page);
163 return;
164 } 153 }
165 VM_BUG_ON(page_head != page->first_page); 154 return;
155 } else
166 /* 156 /*
167 * We can release the refcount taken by 157 * __split_huge_page_refcount run before us,
168 * get_page_unless_zero() now that 158 * "page" was a THP tail. The split page_head
169 * __split_huge_page_refcount() is blocked on 159 * has been freed and reallocated as slab or
170 * the compound_lock. 160 * hugetlbfs page of smaller order (only
161 * possible if reallocated as slab on x86).
171 */ 162 */
172 if (put_page_testzero(page_head)) 163 goto out_put_single;
173 VM_BUG_ON(1); 164 }
174 /* __split_huge_page_refcount will wait now */
175 VM_BUG_ON(page_mapcount(page) <= 0);
176 atomic_dec(&page->_mapcount);
177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
178 VM_BUG_ON(atomic_read(&page->_count) != 0);
179 compound_unlock_irqrestore(page_head, flags);
180 165
166 if (likely(page != page_head && get_page_unless_zero(page_head))) {
167 unsigned long flags;
168
169 /*
170 * page_head wasn't a dangling pointer but it may not
171 * be a head page anymore by the time we obtain the
172 * lock. That is ok as long as it can't be freed from
173 * under us.
174 */
175 flags = compound_lock_irqsave(page_head);
176 if (unlikely(!PageTail(page))) {
177 /* __split_huge_page_refcount run before us */
178 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) { 179 if (put_page_testzero(page_head)) {
180 /*
181 * The head page may have been freed
182 * and reallocated as a compound page
183 * of smaller order and then freed
184 * again. All we know is that it
185 * cannot have become: a THP page, a
186 * compound page of higher order, a
187 * tail page. That is because we
188 * still hold the refcount of the
189 * split THP tail and page_head was
190 * the THP head before the split.
191 */
182 if (PageHead(page_head)) 192 if (PageHead(page_head))
183 __put_compound_page(page_head); 193 __put_compound_page(page_head);
184 else 194 else
185 __put_single_page(page_head); 195 __put_single_page(page_head);
186 } 196 }
187 } else { 197out_put_single:
188 /* page_head is a dangling pointer */ 198 if (put_page_testzero(page))
189 VM_BUG_ON(PageTail(page)); 199 __put_single_page(page);
190 goto out_put_single; 200 return;
191 } 201 }
192 } else if (put_page_testzero(page)) { 202 VM_BUG_ON(page_head != page->first_page);
193 if (PageHead(page)) 203 /*
194 __put_compound_page(page); 204 * We can release the refcount taken by
195 else 205 * get_page_unless_zero() now that
196 __put_single_page(page); 206 * __split_huge_page_refcount() is blocked on the
207 * compound_lock.
208 */
209 if (put_page_testzero(page_head))
210 VM_BUG_ON(1);
211 /* __split_huge_page_refcount will wait now */
212 VM_BUG_ON(page_mapcount(page) <= 0);
213 atomic_dec(&page->_mapcount);
214 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
215 VM_BUG_ON(atomic_read(&page->_count) != 0);
216 compound_unlock_irqrestore(page_head, flags);
217
218 if (put_page_testzero(page_head)) {
219 if (PageHead(page_head))
220 __put_compound_page(page_head);
221 else
222 __put_single_page(page_head);
223 }
224 } else {
225 /* page_head is a dangling pointer */
226 VM_BUG_ON(PageTail(page));
227 goto out_put_single;
197 } 228 }
198} 229}
199 230
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
221 * split_huge_page(). 252 * split_huge_page().
222 */ 253 */
223 unsigned long flags; 254 unsigned long flags;
224 bool got = false; 255 bool got;
225 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_trans_head(page);
226 257
227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 258 /* Ref to put_compound_page() comment. */
228 /* Ref to put_compound_page() comment. */ 259 if (!__compound_tail_refcounted(page_head)) {
229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 260 smp_rmb();
230 if (likely(PageTail(page))) { 261 if (likely(PageTail(page))) {
231 /* 262 /*
232 * This is a hugetlbfs page or a slab 263 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount 264 * page. __split_huge_page_refcount
234 * cannot race here. 265 * cannot race here.
235 */ 266 */
236 VM_BUG_ON(!PageHead(page_head)); 267 VM_BUG_ON(!PageHead(page_head));
237 __get_page_tail_foll(page, false); 268 __get_page_tail_foll(page, true);
238 return true; 269 return true;
239 } else { 270 } else {
240 /* 271 /*
241 * __split_huge_page_refcount run 272 * __split_huge_page_refcount run
242 * before us, "page" was a THP 273 * before us, "page" was a THP
243 * tail. The split page_head has been 274 * tail. The split page_head has been
244 * freed and reallocated as slab or 275 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order 276 * hugetlbfs page of smaller order
246 * (only possible if reallocated as 277 * (only possible if reallocated as
247 * slab on x86). 278 * slab on x86).
248 */ 279 */
249 put_page(page_head); 280 return false;
250 return false;
251 }
252 } 281 }
282 }
253 283
284 got = false;
285 if (likely(page != page_head && get_page_unless_zero(page_head))) {
254 /* 286 /*
255 * page_head wasn't a dangling pointer but it 287 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time 288 * may not be a head page anymore by the time
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
404 return mapping; 404 return mapping;
405} 405}
406 406
407int overcommit_ratio_handler(struct ctl_table *table, int write,
408 void __user *buffer, size_t *lenp,
409 loff_t *ppos)
410{
411 int ret;
412
413 ret = proc_dointvec(table, write, buffer, lenp, ppos);
414 if (ret == 0 && write)
415 sysctl_overcommit_kbytes = 0;
416 return ret;
417}
418
419int overcommit_kbytes_handler(struct ctl_table *table, int write,
420 void __user *buffer, size_t *lenp,
421 loff_t *ppos)
422{
423 int ret;
424
425 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
426 if (ret == 0 && write)
427 sysctl_overcommit_ratio = 0;
428 return ret;
429}
430
407/* 431/*
408 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 432 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
409 */ 433 */
410unsigned long vm_commit_limit(void) 434unsigned long vm_commit_limit(void)
411{ 435{
412 return ((totalram_pages - hugetlb_total_pages()) 436 unsigned long allowed;
413 * sysctl_overcommit_ratio / 100) + total_swap_pages; 437
438 if (sysctl_overcommit_kbytes)
439 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
440 else
441 allowed = ((totalram_pages - hugetlb_total_pages())
442 * sysctl_overcommit_ratio / 100);
443 allowed += total_swap_pages;
444
445 return allowed;
414} 446}
415 447
416 448
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..e4f0db2a3eae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
220} 220}
221 221
222/* 222/*
223 * Walk a vmap address to the struct page it maps. 223 * Walk a vmap address to the physical pfn it maps to.
224 */ 224 */
225struct page *vmalloc_to_page(const void *vmalloc_addr) 225unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
226{ 226{
227 unsigned long addr = (unsigned long) vmalloc_addr; 227 unsigned long addr = (unsigned long) vmalloc_addr;
228 struct page *page = NULL; 228 unsigned long pfn = 0;
229 pgd_t *pgd = pgd_offset_k(addr); 229 pgd_t *pgd = pgd_offset_k(addr);
230 230
231 /* 231 /*
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
244 ptep = pte_offset_map(pmd, addr); 244 ptep = pte_offset_map(pmd, addr);
245 pte = *ptep; 245 pte = *ptep;
246 if (pte_present(pte)) 246 if (pte_present(pte))
247 page = pte_page(pte); 247 pfn = pte_pfn(pte);
248 pte_unmap(ptep); 248 pte_unmap(ptep);
249 } 249 }
250 } 250 }
251 } 251 }
252 return page; 252 return pfn;
253} 253}
254EXPORT_SYMBOL(vmalloc_to_page); 254EXPORT_SYMBOL(vmalloc_to_pfn);
255 255
256/* 256/*
257 * Map a vmalloc()-space virtual address to the physical page frame number. 257 * Map a vmalloc()-space virtual address to the struct page.
258 */ 258 */
259unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 259struct page *vmalloc_to_page(const void *vmalloc_addr)
260{ 260{
261 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 261 return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
262} 262}
263EXPORT_SYMBOL(vmalloc_to_pfn); 263EXPORT_SYMBOL(vmalloc_to_page);
264 264
265 265
266/*** Global kva allocator ***/ 266/*** Global kva allocator ***/