diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 17:27:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 17:27:38 -0400 |
commit | 6c0f568e84a3cfc775682311d65205462c3f3bc1 (patch) | |
tree | 5105a137a9ea2459d55e895d3c096bbd31274724 | |
parent | c82199061009d1561e31e17fca5e47a87cb7ff4c (diff) | |
parent | 559ec2f8fd50981821621f52db5e1a8ffcf8d792 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton:
- a few misc things
- Andy's "ambient capabilities"
- fs/nofity updates
- the ocfs2 queue
- kernel/watchdog.c updates and feature work.
- some of MM. Includes Andrea's userfaultfd feature.
[ Hadn't noticed that userfaultfd was 'default y' when applying the
patches, so that got fixed in this merge instead. We do _not_ mark
new features that nobody uses yet 'default y' - Linus ]
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
mm/hugetlb.c: make vma_has_reserves() return bool
mm/madvise.c: make madvise_behaviour_valid() return bool
mm/memory.c: make tlb_next_batch() return bool
mm/dmapool.c: change is_page_busy() return from int to bool
mm: remove struct node_active_region
mremap: simplify the "overlap" check in mremap_to()
mremap: don't do uneccesary checks if new_len == old_len
mremap: don't do mm_populate(new_addr) on failure
mm: move ->mremap() from file_operations to vm_operations_struct
mremap: don't leak new_vma if f_op->mremap() fails
mm/hugetlb.c: make vma_shareable() return bool
mm: make GUP handle pfn mapping unless FOLL_GET is requested
mm: fix status code which move_pages() returns for zero page
mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout()
genalloc: add support of multiple gen_pools per device
genalloc: add name arg to gen_pool_get() and devm_gen_pool_create()
mm/memblock: WARN_ON when nid differs from overlap region
Documentation/features/vm: add feature description and arch support status for batched TLB flush after unmap
mm: defer flush of writable TLB entries
mm: send one IPI per CPU to TLB flush all entries after unmapping pages
...
143 files changed, 5174 insertions, 1062 deletions
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt new file mode 100644 index 000000000000..261b92e2fb1a --- /dev/null +++ b/Documentation/features/vm/TLB/arch-support.txt | |||
@@ -0,0 +1,40 @@ | |||
1 | # | ||
2 | # Feature name: batch-unmap-tlb-flush | ||
3 | # Kconfig: ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
4 | # description: arch supports deferral of TLB flush until multiple pages are unmapped | ||
5 | # | ||
6 | ----------------------- | ||
7 | | arch |status| | ||
8 | ----------------------- | ||
9 | | alpha: | TODO | | ||
10 | | arc: | TODO | | ||
11 | | arm: | TODO | | ||
12 | | arm64: | TODO | | ||
13 | | avr32: | .. | | ||
14 | | blackfin: | TODO | | ||
15 | | c6x: | .. | | ||
16 | | cris: | .. | | ||
17 | | frv: | .. | | ||
18 | | h8300: | .. | | ||
19 | | hexagon: | TODO | | ||
20 | | ia64: | TODO | | ||
21 | | m32r: | TODO | | ||
22 | | m68k: | .. | | ||
23 | | metag: | TODO | | ||
24 | | microblaze: | .. | | ||
25 | | mips: | TODO | | ||
26 | | mn10300: | TODO | | ||
27 | | nios2: | .. | | ||
28 | | openrisc: | .. | | ||
29 | | parisc: | TODO | | ||
30 | | powerpc: | TODO | | ||
31 | | s390: | TODO | | ||
32 | | score: | .. | | ||
33 | | sh: | TODO | | ||
34 | | sparc: | TODO | | ||
35 | | tile: | TODO | | ||
36 | | um: | .. | | ||
37 | | unicore32: | .. | | ||
38 | | x86: | ok | | ||
39 | | xtensa: | TODO | | ||
40 | ----------------------- | ||
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 64df08db4657..39ac6546d4a4 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -303,6 +303,7 @@ Code Seq#(hex) Include File Comments | |||
303 | 0xA3 80-8F Port ACL in development: | 303 | 0xA3 80-8F Port ACL in development: |
304 | <mailto:tlewis@mindspring.com> | 304 | <mailto:tlewis@mindspring.com> |
305 | 0xA3 90-9F linux/dtlk.h | 305 | 0xA3 90-9F linux/dtlk.h |
306 | 0xAA 00-3F linux/uapi/linux/userfaultfd.h | ||
306 | 0xAB 00-1F linux/nbd.h | 307 | 0xAB 00-1F linux/nbd.h |
307 | 0xAC 00-1F linux/raw.h | 308 | 0xAC 00-1F linux/raw.h |
308 | 0xAD 00 Netfilter device in development: | 309 | 0xAD 00 Netfilter device in development: |
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt new file mode 100644 index 000000000000..70a3c94d1941 --- /dev/null +++ b/Documentation/vm/userfaultfd.txt | |||
@@ -0,0 +1,144 @@ | |||
1 | = Userfaultfd = | ||
2 | |||
3 | == Objective == | ||
4 | |||
5 | Userfaults allow the implementation of on-demand paging from userland | ||
6 | and more generally they allow userland to take control of various | ||
7 | memory page faults, something otherwise only the kernel code could do. | ||
8 | |||
9 | For example userfaults allows a proper and more optimal implementation | ||
10 | of the PROT_NONE+SIGSEGV trick. | ||
11 | |||
12 | == Design == | ||
13 | |||
14 | Userfaults are delivered and resolved through the userfaultfd syscall. | ||
15 | |||
16 | The userfaultfd (aside from registering and unregistering virtual | ||
17 | memory ranges) provides two primary functionalities: | ||
18 | |||
19 | 1) read/POLLIN protocol to notify a userland thread of the faults | ||
20 | happening | ||
21 | |||
22 | 2) various UFFDIO_* ioctls that can manage the virtual memory regions | ||
23 | registered in the userfaultfd that allows userland to efficiently | ||
24 | resolve the userfaults it receives via 1) or to manage the virtual | ||
25 | memory in the background | ||
26 | |||
27 | The real advantage of userfaults if compared to regular virtual memory | ||
28 | management of mremap/mprotect is that the userfaults in all their | ||
29 | operations never involve heavyweight structures like vmas (in fact the | ||
30 | userfaultfd runtime load never takes the mmap_sem for writing). | ||
31 | |||
32 | Vmas are not suitable for page- (or hugepage) granular fault tracking | ||
33 | when dealing with virtual address spaces that could span | ||
34 | Terabytes. Too many vmas would be needed for that. | ||
35 | |||
36 | The userfaultfd once opened by invoking the syscall, can also be | ||
37 | passed using unix domain sockets to a manager process, so the same | ||
38 | manager process could handle the userfaults of a multitude of | ||
39 | different processes without them being aware about what is going on | ||
40 | (well of course unless they later try to use the userfaultfd | ||
41 | themselves on the same region the manager is already tracking, which | ||
42 | is a corner case that would currently return -EBUSY). | ||
43 | |||
44 | == API == | ||
45 | |||
46 | When first opened the userfaultfd must be enabled invoking the | ||
47 | UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or | ||
48 | a later API version) which will specify the read/POLLIN protocol | ||
49 | userland intends to speak on the UFFD and the uffdio_api.features | ||
50 | userland requires. The UFFDIO_API ioctl if successful (i.e. if the | ||
51 | requested uffdio_api.api is spoken also by the running kernel and the | ||
52 | requested features are going to be enabled) will return into | ||
53 | uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of | ||
54 | respectively all the available features of the read(2) protocol and | ||
55 | the generic ioctl available. | ||
56 | |||
57 | Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should | ||
58 | be invoked (if present in the returned uffdio_api.ioctls bitmask) to | ||
59 | register a memory range in the userfaultfd by setting the | ||
60 | uffdio_register structure accordingly. The uffdio_register.mode | ||
61 | bitmask will specify to the kernel which kind of faults to track for | ||
62 | the range (UFFDIO_REGISTER_MODE_MISSING would track missing | ||
63 | pages). The UFFDIO_REGISTER ioctl will return the | ||
64 | uffdio_register.ioctls bitmask of ioctls that are suitable to resolve | ||
65 | userfaults on the range registered. Not all ioctls will necessarily be | ||
66 | supported for all memory types depending on the underlying virtual | ||
67 | memory backend (anonymous memory vs tmpfs vs real filebacked | ||
68 | mappings). | ||
69 | |||
70 | Userland can use the uffdio_register.ioctls to manage the virtual | ||
71 | address space in the background (to add or potentially also remove | ||
72 | memory from the userfaultfd registered range). This means a userfault | ||
73 | could be triggering just before userland maps in the background the | ||
74 | user-faulted page. | ||
75 | |||
76 | The primary ioctl to resolve userfaults is UFFDIO_COPY. That | ||
77 | atomically copies a page into the userfault registered range and wakes | ||
78 | up the blocked userfaults (unless uffdio_copy.mode & | ||
79 | UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to | ||
80 | UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an | ||
81 | half copied page since it'll keep userfaulting until the copy has | ||
82 | finished. | ||
83 | |||
84 | == QEMU/KVM == | ||
85 | |||
86 | QEMU/KVM is using the userfaultfd syscall to implement postcopy live | ||
87 | migration. Postcopy live migration is one form of memory | ||
88 | externalization consisting of a virtual machine running with part or | ||
89 | all of its memory residing on a different node in the cloud. The | ||
90 | userfaultfd abstraction is generic enough that not a single line of | ||
91 | KVM kernel code had to be modified in order to add postcopy live | ||
92 | migration to QEMU. | ||
93 | |||
94 | Guest async page faults, FOLL_NOWAIT and all other GUP features work | ||
95 | just fine in combination with userfaults. Userfaults trigger async | ||
96 | page faults in the guest scheduler so those guest processes that | ||
97 | aren't waiting for userfaults (i.e. network bound) can keep running in | ||
98 | the guest vcpus. | ||
99 | |||
100 | It is generally beneficial to run one pass of precopy live migration | ||
101 | just before starting postcopy live migration, in order to avoid | ||
102 | generating userfaults for readonly guest regions. | ||
103 | |||
104 | The implementation of postcopy live migration currently uses one | ||
105 | single bidirectional socket but in the future two different sockets | ||
106 | will be used (to reduce the latency of the userfaults to the minimum | ||
107 | possible without having to decrease /proc/sys/net/ipv4/tcp_wmem). | ||
108 | |||
109 | The QEMU in the source node writes all pages that it knows are missing | ||
110 | in the destination node, into the socket, and the migration thread of | ||
111 | the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE | ||
112 | ioctls on the userfaultfd in order to map the received pages into the | ||
113 | guest (UFFDIO_ZEROCOPY is used if the source page was a zero page). | ||
114 | |||
115 | A different postcopy thread in the destination node listens with | ||
116 | poll() to the userfaultfd in parallel. When a POLLIN event is | ||
117 | generated after a userfault triggers, the postcopy thread read() from | ||
118 | the userfaultfd and receives the fault address (or -EAGAIN in case the | ||
119 | userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run | ||
120 | by the parallel QEMU migration thread). | ||
121 | |||
122 | After the QEMU postcopy thread (running in the destination node) gets | ||
123 | the userfault address it writes the information about the missing page | ||
124 | into the socket. The QEMU source node receives the information and | ||
125 | roughly "seeks" to that page address and continues sending all | ||
126 | remaining missing pages from that new page offset. Soon after that | ||
127 | (just the time to flush the tcp_wmem queue through the network) the | ||
128 | migration thread in the QEMU running in the destination node will | ||
129 | receive the page that triggered the userfault and it'll map it as | ||
130 | usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it | ||
131 | was spontaneously sent by the source or if it was an urgent page | ||
132 | requested through an userfault). | ||
133 | |||
134 | By the time the userfaults start, the QEMU in the destination node | ||
135 | doesn't need to keep any per-page state bitmap relative to the live | ||
136 | migration around and a single per-page bitmap has to be maintained in | ||
137 | the QEMU running in the source node to know which pages are still | ||
138 | missing in the destination node. The bitmap in the source node is | ||
139 | checked to find which missing pages to send in round robin and we seek | ||
140 | over it when receiving incoming userfaults. After sending each page of | ||
141 | course the bitmap is updated accordingly. It's also useful to avoid | ||
142 | sending the same page twice (in case the userfault is read by the | ||
143 | postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration | ||
144 | thread). | ||
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 265ffeb2037e..80e277cfcc8b 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c | |||
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void) | |||
369 | return; | 369 | return; |
370 | } | 370 | } |
371 | 371 | ||
372 | sram_pool = gen_pool_get(&pdev->dev); | 372 | sram_pool = gen_pool_get(&pdev->dev, NULL); |
373 | if (!sram_pool) { | 373 | if (!sram_pool) { |
374 | pr_warn("%s: sram pool unavailable!\n", __func__); | 374 | pr_warn("%s: sram pool unavailable!\n", __func__); |
375 | return; | 375 | return; |
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c index 1885676c23c0..532d4b08276d 100644 --- a/arch/arm/mach-imx/pm-imx5.c +++ b/arch/arm/mach-imx/pm-imx5.c | |||
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram( | |||
297 | goto put_node; | 297 | goto put_node; |
298 | } | 298 | } |
299 | 299 | ||
300 | ocram_pool = gen_pool_get(&pdev->dev); | 300 | ocram_pool = gen_pool_get(&pdev->dev, NULL); |
301 | if (!ocram_pool) { | 301 | if (!ocram_pool) { |
302 | pr_warn("%s: ocram pool unavailable!\n", __func__); | 302 | pr_warn("%s: ocram pool unavailable!\n", __func__); |
303 | ret = -ENODEV; | 303 | ret = -ENODEV; |
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 93ecf559d06d..8ff8fc0b261c 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c | |||
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata) | |||
451 | goto put_node; | 451 | goto put_node; |
452 | } | 452 | } |
453 | 453 | ||
454 | ocram_pool = gen_pool_get(&pdev->dev); | 454 | ocram_pool = gen_pool_get(&pdev->dev, NULL); |
455 | if (!ocram_pool) { | 455 | if (!ocram_pool) { |
456 | pr_warn("%s: ocram pool unavailable!\n", __func__); | 456 | pr_warn("%s: ocram pool unavailable!\n", __func__); |
457 | ret = -ENODEV; | 457 | ret = -ENODEV; |
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c index 6a4199f2bffb..c378ab0c2431 100644 --- a/arch/arm/mach-socfpga/pm.c +++ b/arch/arm/mach-socfpga/pm.c | |||
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void) | |||
56 | goto put_node; | 56 | goto put_node; |
57 | } | 57 | } |
58 | 58 | ||
59 | ocram_pool = gen_pool_get(&pdev->dev); | 59 | ocram_pool = gen_pool_get(&pdev->dev, NULL); |
60 | if (!ocram_pool) { | 60 | if (!ocram_pool) { |
61 | pr_warn("%s: ocram pool unavailable!\n", __func__); | 61 | pr_warn("%s: ocram pool unavailable!\n", __func__); |
62 | ret = -ENODEV; | 62 | ret = -ENODEV; |
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2790b6a64157..17f486233db0 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c | |||
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) | |||
488 | int arch_add_memory(int nid, u64 start, u64 size) | 488 | int arch_add_memory(int nid, u64 start, u64 size) |
489 | { | 489 | { |
490 | pg_data_t *pgdat; | 490 | pg_data_t *pgdat; |
491 | unsigned long start_pfn = start >> PAGE_SHIFT; | 491 | unsigned long start_pfn = PFN_DOWN(start); |
492 | unsigned long nr_pages = size >> PAGE_SHIFT; | 492 | unsigned long nr_pages = size >> PAGE_SHIFT; |
493 | int ret; | 493 | int ret; |
494 | 494 | ||
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | |||
517 | #ifdef CONFIG_MEMORY_HOTREMOVE | 517 | #ifdef CONFIG_MEMORY_HOTREMOVE |
518 | int arch_remove_memory(u64 start, u64 size) | 518 | int arch_remove_memory(u64 start, u64 size) |
519 | { | 519 | { |
520 | unsigned long start_pfn = start >> PAGE_SHIFT; | 520 | unsigned long start_pfn = PFN_DOWN(start); |
521 | unsigned long nr_pages = size >> PAGE_SHIFT; | 521 | unsigned long nr_pages = size >> PAGE_SHIFT; |
522 | struct zone *zone; | 522 | struct zone *zone; |
523 | int ret; | 523 | int ret; |
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index bce52ba66206..05713d190247 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c | |||
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) | |||
33 | /* Don't allow bogus node assignment */ | 33 | /* Don't allow bogus node assignment */ |
34 | BUG_ON(nid >= MAX_NUMNODES || nid <= 0); | 34 | BUG_ON(nid >= MAX_NUMNODES || nid <= 0); |
35 | 35 | ||
36 | start_pfn = start >> PAGE_SHIFT; | 36 | start_pfn = PFN_DOWN(start); |
37 | end_pfn = end >> PAGE_SHIFT; | 37 | end_pfn = PFN_DOWN(end); |
38 | 38 | ||
39 | pmb_bolt_mapping((unsigned long)__va(start), start, end - start, | 39 | pmb_bolt_mapping((unsigned long)__va(start), start, end - start, |
40 | PAGE_KERNEL); | 40 | PAGE_KERNEL); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48f7433dac6f..117e2f373e50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -41,6 +41,7 @@ config X86 | |||
41 | select ARCH_USE_CMPXCHG_LOCKREF if X86_64 | 41 | select ARCH_USE_CMPXCHG_LOCKREF if X86_64 |
42 | select ARCH_USE_QUEUED_RWLOCKS | 42 | select ARCH_USE_QUEUED_RWLOCKS |
43 | select ARCH_USE_QUEUED_SPINLOCKS | 43 | select ARCH_USE_QUEUED_SPINLOCKS |
44 | select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP | ||
44 | select ARCH_WANTS_DYNAMIC_TASK_STRUCT | 45 | select ARCH_WANTS_DYNAMIC_TASK_STRUCT |
45 | select ARCH_WANT_FRAME_POINTERS | 46 | select ARCH_WANT_FRAME_POINTERS |
46 | select ARCH_WANT_IPC_PARSE_VERSION if X86_32 | 47 | select ARCH_WANT_IPC_PARSE_VERSION if X86_32 |
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 25e3cf1cd8fd..477bfa6db370 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl | |||
@@ -380,3 +380,4 @@ | |||
380 | 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom | 380 | 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom |
381 | 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg | 381 | 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg |
382 | 373 i386 shutdown sys_shutdown | 382 | 373 i386 shutdown sys_shutdown |
383 | 374 i386 userfaultfd sys_userfaultfd | ||
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9ef32d5f1b19..81c490634db9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl | |||
@@ -329,6 +329,7 @@ | |||
329 | 320 common kexec_file_load sys_kexec_file_load | 329 | 320 common kexec_file_load sys_kexec_file_load |
330 | 321 common bpf sys_bpf | 330 | 321 common bpf sys_bpf |
331 | 322 64 execveat stub_execveat | 331 | 322 64 execveat stub_execveat |
332 | 323 common userfaultfd sys_userfaultfd | ||
332 | 333 | ||
333 | # | 334 | # |
334 | # x32-specific system call numbers start at 512 to avoid cache impact | 335 | # x32-specific system call numbers start at 512 to avoid cache impact |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b286..6df2029405a3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void) | |||
261 | 261 | ||
262 | #endif /* SMP */ | 262 | #endif /* SMP */ |
263 | 263 | ||
264 | /* Not inlined due to inc_irq_stat not being defined yet */ | ||
265 | #define flush_tlb_local() { \ | ||
266 | inc_irq_stat(irq_tlb_count); \ | ||
267 | local_flush_tlb(); \ | ||
268 | } | ||
269 | |||
264 | #ifndef CONFIG_PARAVIRT | 270 | #ifndef CONFIG_PARAVIRT |
265 | #define flush_tlb_others(mask, mm, start, end) \ | 271 | #define flush_tlb_others(mask, mm, start, end) \ |
266 | native_flush_tlb_others(mask, mm, start, end) | 272 | native_flush_tlb_others(mask, mm, start, end) |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3f124d553c5a..cd9b6d0b10bf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/watchdog.h> | 15 | #include <linux/nmi.h> |
16 | 16 | ||
17 | #include <asm/cpufeature.h> | 17 | #include <asm/cpufeature.h> |
18 | #include <asm/hardirq.h> | 18 | #include <asm/hardirq.h> |
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void) | |||
3627 | return 0; | 3627 | return 0; |
3628 | } | 3628 | } |
3629 | 3629 | ||
3630 | watchdog_nmi_disable_all(); | 3630 | if (lockup_detector_suspend() != 0) { |
3631 | pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); | ||
3632 | return 0; | ||
3633 | } | ||
3631 | 3634 | ||
3632 | x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); | 3635 | x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); |
3633 | 3636 | ||
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void) | |||
3635 | x86_pmu.commit_scheduling = NULL; | 3638 | x86_pmu.commit_scheduling = NULL; |
3636 | x86_pmu.stop_scheduling = NULL; | 3639 | x86_pmu.stop_scheduling = NULL; |
3637 | 3640 | ||
3638 | watchdog_nmi_enable_all(); | 3641 | lockup_detector_resume(); |
3639 | 3642 | ||
3640 | get_online_cpus(); | 3643 | get_online_cpus(); |
3641 | 3644 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 90b924acd982..8ddb5d0d66fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
140 | info.flush_end = end; | 140 | info.flush_end = end; |
141 | 141 | ||
142 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); | 142 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
143 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); | ||
143 | if (is_uv_system()) { | 144 | if (is_uv_system()) { |
144 | unsigned int cpu; | 145 | unsigned int cpu; |
145 | 146 | ||
diff --git a/drivers/base/node.c b/drivers/base/node.c index 31df474d72f4..560751bad294 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -392,6 +392,16 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) | |||
392 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { | 392 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { |
393 | int page_nid; | 393 | int page_nid; |
394 | 394 | ||
395 | /* | ||
396 | * memory block could have several absent sections from start. | ||
397 | * skip pfn range from absent section | ||
398 | */ | ||
399 | if (!pfn_present(pfn)) { | ||
400 | pfn = round_down(pfn + PAGES_PER_SECTION, | ||
401 | PAGES_PER_SECTION) - 1; | ||
402 | continue; | ||
403 | } | ||
404 | |||
395 | page_nid = get_nid_for_pfn(pfn); | 405 | page_nid = get_nid_for_pfn(pfn); |
396 | if (page_nid < 0) | 406 | if (page_nid < 0) |
397 | continue; | 407 | continue; |
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c index 58f65486de33..284ac4c934ba 100644 --- a/drivers/media/platform/coda/coda-common.c +++ b/drivers/media/platform/coda/coda-common.c | |||
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev) | |||
2157 | /* Get IRAM pool from device tree or platform data */ | 2157 | /* Get IRAM pool from device tree or platform data */ |
2158 | pool = of_gen_pool_get(np, "iram", 0); | 2158 | pool = of_gen_pool_get(np, "iram", 0); |
2159 | if (!pool && pdata) | 2159 | if (!pool && pdata) |
2160 | pool = gen_pool_get(pdata->iram_dev); | 2160 | pool = gen_pool_get(pdata->iram_dev, NULL); |
2161 | if (!pool) { | 2161 | if (!pool) { |
2162 | dev_err(&pdev->dev, "iram pool not available\n"); | 2162 | dev_err(&pdev->dev, "iram pool not available\n"); |
2163 | return -ENOMEM; | 2163 | return -ENOMEM; |
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c index 15c33cc34a80..431e1dd528bc 100644 --- a/drivers/misc/sram.c +++ b/drivers/misc/sram.c | |||
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev) | |||
186 | if (IS_ERR(sram->virt_base)) | 186 | if (IS_ERR(sram->virt_base)) |
187 | return PTR_ERR(sram->virt_base); | 187 | return PTR_ERR(sram->virt_base); |
188 | 188 | ||
189 | sram->pool = devm_gen_pool_create(sram->dev, | 189 | sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY), |
190 | ilog2(SRAM_GRANULARITY), -1); | 190 | NUMA_NO_NODE, NULL); |
191 | if (!sram->pool) | 191 | if (IS_ERR(sram->pool)) |
192 | return -ENOMEM; | 192 | return PTR_ERR(sram->pool); |
193 | 193 | ||
194 | ret = sram_reserve_regions(sram, res); | 194 | ret = sram_reserve_regions(sram, res); |
195 | if (ret) | 195 | if (ret) |
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index ba97efc3bf70..071280643db7 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig | |||
@@ -9,7 +9,7 @@ config VGA_CONSOLE | |||
9 | depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \ | 9 | depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \ |
10 | !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \ | 10 | !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \ |
11 | (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \ | 11 | (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \ |
12 | !ARM64 | 12 | !ARM64 && !ARC |
13 | default y | 13 | default y |
14 | help | 14 | help |
15 | Saying Y here will allow you to use Linux in text mode through a | 15 | Saying Y here will allow you to use Linux in text mode through a |
diff --git a/fs/Makefile b/fs/Makefile index 09e051fefc5b..f79cf4043e60 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o | |||
27 | obj-$(CONFIG_SIGNALFD) += signalfd.o | 27 | obj-$(CONFIG_SIGNALFD) += signalfd.o |
28 | obj-$(CONFIG_TIMERFD) += timerfd.o | 28 | obj-$(CONFIG_TIMERFD) += timerfd.o |
29 | obj-$(CONFIG_EVENTFD) += eventfd.o | 29 | obj-$(CONFIG_EVENTFD) += eventfd.o |
30 | obj-$(CONFIG_USERFAULTFD) += userfaultfd.o | ||
30 | obj-$(CONFIG_AIO) += aio.o | 31 | obj-$(CONFIG_AIO) += aio.o |
31 | obj-$(CONFIG_FS_DAX) += dax.o | 32 | obj-$(CONFIG_FS_DAX) += dax.o |
32 | obj-$(CONFIG_FILE_LOCKING) += locks.o | 33 | obj-$(CONFIG_FILE_LOCKING) += locks.o |
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx) | |||
308 | } | 308 | } |
309 | } | 309 | } |
310 | 310 | ||
311 | static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) | 311 | static int aio_ring_mremap(struct vm_area_struct *vma) |
312 | { | ||
313 | vma->vm_flags |= VM_DONTEXPAND; | ||
314 | vma->vm_ops = &generic_file_vm_ops; | ||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) | ||
319 | { | 312 | { |
313 | struct file *file = vma->vm_file; | ||
320 | struct mm_struct *mm = vma->vm_mm; | 314 | struct mm_struct *mm = vma->vm_mm; |
321 | struct kioctx_table *table; | 315 | struct kioctx_table *table; |
322 | int i, res = -EINVAL; | 316 | int i, res = -EINVAL; |
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) | |||
342 | return res; | 336 | return res; |
343 | } | 337 | } |
344 | 338 | ||
339 | static const struct vm_operations_struct aio_ring_vm_ops = { | ||
340 | .mremap = aio_ring_mremap, | ||
341 | #if IS_ENABLED(CONFIG_MMU) | ||
342 | .fault = filemap_fault, | ||
343 | .map_pages = filemap_map_pages, | ||
344 | .page_mkwrite = filemap_page_mkwrite, | ||
345 | #endif | ||
346 | }; | ||
347 | |||
348 | static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) | ||
349 | { | ||
350 | vma->vm_flags |= VM_DONTEXPAND; | ||
351 | vma->vm_ops = &aio_ring_vm_ops; | ||
352 | return 0; | ||
353 | } | ||
354 | |||
345 | static const struct file_operations aio_ring_fops = { | 355 | static const struct file_operations aio_ring_fops = { |
346 | .mmap = aio_ring_mmap, | 356 | .mmap = aio_ring_mmap, |
347 | .mremap = aio_ring_remap, | ||
348 | }; | 357 | }; |
349 | 358 | ||
350 | #if IS_ENABLED(CONFIG_MIGRATION) | 359 | #if IS_ENABLED(CONFIG_MIGRATION) |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d1c833c321b9..7b6bfcbf801c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) | |||
479 | if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) | 479 | if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) |
480 | seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); | 480 | seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); |
481 | if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) | 481 | if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) |
482 | seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); | 482 | seq_show_option(m, "snapdirname", fsopt->snapdir_name); |
483 | 483 | ||
484 | return 0; | 484 | return 0; |
485 | } | 485 | } |
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a9fb6b53126..6a1119e87fbb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c | |||
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root) | |||
394 | struct sockaddr *srcaddr; | 394 | struct sockaddr *srcaddr; |
395 | srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; | 395 | srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; |
396 | 396 | ||
397 | seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); | 397 | seq_show_option(s, "vers", tcon->ses->server->vals->version_string); |
398 | cifs_show_security(s, tcon->ses); | 398 | cifs_show_security(s, tcon->ses); |
399 | cifs_show_cache_flavor(s, cifs_sb); | 399 | cifs_show_cache_flavor(s, cifs_sb); |
400 | 400 | ||
401 | if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) | 401 | if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) |
402 | seq_puts(s, ",multiuser"); | 402 | seq_puts(s, ",multiuser"); |
403 | else if (tcon->ses->user_name) | 403 | else if (tcon->ses->user_name) |
404 | seq_printf(s, ",username=%s", tcon->ses->user_name); | 404 | seq_show_option(s, "username", tcon->ses->user_name); |
405 | 405 | ||
406 | if (tcon->ses->domainName) | 406 | if (tcon->ses->domainName) |
407 | seq_printf(s, ",domain=%s", tcon->ses->domainName); | 407 | seq_show_option(s, "domain", tcon->ses->domainName); |
408 | 408 | ||
409 | if (srcaddr->sa_family != AF_UNSPEC) { | 409 | if (srcaddr->sa_family != AF_UNSPEC) { |
410 | struct sockaddr_in *saddr4; | 410 | struct sockaddr_in *saddr4; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee3878262a49..a63c7b0a10cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, | |||
1776 | } | 1776 | } |
1777 | 1777 | ||
1778 | if (sbi->s_qf_names[USRQUOTA]) | 1778 | if (sbi->s_qf_names[USRQUOTA]) |
1779 | seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); | 1779 | seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); |
1780 | 1780 | ||
1781 | if (sbi->s_qf_names[GRPQUOTA]) | 1781 | if (sbi->s_qf_names[GRPQUOTA]) |
1782 | seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); | 1782 | seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); |
1783 | #endif | 1783 | #endif |
1784 | } | 1784 | } |
1785 | 1785 | ||
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2982445947e1..894fb01a91da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c | |||
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) | |||
1334 | if (is_ancestor(root, sdp->sd_master_dir)) | 1334 | if (is_ancestor(root, sdp->sd_master_dir)) |
1335 | seq_puts(s, ",meta"); | 1335 | seq_puts(s, ",meta"); |
1336 | if (args->ar_lockproto[0]) | 1336 | if (args->ar_lockproto[0]) |
1337 | seq_printf(s, ",lockproto=%s", args->ar_lockproto); | 1337 | seq_show_option(s, "lockproto", args->ar_lockproto); |
1338 | if (args->ar_locktable[0]) | 1338 | if (args->ar_locktable[0]) |
1339 | seq_printf(s, ",locktable=%s", args->ar_locktable); | 1339 | seq_show_option(s, "locktable", args->ar_locktable); |
1340 | if (args->ar_hostdata[0]) | 1340 | if (args->ar_hostdata[0]) |
1341 | seq_printf(s, ",hostdata=%s", args->ar_hostdata); | 1341 | seq_show_option(s, "hostdata", args->ar_hostdata); |
1342 | if (args->ar_spectator) | 1342 | if (args->ar_spectator) |
1343 | seq_puts(s, ",spectator"); | 1343 | seq_puts(s, ",spectator"); |
1344 | if (args->ar_localflocks) | 1344 | if (args->ar_localflocks) |
diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 55c03b9e9070..4574fdd3d421 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c | |||
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) | |||
136 | struct hfs_sb_info *sbi = HFS_SB(root->d_sb); | 136 | struct hfs_sb_info *sbi = HFS_SB(root->d_sb); |
137 | 137 | ||
138 | if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) | 138 | if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) |
139 | seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); | 139 | seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); |
140 | if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) | 140 | if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) |
141 | seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); | 141 | seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); |
142 | seq_printf(seq, ",uid=%u,gid=%u", | 142 | seq_printf(seq, ",uid=%u,gid=%u", |
143 | from_kuid_munged(&init_user_ns, sbi->s_uid), | 143 | from_kuid_munged(&init_user_ns, sbi->s_uid), |
144 | from_kgid_munged(&init_user_ns, sbi->s_gid)); | 144 | from_kgid_munged(&init_user_ns, sbi->s_gid)); |
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c90b72ee676d..bb806e58c977 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c | |||
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) | |||
218 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); | 218 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); |
219 | 219 | ||
220 | if (sbi->creator != HFSPLUS_DEF_CR_TYPE) | 220 | if (sbi->creator != HFSPLUS_DEF_CR_TYPE) |
221 | seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); | 221 | seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); |
222 | if (sbi->type != HFSPLUS_DEF_CR_TYPE) | 222 | if (sbi->type != HFSPLUS_DEF_CR_TYPE) |
223 | seq_printf(seq, ",type=%.4s", (char *)&sbi->type); | 223 | seq_show_option_n(seq, "type", (char *)&sbi->type, 4); |
224 | seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, | 224 | seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, |
225 | from_kuid_munged(&init_user_ns, sbi->uid), | 225 | from_kuid_munged(&init_user_ns, sbi->uid), |
226 | from_kgid_munged(&init_user_ns, sbi->gid)); | 226 | from_kgid_munged(&init_user_ns, sbi->gid)); |
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 059597b23f67..2ac99db3750e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c | |||
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) | |||
260 | size_t offset = strlen(root_ino) + 1; | 260 | size_t offset = strlen(root_ino) + 1; |
261 | 261 | ||
262 | if (strlen(root_path) > offset) | 262 | if (strlen(root_path) > offset) |
263 | seq_printf(seq, ",%s", root_path + offset); | 263 | seq_show_option(seq, root_path + offset, NULL); |
264 | 264 | ||
265 | if (append) | 265 | if (append) |
266 | seq_puts(seq, ",append"); | 266 | seq_puts(seq, ",append"); |
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 44523f4a6084..6faaf710e563 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c | |||
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) | |||
154 | struct dnotify_struct *dn; | 154 | struct dnotify_struct *dn; |
155 | struct dnotify_struct **prev; | 155 | struct dnotify_struct **prev; |
156 | struct inode *inode; | 156 | struct inode *inode; |
157 | bool free = false; | ||
157 | 158 | ||
158 | inode = file_inode(filp); | 159 | inode = file_inode(filp); |
159 | if (!S_ISDIR(inode->i_mode)) | 160 | if (!S_ISDIR(inode->i_mode)) |
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id) | |||
182 | 183 | ||
183 | /* nothing else could have found us thanks to the dnotify_groups | 184 | /* nothing else could have found us thanks to the dnotify_groups |
184 | mark_mutex */ | 185 | mark_mutex */ |
185 | if (dn_mark->dn == NULL) | 186 | if (dn_mark->dn == NULL) { |
186 | fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); | 187 | fsnotify_detach_mark(fsn_mark); |
188 | free = true; | ||
189 | } | ||
187 | 190 | ||
188 | mutex_unlock(&dnotify_group->mark_mutex); | 191 | mutex_unlock(&dnotify_group->mark_mutex); |
189 | 192 | ||
193 | if (free) | ||
194 | fsnotify_free_mark(fsn_mark); | ||
190 | fsnotify_put_mark(fsn_mark); | 195 | fsnotify_put_mark(fsn_mark); |
191 | } | 196 | } |
192 | 197 | ||
@@ -362,9 +367,10 @@ out: | |||
362 | spin_unlock(&fsn_mark->lock); | 367 | spin_unlock(&fsn_mark->lock); |
363 | 368 | ||
364 | if (destroy) | 369 | if (destroy) |
365 | fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); | 370 | fsnotify_detach_mark(fsn_mark); |
366 | |||
367 | mutex_unlock(&dnotify_group->mark_mutex); | 371 | mutex_unlock(&dnotify_group->mark_mutex); |
372 | if (destroy) | ||
373 | fsnotify_free_mark(fsn_mark); | ||
368 | fsnotify_put_mark(fsn_mark); | 374 | fsnotify_put_mark(fsn_mark); |
369 | out_err: | 375 | out_err: |
370 | if (new_fsn_mark) | 376 | if (new_fsn_mark) |
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index cf275500a665..8e8e6bcd1d43 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c | |||
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, | |||
529 | removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, | 529 | removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, |
530 | &destroy_mark); | 530 | &destroy_mark); |
531 | if (destroy_mark) | 531 | if (destroy_mark) |
532 | fsnotify_destroy_mark_locked(fsn_mark, group); | 532 | fsnotify_detach_mark(fsn_mark); |
533 | mutex_unlock(&group->mark_mutex); | 533 | mutex_unlock(&group->mark_mutex); |
534 | if (destroy_mark) | ||
535 | fsnotify_free_mark(fsn_mark); | ||
534 | 536 | ||
535 | fsnotify_put_mark(fsn_mark); | 537 | fsnotify_put_mark(fsn_mark); |
536 | if (removed & real_mount(mnt)->mnt_fsnotify_mask) | 538 | if (removed & real_mount(mnt)->mnt_fsnotify_mask) |
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, | |||
557 | removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, | 559 | removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, |
558 | &destroy_mark); | 560 | &destroy_mark); |
559 | if (destroy_mark) | 561 | if (destroy_mark) |
560 | fsnotify_destroy_mark_locked(fsn_mark, group); | 562 | fsnotify_detach_mark(fsn_mark); |
561 | mutex_unlock(&group->mark_mutex); | 563 | mutex_unlock(&group->mark_mutex); |
564 | if (destroy_mark) | ||
565 | fsnotify_free_mark(fsn_mark); | ||
562 | 566 | ||
563 | /* matches the fsnotify_find_inode_mark() */ | 567 | /* matches the fsnotify_find_inode_mark() */ |
564 | fsnotify_put_mark(fsn_mark); | 568 | fsnotify_put_mark(fsn_mark); |
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 58b7cdb63da9..6b6f0d472ae8 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c | |||
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) | |||
76 | struct inotify_inode_mark *inode_mark; | 76 | struct inotify_inode_mark *inode_mark; |
77 | struct inode *inode; | 77 | struct inode *inode; |
78 | 78 | ||
79 | if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) | 79 | if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || |
80 | !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) | ||
80 | return; | 81 | return; |
81 | 82 | ||
82 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); | 83 | inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); |
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index dd3fb0b17be7..db39de2dd4cb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c | |||
@@ -26,7 +26,6 @@ | |||
26 | 26 | ||
27 | #include <linux/fsnotify_backend.h> | 27 | #include <linux/fsnotify_backend.h> |
28 | #include "fsnotify.h" | 28 | #include "fsnotify.h" |
29 | #include "../mount.h" | ||
30 | 29 | ||
31 | /* | 30 | /* |
32 | * Clear all of the marks on an inode when it is being evicted from core | 31 | * Clear all of the marks on an inode when it is being evicted from core |
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, | |||
205 | mnt = NULL; | 204 | mnt = NULL; |
206 | 205 | ||
207 | /* | 206 | /* |
207 | * Optimization: srcu_read_lock() has a memory barrier which can | ||
208 | * be expensive. It protects walking the *_fsnotify_marks lists. | ||
209 | * However, if we do not walk the lists, we do not have to do | ||
210 | * SRCU because we have no references to any objects and do not | ||
211 | * need SRCU to keep them "alive". | ||
212 | */ | ||
213 | if (hlist_empty(&to_tell->i_fsnotify_marks) && | ||
214 | (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) | ||
215 | return 0; | ||
216 | /* | ||
208 | * if this is a modify event we may need to clear the ignored masks | 217 | * if this is a modify event we may need to clear the ignored masks |
209 | * otherwise return if neither the inode nor the vfsmount care about | 218 | * otherwise return if neither the inode nor the vfsmount care about |
210 | * this type of event. | 219 | * this type of event. |
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 13a00be516d2..b44c68a857e7 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h | |||
@@ -6,6 +6,8 @@ | |||
6 | #include <linux/srcu.h> | 6 | #include <linux/srcu.h> |
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | 8 | ||
9 | #include "../mount.h" | ||
10 | |||
9 | /* destroy all events sitting in this groups notification queue */ | 11 | /* destroy all events sitting in this groups notification queue */ |
10 | extern void fsnotify_flush_notify(struct fsnotify_group *group); | 12 | extern void fsnotify_flush_notify(struct fsnotify_group *group); |
11 | 13 | ||
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
38 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); | 40 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); |
39 | /* inode specific destruction of a mark */ | 41 | /* inode specific destruction of a mark */ |
40 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); | 42 | extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); |
41 | /* Destroy all marks in the given list */ | ||
42 | extern void fsnotify_destroy_marks(struct list_head *to_free); | ||
43 | /* Find mark belonging to given group in the list of marks */ | 43 | /* Find mark belonging to given group in the list of marks */ |
44 | extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, | 44 | extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, |
45 | struct fsnotify_group *group); | 45 | struct fsnotify_group *group); |
46 | /* run the list of all marks associated with inode and flag them to be freed */ | 46 | /* Destroy all marks in the given list protected by 'lock' */ |
47 | extern void fsnotify_clear_marks_by_inode(struct inode *inode); | 47 | extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); |
48 | /* run the list of all marks associated with vfsmount and flag them to be freed */ | 48 | /* run the list of all marks associated with inode and destroy them */ |
49 | extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); | 49 | static inline void fsnotify_clear_marks_by_inode(struct inode *inode) |
50 | { | ||
51 | fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); | ||
52 | } | ||
53 | /* run the list of all marks associated with vfsmount and destroy them */ | ||
54 | static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) | ||
55 | { | ||
56 | fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, | ||
57 | &mnt->mnt_root->d_lock); | ||
58 | } | ||
50 | /* | 59 | /* |
51 | * update the dentry->d_flags of all of inode's children to indicate if inode cares | 60 | * update the dentry->d_flags of all of inode's children to indicate if inode cares |
52 | * about events that happen to its children. | 61 | * about events that happen to its children. |
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 3daf513ee99e..474a3ce1b5e1 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c | |||
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) | |||
65 | } | 65 | } |
66 | 66 | ||
67 | /* | 67 | /* |
68 | * Given an inode, destroy all of the marks associated with that inode. | ||
69 | */ | ||
70 | void fsnotify_clear_marks_by_inode(struct inode *inode) | ||
71 | { | ||
72 | struct fsnotify_mark *mark; | ||
73 | struct hlist_node *n; | ||
74 | LIST_HEAD(free_list); | ||
75 | |||
76 | spin_lock(&inode->i_lock); | ||
77 | hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { | ||
78 | list_add(&mark->free_list, &free_list); | ||
79 | hlist_del_init_rcu(&mark->obj_list); | ||
80 | fsnotify_get_mark(mark); | ||
81 | } | ||
82 | spin_unlock(&inode->i_lock); | ||
83 | |||
84 | fsnotify_destroy_marks(&free_list); | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * Given a group clear all of the inode marks associated with that group. | 68 | * Given a group clear all of the inode marks associated with that group. |
89 | */ | 69 | */ |
90 | void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) | 70 | void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) |
diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 39ddcaf0918f..fc0df4442f7b 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c | |||
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head) | |||
122 | } | 122 | } |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * Any time a mark is getting freed we end up here. | 125 | * Remove mark from inode / vfsmount list, group list, drop inode reference |
126 | * The caller had better be holding a reference to this mark so we don't actually | 126 | * if we got one. |
127 | * do the final put under the mark->lock | 127 | * |
128 | * Must be called with group->mark_mutex held. | ||
128 | */ | 129 | */ |
129 | void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | 130 | void fsnotify_detach_mark(struct fsnotify_mark *mark) |
130 | struct fsnotify_group *group) | ||
131 | { | 131 | { |
132 | struct inode *inode = NULL; | 132 | struct inode *inode = NULL; |
133 | struct fsnotify_group *group = mark->group; | ||
133 | 134 | ||
134 | BUG_ON(!mutex_is_locked(&group->mark_mutex)); | 135 | BUG_ON(!mutex_is_locked(&group->mark_mutex)); |
135 | 136 | ||
136 | spin_lock(&mark->lock); | 137 | spin_lock(&mark->lock); |
137 | 138 | ||
138 | /* something else already called this function on this mark */ | 139 | /* something else already called this function on this mark */ |
139 | if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { | 140 | if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { |
140 | spin_unlock(&mark->lock); | 141 | spin_unlock(&mark->lock); |
141 | return; | 142 | return; |
142 | } | 143 | } |
143 | 144 | ||
144 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; | 145 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; |
145 | 146 | ||
146 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { | 147 | if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { |
147 | inode = mark->inode; | 148 | inode = mark->inode; |
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
150 | fsnotify_destroy_vfsmount_mark(mark); | 151 | fsnotify_destroy_vfsmount_mark(mark); |
151 | else | 152 | else |
152 | BUG(); | 153 | BUG(); |
154 | /* | ||
155 | * Note that we didn't update flags telling whether inode cares about | ||
156 | * what's happening with children. We update these flags from | ||
157 | * __fsnotify_parent() lazily when next event happens on one of our | ||
158 | * children. | ||
159 | */ | ||
153 | 160 | ||
154 | list_del_init(&mark->g_list); | 161 | list_del_init(&mark->g_list); |
155 | 162 | ||
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
157 | 164 | ||
158 | if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) | 165 | if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) |
159 | iput(inode); | 166 | iput(inode); |
160 | /* release lock temporarily */ | 167 | |
161 | mutex_unlock(&group->mark_mutex); | 168 | atomic_dec(&group->num_marks); |
169 | } | ||
170 | |||
171 | /* | ||
172 | * Free fsnotify mark. The freeing is actually happening from a kthread which | ||
173 | * first waits for srcu period end. Caller must have a reference to the mark | ||
174 | * or be protected by fsnotify_mark_srcu. | ||
175 | */ | ||
176 | void fsnotify_free_mark(struct fsnotify_mark *mark) | ||
177 | { | ||
178 | struct fsnotify_group *group = mark->group; | ||
179 | |||
180 | spin_lock(&mark->lock); | ||
181 | /* something else already called this function on this mark */ | ||
182 | if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { | ||
183 | spin_unlock(&mark->lock); | ||
184 | return; | ||
185 | } | ||
186 | mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; | ||
187 | spin_unlock(&mark->lock); | ||
162 | 188 | ||
163 | spin_lock(&destroy_lock); | 189 | spin_lock(&destroy_lock); |
164 | list_add(&mark->g_list, &destroy_list); | 190 | list_add(&mark->g_list, &destroy_list); |
165 | spin_unlock(&destroy_lock); | 191 | spin_unlock(&destroy_lock); |
166 | wake_up(&destroy_waitq); | 192 | wake_up(&destroy_waitq); |
167 | /* | ||
168 | * We don't necessarily have a ref on mark from caller so the above destroy | ||
169 | * may have actually freed it, unless this group provides a 'freeing_mark' | ||
170 | * function which must be holding a reference. | ||
171 | */ | ||
172 | 193 | ||
173 | /* | 194 | /* |
174 | * Some groups like to know that marks are being freed. This is a | 195 | * Some groups like to know that marks are being freed. This is a |
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | |||
177 | */ | 198 | */ |
178 | if (group->ops->freeing_mark) | 199 | if (group->ops->freeing_mark) |
179 | group->ops->freeing_mark(mark, group); | 200 | group->ops->freeing_mark(mark, group); |
180 | |||
181 | /* | ||
182 | * __fsnotify_update_child_dentry_flags(inode); | ||
183 | * | ||
184 | * I really want to call that, but we can't, we have no idea if the inode | ||
185 | * still exists the second we drop the mark->lock. | ||
186 | * | ||
187 | * The next time an event arrive to this inode from one of it's children | ||
188 | * __fsnotify_parent will see that the inode doesn't care about it's | ||
189 | * children and will update all of these flags then. So really this | ||
190 | * is just a lazy update (and could be a perf win...) | ||
191 | */ | ||
192 | |||
193 | atomic_dec(&group->num_marks); | ||
194 | |||
195 | mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); | ||
196 | } | 201 | } |
197 | 202 | ||
198 | void fsnotify_destroy_mark(struct fsnotify_mark *mark, | 203 | void fsnotify_destroy_mark(struct fsnotify_mark *mark, |
199 | struct fsnotify_group *group) | 204 | struct fsnotify_group *group) |
200 | { | 205 | { |
201 | mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); | 206 | mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); |
202 | fsnotify_destroy_mark_locked(mark, group); | 207 | fsnotify_detach_mark(mark); |
203 | mutex_unlock(&group->mark_mutex); | 208 | mutex_unlock(&group->mark_mutex); |
209 | fsnotify_free_mark(mark); | ||
204 | } | 210 | } |
205 | 211 | ||
206 | /* | 212 | void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) |
207 | * Destroy all marks in the given list. The marks must be already detached from | ||
208 | * the original inode / vfsmount. | ||
209 | */ | ||
210 | void fsnotify_destroy_marks(struct list_head *to_free) | ||
211 | { | 213 | { |
212 | struct fsnotify_mark *mark, *lmark; | 214 | struct fsnotify_mark *mark; |
213 | struct fsnotify_group *group; | ||
214 | |||
215 | list_for_each_entry_safe(mark, lmark, to_free, free_list) { | ||
216 | spin_lock(&mark->lock); | ||
217 | fsnotify_get_group(mark->group); | ||
218 | group = mark->group; | ||
219 | spin_unlock(&mark->lock); | ||
220 | 215 | ||
221 | fsnotify_destroy_mark(mark, group); | 216 | while (1) { |
217 | /* | ||
218 | * We have to be careful since we can race with e.g. | ||
219 | * fsnotify_clear_marks_by_group() and once we drop 'lock', | ||
220 | * mark can get removed from the obj_list and destroyed. But | ||
221 | * we are holding mark reference so mark cannot be freed and | ||
222 | * calling fsnotify_destroy_mark() more than once is fine. | ||
223 | */ | ||
224 | spin_lock(lock); | ||
225 | if (hlist_empty(head)) { | ||
226 | spin_unlock(lock); | ||
227 | break; | ||
228 | } | ||
229 | mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); | ||
230 | /* | ||
231 | * We don't update i_fsnotify_mask / mnt_fsnotify_mask here | ||
232 | * since inode / mount is going away anyway. So just remove | ||
233 | * mark from the list. | ||
234 | */ | ||
235 | hlist_del_init_rcu(&mark->obj_list); | ||
236 | fsnotify_get_mark(mark); | ||
237 | spin_unlock(lock); | ||
238 | fsnotify_destroy_mark(mark, mark->group); | ||
222 | fsnotify_put_mark(mark); | 239 | fsnotify_put_mark(mark); |
223 | fsnotify_put_group(group); | ||
224 | } | 240 | } |
225 | } | 241 | } |
226 | 242 | ||
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, | |||
332 | * inode->i_lock | 348 | * inode->i_lock |
333 | */ | 349 | */ |
334 | spin_lock(&mark->lock); | 350 | spin_lock(&mark->lock); |
335 | mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; | 351 | mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; |
336 | 352 | ||
337 | fsnotify_get_group(group); | 353 | fsnotify_get_group(group); |
338 | mark->group = group; | 354 | mark->group = group; |
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, | |||
438 | } | 454 | } |
439 | mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); | 455 | mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); |
440 | fsnotify_get_mark(mark); | 456 | fsnotify_get_mark(mark); |
441 | fsnotify_destroy_mark_locked(mark, group); | 457 | fsnotify_detach_mark(mark); |
442 | mutex_unlock(&group->mark_mutex); | 458 | mutex_unlock(&group->mark_mutex); |
459 | fsnotify_free_mark(mark); | ||
443 | fsnotify_put_mark(mark); | 460 | fsnotify_put_mark(mark); |
444 | } | 461 | } |
445 | } | 462 | } |
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 326b148e623c..a8fcab68faef 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c | |||
@@ -28,25 +28,6 @@ | |||
28 | 28 | ||
29 | #include <linux/fsnotify_backend.h> | 29 | #include <linux/fsnotify_backend.h> |
30 | #include "fsnotify.h" | 30 | #include "fsnotify.h" |
31 | #include "../mount.h" | ||
32 | |||
33 | void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) | ||
34 | { | ||
35 | struct fsnotify_mark *mark; | ||
36 | struct hlist_node *n; | ||
37 | struct mount *m = real_mount(mnt); | ||
38 | LIST_HEAD(free_list); | ||
39 | |||
40 | spin_lock(&mnt->mnt_root->d_lock); | ||
41 | hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { | ||
42 | list_add(&mark->free_list, &free_list); | ||
43 | hlist_del_init_rcu(&mark->obj_list); | ||
44 | fsnotify_get_mark(mark); | ||
45 | } | ||
46 | spin_unlock(&mnt->mnt_root->d_lock); | ||
47 | |||
48 | fsnotify_destroy_marks(&free_list); | ||
49 | } | ||
50 | 31 | ||
51 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) | 32 | void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) |
52 | { | 33 | { |
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index c1128bcbeb5e..d1a853585b53 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c | |||
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed: | |||
2204 | return true; | 2204 | return true; |
2205 | #ifdef NTFS_RW | 2205 | #ifdef NTFS_RW |
2206 | iput_usnjrnl_err_out: | 2206 | iput_usnjrnl_err_out: |
2207 | if (vol->usnjrnl_j_ino) | 2207 | iput(vol->usnjrnl_j_ino); |
2208 | iput(vol->usnjrnl_j_ino); | 2208 | iput(vol->usnjrnl_max_ino); |
2209 | if (vol->usnjrnl_max_ino) | 2209 | iput(vol->usnjrnl_ino); |
2210 | iput(vol->usnjrnl_max_ino); | ||
2211 | if (vol->usnjrnl_ino) | ||
2212 | iput(vol->usnjrnl_ino); | ||
2213 | iput_quota_err_out: | 2210 | iput_quota_err_out: |
2214 | if (vol->quota_q_ino) | 2211 | iput(vol->quota_q_ino); |
2215 | iput(vol->quota_q_ino); | 2212 | iput(vol->quota_ino); |
2216 | if (vol->quota_ino) | ||
2217 | iput(vol->quota_ino); | ||
2218 | iput(vol->extend_ino); | 2213 | iput(vol->extend_ino); |
2219 | #endif /* NTFS_RW */ | 2214 | #endif /* NTFS_RW */ |
2220 | iput_sec_err_out: | 2215 | iput_sec_err_out: |
@@ -2223,8 +2218,7 @@ iput_root_err_out: | |||
2223 | iput(vol->root_ino); | 2218 | iput(vol->root_ino); |
2224 | iput_logfile_err_out: | 2219 | iput_logfile_err_out: |
2225 | #ifdef NTFS_RW | 2220 | #ifdef NTFS_RW |
2226 | if (vol->logfile_ino) | 2221 | iput(vol->logfile_ino); |
2227 | iput(vol->logfile_ino); | ||
2228 | iput_vol_err_out: | 2222 | iput_vol_err_out: |
2229 | #endif /* NTFS_RW */ | 2223 | #endif /* NTFS_RW */ |
2230 | iput(vol->vol_ino); | 2224 | iput(vol->vol_ino); |
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out: | |||
2254 | iput(vol->mftbmp_ino); | 2248 | iput(vol->mftbmp_ino); |
2255 | iput_mirr_err_out: | 2249 | iput_mirr_err_out: |
2256 | #ifdef NTFS_RW | 2250 | #ifdef NTFS_RW |
2257 | if (vol->mftmirr_ino) | 2251 | iput(vol->mftmirr_ino); |
2258 | iput(vol->mftmirr_ino); | ||
2259 | #endif /* NTFS_RW */ | 2252 | #endif /* NTFS_RW */ |
2260 | return false; | 2253 | return false; |
2261 | } | 2254 | } |
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index c58a1bcfda0f..0cdf497c91ef 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c | |||
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle, | |||
284 | 284 | ||
285 | int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) | 285 | int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) |
286 | { | 286 | { |
287 | return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); | 287 | struct buffer_head *bh = NULL; |
288 | int status = 0; | ||
289 | |||
290 | status = ocfs2_inode_lock(inode, &bh, 1); | ||
291 | if (status < 0) { | ||
292 | if (status != -ENOENT) | ||
293 | mlog_errno(status); | ||
294 | return status; | ||
295 | } | ||
296 | status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); | ||
297 | ocfs2_inode_unlock(inode, 1); | ||
298 | brelse(bh); | ||
299 | return status; | ||
288 | } | 300 | } |
289 | 301 | ||
290 | struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) | 302 | struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) |
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) | |||
292 | struct ocfs2_super *osb; | 304 | struct ocfs2_super *osb; |
293 | struct buffer_head *di_bh = NULL; | 305 | struct buffer_head *di_bh = NULL; |
294 | struct posix_acl *acl; | 306 | struct posix_acl *acl; |
295 | int ret = -EAGAIN; | 307 | int ret; |
296 | 308 | ||
297 | osb = OCFS2_SB(inode->i_sb); | 309 | osb = OCFS2_SB(inode->i_sb); |
298 | if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) | 310 | if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) |
299 | return NULL; | 311 | return NULL; |
300 | 312 | ret = ocfs2_inode_lock(inode, &di_bh, 0); | |
301 | ret = ocfs2_read_inode_block(inode, &di_bh); | 313 | if (ret < 0) { |
302 | if (ret < 0) | 314 | if (ret != -ENOENT) |
315 | mlog_errno(ret); | ||
303 | return ERR_PTR(ret); | 316 | return ERR_PTR(ret); |
317 | } | ||
304 | 318 | ||
305 | acl = ocfs2_get_acl_nolock(inode, type, di_bh); | 319 | acl = ocfs2_get_acl_nolock(inode, type, di_bh); |
306 | 320 | ||
321 | ocfs2_inode_unlock(inode, 0); | ||
307 | brelse(di_bh); | 322 | brelse(di_bh); |
308 | |||
309 | return acl; | 323 | return acl; |
310 | } | 324 | } |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5997c00a1515..86181d6526dc 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, | |||
908 | */ | 908 | */ |
909 | 909 | ||
910 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | 910 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { |
911 | ocfs2_error(sb, | 911 | rc = ocfs2_error(sb, |
912 | "Extent block #%llu has bad signature %.*s", | 912 | "Extent block #%llu has bad signature %.*s\n", |
913 | (unsigned long long)bh->b_blocknr, 7, | 913 | (unsigned long long)bh->b_blocknr, 7, |
914 | eb->h_signature); | 914 | eb->h_signature); |
915 | return -EINVAL; | 915 | goto bail; |
916 | } | 916 | } |
917 | 917 | ||
918 | if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { | 918 | if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { |
919 | ocfs2_error(sb, | 919 | rc = ocfs2_error(sb, |
920 | "Extent block #%llu has an invalid h_blkno " | 920 | "Extent block #%llu has an invalid h_blkno of %llu\n", |
921 | "of %llu", | 921 | (unsigned long long)bh->b_blocknr, |
922 | (unsigned long long)bh->b_blocknr, | 922 | (unsigned long long)le64_to_cpu(eb->h_blkno)); |
923 | (unsigned long long)le64_to_cpu(eb->h_blkno)); | 923 | goto bail; |
924 | return -EINVAL; | ||
925 | } | 924 | } |
926 | 925 | ||
927 | if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { | 926 | if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { |
928 | ocfs2_error(sb, | 927 | rc = ocfs2_error(sb, |
929 | "Extent block #%llu has an invalid " | 928 | "Extent block #%llu has an invalid h_fs_generation of #%u\n", |
930 | "h_fs_generation of #%u", | 929 | (unsigned long long)bh->b_blocknr, |
931 | (unsigned long long)bh->b_blocknr, | 930 | le32_to_cpu(eb->h_fs_generation)); |
932 | le32_to_cpu(eb->h_fs_generation)); | 931 | goto bail; |
933 | return -EINVAL; | ||
934 | } | 932 | } |
935 | 933 | bail: | |
936 | return 0; | 934 | return rc; |
937 | } | 935 | } |
938 | 936 | ||
939 | int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, | 937 | int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, |
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, | |||
1446 | while(le16_to_cpu(el->l_tree_depth) > 1) { | 1444 | while(le16_to_cpu(el->l_tree_depth) > 1) { |
1447 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | 1445 | if (le16_to_cpu(el->l_next_free_rec) == 0) { |
1448 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 1446 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
1449 | "Owner %llu has empty " | 1447 | "Owner %llu has empty extent list (next_free_rec == 0)\n", |
1450 | "extent list (next_free_rec == 0)", | ||
1451 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); | 1448 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); |
1452 | status = -EIO; | 1449 | status = -EIO; |
1453 | goto bail; | 1450 | goto bail; |
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, | |||
1456 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); | 1453 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); |
1457 | if (!blkno) { | 1454 | if (!blkno) { |
1458 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 1455 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
1459 | "Owner %llu has extent " | 1456 | "Owner %llu has extent list where extent # %d has no physical block start\n", |
1460 | "list where extent # %d has no physical " | ||
1461 | "block start", | ||
1462 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); | 1457 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); |
1463 | status = -EIO; | 1458 | status = -EIO; |
1464 | goto bail; | 1459 | goto bail; |
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, | |||
1788 | while (el->l_tree_depth) { | 1783 | while (el->l_tree_depth) { |
1789 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | 1784 | if (le16_to_cpu(el->l_next_free_rec) == 0) { |
1790 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), | 1785 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), |
1791 | "Owner %llu has empty extent list at " | 1786 | "Owner %llu has empty extent list at depth %u\n", |
1792 | "depth %u\n", | ||
1793 | (unsigned long long)ocfs2_metadata_cache_owner(ci), | 1787 | (unsigned long long)ocfs2_metadata_cache_owner(ci), |
1794 | le16_to_cpu(el->l_tree_depth)); | 1788 | le16_to_cpu(el->l_tree_depth)); |
1795 | ret = -EROFS; | 1789 | ret = -EROFS; |
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, | |||
1814 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); | 1808 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); |
1815 | if (blkno == 0) { | 1809 | if (blkno == 0) { |
1816 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), | 1810 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), |
1817 | "Owner %llu has bad blkno in extent list " | 1811 | "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", |
1818 | "at depth %u (index %d)\n", | ||
1819 | (unsigned long long)ocfs2_metadata_cache_owner(ci), | 1812 | (unsigned long long)ocfs2_metadata_cache_owner(ci), |
1820 | le16_to_cpu(el->l_tree_depth), i); | 1813 | le16_to_cpu(el->l_tree_depth), i); |
1821 | ret = -EROFS; | 1814 | ret = -EROFS; |
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, | |||
1836 | if (le16_to_cpu(el->l_next_free_rec) > | 1829 | if (le16_to_cpu(el->l_next_free_rec) > |
1837 | le16_to_cpu(el->l_count)) { | 1830 | le16_to_cpu(el->l_count)) { |
1838 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), | 1831 | ocfs2_error(ocfs2_metadata_cache_get_super(ci), |
1839 | "Owner %llu has bad count in extent list " | 1832 | "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", |
1840 | "at block %llu (next free=%u, count=%u)\n", | ||
1841 | (unsigned long long)ocfs2_metadata_cache_owner(ci), | 1833 | (unsigned long long)ocfs2_metadata_cache_owner(ci), |
1842 | (unsigned long long)bh->b_blocknr, | 1834 | (unsigned long long)bh->b_blocknr, |
1843 | le16_to_cpu(el->l_next_free_rec), | 1835 | le16_to_cpu(el->l_next_free_rec), |
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, | |||
2116 | 2108 | ||
2117 | if (left_el->l_next_free_rec != left_el->l_count) { | 2109 | if (left_el->l_next_free_rec != left_el->l_count) { |
2118 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 2110 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
2119 | "Inode %llu has non-full interior leaf node %llu" | 2111 | "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", |
2120 | "(next free = %u)", | ||
2121 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 2112 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
2122 | (unsigned long long)left_leaf_bh->b_blocknr, | 2113 | (unsigned long long)left_leaf_bh->b_blocknr, |
2123 | le16_to_cpu(left_el->l_next_free_rec)); | 2114 | le16_to_cpu(left_el->l_next_free_rec)); |
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, | |||
2256 | * If we got here, we never found a valid node where | 2247 | * If we got here, we never found a valid node where |
2257 | * the tree indicated one should be. | 2248 | * the tree indicated one should be. |
2258 | */ | 2249 | */ |
2259 | ocfs2_error(sb, | 2250 | ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", |
2260 | "Invalid extent tree at extent block %llu\n", | ||
2261 | (unsigned long long)blkno); | 2251 | (unsigned long long)blkno); |
2262 | ret = -EROFS; | 2252 | ret = -EROFS; |
2263 | goto out; | 2253 | goto out; |
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, | |||
2872 | * If we got here, we never found a valid node where | 2862 | * If we got here, we never found a valid node where |
2873 | * the tree indicated one should be. | 2863 | * the tree indicated one should be. |
2874 | */ | 2864 | */ |
2875 | ocfs2_error(sb, | 2865 | ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", |
2876 | "Invalid extent tree at extent block %llu\n", | ||
2877 | (unsigned long long)blkno); | 2866 | (unsigned long long)blkno); |
2878 | ret = -EROFS; | 2867 | ret = -EROFS; |
2879 | goto out; | 2868 | goto out; |
@@ -3131,6 +3120,30 @@ out: | |||
3131 | return ret; | 3120 | return ret; |
3132 | } | 3121 | } |
3133 | 3122 | ||
3123 | static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, | ||
3124 | struct ocfs2_extent_tree *et, | ||
3125 | struct ocfs2_path *path, | ||
3126 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
3127 | { | ||
3128 | handle_t *handle; | ||
3129 | int ret; | ||
3130 | int credits = path->p_tree_depth * 2 + 1; | ||
3131 | |||
3132 | handle = ocfs2_start_trans(osb, credits); | ||
3133 | if (IS_ERR(handle)) { | ||
3134 | ret = PTR_ERR(handle); | ||
3135 | mlog_errno(ret); | ||
3136 | return ret; | ||
3137 | } | ||
3138 | |||
3139 | ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); | ||
3140 | if (ret) | ||
3141 | mlog_errno(ret); | ||
3142 | |||
3143 | ocfs2_commit_trans(osb, handle); | ||
3144 | return ret; | ||
3145 | } | ||
3146 | |||
3134 | /* | 3147 | /* |
3135 | * Left rotation of btree records. | 3148 | * Left rotation of btree records. |
3136 | * | 3149 | * |
@@ -3200,7 +3213,7 @@ rightmost_no_delete: | |||
3200 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | 3213 | if (le16_to_cpu(el->l_next_free_rec) == 0) { |
3201 | ret = -EIO; | 3214 | ret = -EIO; |
3202 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 3215 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
3203 | "Owner %llu has empty extent block at %llu", | 3216 | "Owner %llu has empty extent block at %llu\n", |
3204 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 3217 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
3205 | (unsigned long long)le64_to_cpu(eb->h_blkno)); | 3218 | (unsigned long long)le64_to_cpu(eb->h_blkno)); |
3206 | goto out; | 3219 | goto out; |
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, | |||
3930 | next_free = le16_to_cpu(el->l_next_free_rec); | 3943 | next_free = le16_to_cpu(el->l_next_free_rec); |
3931 | if (next_free == 0) { | 3944 | if (next_free == 0) { |
3932 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 3945 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
3933 | "Owner %llu has a bad extent list", | 3946 | "Owner %llu has a bad extent list\n", |
3934 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); | 3947 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); |
3935 | ret = -EIO; | 3948 | ret = -EIO; |
3936 | return; | 3949 | return; |
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, | |||
4355 | bh = path_leaf_bh(left_path); | 4368 | bh = path_leaf_bh(left_path); |
4356 | eb = (struct ocfs2_extent_block *)bh->b_data; | 4369 | eb = (struct ocfs2_extent_block *)bh->b_data; |
4357 | ocfs2_error(sb, | 4370 | ocfs2_error(sb, |
4358 | "Extent block #%llu has an " | 4371 | "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", |
4359 | "invalid l_next_free_rec of " | ||
4360 | "%d. It should have " | ||
4361 | "matched the l_count of %d", | ||
4362 | (unsigned long long)le64_to_cpu(eb->h_blkno), | 4372 | (unsigned long long)le64_to_cpu(eb->h_blkno), |
4363 | le16_to_cpu(new_el->l_next_free_rec), | 4373 | le16_to_cpu(new_el->l_next_free_rec), |
4364 | le16_to_cpu(new_el->l_count)); | 4374 | le16_to_cpu(new_el->l_count)); |
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, | |||
4413 | bh = path_leaf_bh(right_path); | 4423 | bh = path_leaf_bh(right_path); |
4414 | eb = (struct ocfs2_extent_block *)bh->b_data; | 4424 | eb = (struct ocfs2_extent_block *)bh->b_data; |
4415 | ocfs2_error(sb, | 4425 | ocfs2_error(sb, |
4416 | "Extent block #%llu has an " | 4426 | "Extent block #%llu has an invalid l_next_free_rec of %d\n", |
4417 | "invalid l_next_free_rec of %d", | ||
4418 | (unsigned long long)le64_to_cpu(eb->h_blkno), | 4427 | (unsigned long long)le64_to_cpu(eb->h_blkno), |
4419 | le16_to_cpu(new_el->l_next_free_rec)); | 4428 | le16_to_cpu(new_el->l_next_free_rec)); |
4420 | status = -EINVAL; | 4429 | status = -EINVAL; |
@@ -4970,10 +4979,9 @@ leftright: | |||
4970 | split_index = ocfs2_search_extent_list(el, cpos); | 4979 | split_index = ocfs2_search_extent_list(el, cpos); |
4971 | if (split_index == -1) { | 4980 | if (split_index == -1) { |
4972 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 4981 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
4973 | "Owner %llu has an extent at cpos %u " | 4982 | "Owner %llu has an extent at cpos %u which can no longer be found\n", |
4974 | "which can no longer be found.\n", | 4983 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
4975 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 4984 | cpos); |
4976 | cpos); | ||
4977 | ret = -EROFS; | 4985 | ret = -EROFS; |
4978 | goto out; | 4986 | goto out; |
4979 | } | 4987 | } |
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle, | |||
5158 | index = ocfs2_search_extent_list(el, cpos); | 5166 | index = ocfs2_search_extent_list(el, cpos); |
5159 | if (index == -1) { | 5167 | if (index == -1) { |
5160 | ocfs2_error(sb, | 5168 | ocfs2_error(sb, |
5161 | "Owner %llu has an extent at cpos %u which can no " | 5169 | "Owner %llu has an extent at cpos %u which can no longer be found\n", |
5162 | "longer be found.\n", | 5170 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
5163 | (unsigned long long) | 5171 | cpos); |
5164 | ocfs2_metadata_cache_owner(et->et_ci), cpos); | ||
5165 | ret = -EROFS; | 5172 | ret = -EROFS; |
5166 | goto out; | 5173 | goto out; |
5167 | } | 5174 | } |
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode, | |||
5228 | cpos, len, phys); | 5235 | cpos, len, phys); |
5229 | 5236 | ||
5230 | if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { | 5237 | if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { |
5231 | ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " | 5238 | ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", |
5232 | "that are being written to, but the feature bit " | ||
5233 | "is not set in the super block.", | ||
5234 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 5239 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
5235 | ret = -EROFS; | 5240 | ret = -EROFS; |
5236 | goto out; | 5241 | goto out; |
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle, | |||
5514 | index = ocfs2_search_extent_list(el, cpos); | 5519 | index = ocfs2_search_extent_list(el, cpos); |
5515 | if (index == -1) { | 5520 | if (index == -1) { |
5516 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 5521 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
5517 | "Owner %llu has an extent at cpos %u which can no " | 5522 | "Owner %llu has an extent at cpos %u which can no longer be found\n", |
5518 | "longer be found.\n", | ||
5519 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 5523 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
5520 | cpos); | 5524 | cpos); |
5521 | ret = -EROFS; | 5525 | ret = -EROFS; |
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle, | |||
5580 | index = ocfs2_search_extent_list(el, cpos); | 5584 | index = ocfs2_search_extent_list(el, cpos); |
5581 | if (index == -1) { | 5585 | if (index == -1) { |
5582 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 5586 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
5583 | "Owner %llu: split at cpos %u lost record.", | 5587 | "Owner %llu: split at cpos %u lost record\n", |
5584 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 5588 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
5585 | cpos); | 5589 | cpos); |
5586 | ret = -EROFS; | 5590 | ret = -EROFS; |
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle, | |||
5596 | ocfs2_rec_clusters(el, rec); | 5600 | ocfs2_rec_clusters(el, rec); |
5597 | if (rec_range != trunc_range) { | 5601 | if (rec_range != trunc_range) { |
5598 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), | 5602 | ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), |
5599 | "Owner %llu: error after split at cpos %u" | 5603 | "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", |
5600 | "trunc len %u, existing record is (%u,%u)", | ||
5601 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), | 5604 | (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), |
5602 | cpos, len, le32_to_cpu(rec->e_cpos), | 5605 | cpos, len, le32_to_cpu(rec->e_cpos), |
5603 | ocfs2_rec_clusters(el, rec)); | 5606 | ocfs2_rec_clusters(el, rec)); |
@@ -6175,7 +6178,7 @@ bail: | |||
6175 | iput(tl_inode); | 6178 | iput(tl_inode); |
6176 | brelse(tl_bh); | 6179 | brelse(tl_bh); |
6177 | 6180 | ||
6178 | if (status < 0 && (*tl_copy)) { | 6181 | if (status < 0) { |
6179 | kfree(*tl_copy); | 6182 | kfree(*tl_copy); |
6180 | *tl_copy = NULL; | 6183 | *tl_copy = NULL; |
6181 | mlog_errno(status); | 6184 | mlog_errno(status); |
@@ -7108,15 +7111,23 @@ start: | |||
7108 | * to check it up here before changing the tree. | 7111 | * to check it up here before changing the tree. |
7109 | */ | 7112 | */ |
7110 | if (root_el->l_tree_depth && rec->e_int_clusters == 0) { | 7113 | if (root_el->l_tree_depth && rec->e_int_clusters == 0) { |
7111 | ocfs2_error(inode->i_sb, "Inode %lu has an empty " | 7114 | mlog(ML_ERROR, "Inode %lu has an empty " |
7112 | "extent record, depth %u\n", inode->i_ino, | 7115 | "extent record, depth %u\n", inode->i_ino, |
7113 | le16_to_cpu(root_el->l_tree_depth)); | 7116 | le16_to_cpu(root_el->l_tree_depth)); |
7114 | status = -EROFS; | 7117 | status = ocfs2_remove_rightmost_empty_extent(osb, |
7115 | goto bail; | 7118 | &et, path, &dealloc); |
7119 | if (status) { | ||
7120 | mlog_errno(status); | ||
7121 | goto bail; | ||
7122 | } | ||
7123 | |||
7124 | ocfs2_reinit_path(path, 1); | ||
7125 | goto start; | ||
7126 | } else { | ||
7127 | trunc_cpos = le32_to_cpu(rec->e_cpos); | ||
7128 | trunc_len = 0; | ||
7129 | blkno = 0; | ||
7116 | } | 7130 | } |
7117 | trunc_cpos = le32_to_cpu(rec->e_cpos); | ||
7118 | trunc_len = 0; | ||
7119 | blkno = 0; | ||
7120 | } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { | 7131 | } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { |
7121 | /* | 7132 | /* |
7122 | * Truncate entire record. | 7133 | * Truncate entire record. |
@@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, | |||
7204 | !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || | 7215 | !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || |
7205 | !ocfs2_supports_inline_data(osb)) { | 7216 | !ocfs2_supports_inline_data(osb)) { |
7206 | ocfs2_error(inode->i_sb, | 7217 | ocfs2_error(inode->i_sb, |
7207 | "Inline data flags for inode %llu don't agree! " | 7218 | "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", |
7208 | "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", | ||
7209 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 7219 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
7210 | le16_to_cpu(di->i_dyn_features), | 7220 | le16_to_cpu(di->i_dyn_features), |
7211 | OCFS2_I(inode)->ip_dyn_features, | 7221 | OCFS2_I(inode)->ip_dyn_features, |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0f5fd9db8194..64b11d90eca6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, | |||
227 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | 227 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; |
228 | 228 | ||
229 | if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { | 229 | if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { |
230 | ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", | 230 | ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", |
231 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 231 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
232 | return -EROFS; | 232 | return -EROFS; |
233 | } | 233 | } |
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, | |||
237 | if (size > PAGE_CACHE_SIZE || | 237 | if (size > PAGE_CACHE_SIZE || |
238 | size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { | 238 | size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { |
239 | ocfs2_error(inode->i_sb, | 239 | ocfs2_error(inode->i_sb, |
240 | "Inode %llu has with inline data has bad size: %Lu", | 240 | "Inode %llu has with inline data has bad size: %Lu\n", |
241 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 241 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
242 | (unsigned long long)size); | 242 | (unsigned long long)size); |
243 | return -EROFS; | 243 | return -EROFS; |
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
533 | 533 | ||
534 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 534 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
535 | 535 | ||
536 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
537 | |||
536 | /* This figures out the size of the next contiguous block, and | 538 | /* This figures out the size of the next contiguous block, and |
537 | * our logical offset */ | 539 | * our logical offset */ |
538 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | 540 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
539 | &contig_blocks, &ext_flags); | 541 | &contig_blocks, &ext_flags); |
542 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
543 | |||
540 | if (ret) { | 544 | if (ret) { |
541 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 545 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
542 | (unsigned long long)iblock); | 546 | (unsigned long long)iblock); |
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
557 | 561 | ||
558 | alloc_locked = 1; | 562 | alloc_locked = 1; |
559 | 563 | ||
564 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
565 | |||
560 | /* fill hole, allocate blocks can't be larger than the size | 566 | /* fill hole, allocate blocks can't be larger than the size |
561 | * of the hole */ | 567 | * of the hole */ |
562 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); | 568 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); |
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
569 | ret = ocfs2_extend_allocation(inode, cpos, | 575 | ret = ocfs2_extend_allocation(inode, cpos, |
570 | clusters_to_alloc, 0); | 576 | clusters_to_alloc, 0); |
571 | if (ret < 0) { | 577 | if (ret < 0) { |
578 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
572 | mlog_errno(ret); | 579 | mlog_errno(ret); |
573 | goto bail; | 580 | goto bail; |
574 | } | 581 | } |
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
576 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | 583 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
577 | &contig_blocks, &ext_flags); | 584 | &contig_blocks, &ext_flags); |
578 | if (ret < 0) { | 585 | if (ret < 0) { |
586 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
579 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 587 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
580 | (unsigned long long)iblock); | 588 | (unsigned long long)iblock); |
581 | ret = -EIO; | 589 | ret = -EIO; |
582 | goto bail; | 590 | goto bail; |
583 | } | 591 | } |
592 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
584 | } | 593 | } |
585 | 594 | ||
586 | /* | 595 | /* |
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
627 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | 636 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); |
628 | } | 637 | } |
629 | 638 | ||
630 | ocfs2_iocb_clear_rw_locked(iocb); | 639 | /* Let rw unlock to be done later to protect append direct io write */ |
640 | if (offset + bytes <= i_size_read(inode)) { | ||
641 | ocfs2_iocb_clear_rw_locked(iocb); | ||
631 | 642 | ||
632 | level = ocfs2_iocb_rw_locked_level(iocb); | 643 | level = ocfs2_iocb_rw_locked_level(iocb); |
633 | ocfs2_rw_unlock(inode, level); | 644 | ocfs2_rw_unlock(inode, level); |
645 | } | ||
634 | } | 646 | } |
635 | 647 | ||
636 | static int ocfs2_releasepage(struct page *page, gfp_t wait) | 648 | static int ocfs2_releasepage(struct page *page, gfp_t wait) |
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | |||
832 | 844 | ||
833 | /* zeroing out the previously allocated cluster tail | 845 | /* zeroing out the previously allocated cluster tail |
834 | * that but not zeroed */ | 846 | * that but not zeroed */ |
835 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | 847 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
848 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
836 | ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, | 849 | ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, |
837 | zero_len_tail, cluster_align_tail); | 850 | zero_len_tail, cluster_align_tail); |
838 | else | 851 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
852 | } else { | ||
853 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
839 | ret = ocfs2_direct_IO_extend_no_holes(osb, inode, | 854 | ret = ocfs2_direct_IO_extend_no_holes(osb, inode, |
840 | offset); | 855 | offset); |
856 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
857 | } | ||
841 | if (ret < 0) { | 858 | if (ret < 0) { |
842 | mlog_errno(ret); | 859 | mlog_errno(ret); |
843 | ocfs2_inode_unlock(inode, 1); | 860 | ocfs2_inode_unlock(inode, 1); |
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | |||
857 | written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, | 874 | written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, |
858 | offset, ocfs2_direct_IO_get_blocks, | 875 | offset, ocfs2_direct_IO_get_blocks, |
859 | ocfs2_dio_end_io, NULL, 0); | 876 | ocfs2_dio_end_io, NULL, 0); |
860 | if (unlikely(written < 0)) { | 877 | /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ |
878 | if ((written < 0) && (written != -EIOCBQUEUED)) { | ||
861 | loff_t i_size = i_size_read(inode); | 879 | loff_t i_size = i_size_read(inode); |
862 | 880 | ||
863 | if (offset + count > i_size) { | 881 | if (offset + count > i_size) { |
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | |||
876 | 894 | ||
877 | ocfs2_inode_unlock(inode, 1); | 895 | ocfs2_inode_unlock(inode, 1); |
878 | brelse(di_bh); | 896 | brelse(di_bh); |
897 | di_bh = NULL; | ||
879 | goto clean_orphan; | 898 | goto clean_orphan; |
880 | } | 899 | } |
881 | } | 900 | } |
882 | 901 | ||
883 | ocfs2_inode_unlock(inode, 1); | 902 | ocfs2_inode_unlock(inode, 1); |
884 | brelse(di_bh); | 903 | brelse(di_bh); |
904 | di_bh = NULL; | ||
885 | 905 | ||
886 | ret = jbd2_journal_force_commit(journal); | 906 | ret = jbd2_journal_force_commit(journal); |
887 | if (ret < 0) | 907 | if (ret < 0) |
@@ -936,10 +956,12 @@ clean_orphan: | |||
936 | if (tmp_ret < 0) { | 956 | if (tmp_ret < 0) { |
937 | ret = tmp_ret; | 957 | ret = tmp_ret; |
938 | mlog_errno(ret); | 958 | mlog_errno(ret); |
959 | brelse(di_bh); | ||
939 | goto out; | 960 | goto out; |
940 | } | 961 | } |
941 | 962 | ||
942 | ocfs2_inode_unlock(inode, 1); | 963 | ocfs2_inode_unlock(inode, 1); |
964 | brelse(di_bh); | ||
943 | 965 | ||
944 | tmp_ret = jbd2_journal_force_commit(journal); | 966 | tmp_ret = jbd2_journal_force_commit(journal); |
945 | if (tmp_ret < 0) { | 967 | if (tmp_ret < 0) { |
@@ -2185,10 +2207,7 @@ try_again: | |||
2185 | if (ret) | 2207 | if (ret) |
2186 | goto out_commit; | 2208 | goto out_commit; |
2187 | } | 2209 | } |
2188 | /* | 2210 | |
2189 | * We don't want this to fail in ocfs2_write_end(), so do it | ||
2190 | * here. | ||
2191 | */ | ||
2192 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, | 2211 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, |
2193 | OCFS2_JOURNAL_ACCESS_WRITE); | 2212 | OCFS2_JOURNAL_ACCESS_WRITE); |
2194 | if (ret) { | 2213 | if (ret) { |
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2345 | loff_t pos, unsigned len, unsigned copied, | 2364 | loff_t pos, unsigned len, unsigned copied, |
2346 | struct page *page, void *fsdata) | 2365 | struct page *page, void *fsdata) |
2347 | { | 2366 | { |
2348 | int i; | 2367 | int i, ret; |
2349 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | 2368 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); |
2350 | struct inode *inode = mapping->host; | 2369 | struct inode *inode = mapping->host; |
2351 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2370 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2354 | handle_t *handle = wc->w_handle; | 2373 | handle_t *handle = wc->w_handle; |
2355 | struct page *tmppage; | 2374 | struct page *tmppage; |
2356 | 2375 | ||
2376 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, | ||
2377 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2378 | if (ret) { | ||
2379 | copied = ret; | ||
2380 | mlog_errno(ret); | ||
2381 | goto out; | ||
2382 | } | ||
2383 | |||
2357 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 2384 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
2358 | ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); | 2385 | ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); |
2359 | goto out_write_size; | 2386 | goto out_write_size; |
@@ -2409,6 +2436,7 @@ out_write_size: | |||
2409 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | 2436 | ocfs2_update_inode_fsync_trans(handle, inode, 1); |
2410 | ocfs2_journal_dirty(handle, wc->w_di_bh); | 2437 | ocfs2_journal_dirty(handle, wc->w_di_bh); |
2411 | 2438 | ||
2439 | out: | ||
2412 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier | 2440 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier |
2413 | * lock, or it will cause a deadlock since journal commit threads holds | 2441 | * lock, or it will cause a deadlock since journal commit threads holds |
2414 | * this lock and will ask for the page lock when flushing the data. | 2442 | * this lock and will ask for the page lock when flushing the data. |
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c | |||
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, | |||
316 | bh = bhs[i]; | 316 | bh = bhs[i]; |
317 | 317 | ||
318 | if (!(flags & OCFS2_BH_READAHEAD)) { | 318 | if (!(flags & OCFS2_BH_READAHEAD)) { |
319 | if (status) { | ||
320 | /* Clear the rest of the buffers on error */ | ||
321 | put_bh(bh); | ||
322 | bhs[i] = NULL; | ||
323 | continue; | ||
324 | } | ||
319 | /* We know this can't have changed as we hold the | 325 | /* We know this can't have changed as we hold the |
320 | * owner sem. Avoid doing any work on the bh if the | 326 | * owner sem. Avoid doing any work on the bh if the |
321 | * journal has it. */ | 327 | * journal has it. */ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 140de3c93d2e..fa15debcc02b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/debugfs.h> | 36 | #include <linux/debugfs.h> |
37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
38 | #include <linux/bitmap.h> | 38 | #include <linux/bitmap.h> |
39 | 39 | #include <linux/ktime.h> | |
40 | #include "heartbeat.h" | 40 | #include "heartbeat.h" |
41 | #include "tcp.h" | 41 | #include "tcp.h" |
42 | #include "nodemanager.h" | 42 | #include "nodemanager.h" |
@@ -1060,37 +1060,6 @@ bail: | |||
1060 | return ret; | 1060 | return ret; |
1061 | } | 1061 | } |
1062 | 1062 | ||
1063 | /* Subtract b from a, storing the result in a. a *must* have a larger | ||
1064 | * value than b. */ | ||
1065 | static void o2hb_tv_subtract(struct timeval *a, | ||
1066 | struct timeval *b) | ||
1067 | { | ||
1068 | /* just return 0 when a is after b */ | ||
1069 | if (a->tv_sec < b->tv_sec || | ||
1070 | (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { | ||
1071 | a->tv_sec = 0; | ||
1072 | a->tv_usec = 0; | ||
1073 | return; | ||
1074 | } | ||
1075 | |||
1076 | a->tv_sec -= b->tv_sec; | ||
1077 | a->tv_usec -= b->tv_usec; | ||
1078 | while ( a->tv_usec < 0 ) { | ||
1079 | a->tv_sec--; | ||
1080 | a->tv_usec += 1000000; | ||
1081 | } | ||
1082 | } | ||
1083 | |||
1084 | static unsigned int o2hb_elapsed_msecs(struct timeval *start, | ||
1085 | struct timeval *end) | ||
1086 | { | ||
1087 | struct timeval res = *end; | ||
1088 | |||
1089 | o2hb_tv_subtract(&res, start); | ||
1090 | |||
1091 | return res.tv_sec * 1000 + res.tv_usec / 1000; | ||
1092 | } | ||
1093 | |||
1094 | /* | 1063 | /* |
1095 | * we ride the region ref that the region dir holds. before the region | 1064 | * we ride the region ref that the region dir holds. before the region |
1096 | * dir is removed and drops it ref it will wait to tear down this | 1065 | * dir is removed and drops it ref it will wait to tear down this |
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data) | |||
1101 | int i, ret; | 1070 | int i, ret; |
1102 | struct o2hb_region *reg = data; | 1071 | struct o2hb_region *reg = data; |
1103 | struct o2hb_bio_wait_ctxt write_wc; | 1072 | struct o2hb_bio_wait_ctxt write_wc; |
1104 | struct timeval before_hb, after_hb; | 1073 | ktime_t before_hb, after_hb; |
1105 | unsigned int elapsed_msec; | 1074 | unsigned int elapsed_msec; |
1106 | 1075 | ||
1107 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); | 1076 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); |
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data) | |||
1118 | * hr_timeout_ms between disk writes. On busy systems | 1087 | * hr_timeout_ms between disk writes. On busy systems |
1119 | * this should result in a heartbeat which is less | 1088 | * this should result in a heartbeat which is less |
1120 | * likely to time itself out. */ | 1089 | * likely to time itself out. */ |
1121 | do_gettimeofday(&before_hb); | 1090 | before_hb = ktime_get_real(); |
1122 | 1091 | ||
1123 | ret = o2hb_do_disk_heartbeat(reg); | 1092 | ret = o2hb_do_disk_heartbeat(reg); |
1124 | 1093 | ||
1125 | do_gettimeofday(&after_hb); | 1094 | after_hb = ktime_get_real(); |
1126 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | 1095 | |
1096 | elapsed_msec = (unsigned int) | ||
1097 | ktime_ms_delta(after_hb, before_hb); | ||
1127 | 1098 | ||
1128 | mlog(ML_HEARTBEAT, | 1099 | mlog(ML_HEARTBEAT, |
1129 | "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", | 1100 | "start = %lld, end = %lld, msec = %u, ret = %d\n", |
1130 | before_hb.tv_sec, (unsigned long) before_hb.tv_usec, | 1101 | before_hb.tv64, after_hb.tv64, elapsed_msec, ret); |
1131 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, | ||
1132 | elapsed_msec, ret); | ||
1133 | 1102 | ||
1134 | if (!kthread_should_stop() && | 1103 | if (!kthread_should_stop() && |
1135 | elapsed_msec < reg->hr_timeout_ms) { | 1104 | elapsed_msec < reg->hr_timeout_ms) { |
@@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) | |||
1619 | struct o2hb_disk_slot *slot; | 1588 | struct o2hb_disk_slot *slot; |
1620 | 1589 | ||
1621 | reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); | 1590 | reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); |
1622 | if (reg->hr_tmp_block == NULL) { | 1591 | if (reg->hr_tmp_block == NULL) |
1623 | mlog_errno(-ENOMEM); | ||
1624 | return -ENOMEM; | 1592 | return -ENOMEM; |
1625 | } | ||
1626 | 1593 | ||
1627 | reg->hr_slots = kcalloc(reg->hr_blocks, | 1594 | reg->hr_slots = kcalloc(reg->hr_blocks, |
1628 | sizeof(struct o2hb_disk_slot), GFP_KERNEL); | 1595 | sizeof(struct o2hb_disk_slot), GFP_KERNEL); |
1629 | if (reg->hr_slots == NULL) { | 1596 | if (reg->hr_slots == NULL) |
1630 | mlog_errno(-ENOMEM); | ||
1631 | return -ENOMEM; | 1597 | return -ENOMEM; |
1632 | } | ||
1633 | 1598 | ||
1634 | for(i = 0; i < reg->hr_blocks; i++) { | 1599 | for(i = 0; i < reg->hr_blocks; i++) { |
1635 | slot = ®->hr_slots[i]; | 1600 | slot = ®->hr_slots[i]; |
@@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) | |||
1645 | 1610 | ||
1646 | reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), | 1611 | reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), |
1647 | GFP_KERNEL); | 1612 | GFP_KERNEL); |
1648 | if (!reg->hr_slot_data) { | 1613 | if (!reg->hr_slot_data) |
1649 | mlog_errno(-ENOMEM); | ||
1650 | return -ENOMEM; | 1614 | return -ENOMEM; |
1651 | } | ||
1652 | 1615 | ||
1653 | for(i = 0; i < reg->hr_num_pages; i++) { | 1616 | for(i = 0; i < reg->hr_num_pages; i++) { |
1654 | page = alloc_page(GFP_KERNEL); | 1617 | page = alloc_page(GFP_KERNEL); |
1655 | if (!page) { | 1618 | if (!page) |
1656 | mlog_errno(-ENOMEM); | ||
1657 | return -ENOMEM; | 1619 | return -ENOMEM; |
1658 | } | ||
1659 | 1620 | ||
1660 | reg->hr_slot_data[i] = page; | 1621 | reg->hr_slot_data[i] = page; |
1661 | 1622 | ||
@@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) | |||
1687 | struct o2hb_disk_heartbeat_block *hb_block; | 1648 | struct o2hb_disk_heartbeat_block *hb_block; |
1688 | 1649 | ||
1689 | ret = o2hb_read_slots(reg, reg->hr_blocks); | 1650 | ret = o2hb_read_slots(reg, reg->hr_blocks); |
1690 | if (ret) { | 1651 | if (ret) |
1691 | mlog_errno(ret); | ||
1692 | goto out; | 1652 | goto out; |
1693 | } | ||
1694 | 1653 | ||
1695 | /* We only want to get an idea of the values initially in each | 1654 | /* We only want to get an idea of the values initially in each |
1696 | * slot, so we do no verification - o2hb_check_slot will | 1655 | * slot, so we do no verification - o2hb_check_slot will |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 02878a83f0b4..ffecf89c8c1c 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) | |||
480 | 480 | ||
481 | trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); | 481 | trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); |
482 | if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { | 482 | if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { |
483 | rc = -EINVAL; | 483 | rc = ocfs2_error(dir->i_sb, |
484 | ocfs2_error(dir->i_sb, | 484 | "Invalid dirblock #%llu: signature = %.*s\n", |
485 | "Invalid dirblock #%llu: " | 485 | (unsigned long long)bh->b_blocknr, 7, |
486 | "signature = %.*s\n", | 486 | trailer->db_signature); |
487 | (unsigned long long)bh->b_blocknr, 7, | ||
488 | trailer->db_signature); | ||
489 | goto out; | 487 | goto out; |
490 | } | 488 | } |
491 | if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { | 489 | if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { |
492 | rc = -EINVAL; | 490 | rc = ocfs2_error(dir->i_sb, |
493 | ocfs2_error(dir->i_sb, | 491 | "Directory block #%llu has an invalid db_blkno of %llu\n", |
494 | "Directory block #%llu has an invalid " | 492 | (unsigned long long)bh->b_blocknr, |
495 | "db_blkno of %llu", | 493 | (unsigned long long)le64_to_cpu(trailer->db_blkno)); |
496 | (unsigned long long)bh->b_blocknr, | ||
497 | (unsigned long long)le64_to_cpu(trailer->db_blkno)); | ||
498 | goto out; | 494 | goto out; |
499 | } | 495 | } |
500 | if (le64_to_cpu(trailer->db_parent_dinode) != | 496 | if (le64_to_cpu(trailer->db_parent_dinode) != |
501 | OCFS2_I(dir)->ip_blkno) { | 497 | OCFS2_I(dir)->ip_blkno) { |
502 | rc = -EINVAL; | 498 | rc = ocfs2_error(dir->i_sb, |
503 | ocfs2_error(dir->i_sb, | 499 | "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", |
504 | "Directory block #%llu on dinode " | 500 | (unsigned long long)bh->b_blocknr, |
505 | "#%llu has an invalid parent_dinode " | 501 | (unsigned long long)OCFS2_I(dir)->ip_blkno, |
506 | "of %llu", | 502 | (unsigned long long)le64_to_cpu(trailer->db_blkno)); |
507 | (unsigned long long)bh->b_blocknr, | ||
508 | (unsigned long long)OCFS2_I(dir)->ip_blkno, | ||
509 | (unsigned long long)le64_to_cpu(trailer->db_blkno)); | ||
510 | goto out; | 503 | goto out; |
511 | } | 504 | } |
512 | out: | 505 | out: |
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, | |||
604 | } | 597 | } |
605 | 598 | ||
606 | if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { | 599 | if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { |
607 | ocfs2_error(sb, | 600 | ret = ocfs2_error(sb, |
608 | "Dir Index Root # %llu has bad signature %.*s", | 601 | "Dir Index Root # %llu has bad signature %.*s\n", |
609 | (unsigned long long)le64_to_cpu(dx_root->dr_blkno), | 602 | (unsigned long long)le64_to_cpu(dx_root->dr_blkno), |
610 | 7, dx_root->dr_signature); | 603 | 7, dx_root->dr_signature); |
611 | return -EINVAL; | ||
612 | } | 604 | } |
613 | 605 | ||
614 | return 0; | 606 | return ret; |
615 | } | 607 | } |
616 | 608 | ||
617 | static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, | 609 | static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, |
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, | |||
648 | } | 640 | } |
649 | 641 | ||
650 | if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { | 642 | if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { |
651 | ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", | 643 | ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", |
652 | 7, dx_leaf->dl_signature); | 644 | 7, dx_leaf->dl_signature); |
653 | return -EROFS; | ||
654 | } | 645 | } |
655 | 646 | ||
656 | return 0; | 647 | return ret; |
657 | } | 648 | } |
658 | 649 | ||
659 | static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, | 650 | static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, |
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, | |||
812 | el = &eb->h_list; | 803 | el = &eb->h_list; |
813 | 804 | ||
814 | if (el->l_tree_depth) { | 805 | if (el->l_tree_depth) { |
815 | ocfs2_error(inode->i_sb, | 806 | ret = ocfs2_error(inode->i_sb, |
816 | "Inode %lu has non zero tree depth in " | 807 | "Inode %lu has non zero tree depth in btree tree block %llu\n", |
817 | "btree tree block %llu\n", inode->i_ino, | 808 | inode->i_ino, |
818 | (unsigned long long)eb_bh->b_blocknr); | 809 | (unsigned long long)eb_bh->b_blocknr); |
819 | ret = -EROFS; | ||
820 | goto out; | 810 | goto out; |
821 | } | 811 | } |
822 | } | 812 | } |
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, | |||
832 | } | 822 | } |
833 | 823 | ||
834 | if (!found) { | 824 | if (!found) { |
835 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " | 825 | ret = ocfs2_error(inode->i_sb, |
836 | "record (%u, %u, 0) in btree", inode->i_ino, | 826 | "Inode %lu has bad extent record (%u, %u, 0) in btree\n", |
837 | le32_to_cpu(rec->e_cpos), | 827 | inode->i_ino, |
838 | ocfs2_rec_clusters(el, rec)); | 828 | le32_to_cpu(rec->e_cpos), |
839 | ret = -EROFS; | 829 | ocfs2_rec_clusters(el, rec)); |
840 | goto out; | 830 | goto out; |
841 | } | 831 | } |
842 | 832 | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7df88a6dd626..6918f30d02cd 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm, | |||
1465 | if (status == -ENOPROTOOPT) { | 1465 | if (status == -ENOPROTOOPT) { |
1466 | status = 0; | 1466 | status = 0; |
1467 | *response = JOIN_OK_NO_MAP; | 1467 | *response = JOIN_OK_NO_MAP; |
1468 | } else if (packet.code == JOIN_DISALLOW || | ||
1469 | packet.code == JOIN_OK_NO_MAP) { | ||
1470 | *response = packet.code; | ||
1471 | } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { | ||
1472 | mlog(ML_NOTICE, | ||
1473 | "This node requested DLM locking protocol %u.%u and " | ||
1474 | "filesystem locking protocol %u.%u. At least one of " | ||
1475 | "the protocol versions on node %d is not compatible, " | ||
1476 | "disconnecting\n", | ||
1477 | dlm->dlm_locking_proto.pv_major, | ||
1478 | dlm->dlm_locking_proto.pv_minor, | ||
1479 | dlm->fs_locking_proto.pv_major, | ||
1480 | dlm->fs_locking_proto.pv_minor, | ||
1481 | node); | ||
1482 | status = -EPROTO; | ||
1483 | *response = packet.code; | ||
1484 | } else if (packet.code == JOIN_OK) { | ||
1485 | *response = packet.code; | ||
1486 | /* Use the same locking protocol as the remote node */ | ||
1487 | dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; | ||
1488 | dlm->fs_locking_proto.pv_minor = packet.fs_minor; | ||
1489 | mlog(0, | ||
1490 | "Node %d responds JOIN_OK with DLM locking protocol " | ||
1491 | "%u.%u and fs locking protocol %u.%u\n", | ||
1492 | node, | ||
1493 | dlm->dlm_locking_proto.pv_major, | ||
1494 | dlm->dlm_locking_proto.pv_minor, | ||
1495 | dlm->fs_locking_proto.pv_major, | ||
1496 | dlm->fs_locking_proto.pv_minor); | ||
1497 | } else { | 1468 | } else { |
1498 | status = -EINVAL; | 1469 | *response = packet.code; |
1499 | mlog(ML_ERROR, "invalid response %d from node %u\n", | 1470 | switch (packet.code) { |
1500 | packet.code, node); | 1471 | case JOIN_DISALLOW: |
1472 | case JOIN_OK_NO_MAP: | ||
1473 | break; | ||
1474 | case JOIN_PROTOCOL_MISMATCH: | ||
1475 | mlog(ML_NOTICE, | ||
1476 | "This node requested DLM locking protocol %u.%u and " | ||
1477 | "filesystem locking protocol %u.%u. At least one of " | ||
1478 | "the protocol versions on node %d is not compatible, " | ||
1479 | "disconnecting\n", | ||
1480 | dlm->dlm_locking_proto.pv_major, | ||
1481 | dlm->dlm_locking_proto.pv_minor, | ||
1482 | dlm->fs_locking_proto.pv_major, | ||
1483 | dlm->fs_locking_proto.pv_minor, | ||
1484 | node); | ||
1485 | status = -EPROTO; | ||
1486 | break; | ||
1487 | case JOIN_OK: | ||
1488 | /* Use the same locking protocol as the remote node */ | ||
1489 | dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; | ||
1490 | dlm->fs_locking_proto.pv_minor = packet.fs_minor; | ||
1491 | mlog(0, | ||
1492 | "Node %d responds JOIN_OK with DLM locking protocol " | ||
1493 | "%u.%u and fs locking protocol %u.%u\n", | ||
1494 | node, | ||
1495 | dlm->dlm_locking_proto.pv_major, | ||
1496 | dlm->dlm_locking_proto.pv_minor, | ||
1497 | dlm->fs_locking_proto.pv_major, | ||
1498 | dlm->fs_locking_proto.pv_minor); | ||
1499 | break; | ||
1500 | default: | ||
1501 | status = -EINVAL; | ||
1502 | mlog(ML_ERROR, "invalid response %d from node %u\n", | ||
1503 | packet.code, node); | ||
1504 | /* Reset response to JOIN_DISALLOW */ | ||
1505 | *response = JOIN_DISALLOW; | ||
1506 | break; | ||
1507 | } | ||
1501 | } | 1508 | } |
1502 | 1509 | ||
1503 | mlog(0, "status %d, node %d response is %d\n", status, node, | 1510 | mlog(0, "status %d, node %d response is %d\n", status, node, |
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | |||
1725 | 1732 | ||
1726 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, | 1733 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, |
1727 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); | 1734 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); |
1735 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | ||
1736 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | ||
1737 | |||
1728 | status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); | 1738 | status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); |
1729 | if (status) | 1739 | if (status) |
1730 | goto bail; | 1740 | goto bail; |
1731 | 1741 | ||
1732 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | ||
1733 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | ||
1734 | status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); | 1742 | status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); |
1735 | if (status) | 1743 | if (status) |
1736 | goto bail; | 1744 | goto bail; |
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | |||
1845 | sizeof(struct dlm_exit_domain), | 1853 | sizeof(struct dlm_exit_domain), |
1846 | dlm_begin_exit_domain_handler, | 1854 | dlm_begin_exit_domain_handler, |
1847 | dlm, NULL, &dlm->dlm_domain_handlers); | 1855 | dlm, NULL, &dlm->dlm_domain_handlers); |
1848 | if (status) | ||
1849 | goto bail; | ||
1850 | 1856 | ||
1851 | bail: | 1857 | bail: |
1852 | if (status) | 1858 | if (status) |
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) | |||
498 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, | 498 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, |
499 | res->lockname.name); | 499 | res->lockname.name); |
500 | 500 | ||
501 | spin_lock(&dlm->track_lock); | ||
502 | if (!list_empty(&res->tracking)) | ||
503 | list_del_init(&res->tracking); | ||
504 | else { | ||
505 | mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", | ||
506 | res->lockname.len, res->lockname.name); | ||
507 | dlm_print_one_lock_resource(res); | ||
508 | } | ||
509 | spin_unlock(&dlm->track_lock); | ||
510 | |||
511 | atomic_dec(&dlm->res_cur_count); | 501 | atomic_dec(&dlm->res_cur_count); |
512 | 502 | ||
513 | if (!hlist_unhashed(&res->hash_node) || | 503 | if (!hlist_unhashed(&res->hash_node) || |
@@ -795,8 +785,18 @@ lookup: | |||
795 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | 785 | dlm_lockres_grab_inflight_ref(dlm, tmpres); |
796 | 786 | ||
797 | spin_unlock(&tmpres->spinlock); | 787 | spin_unlock(&tmpres->spinlock); |
798 | if (res) | 788 | if (res) { |
789 | spin_lock(&dlm->track_lock); | ||
790 | if (!list_empty(&res->tracking)) | ||
791 | list_del_init(&res->tracking); | ||
792 | else | ||
793 | mlog(ML_ERROR, "Resource %.*s not " | ||
794 | "on the Tracking list\n", | ||
795 | res->lockname.len, | ||
796 | res->lockname.name); | ||
797 | spin_unlock(&dlm->track_lock); | ||
799 | dlm_lockres_put(res); | 798 | dlm_lockres_put(res); |
799 | } | ||
800 | res = tmpres; | 800 | res = tmpres; |
801 | goto leave; | 801 | goto leave; |
802 | } | 802 | } |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..d0e436dc6437 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1776 | struct dlm_migratable_lockres *mres) | 1776 | struct dlm_migratable_lockres *mres) |
1777 | { | 1777 | { |
1778 | struct dlm_migratable_lock *ml; | 1778 | struct dlm_migratable_lock *ml; |
1779 | struct list_head *queue, *iter; | 1779 | struct list_head *queue; |
1780 | struct list_head *tmpq = NULL; | 1780 | struct list_head *tmpq = NULL; |
1781 | struct dlm_lock *newlock = NULL; | 1781 | struct dlm_lock *newlock = NULL; |
1782 | struct dlm_lockstatus *lksb = NULL; | 1782 | struct dlm_lockstatus *lksb = NULL; |
@@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1821 | spin_lock(&res->spinlock); | 1821 | spin_lock(&res->spinlock); |
1822 | for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { | 1822 | for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { |
1823 | tmpq = dlm_list_idx_to_ptr(res, j); | 1823 | tmpq = dlm_list_idx_to_ptr(res, j); |
1824 | list_for_each(iter, tmpq) { | 1824 | list_for_each_entry(lock, tmpq, list) { |
1825 | lock = list_entry(iter, | ||
1826 | struct dlm_lock, list); | ||
1827 | if (lock->ml.cookie == ml->cookie) | 1825 | if (lock->ml.cookie == ml->cookie) |
1828 | break; | 1826 | break; |
1829 | lock = NULL; | 1827 | lock = NULL; |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, | |||
211 | 211 | ||
212 | __dlm_unhash_lockres(dlm, res); | 212 | __dlm_unhash_lockres(dlm, res); |
213 | 213 | ||
214 | spin_lock(&dlm->track_lock); | ||
215 | if (!list_empty(&res->tracking)) | ||
216 | list_del_init(&res->tracking); | ||
217 | else { | ||
218 | mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", | ||
219 | res->lockname.len, res->lockname.name); | ||
220 | __dlm_print_one_lock_resource(res); | ||
221 | } | ||
222 | spin_unlock(&dlm->track_lock); | ||
223 | |||
214 | /* lockres is not in the hash now. drop the flag and wake up | 224 | /* lockres is not in the hash now. drop the flag and wake up |
215 | * any processes waiting in dlm_get_lock_resource. */ | 225 | * any processes waiting in dlm_get_lock_resource. */ |
216 | if (!master) { | 226 | if (!master) { |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 23157e40dd74..1c91103c1333 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -3035,8 +3035,6 @@ local: | |||
3035 | ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); | 3035 | ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); |
3036 | 3036 | ||
3037 | osb->cconn = conn; | 3037 | osb->cconn = conn; |
3038 | |||
3039 | status = 0; | ||
3040 | bail: | 3038 | bail: |
3041 | if (status < 0) { | 3039 | if (status < 0) { |
3042 | ocfs2_dlm_shutdown_debug(osb); | 3040 | ocfs2_dlm_shutdown_debug(osb); |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, | |||
305 | 305 | ||
306 | if (el->l_tree_depth) { | 306 | if (el->l_tree_depth) { |
307 | ocfs2_error(inode->i_sb, | 307 | ocfs2_error(inode->i_sb, |
308 | "Inode %lu has non zero tree depth in " | 308 | "Inode %lu has non zero tree depth in leaf block %llu\n", |
309 | "leaf block %llu\n", inode->i_ino, | 309 | inode->i_ino, |
310 | (unsigned long long)eb_bh->b_blocknr); | 310 | (unsigned long long)eb_bh->b_blocknr); |
311 | ret = -EROFS; | 311 | ret = -EROFS; |
312 | goto out; | 312 | goto out; |
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, | |||
441 | 441 | ||
442 | if (el->l_tree_depth) { | 442 | if (el->l_tree_depth) { |
443 | ocfs2_error(inode->i_sb, | 443 | ocfs2_error(inode->i_sb, |
444 | "Inode %lu has non zero tree depth in " | 444 | "Inode %lu has non zero tree depth in leaf block %llu\n", |
445 | "leaf block %llu\n", inode->i_ino, | 445 | inode->i_ino, |
446 | (unsigned long long)eb_bh->b_blocknr); | 446 | (unsigned long long)eb_bh->b_blocknr); |
447 | ret = -EROFS; | 447 | ret = -EROFS; |
448 | goto out; | 448 | goto out; |
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, | |||
475 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); | 475 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
476 | 476 | ||
477 | if (!rec->e_blkno) { | 477 | if (!rec->e_blkno) { |
478 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " | 478 | ocfs2_error(inode->i_sb, |
479 | "record (%u, %u, 0)", inode->i_ino, | 479 | "Inode %lu has bad extent record (%u, %u, 0)\n", |
480 | inode->i_ino, | ||
480 | le32_to_cpu(rec->e_cpos), | 481 | le32_to_cpu(rec->e_cpos), |
481 | ocfs2_rec_clusters(el, rec)); | 482 | ocfs2_rec_clusters(el, rec)); |
482 | ret = -EROFS; | 483 | ret = -EROFS; |
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, | |||
564 | 565 | ||
565 | if (el->l_tree_depth) { | 566 | if (el->l_tree_depth) { |
566 | ocfs2_error(inode->i_sb, | 567 | ocfs2_error(inode->i_sb, |
567 | "Inode %lu has non zero tree depth in " | 568 | "Inode %lu has non zero tree depth in xattr leaf block %llu\n", |
568 | "xattr leaf block %llu\n", inode->i_ino, | 569 | inode->i_ino, |
569 | (unsigned long long)eb_bh->b_blocknr); | 570 | (unsigned long long)eb_bh->b_blocknr); |
570 | ret = -EROFS; | 571 | ret = -EROFS; |
571 | goto out; | 572 | goto out; |
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, | |||
582 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); | 583 | BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); |
583 | 584 | ||
584 | if (!rec->e_blkno) { | 585 | if (!rec->e_blkno) { |
585 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " | 586 | ocfs2_error(inode->i_sb, |
586 | "record (%u, %u, 0) in xattr", inode->i_ino, | 587 | "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", |
588 | inode->i_ino, | ||
587 | le32_to_cpu(rec->e_cpos), | 589 | le32_to_cpu(rec->e_cpos), |
588 | ocfs2_rec_clusters(el, rec)); | 590 | ocfs2_rec_clusters(el, rec)); |
589 | ret = -EROFS; | 591 | ret = -EROFS; |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7210583b472f..0e5b4515f92e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1130,6 +1130,7 @@ out: | |||
1130 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | 1130 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) |
1131 | { | 1131 | { |
1132 | int status = 0, size_change; | 1132 | int status = 0, size_change; |
1133 | int inode_locked = 0; | ||
1133 | struct inode *inode = d_inode(dentry); | 1134 | struct inode *inode = d_inode(dentry); |
1134 | struct super_block *sb = inode->i_sb; | 1135 | struct super_block *sb = inode->i_sb; |
1135 | struct ocfs2_super *osb = OCFS2_SB(sb); | 1136 | struct ocfs2_super *osb = OCFS2_SB(sb); |
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | |||
1178 | mlog_errno(status); | 1179 | mlog_errno(status); |
1179 | goto bail_unlock_rw; | 1180 | goto bail_unlock_rw; |
1180 | } | 1181 | } |
1182 | inode_locked = 1; | ||
1181 | 1183 | ||
1182 | if (size_change) { | 1184 | if (size_change) { |
1183 | status = inode_newsize_ok(inode, attr->ia_size); | 1185 | status = inode_newsize_ok(inode, attr->ia_size); |
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | |||
1258 | bail_commit: | 1260 | bail_commit: |
1259 | ocfs2_commit_trans(osb, handle); | 1261 | ocfs2_commit_trans(osb, handle); |
1260 | bail_unlock: | 1262 | bail_unlock: |
1261 | ocfs2_inode_unlock(inode, 1); | 1263 | if (status) { |
1264 | ocfs2_inode_unlock(inode, 1); | ||
1265 | inode_locked = 0; | ||
1266 | } | ||
1262 | bail_unlock_rw: | 1267 | bail_unlock_rw: |
1263 | if (size_change) | 1268 | if (size_change) |
1264 | ocfs2_rw_unlock(inode, 1); | 1269 | ocfs2_rw_unlock(inode, 1); |
@@ -1274,6 +1279,8 @@ bail: | |||
1274 | if (status < 0) | 1279 | if (status < 0) |
1275 | mlog_errno(status); | 1280 | mlog_errno(status); |
1276 | } | 1281 | } |
1282 | if (inode_locked) | ||
1283 | ocfs2_inode_unlock(inode, 1); | ||
1277 | 1284 | ||
1278 | return status; | 1285 | return status; |
1279 | } | 1286 | } |
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | |||
2262 | ssize_t written = 0; | 2269 | ssize_t written = 0; |
2263 | ssize_t ret; | 2270 | ssize_t ret; |
2264 | size_t count = iov_iter_count(from), orig_count; | 2271 | size_t count = iov_iter_count(from), orig_count; |
2265 | loff_t old_size; | ||
2266 | u32 old_clusters; | ||
2267 | struct file *file = iocb->ki_filp; | 2272 | struct file *file = iocb->ki_filp; |
2268 | struct inode *inode = file_inode(file); | 2273 | struct inode *inode = file_inode(file); |
2269 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2274 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | |||
2271 | OCFS2_MOUNT_COHERENCY_BUFFERED); | 2276 | OCFS2_MOUNT_COHERENCY_BUFFERED); |
2272 | int unaligned_dio = 0; | 2277 | int unaligned_dio = 0; |
2273 | int dropped_dio = 0; | 2278 | int dropped_dio = 0; |
2279 | int append_write = ((iocb->ki_pos + count) >= | ||
2280 | i_size_read(inode) ? 1 : 0); | ||
2274 | 2281 | ||
2275 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, | 2282 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, |
2276 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 2283 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
@@ -2290,8 +2297,9 @@ relock: | |||
2290 | /* | 2297 | /* |
2291 | * Concurrent O_DIRECT writes are allowed with | 2298 | * Concurrent O_DIRECT writes are allowed with |
2292 | * mount_option "coherency=buffered". | 2299 | * mount_option "coherency=buffered". |
2300 | * For append write, we must take rw EX. | ||
2293 | */ | 2301 | */ |
2294 | rw_level = (!direct_io || full_coherency); | 2302 | rw_level = (!direct_io || full_coherency || append_write); |
2295 | 2303 | ||
2296 | ret = ocfs2_rw_lock(inode, rw_level); | 2304 | ret = ocfs2_rw_lock(inode, rw_level); |
2297 | if (ret < 0) { | 2305 | if (ret < 0) { |
@@ -2364,13 +2372,6 @@ relock: | |||
2364 | ocfs2_iocb_set_unaligned_aio(iocb); | 2372 | ocfs2_iocb_set_unaligned_aio(iocb); |
2365 | } | 2373 | } |
2366 | 2374 | ||
2367 | /* | ||
2368 | * To later detect whether a journal commit for sync writes is | ||
2369 | * necessary, we sample i_size, and cluster count here. | ||
2370 | */ | ||
2371 | old_size = i_size_read(inode); | ||
2372 | old_clusters = OCFS2_I(inode)->ip_clusters; | ||
2373 | |||
2374 | /* communicate with ocfs2_dio_end_io */ | 2375 | /* communicate with ocfs2_dio_end_io */ |
2375 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | 2376 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
2376 | 2377 | ||
@@ -2378,6 +2379,20 @@ relock: | |||
2378 | /* buffered aio wouldn't have proper lock coverage today */ | 2379 | /* buffered aio wouldn't have proper lock coverage today */ |
2379 | BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); | 2380 | BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); |
2380 | 2381 | ||
2382 | /* | ||
2383 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | ||
2384 | * function pointer which is called when o_direct io completes so that | ||
2385 | * it can unlock our rw lock. | ||
2386 | * Unfortunately there are error cases which call end_io and others | ||
2387 | * that don't. so we don't have to unlock the rw_lock if either an | ||
2388 | * async dio is going to do it in the future or an end_io after an | ||
2389 | * error has already done it. | ||
2390 | */ | ||
2391 | if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | ||
2392 | rw_level = -1; | ||
2393 | unaligned_dio = 0; | ||
2394 | } | ||
2395 | |||
2381 | if (unlikely(written <= 0)) | 2396 | if (unlikely(written <= 0)) |
2382 | goto no_sync; | 2397 | goto no_sync; |
2383 | 2398 | ||
@@ -2402,21 +2417,7 @@ relock: | |||
2402 | } | 2417 | } |
2403 | 2418 | ||
2404 | no_sync: | 2419 | no_sync: |
2405 | /* | 2420 | if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { |
2406 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | ||
2407 | * function pointer which is called when o_direct io completes so that | ||
2408 | * it can unlock our rw lock. | ||
2409 | * Unfortunately there are error cases which call end_io and others | ||
2410 | * that don't. so we don't have to unlock the rw_lock if either an | ||
2411 | * async dio is going to do it in the future or an end_io after an | ||
2412 | * error has already done it. | ||
2413 | */ | ||
2414 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | ||
2415 | rw_level = -1; | ||
2416 | unaligned_dio = 0; | ||
2417 | } | ||
2418 | |||
2419 | if (unaligned_dio) { | ||
2420 | ocfs2_iocb_clear_unaligned_aio(iocb); | 2421 | ocfs2_iocb_clear_unaligned_aio(iocb); |
2421 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | 2422 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); |
2422 | } | 2423 | } |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..8f87e05ee25d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode) | |||
971 | int wipe, status; | 971 | int wipe, status; |
972 | sigset_t oldset; | 972 | sigset_t oldset; |
973 | struct buffer_head *di_bh = NULL; | 973 | struct buffer_head *di_bh = NULL; |
974 | struct ocfs2_dinode *di = NULL; | ||
974 | 975 | ||
975 | trace_ocfs2_delete_inode(inode->i_ino, | 976 | trace_ocfs2_delete_inode(inode->i_ino, |
976 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 977 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode) | |||
1025 | goto bail_unlock_nfs_sync; | 1026 | goto bail_unlock_nfs_sync; |
1026 | } | 1027 | } |
1027 | 1028 | ||
1029 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1030 | /* Skip inode deletion and wait for dio orphan entry recovered | ||
1031 | * first */ | ||
1032 | if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { | ||
1033 | ocfs2_cleanup_delete_inode(inode, 0); | ||
1034 | goto bail_unlock_inode; | ||
1035 | } | ||
1036 | |||
1028 | /* Query the cluster. This will be the final decision made | 1037 | /* Query the cluster. This will be the final decision made |
1029 | * before we go ahead and wipe the inode. */ | 1038 | * before we go ahead and wipe the inode. */ |
1030 | status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); | 1039 | status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); |
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode) | |||
1191 | int ocfs2_drop_inode(struct inode *inode) | 1200 | int ocfs2_drop_inode(struct inode *inode) |
1192 | { | 1201 | { |
1193 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 1202 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
1194 | int res; | ||
1195 | 1203 | ||
1196 | trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, | 1204 | trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, |
1197 | inode->i_nlink, oi->ip_flags); | 1205 | inode->i_nlink, oi->ip_flags); |
1198 | 1206 | ||
1199 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) | 1207 | assert_spin_locked(&inode->i_lock); |
1200 | res = 1; | 1208 | inode->i_state |= I_WILL_FREE; |
1201 | else | 1209 | spin_unlock(&inode->i_lock); |
1202 | res = generic_drop_inode(inode); | 1210 | write_inode_now(inode, 1); |
1211 | spin_lock(&inode->i_lock); | ||
1212 | WARN_ON(inode->i_state & I_NEW); | ||
1213 | inode->i_state &= ~I_WILL_FREE; | ||
1203 | 1214 | ||
1204 | return res; | 1215 | return 1; |
1205 | } | 1216 | } |
1206 | 1217 | ||
1207 | /* | 1218 | /* |
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, | |||
1350 | rc = -EINVAL; | 1361 | rc = -EINVAL; |
1351 | 1362 | ||
1352 | if (!OCFS2_IS_VALID_DINODE(di)) { | 1363 | if (!OCFS2_IS_VALID_DINODE(di)) { |
1353 | ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", | 1364 | rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", |
1354 | (unsigned long long)bh->b_blocknr, 7, | 1365 | (unsigned long long)bh->b_blocknr, 7, |
1355 | di->i_signature); | 1366 | di->i_signature); |
1356 | goto bail; | 1367 | goto bail; |
1357 | } | 1368 | } |
1358 | 1369 | ||
1359 | if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { | 1370 | if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { |
1360 | ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", | 1371 | rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", |
1361 | (unsigned long long)bh->b_blocknr, | 1372 | (unsigned long long)bh->b_blocknr, |
1362 | (unsigned long long)le64_to_cpu(di->i_blkno)); | 1373 | (unsigned long long)le64_to_cpu(di->i_blkno)); |
1363 | goto bail; | 1374 | goto bail; |
1364 | } | 1375 | } |
1365 | 1376 | ||
1366 | if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { | 1377 | if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { |
1367 | ocfs2_error(sb, | 1378 | rc = ocfs2_error(sb, |
1368 | "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", | 1379 | "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", |
1369 | (unsigned long long)bh->b_blocknr); | 1380 | (unsigned long long)bh->b_blocknr); |
1370 | goto bail; | 1381 | goto bail; |
1371 | } | 1382 | } |
1372 | 1383 | ||
1373 | if (le32_to_cpu(di->i_fs_generation) != | 1384 | if (le32_to_cpu(di->i_fs_generation) != |
1374 | OCFS2_SB(sb)->fs_generation) { | 1385 | OCFS2_SB(sb)->fs_generation) { |
1375 | ocfs2_error(sb, | 1386 | rc = ocfs2_error(sb, |
1376 | "Invalid dinode #%llu: fs_generation is %u\n", | 1387 | "Invalid dinode #%llu: fs_generation is %u\n", |
1377 | (unsigned long long)bh->b_blocknr, | 1388 | (unsigned long long)bh->b_blocknr, |
1378 | le32_to_cpu(di->i_fs_generation)); | 1389 | le32_to_cpu(di->i_fs_generation)); |
1379 | goto bail; | 1390 | goto bail; |
1380 | } | 1391 | } |
1381 | 1392 | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5e86b247c821..ca3431ee7f24 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -81,8 +81,6 @@ struct ocfs2_inode_info | |||
81 | tid_t i_sync_tid; | 81 | tid_t i_sync_tid; |
82 | tid_t i_datasync_tid; | 82 | tid_t i_datasync_tid; |
83 | 83 | ||
84 | wait_queue_head_t append_dio_wq; | ||
85 | |||
86 | struct dquot *i_dquot[MAXQUOTAS]; | 84 | struct dquot *i_dquot[MAXQUOTAS]; |
87 | }; | 85 | }; |
88 | 86 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7c099f7032fd..ff82b28462a6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) | |||
374 | mlog_errno(PTR_ERR(handle)); | 374 | mlog_errno(PTR_ERR(handle)); |
375 | 375 | ||
376 | if (is_journal_aborted(journal)) { | 376 | if (is_journal_aborted(journal)) { |
377 | ocfs2_abort(osb->sb, "Detected aborted journal"); | 377 | ocfs2_abort(osb->sb, "Detected aborted journal\n"); |
378 | handle = ERR_PTR(-EROFS); | 378 | handle = ERR_PTR(-EROFS); |
379 | } | 379 | } |
380 | } else { | 380 | } else { |
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle, | |||
668 | mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); | 668 | mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); |
669 | mlog(ML_ERROR, "b_blocknr=%llu\n", | 669 | mlog(ML_ERROR, "b_blocknr=%llu\n", |
670 | (unsigned long long)bh->b_blocknr); | 670 | (unsigned long long)bh->b_blocknr); |
671 | BUG(); | 671 | |
672 | lock_buffer(bh); | ||
673 | /* | ||
674 | * A previous attempt to write this buffer head failed. | ||
675 | * Nothing we can do but to retry the write and hope for | ||
676 | * the best. | ||
677 | */ | ||
678 | if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { | ||
679 | clear_buffer_write_io_error(bh); | ||
680 | set_buffer_uptodate(bh); | ||
681 | } | ||
682 | |||
683 | if (!buffer_uptodate(bh)) { | ||
684 | unlock_buffer(bh); | ||
685 | return -EIO; | ||
686 | } | ||
687 | unlock_buffer(bh); | ||
672 | } | 688 | } |
673 | 689 | ||
674 | /* Set the current transaction information on the ci so | 690 | /* Set the current transaction information on the ci so |
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2170 | iter = oi->ip_next_orphan; | 2186 | iter = oi->ip_next_orphan; |
2171 | oi->ip_next_orphan = NULL; | 2187 | oi->ip_next_orphan = NULL; |
2172 | 2188 | ||
2189 | mutex_lock(&inode->i_mutex); | ||
2173 | ret = ocfs2_rw_lock(inode, 1); | 2190 | ret = ocfs2_rw_lock(inode, 1); |
2174 | if (ret < 0) { | 2191 | if (ret < 0) { |
2175 | mlog_errno(ret); | 2192 | mlog_errno(ret); |
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2193 | * ocfs2_delete_inode. */ | 2210 | * ocfs2_delete_inode. */ |
2194 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | 2211 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; |
2195 | spin_unlock(&oi->ip_lock); | 2212 | spin_unlock(&oi->ip_lock); |
2196 | } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && | 2213 | } |
2214 | |||
2215 | if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && | ||
2197 | (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { | 2216 | (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { |
2198 | ret = ocfs2_truncate_file(inode, di_bh, | 2217 | ret = ocfs2_truncate_file(inode, di_bh, |
2199 | i_size_read(inode)); | 2218 | i_size_read(inode)); |
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
2206 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); | 2225 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); |
2207 | if (ret) | 2226 | if (ret) |
2208 | mlog_errno(ret); | 2227 | mlog_errno(ret); |
2209 | |||
2210 | wake_up(&OCFS2_I(inode)->append_dio_wq); | ||
2211 | } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ | 2228 | } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ |
2212 | unlock_inode: | 2229 | unlock_inode: |
2213 | ocfs2_inode_unlock(inode, 1); | 2230 | ocfs2_inode_unlock(inode, 1); |
2231 | brelse(di_bh); | ||
2232 | di_bh = NULL; | ||
2214 | unlock_rw: | 2233 | unlock_rw: |
2215 | ocfs2_rw_unlock(inode, 1); | 2234 | ocfs2_rw_unlock(inode, 1); |
2216 | next: | 2235 | next: |
2236 | mutex_unlock(&inode->i_mutex); | ||
2217 | iput(inode); | 2237 | iput(inode); |
2218 | brelse(di_bh); | ||
2219 | di_bh = NULL; | ||
2220 | inode = iter; | 2238 | inode = iter; |
2221 | } | 2239 | } |
2222 | 2240 | ||
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c | |||
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | |||
665 | #ifdef CONFIG_OCFS2_DEBUG_FS | 665 | #ifdef CONFIG_OCFS2_DEBUG_FS |
666 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) != | 666 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) != |
667 | ocfs2_local_alloc_count_bits(alloc)) { | 667 | ocfs2_local_alloc_count_bits(alloc)) { |
668 | ocfs2_error(osb->sb, "local alloc inode %llu says it has " | 668 | ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", |
669 | "%u used bits, but a count shows %u", | ||
670 | (unsigned long long)le64_to_cpu(alloc->i_blkno), | 669 | (unsigned long long)le64_to_cpu(alloc->i_blkno), |
671 | le32_to_cpu(alloc->id1.bitmap1.i_used), | 670 | le32_to_cpu(alloc->id1.bitmap1.i_used), |
672 | ocfs2_local_alloc_count_bits(alloc)); | 671 | ocfs2_local_alloc_count_bits(alloc)); |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, | |||
99 | 99 | ||
100 | index = ocfs2_search_extent_list(el, cpos); | 100 | index = ocfs2_search_extent_list(el, cpos); |
101 | if (index == -1) { | 101 | if (index == -1) { |
102 | ocfs2_error(inode->i_sb, | 102 | ret = ocfs2_error(inode->i_sb, |
103 | "Inode %llu has an extent at cpos %u which can no " | 103 | "Inode %llu has an extent at cpos %u which can no longer be found\n", |
104 | "longer be found.\n", | 104 | (unsigned long long)ino, cpos); |
105 | (unsigned long long)ino, cpos); | ||
106 | ret = -EROFS; | ||
107 | goto out; | 105 | goto out; |
108 | } | 106 | } |
109 | 107 | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 948681e37cfd..b7dfac226b1e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -1035,11 +1035,6 @@ leave: | |||
1035 | if (handle) | 1035 | if (handle) |
1036 | ocfs2_commit_trans(osb, handle); | 1036 | ocfs2_commit_trans(osb, handle); |
1037 | 1037 | ||
1038 | if (child_locked) | ||
1039 | ocfs2_inode_unlock(inode, 1); | ||
1040 | |||
1041 | ocfs2_inode_unlock(dir, 1); | ||
1042 | |||
1043 | if (orphan_dir) { | 1038 | if (orphan_dir) { |
1044 | /* This was locked for us in ocfs2_prepare_orphan_dir() */ | 1039 | /* This was locked for us in ocfs2_prepare_orphan_dir() */ |
1045 | ocfs2_inode_unlock(orphan_dir, 1); | 1040 | ocfs2_inode_unlock(orphan_dir, 1); |
@@ -1047,6 +1042,11 @@ leave: | |||
1047 | iput(orphan_dir); | 1042 | iput(orphan_dir); |
1048 | } | 1043 | } |
1049 | 1044 | ||
1045 | if (child_locked) | ||
1046 | ocfs2_inode_unlock(inode, 1); | ||
1047 | |||
1048 | ocfs2_inode_unlock(dir, 1); | ||
1049 | |||
1050 | brelse(fe_bh); | 1050 | brelse(fe_bh); |
1051 | brelse(parent_node_bh); | 1051 | brelse(parent_node_bh); |
1052 | 1052 | ||
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir, | |||
1309 | } | 1309 | } |
1310 | parents_locked = 1; | 1310 | parents_locked = 1; |
1311 | 1311 | ||
1312 | if (!new_dir->i_nlink) { | ||
1313 | status = -EACCES; | ||
1314 | goto bail; | ||
1315 | } | ||
1316 | |||
1312 | /* make sure both dirs have bhs | 1317 | /* make sure both dirs have bhs |
1313 | * get an extra ref on old_dir_bh if old==new */ | 1318 | * get an extra ref on old_dir_bh if old==new */ |
1314 | if (!new_dir_bh) { | 1319 | if (!new_dir_bh) { |
@@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir, | |||
1569 | status = ocfs2_find_entry(old_dentry->d_name.name, | 1574 | status = ocfs2_find_entry(old_dentry->d_name.name, |
1570 | old_dentry->d_name.len, old_dir, | 1575 | old_dentry->d_name.len, old_dir, |
1571 | &old_entry_lookup); | 1576 | &old_entry_lookup); |
1572 | if (status) | 1577 | if (status) { |
1578 | if (!is_journal_aborted(osb->journal->j_journal)) { | ||
1579 | ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " | ||
1580 | "is not deleted.", | ||
1581 | new_dentry->d_name.len, new_dentry->d_name.name, | ||
1582 | old_dentry->d_name.len, old_dentry->d_name.name); | ||
1583 | } | ||
1573 | goto bail; | 1584 | goto bail; |
1585 | } | ||
1574 | 1586 | ||
1575 | status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); | 1587 | status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); |
1576 | if (status < 0) { | 1588 | if (status < 0) { |
1577 | mlog_errno(status); | 1589 | mlog_errno(status); |
1590 | if (!is_journal_aborted(osb->journal->j_journal)) { | ||
1591 | ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " | ||
1592 | "is not deleted.", | ||
1593 | new_dentry->d_name.len, new_dentry->d_name.name, | ||
1594 | old_dentry->d_name.len, old_dentry->d_name.name); | ||
1595 | } | ||
1578 | goto bail; | 1596 | goto bail; |
1579 | } | 1597 | } |
1580 | 1598 | ||
@@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir, | |||
1633 | ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); | 1651 | ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); |
1634 | status = 0; | 1652 | status = 0; |
1635 | bail: | 1653 | bail: |
1636 | if (rename_lock) | ||
1637 | ocfs2_rename_unlock(osb); | ||
1638 | |||
1639 | if (handle) | 1654 | if (handle) |
1640 | ocfs2_commit_trans(osb, handle); | 1655 | ocfs2_commit_trans(osb, handle); |
1641 | 1656 | ||
1642 | if (parents_locked) | ||
1643 | ocfs2_double_unlock(old_dir, new_dir); | ||
1644 | |||
1645 | if (old_child_locked) | ||
1646 | ocfs2_inode_unlock(old_inode, 1); | ||
1647 | |||
1648 | if (new_child_locked) | ||
1649 | ocfs2_inode_unlock(new_inode, 1); | ||
1650 | |||
1651 | if (orphan_dir) { | 1657 | if (orphan_dir) { |
1652 | /* This was locked for us in ocfs2_prepare_orphan_dir() */ | 1658 | /* This was locked for us in ocfs2_prepare_orphan_dir() */ |
1653 | ocfs2_inode_unlock(orphan_dir, 1); | 1659 | ocfs2_inode_unlock(orphan_dir, 1); |
@@ -1655,6 +1661,18 @@ bail: | |||
1655 | iput(orphan_dir); | 1661 | iput(orphan_dir); |
1656 | } | 1662 | } |
1657 | 1663 | ||
1664 | if (new_child_locked) | ||
1665 | ocfs2_inode_unlock(new_inode, 1); | ||
1666 | |||
1667 | if (old_child_locked) | ||
1668 | ocfs2_inode_unlock(old_inode, 1); | ||
1669 | |||
1670 | if (parents_locked) | ||
1671 | ocfs2_double_unlock(old_dir, new_dir); | ||
1672 | |||
1673 | if (rename_lock) | ||
1674 | ocfs2_rename_unlock(osb); | ||
1675 | |||
1658 | if (new_inode) | 1676 | if (new_inode) |
1659 | sync_mapping_buffers(old_inode->i_mapping); | 1677 | sync_mapping_buffers(old_inode->i_mapping); |
1660 | 1678 | ||
@@ -2601,27 +2619,6 @@ leave: | |||
2601 | return status; | 2619 | return status; |
2602 | } | 2620 | } |
2603 | 2621 | ||
2604 | static int ocfs2_dio_orphan_recovered(struct inode *inode) | ||
2605 | { | ||
2606 | int ret; | ||
2607 | struct buffer_head *di_bh = NULL; | ||
2608 | struct ocfs2_dinode *di = NULL; | ||
2609 | |||
2610 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
2611 | if (ret < 0) { | ||
2612 | mlog_errno(ret); | ||
2613 | return 0; | ||
2614 | } | ||
2615 | |||
2616 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
2617 | ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); | ||
2618 | ocfs2_inode_unlock(inode, 1); | ||
2619 | brelse(di_bh); | ||
2620 | |||
2621 | return ret; | ||
2622 | } | ||
2623 | |||
2624 | #define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 | ||
2625 | int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, | 2622 | int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, |
2626 | struct inode *inode) | 2623 | struct inode *inode) |
2627 | { | 2624 | { |
@@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, | |||
2633 | handle_t *handle = NULL; | 2630 | handle_t *handle = NULL; |
2634 | struct ocfs2_dinode *di = NULL; | 2631 | struct ocfs2_dinode *di = NULL; |
2635 | 2632 | ||
2636 | restart: | ||
2637 | status = ocfs2_inode_lock(inode, &di_bh, 1); | 2633 | status = ocfs2_inode_lock(inode, &di_bh, 1); |
2638 | if (status < 0) { | 2634 | if (status < 0) { |
2639 | mlog_errno(status); | 2635 | mlog_errno(status); |
@@ -2643,15 +2639,21 @@ restart: | |||
2643 | di = (struct ocfs2_dinode *) di_bh->b_data; | 2639 | di = (struct ocfs2_dinode *) di_bh->b_data; |
2644 | /* | 2640 | /* |
2645 | * Another append dio crashed? | 2641 | * Another append dio crashed? |
2646 | * If so, wait for recovery first. | 2642 | * If so, manually recover it first. |
2647 | */ | 2643 | */ |
2648 | if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { | 2644 | if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { |
2649 | ocfs2_inode_unlock(inode, 1); | 2645 | status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); |
2650 | brelse(di_bh); | 2646 | if (status < 0) { |
2651 | wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, | 2647 | if (status != -ENOSPC) |
2652 | ocfs2_dio_orphan_recovered(inode), | 2648 | mlog_errno(status); |
2653 | msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); | 2649 | goto bail_unlock_inode; |
2654 | goto restart; | 2650 | } |
2651 | |||
2652 | status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); | ||
2653 | if (status < 0) { | ||
2654 | mlog_errno(status); | ||
2655 | goto bail_unlock_inode; | ||
2656 | } | ||
2655 | } | 2657 | } |
2656 | 2658 | ||
2657 | status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, | 2659 | status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 690ddc60189b..7a0126267847 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -286,6 +286,8 @@ enum ocfs2_mount_options | |||
286 | OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ | 286 | OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ |
287 | 287 | ||
288 | OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ | 288 | OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ |
289 | OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ | ||
290 | OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ | ||
289 | }; | 291 | }; |
290 | 292 | ||
291 | #define OCFS2_OSB_SOFT_RO 0x0001 | 293 | #define OCFS2_OSB_SOFT_RO 0x0001 |
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index bb07004df72a..8a54fd8a4fa5 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c | |||
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, | |||
138 | 138 | ||
139 | if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { | 139 | if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { |
140 | ocfs2_error(inode->i_sb, | 140 | ocfs2_error(inode->i_sb, |
141 | "Quota file %llu is probably corrupted! Requested " | 141 | "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", |
142 | "to read block %Lu but file has size only %Lu\n", | ||
143 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 142 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
144 | (unsigned long long)v_block, | 143 | (unsigned long long)v_block, |
145 | (unsigned long long)i_size_read(inode)); | 144 | (unsigned long long)i_size_read(inode)); |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7dc818b87cd8..e5d57cd32505 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, | |||
102 | 102 | ||
103 | 103 | ||
104 | if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { | 104 | if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { |
105 | ocfs2_error(sb, | 105 | rc = ocfs2_error(sb, |
106 | "Refcount block #%llu has bad signature %.*s", | 106 | "Refcount block #%llu has bad signature %.*s\n", |
107 | (unsigned long long)bh->b_blocknr, 7, | 107 | (unsigned long long)bh->b_blocknr, 7, |
108 | rb->rf_signature); | 108 | rb->rf_signature); |
109 | return -EINVAL; | 109 | goto out; |
110 | } | 110 | } |
111 | 111 | ||
112 | if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { | 112 | if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { |
113 | ocfs2_error(sb, | 113 | rc = ocfs2_error(sb, |
114 | "Refcount block #%llu has an invalid rf_blkno " | 114 | "Refcount block #%llu has an invalid rf_blkno of %llu\n", |
115 | "of %llu", | 115 | (unsigned long long)bh->b_blocknr, |
116 | (unsigned long long)bh->b_blocknr, | 116 | (unsigned long long)le64_to_cpu(rb->rf_blkno)); |
117 | (unsigned long long)le64_to_cpu(rb->rf_blkno)); | 117 | goto out; |
118 | return -EINVAL; | ||
119 | } | 118 | } |
120 | 119 | ||
121 | if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { | 120 | if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { |
122 | ocfs2_error(sb, | 121 | rc = ocfs2_error(sb, |
123 | "Refcount block #%llu has an invalid " | 122 | "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", |
124 | "rf_fs_generation of #%u", | 123 | (unsigned long long)bh->b_blocknr, |
125 | (unsigned long long)bh->b_blocknr, | 124 | le32_to_cpu(rb->rf_fs_generation)); |
126 | le32_to_cpu(rb->rf_fs_generation)); | 125 | goto out; |
127 | return -EINVAL; | ||
128 | } | 126 | } |
129 | 127 | out: | |
130 | return 0; | 128 | return rc; |
131 | } | 129 | } |
132 | 130 | ||
133 | static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, | 131 | static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, |
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, | |||
1102 | el = &eb->h_list; | 1100 | el = &eb->h_list; |
1103 | 1101 | ||
1104 | if (el->l_tree_depth) { | 1102 | if (el->l_tree_depth) { |
1105 | ocfs2_error(sb, | 1103 | ret = ocfs2_error(sb, |
1106 | "refcount tree %llu has non zero tree " | 1104 | "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", |
1107 | "depth in leaf btree tree block %llu\n", | 1105 | (unsigned long long)ocfs2_metadata_cache_owner(ci), |
1108 | (unsigned long long)ocfs2_metadata_cache_owner(ci), | 1106 | (unsigned long long)eb_bh->b_blocknr); |
1109 | (unsigned long long)eb_bh->b_blocknr); | ||
1110 | ret = -EROFS; | ||
1111 | goto out; | 1107 | goto out; |
1112 | } | 1108 | } |
1113 | } | 1109 | } |
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, | |||
2359 | cpos, len, phys); | 2355 | cpos, len, phys); |
2360 | 2356 | ||
2361 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { | 2357 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { |
2362 | ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " | 2358 | ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", |
2363 | "tree, but the feature bit is not set in the " | 2359 | inode->i_ino); |
2364 | "super block.", inode->i_ino); | ||
2365 | ret = -EROFS; | ||
2366 | goto out; | 2360 | goto out; |
2367 | } | 2361 | } |
2368 | 2362 | ||
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, | |||
2545 | u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); | 2539 | u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); |
2546 | 2540 | ||
2547 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { | 2541 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { |
2548 | ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " | 2542 | ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", |
2549 | "tree, but the feature bit is not set in the " | 2543 | inode->i_ino); |
2550 | "super block.", inode->i_ino); | ||
2551 | ret = -EROFS; | ||
2552 | goto out; | 2544 | goto out; |
2553 | } | 2545 | } |
2554 | 2546 | ||
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, | |||
2672 | el = &eb->h_list; | 2664 | el = &eb->h_list; |
2673 | 2665 | ||
2674 | if (el->l_tree_depth) { | 2666 | if (el->l_tree_depth) { |
2675 | ocfs2_error(inode->i_sb, | 2667 | ret = ocfs2_error(inode->i_sb, |
2676 | "Inode %lu has non zero tree depth in " | 2668 | "Inode %lu has non zero tree depth in leaf block %llu\n", |
2677 | "leaf block %llu\n", inode->i_ino, | 2669 | inode->i_ino, |
2678 | (unsigned long long)eb_bh->b_blocknr); | 2670 | (unsigned long long)eb_bh->b_blocknr); |
2679 | ret = -EROFS; | ||
2680 | goto out; | 2671 | goto out; |
2681 | } | 2672 | } |
2682 | } | 2673 | } |
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, | |||
3106 | 3097 | ||
3107 | index = ocfs2_search_extent_list(el, cpos); | 3098 | index = ocfs2_search_extent_list(el, cpos); |
3108 | if (index == -1) { | 3099 | if (index == -1) { |
3109 | ocfs2_error(sb, | 3100 | ret = ocfs2_error(sb, |
3110 | "Inode %llu has an extent at cpos %u which can no " | 3101 | "Inode %llu has an extent at cpos %u which can no longer be found\n", |
3111 | "longer be found.\n", | 3102 | (unsigned long long)ino, cpos); |
3112 | (unsigned long long)ino, cpos); | ||
3113 | ret = -EROFS; | ||
3114 | goto out; | 3103 | goto out; |
3115 | } | 3104 | } |
3116 | 3105 | ||
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) | |||
3376 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 3365 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
3377 | 3366 | ||
3378 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { | 3367 | if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { |
3379 | ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " | 3368 | return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", |
3380 | "tree, but the feature bit is not set in the " | 3369 | inode->i_ino); |
3381 | "super block.", inode->i_ino); | ||
3382 | return -EROFS; | ||
3383 | } | 3370 | } |
3384 | 3371 | ||
3385 | ocfs2_init_dealloc_ctxt(&context->dealloc); | 3372 | ocfs2_init_dealloc_ctxt(&context->dealloc); |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..d83d2602cf2b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) | |||
149 | brelse(ac->ac_bh); | 149 | brelse(ac->ac_bh); |
150 | ac->ac_bh = NULL; | 150 | ac->ac_bh = NULL; |
151 | ac->ac_resv = NULL; | 151 | ac->ac_resv = NULL; |
152 | if (ac->ac_find_loc_priv) { | 152 | kfree(ac->ac_find_loc_priv); |
153 | kfree(ac->ac_find_loc_priv); | 153 | ac->ac_find_loc_priv = NULL; |
154 | ac->ac_find_loc_priv = NULL; | ||
155 | } | ||
156 | } | 154 | } |
157 | 155 | ||
158 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | 156 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) |
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) | |||
167 | } | 165 | } |
168 | 166 | ||
169 | #define do_error(fmt, ...) \ | 167 | #define do_error(fmt, ...) \ |
170 | do{ \ | 168 | do { \ |
171 | if (resize) \ | 169 | if (resize) \ |
172 | mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ | 170 | mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ |
173 | else \ | 171 | else \ |
174 | ocfs2_error(sb, fmt, ##__VA_ARGS__); \ | 172 | return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ |
175 | } while (0) | 173 | } while (0) |
176 | 174 | ||
177 | static int ocfs2_validate_gd_self(struct super_block *sb, | 175 | static int ocfs2_validate_gd_self(struct super_block *sb, |
178 | struct buffer_head *bh, | 176 | struct buffer_head *bh, |
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, | |||
181 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; | 179 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; |
182 | 180 | ||
183 | if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { | 181 | if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { |
184 | do_error("Group descriptor #%llu has bad signature %.*s", | 182 | do_error("Group descriptor #%llu has bad signature %.*s\n", |
185 | (unsigned long long)bh->b_blocknr, 7, | 183 | (unsigned long long)bh->b_blocknr, 7, |
186 | gd->bg_signature); | 184 | gd->bg_signature); |
187 | return -EINVAL; | ||
188 | } | 185 | } |
189 | 186 | ||
190 | if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { | 187 | if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { |
191 | do_error("Group descriptor #%llu has an invalid bg_blkno " | 188 | do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", |
192 | "of %llu", | ||
193 | (unsigned long long)bh->b_blocknr, | 189 | (unsigned long long)bh->b_blocknr, |
194 | (unsigned long long)le64_to_cpu(gd->bg_blkno)); | 190 | (unsigned long long)le64_to_cpu(gd->bg_blkno)); |
195 | return -EINVAL; | ||
196 | } | 191 | } |
197 | 192 | ||
198 | if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { | 193 | if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { |
199 | do_error("Group descriptor #%llu has an invalid " | 194 | do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", |
200 | "fs_generation of #%u", | ||
201 | (unsigned long long)bh->b_blocknr, | 195 | (unsigned long long)bh->b_blocknr, |
202 | le32_to_cpu(gd->bg_generation)); | 196 | le32_to_cpu(gd->bg_generation)); |
203 | return -EINVAL; | ||
204 | } | 197 | } |
205 | 198 | ||
206 | if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { | 199 | if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { |
207 | do_error("Group descriptor #%llu has bit count %u but " | 200 | do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", |
208 | "claims that %u are free", | ||
209 | (unsigned long long)bh->b_blocknr, | 201 | (unsigned long long)bh->b_blocknr, |
210 | le16_to_cpu(gd->bg_bits), | 202 | le16_to_cpu(gd->bg_bits), |
211 | le16_to_cpu(gd->bg_free_bits_count)); | 203 | le16_to_cpu(gd->bg_free_bits_count)); |
212 | return -EINVAL; | ||
213 | } | 204 | } |
214 | 205 | ||
215 | if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { | 206 | if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { |
216 | do_error("Group descriptor #%llu has bit count %u but " | 207 | do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", |
217 | "max bitmap bits of %u", | ||
218 | (unsigned long long)bh->b_blocknr, | 208 | (unsigned long long)bh->b_blocknr, |
219 | le16_to_cpu(gd->bg_bits), | 209 | le16_to_cpu(gd->bg_bits), |
220 | 8 * le16_to_cpu(gd->bg_size)); | 210 | 8 * le16_to_cpu(gd->bg_size)); |
221 | return -EINVAL; | ||
222 | } | 211 | } |
223 | 212 | ||
224 | return 0; | 213 | return 0; |
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, | |||
233 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; | 222 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; |
234 | 223 | ||
235 | if (di->i_blkno != gd->bg_parent_dinode) { | 224 | if (di->i_blkno != gd->bg_parent_dinode) { |
236 | do_error("Group descriptor #%llu has bad parent " | 225 | do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", |
237 | "pointer (%llu, expected %llu)", | ||
238 | (unsigned long long)bh->b_blocknr, | 226 | (unsigned long long)bh->b_blocknr, |
239 | (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), | 227 | (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), |
240 | (unsigned long long)le64_to_cpu(di->i_blkno)); | 228 | (unsigned long long)le64_to_cpu(di->i_blkno)); |
241 | return -EINVAL; | ||
242 | } | 229 | } |
243 | 230 | ||
244 | max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); | 231 | max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); |
245 | if (le16_to_cpu(gd->bg_bits) > max_bits) { | 232 | if (le16_to_cpu(gd->bg_bits) > max_bits) { |
246 | do_error("Group descriptor #%llu has bit count of %u", | 233 | do_error("Group descriptor #%llu has bit count of %u\n", |
247 | (unsigned long long)bh->b_blocknr, | 234 | (unsigned long long)bh->b_blocknr, |
248 | le16_to_cpu(gd->bg_bits)); | 235 | le16_to_cpu(gd->bg_bits)); |
249 | return -EINVAL; | ||
250 | } | 236 | } |
251 | 237 | ||
252 | /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ | 238 | /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ |
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, | |||
254 | le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || | 240 | le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || |
255 | ((le16_to_cpu(gd->bg_chain) == | 241 | ((le16_to_cpu(gd->bg_chain) == |
256 | le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { | 242 | le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { |
257 | do_error("Group descriptor #%llu has bad chain %u", | 243 | do_error("Group descriptor #%llu has bad chain %u\n", |
258 | (unsigned long long)bh->b_blocknr, | 244 | (unsigned long long)bh->b_blocknr, |
259 | le16_to_cpu(gd->bg_chain)); | 245 | le16_to_cpu(gd->bg_chain)); |
260 | return -EINVAL; | ||
261 | } | 246 | } |
262 | 247 | ||
263 | return 0; | 248 | return 0; |
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle, | |||
384 | struct super_block * sb = alloc_inode->i_sb; | 369 | struct super_block * sb = alloc_inode->i_sb; |
385 | 370 | ||
386 | if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { | 371 | if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { |
387 | ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " | 372 | status = ocfs2_error(alloc_inode->i_sb, |
388 | "b_blocknr (%llu)", | 373 | "group block (%llu) != b_blocknr (%llu)\n", |
389 | (unsigned long long)group_blkno, | 374 | (unsigned long long)group_blkno, |
390 | (unsigned long long) bg_bh->b_blocknr); | 375 | (unsigned long long) bg_bh->b_blocknr); |
391 | status = -EIO; | ||
392 | goto bail; | 376 | goto bail; |
393 | } | 377 | } |
394 | 378 | ||
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | |||
834 | BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); | 818 | BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); |
835 | 819 | ||
836 | if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { | 820 | if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { |
837 | ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", | 821 | status = ocfs2_error(alloc_inode->i_sb, |
838 | (unsigned long long)le64_to_cpu(fe->i_blkno)); | 822 | "Invalid chain allocator %llu\n", |
839 | status = -EIO; | 823 | (unsigned long long)le64_to_cpu(fe->i_blkno)); |
840 | goto bail; | 824 | goto bail; |
841 | } | 825 | } |
842 | 826 | ||
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, | |||
1370 | 1354 | ||
1371 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | 1355 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); |
1372 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | 1356 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { |
1373 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | 1357 | return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", |
1374 | " count %u but claims %u are freed. num_bits %d", | 1358 | (unsigned long long)le64_to_cpu(bg->bg_blkno), |
1375 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | 1359 | le16_to_cpu(bg->bg_bits), |
1376 | le16_to_cpu(bg->bg_bits), | 1360 | le16_to_cpu(bg->bg_free_bits_count), |
1377 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | 1361 | num_bits); |
1378 | return -EROFS; | ||
1379 | } | 1362 | } |
1380 | while(num_bits--) | 1363 | while(num_bits--) |
1381 | ocfs2_set_bit(bit_off++, bitmap); | 1364 | ocfs2_set_bit(bit_off++, bitmap); |
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, | |||
1905 | 1888 | ||
1906 | if (le32_to_cpu(fe->id1.bitmap1.i_used) >= | 1889 | if (le32_to_cpu(fe->id1.bitmap1.i_used) >= |
1907 | le32_to_cpu(fe->id1.bitmap1.i_total)) { | 1890 | le32_to_cpu(fe->id1.bitmap1.i_total)) { |
1908 | ocfs2_error(ac->ac_inode->i_sb, | 1891 | status = ocfs2_error(ac->ac_inode->i_sb, |
1909 | "Chain allocator dinode %llu has %u used " | 1892 | "Chain allocator dinode %llu has %u used bits but only %u total\n", |
1910 | "bits but only %u total.", | 1893 | (unsigned long long)le64_to_cpu(fe->i_blkno), |
1911 | (unsigned long long)le64_to_cpu(fe->i_blkno), | 1894 | le32_to_cpu(fe->id1.bitmap1.i_used), |
1912 | le32_to_cpu(fe->id1.bitmap1.i_used), | 1895 | le32_to_cpu(fe->id1.bitmap1.i_total)); |
1913 | le32_to_cpu(fe->id1.bitmap1.i_total)); | ||
1914 | status = -EIO; | ||
1915 | goto bail; | 1896 | goto bail; |
1916 | } | 1897 | } |
1917 | 1898 | ||
@@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, | |||
2429 | } | 2410 | } |
2430 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); | 2411 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); |
2431 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | 2412 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { |
2432 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | 2413 | return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", |
2433 | " count %u but claims %u are freed. num_bits %d", | 2414 | (unsigned long long)le64_to_cpu(bg->bg_blkno), |
2434 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | 2415 | le16_to_cpu(bg->bg_bits), |
2435 | le16_to_cpu(bg->bg_bits), | 2416 | le16_to_cpu(bg->bg_free_bits_count), |
2436 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | 2417 | num_bits); |
2437 | return -EROFS; | ||
2438 | } | 2418 | } |
2439 | 2419 | ||
2440 | if (undo_fn) | 2420 | if (undo_fn) |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2de4c8a9340c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -192,6 +192,7 @@ enum { | |||
192 | Opt_resv_level, | 192 | Opt_resv_level, |
193 | Opt_dir_resv_level, | 193 | Opt_dir_resv_level, |
194 | Opt_journal_async_commit, | 194 | Opt_journal_async_commit, |
195 | Opt_err_cont, | ||
195 | Opt_err, | 196 | Opt_err, |
196 | }; | 197 | }; |
197 | 198 | ||
@@ -224,6 +225,7 @@ static const match_table_t tokens = { | |||
224 | {Opt_resv_level, "resv_level=%u"}, | 225 | {Opt_resv_level, "resv_level=%u"}, |
225 | {Opt_dir_resv_level, "dir_resv_level=%u"}, | 226 | {Opt_dir_resv_level, "dir_resv_level=%u"}, |
226 | {Opt_journal_async_commit, "journal_async_commit"}, | 227 | {Opt_journal_async_commit, "journal_async_commit"}, |
228 | {Opt_err_cont, "errors=continue"}, | ||
227 | {Opt_err, NULL} | 229 | {Opt_err, NULL} |
228 | }; | 230 | }; |
229 | 231 | ||
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
1330 | mopt->mount_opt |= OCFS2_MOUNT_NOINTR; | 1332 | mopt->mount_opt |= OCFS2_MOUNT_NOINTR; |
1331 | break; | 1333 | break; |
1332 | case Opt_err_panic: | 1334 | case Opt_err_panic: |
1335 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; | ||
1336 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; | ||
1333 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | 1337 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; |
1334 | break; | 1338 | break; |
1335 | case Opt_err_ro: | 1339 | case Opt_err_ro: |
1340 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; | ||
1336 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; | 1341 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; |
1342 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; | ||
1343 | break; | ||
1344 | case Opt_err_cont: | ||
1345 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; | ||
1346 | mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; | ||
1347 | mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; | ||
1337 | break; | 1348 | break; |
1338 | case Opt_data_ordered: | 1349 | case Opt_data_ordered: |
1339 | mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; | 1350 | mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; |
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) | |||
1530 | 1541 | ||
1531 | if (opts & OCFS2_MOUNT_ERRORS_PANIC) | 1542 | if (opts & OCFS2_MOUNT_ERRORS_PANIC) |
1532 | seq_printf(s, ",errors=panic"); | 1543 | seq_printf(s, ",errors=panic"); |
1544 | else if (opts & OCFS2_MOUNT_ERRORS_CONT) | ||
1545 | seq_printf(s, ",errors=continue"); | ||
1533 | else | 1546 | else |
1534 | seq_printf(s, ",errors=remount-ro"); | 1547 | seq_printf(s, ",errors=remount-ro"); |
1535 | 1548 | ||
@@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) | |||
1550 | seq_printf(s, ",localflocks,"); | 1563 | seq_printf(s, ",localflocks,"); |
1551 | 1564 | ||
1552 | if (osb->osb_cluster_stack[0]) | 1565 | if (osb->osb_cluster_stack[0]) |
1553 | seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, | 1566 | seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, |
1554 | osb->osb_cluster_stack); | 1567 | OCFS2_STACK_LABEL_LEN); |
1555 | if (opts & OCFS2_MOUNT_USRQUOTA) | 1568 | if (opts & OCFS2_MOUNT_USRQUOTA) |
1556 | seq_printf(s, ",usrquota"); | 1569 | seq_printf(s, ",usrquota"); |
1557 | if (opts & OCFS2_MOUNT_GRPQUOTA) | 1570 | if (opts & OCFS2_MOUNT_GRPQUOTA) |
@@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data) | |||
1746 | ocfs2_lock_res_init_once(&oi->ip_inode_lockres); | 1759 | ocfs2_lock_res_init_once(&oi->ip_inode_lockres); |
1747 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); | 1760 | ocfs2_lock_res_init_once(&oi->ip_open_lockres); |
1748 | 1761 | ||
1749 | init_waitqueue_head(&oi->append_dio_wq); | ||
1750 | |||
1751 | ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), | 1762 | ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), |
1752 | &ocfs2_inode_caching_ops); | 1763 | &ocfs2_inode_caching_ops); |
1753 | 1764 | ||
@@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) | |||
2541 | memset(osb, 0, sizeof(struct ocfs2_super)); | 2552 | memset(osb, 0, sizeof(struct ocfs2_super)); |
2542 | } | 2553 | } |
2543 | 2554 | ||
2544 | /* Put OCFS2 into a readonly state, or (if the user specifies it), | 2555 | /* Depending on the mount option passed, perform one of the following: |
2545 | * panic(). We do not support continue-on-error operation. */ | 2556 | * Put OCFS2 into a readonly state (default) |
2546 | static void ocfs2_handle_error(struct super_block *sb) | 2557 | * Return EIO so that only the process errs |
2558 | * Fix the error as if fsck.ocfs2 -y | ||
2559 | * panic | ||
2560 | */ | ||
2561 | static int ocfs2_handle_error(struct super_block *sb) | ||
2547 | { | 2562 | { |
2548 | struct ocfs2_super *osb = OCFS2_SB(sb); | 2563 | struct ocfs2_super *osb = OCFS2_SB(sb); |
2549 | 2564 | int rv = 0; | |
2550 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) | ||
2551 | panic("OCFS2: (device %s): panic forced after error\n", | ||
2552 | sb->s_id); | ||
2553 | 2565 | ||
2554 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); | 2566 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); |
2567 | pr_crit("On-disk corruption discovered. " | ||
2568 | "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); | ||
2555 | 2569 | ||
2556 | if (sb->s_flags & MS_RDONLY && | 2570 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { |
2557 | (ocfs2_is_soft_readonly(osb) || | 2571 | panic("OCFS2: (device %s): panic forced after error\n", |
2558 | ocfs2_is_hard_readonly(osb))) | 2572 | sb->s_id); |
2559 | return; | 2573 | } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { |
2560 | 2574 | pr_crit("OCFS2: Returning error to the calling process.\n"); | |
2561 | printk(KERN_CRIT "File system is now read-only due to the potential " | 2575 | rv = -EIO; |
2562 | "of on-disk corruption. Please run fsck.ocfs2 once the file " | 2576 | } else { /* default option */ |
2563 | "system is unmounted.\n"); | 2577 | rv = -EROFS; |
2564 | sb->s_flags |= MS_RDONLY; | 2578 | if (sb->s_flags & MS_RDONLY && |
2565 | ocfs2_set_ro_flag(osb, 0); | 2579 | (ocfs2_is_soft_readonly(osb) || |
2580 | ocfs2_is_hard_readonly(osb))) | ||
2581 | return rv; | ||
2582 | |||
2583 | pr_crit("OCFS2: File system is now read-only.\n"); | ||
2584 | sb->s_flags |= MS_RDONLY; | ||
2585 | ocfs2_set_ro_flag(osb, 0); | ||
2586 | } | ||
2587 | |||
2588 | return rv; | ||
2566 | } | 2589 | } |
2567 | 2590 | ||
2568 | void __ocfs2_error(struct super_block *sb, const char *function, | 2591 | int __ocfs2_error(struct super_block *sb, const char *function, |
2569 | const char *fmt, ...) | 2592 | const char *fmt, ...) |
2570 | { | 2593 | { |
2571 | struct va_format vaf; | 2594 | struct va_format vaf; |
@@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, | |||
2577 | 2600 | ||
2578 | /* Not using mlog here because we want to show the actual | 2601 | /* Not using mlog here because we want to show the actual |
2579 | * function the error came from. */ | 2602 | * function the error came from. */ |
2580 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", | 2603 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", |
2581 | sb->s_id, function, &vaf); | 2604 | sb->s_id, function, &vaf); |
2582 | 2605 | ||
2583 | va_end(args); | 2606 | va_end(args); |
2584 | 2607 | ||
2585 | ocfs2_handle_error(sb); | 2608 | return ocfs2_handle_error(sb); |
2586 | } | 2609 | } |
2587 | 2610 | ||
2588 | /* Handle critical errors. This is intentionally more drastic than | 2611 | /* Handle critical errors. This is intentionally more drastic than |
@@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, | |||
2599 | vaf.fmt = fmt; | 2622 | vaf.fmt = fmt; |
2600 | vaf.va = &args; | 2623 | vaf.va = &args; |
2601 | 2624 | ||
2602 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", | 2625 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", |
2603 | sb->s_id, function, &vaf); | 2626 | sb->s_id, function, &vaf); |
2604 | 2627 | ||
2605 | va_end(args); | 2628 | va_end(args); |
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h | |||
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, | |||
32 | int node_num); | 32 | int node_num); |
33 | 33 | ||
34 | __printf(3, 4) | 34 | __printf(3, 4) |
35 | void __ocfs2_error(struct super_block *sb, const char *function, | 35 | int __ocfs2_error(struct super_block *sb, const char *function, |
36 | const char *fmt, ...); | 36 | const char *fmt, ...); |
37 | 37 | ||
38 | #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) | 38 | #define ocfs2_error(sb, fmt, ...) \ |
39 | __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) | ||
39 | 40 | ||
40 | __printf(3, 4) | 41 | __printf(3, 4) |
41 | void __ocfs2_abort(struct super_block *sb, const char *function, | 42 | void __ocfs2_abort(struct super_block *sb, const char *function, |
42 | const char *fmt, ...); | 43 | const char *fmt, ...); |
43 | 44 | ||
44 | #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) | 45 | #define ocfs2_abort(sb, fmt, ...) \ |
46 | __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) | ||
45 | 47 | ||
46 | /* | 48 | /* |
47 | * Void signal blockers, because in-kernel sigprocmask() only fails | 49 | * Void signal blockers, because in-kernel sigprocmask() only fails |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 889f3796a0d7..ebfdea78659b 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, | |||
499 | */ | 499 | */ |
500 | 500 | ||
501 | if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { | 501 | if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { |
502 | ocfs2_error(sb, | 502 | return ocfs2_error(sb, |
503 | "Extended attribute block #%llu has bad " | 503 | "Extended attribute block #%llu has bad signature %.*s\n", |
504 | "signature %.*s", | 504 | (unsigned long long)bh->b_blocknr, 7, |
505 | (unsigned long long)bh->b_blocknr, 7, | 505 | xb->xb_signature); |
506 | xb->xb_signature); | ||
507 | return -EINVAL; | ||
508 | } | 506 | } |
509 | 507 | ||
510 | if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { | 508 | if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { |
511 | ocfs2_error(sb, | 509 | return ocfs2_error(sb, |
512 | "Extended attribute block #%llu has an " | 510 | "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", |
513 | "invalid xb_blkno of %llu", | 511 | (unsigned long long)bh->b_blocknr, |
514 | (unsigned long long)bh->b_blocknr, | 512 | (unsigned long long)le64_to_cpu(xb->xb_blkno)); |
515 | (unsigned long long)le64_to_cpu(xb->xb_blkno)); | ||
516 | return -EINVAL; | ||
517 | } | 513 | } |
518 | 514 | ||
519 | if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { | 515 | if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { |
520 | ocfs2_error(sb, | 516 | return ocfs2_error(sb, |
521 | "Extended attribute block #%llu has an invalid " | 517 | "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", |
522 | "xb_fs_generation of #%u", | 518 | (unsigned long long)bh->b_blocknr, |
523 | (unsigned long long)bh->b_blocknr, | 519 | le32_to_cpu(xb->xb_fs_generation)); |
524 | le32_to_cpu(xb->xb_fs_generation)); | ||
525 | return -EINVAL; | ||
526 | } | 520 | } |
527 | 521 | ||
528 | return 0; | 522 | return 0; |
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, | |||
3694 | el = &eb->h_list; | 3688 | el = &eb->h_list; |
3695 | 3689 | ||
3696 | if (el->l_tree_depth) { | 3690 | if (el->l_tree_depth) { |
3697 | ocfs2_error(inode->i_sb, | 3691 | ret = ocfs2_error(inode->i_sb, |
3698 | "Inode %lu has non zero tree depth in " | 3692 | "Inode %lu has non zero tree depth in xattr tree block %llu\n", |
3699 | "xattr tree block %llu\n", inode->i_ino, | 3693 | inode->i_ino, |
3700 | (unsigned long long)eb_bh->b_blocknr); | 3694 | (unsigned long long)eb_bh->b_blocknr); |
3701 | ret = -EROFS; | ||
3702 | goto out; | 3695 | goto out; |
3703 | } | 3696 | } |
3704 | } | 3697 | } |
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, | |||
3713 | } | 3706 | } |
3714 | 3707 | ||
3715 | if (!e_blkno) { | 3708 | if (!e_blkno) { |
3716 | ocfs2_error(inode->i_sb, "Inode %lu has bad extent " | 3709 | ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", |
3717 | "record (%u, %u, 0) in xattr", inode->i_ino, | 3710 | inode->i_ino, |
3718 | le32_to_cpu(rec->e_cpos), | 3711 | le32_to_cpu(rec->e_cpos), |
3719 | ocfs2_rec_clusters(el, rec)); | 3712 | ocfs2_rec_clusters(el, rec)); |
3720 | ret = -EROFS; | ||
3721 | goto out; | 3713 | goto out; |
3722 | } | 3714 | } |
3723 | 3715 | ||
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, | |||
7334 | const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; | 7326 | const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; |
7335 | const size_t total_len = prefix_len + name_len + 1; | 7327 | const size_t total_len = prefix_len + name_len + 1; |
7336 | 7328 | ||
7329 | if (!capable(CAP_SYS_ADMIN)) | ||
7330 | return 0; | ||
7331 | |||
7337 | if (list && total_len <= list_size) { | 7332 | if (list && total_len <= list_size) { |
7338 | memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); | 7333 | memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); |
7339 | memcpy(list + prefix_len, name, name_len); | 7334 | memcpy(list + prefix_len, name, name_len); |
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7466ff339c66..79073d68b475 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c | |||
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) | |||
588 | struct super_block *sb = dentry->d_sb; | 588 | struct super_block *sb = dentry->d_sb; |
589 | struct ovl_fs *ufs = sb->s_fs_info; | 589 | struct ovl_fs *ufs = sb->s_fs_info; |
590 | 590 | ||
591 | seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); | 591 | seq_show_option(m, "lowerdir", ufs->config.lowerdir); |
592 | if (ufs->config.upperdir) { | 592 | if (ufs->config.upperdir) { |
593 | seq_printf(m, ",upperdir=%s", ufs->config.upperdir); | 593 | seq_show_option(m, "upperdir", ufs->config.upperdir); |
594 | seq_printf(m, ",workdir=%s", ufs->config.workdir); | 594 | seq_show_option(m, "workdir", ufs->config.workdir); |
595 | } | 595 | } |
596 | return 0; | 596 | return 0; |
597 | } | 597 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf3104f..f60f0121e331 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, | |||
308 | static inline void task_cap(struct seq_file *m, struct task_struct *p) | 308 | static inline void task_cap(struct seq_file *m, struct task_struct *p) |
309 | { | 309 | { |
310 | const struct cred *cred; | 310 | const struct cred *cred; |
311 | kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; | 311 | kernel_cap_t cap_inheritable, cap_permitted, cap_effective, |
312 | cap_bset, cap_ambient; | ||
312 | 313 | ||
313 | rcu_read_lock(); | 314 | rcu_read_lock(); |
314 | cred = __task_cred(p); | 315 | cred = __task_cred(p); |
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) | |||
316 | cap_permitted = cred->cap_permitted; | 317 | cap_permitted = cred->cap_permitted; |
317 | cap_effective = cred->cap_effective; | 318 | cap_effective = cred->cap_effective; |
318 | cap_bset = cred->cap_bset; | 319 | cap_bset = cred->cap_bset; |
320 | cap_ambient = cred->cap_ambient; | ||
319 | rcu_read_unlock(); | 321 | rcu_read_unlock(); |
320 | 322 | ||
321 | render_cap_t(m, "CapInh:\t", &cap_inheritable); | 323 | render_cap_t(m, "CapInh:\t", &cap_inheritable); |
322 | render_cap_t(m, "CapPrm:\t", &cap_permitted); | 324 | render_cap_t(m, "CapPrm:\t", &cap_permitted); |
323 | render_cap_t(m, "CapEff:\t", &cap_effective); | 325 | render_cap_t(m, "CapEff:\t", &cap_effective); |
324 | render_cap_t(m, "CapBnd:\t", &cap_bset); | 326 | render_cap_t(m, "CapBnd:\t", &cap_bset); |
327 | render_cap_t(m, "CapAmb:\t", &cap_ambient); | ||
325 | } | 328 | } |
326 | 329 | ||
327 | static inline void task_seccomp(struct seq_file *m, struct task_struct *p) | 330 | static inline void task_seccomp(struct seq_file *m, struct task_struct *p) |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091881d4..3b4d8255e806 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
597 | [ilog2(VM_HUGEPAGE)] = "hg", | 597 | [ilog2(VM_HUGEPAGE)] = "hg", |
598 | [ilog2(VM_NOHUGEPAGE)] = "nh", | 598 | [ilog2(VM_NOHUGEPAGE)] = "nh", |
599 | [ilog2(VM_MERGEABLE)] = "mg", | 599 | [ilog2(VM_MERGEABLE)] = "mg", |
600 | [ilog2(VM_UFFD_MISSING)]= "um", | ||
601 | [ilog2(VM_UFFD_WP)] = "uw", | ||
600 | }; | 602 | }; |
601 | size_t i; | 603 | size_t i; |
602 | 604 | ||
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0e4cf728126f..4a62fe8cc3bf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c | |||
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) | |||
714 | seq_puts(seq, ",acl"); | 714 | seq_puts(seq, ",acl"); |
715 | 715 | ||
716 | if (REISERFS_SB(s)->s_jdev) | 716 | if (REISERFS_SB(s)->s_jdev) |
717 | seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); | 717 | seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); |
718 | 718 | ||
719 | if (journal->j_max_commit_age != journal->j_default_max_commit_age) | 719 | if (journal->j_max_commit_age != journal->j_default_max_commit_age) |
720 | seq_printf(seq, ",commit=%d", journal->j_max_commit_age); | 720 | seq_printf(seq, ",commit=%d", journal->j_max_commit_age); |
721 | 721 | ||
722 | #ifdef CONFIG_QUOTA | 722 | #ifdef CONFIG_QUOTA |
723 | if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) | 723 | if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) |
724 | seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); | 724 | seq_show_option(seq, "usrjquota", |
725 | REISERFS_SB(s)->s_qf_names[USRQUOTA]); | ||
725 | else if (opts & (1 << REISERFS_USRQUOTA)) | 726 | else if (opts & (1 << REISERFS_USRQUOTA)) |
726 | seq_puts(seq, ",usrquota"); | 727 | seq_puts(seq, ",usrquota"); |
727 | if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) | 728 | if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) |
728 | seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); | 729 | seq_show_option(seq, "grpjquota", |
730 | REISERFS_SB(s)->s_qf_names[GRPQUOTA]); | ||
729 | else if (opts & (1 << REISERFS_GRPQUOTA)) | 731 | else if (opts & (1 << REISERFS_GRPQUOTA)) |
730 | seq_puts(seq, ",grpquota"); | 732 | seq_puts(seq, ",grpquota"); |
731 | if (REISERFS_SB(s)->s_jquota_fmt) { | 733 | if (REISERFS_SB(s)->s_jquota_fmt) { |
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c new file mode 100644 index 000000000000..634e676072cb --- /dev/null +++ b/fs/userfaultfd.c | |||
@@ -0,0 +1,1330 @@ | |||
1 | /* | ||
2 | * fs/userfaultfd.c | ||
3 | * | ||
4 | * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> | ||
5 | * Copyright (C) 2008-2009 Red Hat, Inc. | ||
6 | * Copyright (C) 2015 Red Hat, Inc. | ||
7 | * | ||
8 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
9 | * the COPYING file in the top-level directory. | ||
10 | * | ||
11 | * Some part derived from fs/eventfd.c (anon inode setup) and | ||
12 | * mm/ksm.c (mm hashing). | ||
13 | */ | ||
14 | |||
15 | #include <linux/hashtable.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/poll.h> | ||
19 | #include <linux/slab.h> | ||
20 | #include <linux/seq_file.h> | ||
21 | #include <linux/file.h> | ||
22 | #include <linux/bug.h> | ||
23 | #include <linux/anon_inodes.h> | ||
24 | #include <linux/syscalls.h> | ||
25 | #include <linux/userfaultfd_k.h> | ||
26 | #include <linux/mempolicy.h> | ||
27 | #include <linux/ioctl.h> | ||
28 | #include <linux/security.h> | ||
29 | |||
30 | static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; | ||
31 | |||
32 | enum userfaultfd_state { | ||
33 | UFFD_STATE_WAIT_API, | ||
34 | UFFD_STATE_RUNNING, | ||
35 | }; | ||
36 | |||
37 | /* | ||
38 | * Start with fault_pending_wqh and fault_wqh so they're more likely | ||
39 | * to be in the same cacheline. | ||
40 | */ | ||
41 | struct userfaultfd_ctx { | ||
42 | /* waitqueue head for the pending (i.e. not read) userfaults */ | ||
43 | wait_queue_head_t fault_pending_wqh; | ||
44 | /* waitqueue head for the userfaults */ | ||
45 | wait_queue_head_t fault_wqh; | ||
46 | /* waitqueue head for the pseudo fd to wakeup poll/read */ | ||
47 | wait_queue_head_t fd_wqh; | ||
48 | /* a refile sequence protected by fault_pending_wqh lock */ | ||
49 | struct seqcount refile_seq; | ||
50 | /* pseudo fd refcounting */ | ||
51 | atomic_t refcount; | ||
52 | /* userfaultfd syscall flags */ | ||
53 | unsigned int flags; | ||
54 | /* state machine */ | ||
55 | enum userfaultfd_state state; | ||
56 | /* released */ | ||
57 | bool released; | ||
58 | /* mm with one ore more vmas attached to this userfaultfd_ctx */ | ||
59 | struct mm_struct *mm; | ||
60 | }; | ||
61 | |||
62 | struct userfaultfd_wait_queue { | ||
63 | struct uffd_msg msg; | ||
64 | wait_queue_t wq; | ||
65 | struct userfaultfd_ctx *ctx; | ||
66 | }; | ||
67 | |||
68 | struct userfaultfd_wake_range { | ||
69 | unsigned long start; | ||
70 | unsigned long len; | ||
71 | }; | ||
72 | |||
73 | static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, | ||
74 | int wake_flags, void *key) | ||
75 | { | ||
76 | struct userfaultfd_wake_range *range = key; | ||
77 | int ret; | ||
78 | struct userfaultfd_wait_queue *uwq; | ||
79 | unsigned long start, len; | ||
80 | |||
81 | uwq = container_of(wq, struct userfaultfd_wait_queue, wq); | ||
82 | ret = 0; | ||
83 | /* len == 0 means wake all */ | ||
84 | start = range->start; | ||
85 | len = range->len; | ||
86 | if (len && (start > uwq->msg.arg.pagefault.address || | ||
87 | start + len <= uwq->msg.arg.pagefault.address)) | ||
88 | goto out; | ||
89 | ret = wake_up_state(wq->private, mode); | ||
90 | if (ret) | ||
91 | /* | ||
92 | * Wake only once, autoremove behavior. | ||
93 | * | ||
94 | * After the effect of list_del_init is visible to the | ||
95 | * other CPUs, the waitqueue may disappear from under | ||
96 | * us, see the !list_empty_careful() in | ||
97 | * handle_userfault(). try_to_wake_up() has an | ||
98 | * implicit smp_mb__before_spinlock, and the | ||
99 | * wq->private is read before calling the extern | ||
100 | * function "wake_up_state" (which in turns calls | ||
101 | * try_to_wake_up). While the spin_lock;spin_unlock; | ||
102 | * wouldn't be enough, the smp_mb__before_spinlock is | ||
103 | * enough to avoid an explicit smp_mb() here. | ||
104 | */ | ||
105 | list_del_init(&wq->task_list); | ||
106 | out: | ||
107 | return ret; | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd | ||
112 | * context. | ||
113 | * @ctx: [in] Pointer to the userfaultfd context. | ||
114 | * | ||
115 | * Returns: In case of success, returns not zero. | ||
116 | */ | ||
117 | static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) | ||
118 | { | ||
119 | if (!atomic_inc_not_zero(&ctx->refcount)) | ||
120 | BUG(); | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd | ||
125 | * context. | ||
126 | * @ctx: [in] Pointer to userfaultfd context. | ||
127 | * | ||
128 | * The userfaultfd context reference must have been previously acquired either | ||
129 | * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). | ||
130 | */ | ||
131 | static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) | ||
132 | { | ||
133 | if (atomic_dec_and_test(&ctx->refcount)) { | ||
134 | VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); | ||
135 | VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); | ||
136 | VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); | ||
137 | VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); | ||
138 | VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); | ||
139 | VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); | ||
140 | mmput(ctx->mm); | ||
141 | kmem_cache_free(userfaultfd_ctx_cachep, ctx); | ||
142 | } | ||
143 | } | ||
144 | |||
145 | static inline void msg_init(struct uffd_msg *msg) | ||
146 | { | ||
147 | BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); | ||
148 | /* | ||
149 | * Must use memset to zero out the paddings or kernel data is | ||
150 | * leaked to userland. | ||
151 | */ | ||
152 | memset(msg, 0, sizeof(struct uffd_msg)); | ||
153 | } | ||
154 | |||
155 | static inline struct uffd_msg userfault_msg(unsigned long address, | ||
156 | unsigned int flags, | ||
157 | unsigned long reason) | ||
158 | { | ||
159 | struct uffd_msg msg; | ||
160 | msg_init(&msg); | ||
161 | msg.event = UFFD_EVENT_PAGEFAULT; | ||
162 | msg.arg.pagefault.address = address; | ||
163 | if (flags & FAULT_FLAG_WRITE) | ||
164 | /* | ||
165 | * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the | ||
166 | * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE | ||
167 | * was not set in a UFFD_EVENT_PAGEFAULT, it means it | ||
168 | * was a read fault, otherwise if set it means it's | ||
169 | * a write fault. | ||
170 | */ | ||
171 | msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; | ||
172 | if (reason & VM_UFFD_WP) | ||
173 | /* | ||
174 | * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the | ||
175 | * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was | ||
176 | * not set in a UFFD_EVENT_PAGEFAULT, it means it was | ||
177 | * a missing fault, otherwise if set it means it's a | ||
178 | * write protect fault. | ||
179 | */ | ||
180 | msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; | ||
181 | return msg; | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * Verify the pagetables are still not ok after having reigstered into | ||
186 | * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any | ||
187 | * userfault that has already been resolved, if userfaultfd_read and | ||
188 | * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different | ||
189 | * threads. | ||
190 | */ | ||
191 | static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, | ||
192 | unsigned long address, | ||
193 | unsigned long flags, | ||
194 | unsigned long reason) | ||
195 | { | ||
196 | struct mm_struct *mm = ctx->mm; | ||
197 | pgd_t *pgd; | ||
198 | pud_t *pud; | ||
199 | pmd_t *pmd, _pmd; | ||
200 | pte_t *pte; | ||
201 | bool ret = true; | ||
202 | |||
203 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | ||
204 | |||
205 | pgd = pgd_offset(mm, address); | ||
206 | if (!pgd_present(*pgd)) | ||
207 | goto out; | ||
208 | pud = pud_offset(pgd, address); | ||
209 | if (!pud_present(*pud)) | ||
210 | goto out; | ||
211 | pmd = pmd_offset(pud, address); | ||
212 | /* | ||
213 | * READ_ONCE must function as a barrier with narrower scope | ||
214 | * and it must be equivalent to: | ||
215 | * _pmd = *pmd; barrier(); | ||
216 | * | ||
217 | * This is to deal with the instability (as in | ||
218 | * pmd_trans_unstable) of the pmd. | ||
219 | */ | ||
220 | _pmd = READ_ONCE(*pmd); | ||
221 | if (!pmd_present(_pmd)) | ||
222 | goto out; | ||
223 | |||
224 | ret = false; | ||
225 | if (pmd_trans_huge(_pmd)) | ||
226 | goto out; | ||
227 | |||
228 | /* | ||
229 | * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it | ||
230 | * and use the standard pte_offset_map() instead of parsing _pmd. | ||
231 | */ | ||
232 | pte = pte_offset_map(pmd, address); | ||
233 | /* | ||
234 | * Lockless access: we're in a wait_event so it's ok if it | ||
235 | * changes under us. | ||
236 | */ | ||
237 | if (pte_none(*pte)) | ||
238 | ret = true; | ||
239 | pte_unmap(pte); | ||
240 | |||
241 | out: | ||
242 | return ret; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * The locking rules involved in returning VM_FAULT_RETRY depending on | ||
247 | * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and | ||
248 | * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" | ||
249 | * recommendation in __lock_page_or_retry is not an understatement. | ||
250 | * | ||
251 | * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released | ||
252 | * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is | ||
253 | * not set. | ||
254 | * | ||
255 | * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not | ||
256 | * set, VM_FAULT_RETRY can still be returned if and only if there are | ||
257 | * fatal_signal_pending()s, and the mmap_sem must be released before | ||
258 | * returning it. | ||
259 | */ | ||
260 | int handle_userfault(struct vm_area_struct *vma, unsigned long address, | ||
261 | unsigned int flags, unsigned long reason) | ||
262 | { | ||
263 | struct mm_struct *mm = vma->vm_mm; | ||
264 | struct userfaultfd_ctx *ctx; | ||
265 | struct userfaultfd_wait_queue uwq; | ||
266 | int ret; | ||
267 | bool must_wait, return_to_userland; | ||
268 | |||
269 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | ||
270 | |||
271 | ret = VM_FAULT_SIGBUS; | ||
272 | ctx = vma->vm_userfaultfd_ctx.ctx; | ||
273 | if (!ctx) | ||
274 | goto out; | ||
275 | |||
276 | BUG_ON(ctx->mm != mm); | ||
277 | |||
278 | VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); | ||
279 | VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); | ||
280 | |||
281 | /* | ||
282 | * If it's already released don't get it. This avoids to loop | ||
283 | * in __get_user_pages if userfaultfd_release waits on the | ||
284 | * caller of handle_userfault to release the mmap_sem. | ||
285 | */ | ||
286 | if (unlikely(ACCESS_ONCE(ctx->released))) | ||
287 | goto out; | ||
288 | |||
289 | /* | ||
290 | * Check that we can return VM_FAULT_RETRY. | ||
291 | * | ||
292 | * NOTE: it should become possible to return VM_FAULT_RETRY | ||
293 | * even if FAULT_FLAG_TRIED is set without leading to gup() | ||
294 | * -EBUSY failures, if the userfaultfd is to be extended for | ||
295 | * VM_UFFD_WP tracking and we intend to arm the userfault | ||
296 | * without first stopping userland access to the memory. For | ||
297 | * VM_UFFD_MISSING userfaults this is enough for now. | ||
298 | */ | ||
299 | if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { | ||
300 | /* | ||
301 | * Validate the invariant that nowait must allow retry | ||
302 | * to be sure not to return SIGBUS erroneously on | ||
303 | * nowait invocations. | ||
304 | */ | ||
305 | BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); | ||
306 | #ifdef CONFIG_DEBUG_VM | ||
307 | if (printk_ratelimit()) { | ||
308 | printk(KERN_WARNING | ||
309 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); | ||
310 | dump_stack(); | ||
311 | } | ||
312 | #endif | ||
313 | goto out; | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * Handle nowait, not much to do other than tell it to retry | ||
318 | * and wait. | ||
319 | */ | ||
320 | ret = VM_FAULT_RETRY; | ||
321 | if (flags & FAULT_FLAG_RETRY_NOWAIT) | ||
322 | goto out; | ||
323 | |||
324 | /* take the reference before dropping the mmap_sem */ | ||
325 | userfaultfd_ctx_get(ctx); | ||
326 | |||
327 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | ||
328 | uwq.wq.private = current; | ||
329 | uwq.msg = userfault_msg(address, flags, reason); | ||
330 | uwq.ctx = ctx; | ||
331 | |||
332 | return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | ||
333 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); | ||
334 | |||
335 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
336 | /* | ||
337 | * After the __add_wait_queue the uwq is visible to userland | ||
338 | * through poll/read(). | ||
339 | */ | ||
340 | __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); | ||
341 | /* | ||
342 | * The smp_mb() after __set_current_state prevents the reads | ||
343 | * following the spin_unlock to happen before the list_add in | ||
344 | * __add_wait_queue. | ||
345 | */ | ||
346 | set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : | ||
347 | TASK_KILLABLE); | ||
348 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
349 | |||
350 | must_wait = userfaultfd_must_wait(ctx, address, flags, reason); | ||
351 | up_read(&mm->mmap_sem); | ||
352 | |||
353 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && | ||
354 | (return_to_userland ? !signal_pending(current) : | ||
355 | !fatal_signal_pending(current)))) { | ||
356 | wake_up_poll(&ctx->fd_wqh, POLLIN); | ||
357 | schedule(); | ||
358 | ret |= VM_FAULT_MAJOR; | ||
359 | } | ||
360 | |||
361 | __set_current_state(TASK_RUNNING); | ||
362 | |||
363 | if (return_to_userland) { | ||
364 | if (signal_pending(current) && | ||
365 | !fatal_signal_pending(current)) { | ||
366 | /* | ||
367 | * If we got a SIGSTOP or SIGCONT and this is | ||
368 | * a normal userland page fault, just let | ||
369 | * userland return so the signal will be | ||
370 | * handled and gdb debugging works. The page | ||
371 | * fault code immediately after we return from | ||
372 | * this function is going to release the | ||
373 | * mmap_sem and it's not depending on it | ||
374 | * (unlike gup would if we were not to return | ||
375 | * VM_FAULT_RETRY). | ||
376 | * | ||
377 | * If a fatal signal is pending we still take | ||
378 | * the streamlined VM_FAULT_RETRY failure path | ||
379 | * and there's no need to retake the mmap_sem | ||
380 | * in such case. | ||
381 | */ | ||
382 | down_read(&mm->mmap_sem); | ||
383 | ret = 0; | ||
384 | } | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * Here we race with the list_del; list_add in | ||
389 | * userfaultfd_ctx_read(), however because we don't ever run | ||
390 | * list_del_init() to refile across the two lists, the prev | ||
391 | * and next pointers will never point to self. list_add also | ||
392 | * would never let any of the two pointers to point to | ||
393 | * self. So list_empty_careful won't risk to see both pointers | ||
394 | * pointing to self at any time during the list refile. The | ||
395 | * only case where list_del_init() is called is the full | ||
396 | * removal in the wake function and there we don't re-list_add | ||
397 | * and it's fine not to block on the spinlock. The uwq on this | ||
398 | * kernel stack can be released after the list_del_init. | ||
399 | */ | ||
400 | if (!list_empty_careful(&uwq.wq.task_list)) { | ||
401 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
402 | /* | ||
403 | * No need of list_del_init(), the uwq on the stack | ||
404 | * will be freed shortly anyway. | ||
405 | */ | ||
406 | list_del(&uwq.wq.task_list); | ||
407 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * ctx may go away after this if the userfault pseudo fd is | ||
412 | * already released. | ||
413 | */ | ||
414 | userfaultfd_ctx_put(ctx); | ||
415 | |||
416 | out: | ||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | static int userfaultfd_release(struct inode *inode, struct file *file) | ||
421 | { | ||
422 | struct userfaultfd_ctx *ctx = file->private_data; | ||
423 | struct mm_struct *mm = ctx->mm; | ||
424 | struct vm_area_struct *vma, *prev; | ||
425 | /* len == 0 means wake all */ | ||
426 | struct userfaultfd_wake_range range = { .len = 0, }; | ||
427 | unsigned long new_flags; | ||
428 | |||
429 | ACCESS_ONCE(ctx->released) = true; | ||
430 | |||
431 | /* | ||
432 | * Flush page faults out of all CPUs. NOTE: all page faults | ||
433 | * must be retried without returning VM_FAULT_SIGBUS if | ||
434 | * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx | ||
435 | * changes while handle_userfault released the mmap_sem. So | ||
436 | * it's critical that released is set to true (above), before | ||
437 | * taking the mmap_sem for writing. | ||
438 | */ | ||
439 | down_write(&mm->mmap_sem); | ||
440 | prev = NULL; | ||
441 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
442 | cond_resched(); | ||
443 | BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ | ||
444 | !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); | ||
445 | if (vma->vm_userfaultfd_ctx.ctx != ctx) { | ||
446 | prev = vma; | ||
447 | continue; | ||
448 | } | ||
449 | new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); | ||
450 | prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, | ||
451 | new_flags, vma->anon_vma, | ||
452 | vma->vm_file, vma->vm_pgoff, | ||
453 | vma_policy(vma), | ||
454 | NULL_VM_UFFD_CTX); | ||
455 | if (prev) | ||
456 | vma = prev; | ||
457 | else | ||
458 | prev = vma; | ||
459 | vma->vm_flags = new_flags; | ||
460 | vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | ||
461 | } | ||
462 | up_write(&mm->mmap_sem); | ||
463 | |||
464 | /* | ||
465 | * After no new page faults can wait on this fault_*wqh, flush | ||
466 | * the last page faults that may have been already waiting on | ||
467 | * the fault_*wqh. | ||
468 | */ | ||
469 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
470 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); | ||
471 | __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); | ||
472 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
473 | |||
474 | wake_up_poll(&ctx->fd_wqh, POLLHUP); | ||
475 | userfaultfd_ctx_put(ctx); | ||
476 | return 0; | ||
477 | } | ||
478 | |||
479 | /* fault_pending_wqh.lock must be hold by the caller */ | ||
480 | static inline struct userfaultfd_wait_queue *find_userfault( | ||
481 | struct userfaultfd_ctx *ctx) | ||
482 | { | ||
483 | wait_queue_t *wq; | ||
484 | struct userfaultfd_wait_queue *uwq; | ||
485 | |||
486 | VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); | ||
487 | |||
488 | uwq = NULL; | ||
489 | if (!waitqueue_active(&ctx->fault_pending_wqh)) | ||
490 | goto out; | ||
491 | /* walk in reverse to provide FIFO behavior to read userfaults */ | ||
492 | wq = list_last_entry(&ctx->fault_pending_wqh.task_list, | ||
493 | typeof(*wq), task_list); | ||
494 | uwq = container_of(wq, struct userfaultfd_wait_queue, wq); | ||
495 | out: | ||
496 | return uwq; | ||
497 | } | ||
498 | |||
499 | static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) | ||
500 | { | ||
501 | struct userfaultfd_ctx *ctx = file->private_data; | ||
502 | unsigned int ret; | ||
503 | |||
504 | poll_wait(file, &ctx->fd_wqh, wait); | ||
505 | |||
506 | switch (ctx->state) { | ||
507 | case UFFD_STATE_WAIT_API: | ||
508 | return POLLERR; | ||
509 | case UFFD_STATE_RUNNING: | ||
510 | /* | ||
511 | * poll() never guarantees that read won't block. | ||
512 | * userfaults can be waken before they're read(). | ||
513 | */ | ||
514 | if (unlikely(!(file->f_flags & O_NONBLOCK))) | ||
515 | return POLLERR; | ||
516 | /* | ||
517 | * lockless access to see if there are pending faults | ||
518 | * __pollwait last action is the add_wait_queue but | ||
519 | * the spin_unlock would allow the waitqueue_active to | ||
520 | * pass above the actual list_add inside | ||
521 | * add_wait_queue critical section. So use a full | ||
522 | * memory barrier to serialize the list_add write of | ||
523 | * add_wait_queue() with the waitqueue_active read | ||
524 | * below. | ||
525 | */ | ||
526 | ret = 0; | ||
527 | smp_mb(); | ||
528 | if (waitqueue_active(&ctx->fault_pending_wqh)) | ||
529 | ret = POLLIN; | ||
530 | return ret; | ||
531 | default: | ||
532 | BUG(); | ||
533 | } | ||
534 | } | ||
535 | |||
536 | static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, | ||
537 | struct uffd_msg *msg) | ||
538 | { | ||
539 | ssize_t ret; | ||
540 | DECLARE_WAITQUEUE(wait, current); | ||
541 | struct userfaultfd_wait_queue *uwq; | ||
542 | |||
543 | /* always take the fd_wqh lock before the fault_pending_wqh lock */ | ||
544 | spin_lock(&ctx->fd_wqh.lock); | ||
545 | __add_wait_queue(&ctx->fd_wqh, &wait); | ||
546 | for (;;) { | ||
547 | set_current_state(TASK_INTERRUPTIBLE); | ||
548 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
549 | uwq = find_userfault(ctx); | ||
550 | if (uwq) { | ||
551 | /* | ||
552 | * Use a seqcount to repeat the lockless check | ||
553 | * in wake_userfault() to avoid missing | ||
554 | * wakeups because during the refile both | ||
555 | * waitqueue could become empty if this is the | ||
556 | * only userfault. | ||
557 | */ | ||
558 | write_seqcount_begin(&ctx->refile_seq); | ||
559 | |||
560 | /* | ||
561 | * The fault_pending_wqh.lock prevents the uwq | ||
562 | * to disappear from under us. | ||
563 | * | ||
564 | * Refile this userfault from | ||
565 | * fault_pending_wqh to fault_wqh, it's not | ||
566 | * pending anymore after we read it. | ||
567 | * | ||
568 | * Use list_del() by hand (as | ||
569 | * userfaultfd_wake_function also uses | ||
570 | * list_del_init() by hand) to be sure nobody | ||
571 | * changes __remove_wait_queue() to use | ||
572 | * list_del_init() in turn breaking the | ||
573 | * !list_empty_careful() check in | ||
574 | * handle_userfault(). The uwq->wq.task_list | ||
575 | * must never be empty at any time during the | ||
576 | * refile, or the waitqueue could disappear | ||
577 | * from under us. The "wait_queue_head_t" | ||
578 | * parameter of __remove_wait_queue() is unused | ||
579 | * anyway. | ||
580 | */ | ||
581 | list_del(&uwq->wq.task_list); | ||
582 | __add_wait_queue(&ctx->fault_wqh, &uwq->wq); | ||
583 | |||
584 | write_seqcount_end(&ctx->refile_seq); | ||
585 | |||
586 | /* careful to always initialize msg if ret == 0 */ | ||
587 | *msg = uwq->msg; | ||
588 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
589 | ret = 0; | ||
590 | break; | ||
591 | } | ||
592 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
593 | if (signal_pending(current)) { | ||
594 | ret = -ERESTARTSYS; | ||
595 | break; | ||
596 | } | ||
597 | if (no_wait) { | ||
598 | ret = -EAGAIN; | ||
599 | break; | ||
600 | } | ||
601 | spin_unlock(&ctx->fd_wqh.lock); | ||
602 | schedule(); | ||
603 | spin_lock(&ctx->fd_wqh.lock); | ||
604 | } | ||
605 | __remove_wait_queue(&ctx->fd_wqh, &wait); | ||
606 | __set_current_state(TASK_RUNNING); | ||
607 | spin_unlock(&ctx->fd_wqh.lock); | ||
608 | |||
609 | return ret; | ||
610 | } | ||
611 | |||
612 | static ssize_t userfaultfd_read(struct file *file, char __user *buf, | ||
613 | size_t count, loff_t *ppos) | ||
614 | { | ||
615 | struct userfaultfd_ctx *ctx = file->private_data; | ||
616 | ssize_t _ret, ret = 0; | ||
617 | struct uffd_msg msg; | ||
618 | int no_wait = file->f_flags & O_NONBLOCK; | ||
619 | |||
620 | if (ctx->state == UFFD_STATE_WAIT_API) | ||
621 | return -EINVAL; | ||
622 | |||
623 | for (;;) { | ||
624 | if (count < sizeof(msg)) | ||
625 | return ret ? ret : -EINVAL; | ||
626 | _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); | ||
627 | if (_ret < 0) | ||
628 | return ret ? ret : _ret; | ||
629 | if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) | ||
630 | return ret ? ret : -EFAULT; | ||
631 | ret += sizeof(msg); | ||
632 | buf += sizeof(msg); | ||
633 | count -= sizeof(msg); | ||
634 | /* | ||
635 | * Allow to read more than one fault at time but only | ||
636 | * block if waiting for the very first one. | ||
637 | */ | ||
638 | no_wait = O_NONBLOCK; | ||
639 | } | ||
640 | } | ||
641 | |||
642 | static void __wake_userfault(struct userfaultfd_ctx *ctx, | ||
643 | struct userfaultfd_wake_range *range) | ||
644 | { | ||
645 | unsigned long start, end; | ||
646 | |||
647 | start = range->start; | ||
648 | end = range->start + range->len; | ||
649 | |||
650 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
651 | /* wake all in the range and autoremove */ | ||
652 | if (waitqueue_active(&ctx->fault_pending_wqh)) | ||
653 | __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, | ||
654 | range); | ||
655 | if (waitqueue_active(&ctx->fault_wqh)) | ||
656 | __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); | ||
657 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
658 | } | ||
659 | |||
660 | static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, | ||
661 | struct userfaultfd_wake_range *range) | ||
662 | { | ||
663 | unsigned seq; | ||
664 | bool need_wakeup; | ||
665 | |||
666 | /* | ||
667 | * To be sure waitqueue_active() is not reordered by the CPU | ||
668 | * before the pagetable update, use an explicit SMP memory | ||
669 | * barrier here. PT lock release or up_read(mmap_sem) still | ||
670 | * have release semantics that can allow the | ||
671 | * waitqueue_active() to be reordered before the pte update. | ||
672 | */ | ||
673 | smp_mb(); | ||
674 | |||
675 | /* | ||
676 | * Use waitqueue_active because it's very frequent to | ||
677 | * change the address space atomically even if there are no | ||
678 | * userfaults yet. So we take the spinlock only when we're | ||
679 | * sure we've userfaults to wake. | ||
680 | */ | ||
681 | do { | ||
682 | seq = read_seqcount_begin(&ctx->refile_seq); | ||
683 | need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || | ||
684 | waitqueue_active(&ctx->fault_wqh); | ||
685 | cond_resched(); | ||
686 | } while (read_seqcount_retry(&ctx->refile_seq, seq)); | ||
687 | if (need_wakeup) | ||
688 | __wake_userfault(ctx, range); | ||
689 | } | ||
690 | |||
691 | static __always_inline int validate_range(struct mm_struct *mm, | ||
692 | __u64 start, __u64 len) | ||
693 | { | ||
694 | __u64 task_size = mm->task_size; | ||
695 | |||
696 | if (start & ~PAGE_MASK) | ||
697 | return -EINVAL; | ||
698 | if (len & ~PAGE_MASK) | ||
699 | return -EINVAL; | ||
700 | if (!len) | ||
701 | return -EINVAL; | ||
702 | if (start < mmap_min_addr) | ||
703 | return -EINVAL; | ||
704 | if (start >= task_size) | ||
705 | return -EINVAL; | ||
706 | if (len > task_size - start) | ||
707 | return -EINVAL; | ||
708 | return 0; | ||
709 | } | ||
710 | |||
711 | static int userfaultfd_register(struct userfaultfd_ctx *ctx, | ||
712 | unsigned long arg) | ||
713 | { | ||
714 | struct mm_struct *mm = ctx->mm; | ||
715 | struct vm_area_struct *vma, *prev, *cur; | ||
716 | int ret; | ||
717 | struct uffdio_register uffdio_register; | ||
718 | struct uffdio_register __user *user_uffdio_register; | ||
719 | unsigned long vm_flags, new_flags; | ||
720 | bool found; | ||
721 | unsigned long start, end, vma_end; | ||
722 | |||
723 | user_uffdio_register = (struct uffdio_register __user *) arg; | ||
724 | |||
725 | ret = -EFAULT; | ||
726 | if (copy_from_user(&uffdio_register, user_uffdio_register, | ||
727 | sizeof(uffdio_register)-sizeof(__u64))) | ||
728 | goto out; | ||
729 | |||
730 | ret = -EINVAL; | ||
731 | if (!uffdio_register.mode) | ||
732 | goto out; | ||
733 | if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| | ||
734 | UFFDIO_REGISTER_MODE_WP)) | ||
735 | goto out; | ||
736 | vm_flags = 0; | ||
737 | if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) | ||
738 | vm_flags |= VM_UFFD_MISSING; | ||
739 | if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { | ||
740 | vm_flags |= VM_UFFD_WP; | ||
741 | /* | ||
742 | * FIXME: remove the below error constraint by | ||
743 | * implementing the wprotect tracking mode. | ||
744 | */ | ||
745 | ret = -EINVAL; | ||
746 | goto out; | ||
747 | } | ||
748 | |||
749 | ret = validate_range(mm, uffdio_register.range.start, | ||
750 | uffdio_register.range.len); | ||
751 | if (ret) | ||
752 | goto out; | ||
753 | |||
754 | start = uffdio_register.range.start; | ||
755 | end = start + uffdio_register.range.len; | ||
756 | |||
757 | down_write(&mm->mmap_sem); | ||
758 | vma = find_vma_prev(mm, start, &prev); | ||
759 | |||
760 | ret = -ENOMEM; | ||
761 | if (!vma) | ||
762 | goto out_unlock; | ||
763 | |||
764 | /* check that there's at least one vma in the range */ | ||
765 | ret = -EINVAL; | ||
766 | if (vma->vm_start >= end) | ||
767 | goto out_unlock; | ||
768 | |||
769 | /* | ||
770 | * Search for not compatible vmas. | ||
771 | * | ||
772 | * FIXME: this shall be relaxed later so that it doesn't fail | ||
773 | * on tmpfs backed vmas (in addition to the current allowance | ||
774 | * on anonymous vmas). | ||
775 | */ | ||
776 | found = false; | ||
777 | for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { | ||
778 | cond_resched(); | ||
779 | |||
780 | BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ | ||
781 | !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); | ||
782 | |||
783 | /* check not compatible vmas */ | ||
784 | ret = -EINVAL; | ||
785 | if (cur->vm_ops) | ||
786 | goto out_unlock; | ||
787 | |||
788 | /* | ||
789 | * Check that this vma isn't already owned by a | ||
790 | * different userfaultfd. We can't allow more than one | ||
791 | * userfaultfd to own a single vma simultaneously or we | ||
792 | * wouldn't know which one to deliver the userfaults to. | ||
793 | */ | ||
794 | ret = -EBUSY; | ||
795 | if (cur->vm_userfaultfd_ctx.ctx && | ||
796 | cur->vm_userfaultfd_ctx.ctx != ctx) | ||
797 | goto out_unlock; | ||
798 | |||
799 | found = true; | ||
800 | } | ||
801 | BUG_ON(!found); | ||
802 | |||
803 | if (vma->vm_start < start) | ||
804 | prev = vma; | ||
805 | |||
806 | ret = 0; | ||
807 | do { | ||
808 | cond_resched(); | ||
809 | |||
810 | BUG_ON(vma->vm_ops); | ||
811 | BUG_ON(vma->vm_userfaultfd_ctx.ctx && | ||
812 | vma->vm_userfaultfd_ctx.ctx != ctx); | ||
813 | |||
814 | /* | ||
815 | * Nothing to do: this vma is already registered into this | ||
816 | * userfaultfd and with the right tracking mode too. | ||
817 | */ | ||
818 | if (vma->vm_userfaultfd_ctx.ctx == ctx && | ||
819 | (vma->vm_flags & vm_flags) == vm_flags) | ||
820 | goto skip; | ||
821 | |||
822 | if (vma->vm_start > start) | ||
823 | start = vma->vm_start; | ||
824 | vma_end = min(end, vma->vm_end); | ||
825 | |||
826 | new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; | ||
827 | prev = vma_merge(mm, prev, start, vma_end, new_flags, | ||
828 | vma->anon_vma, vma->vm_file, vma->vm_pgoff, | ||
829 | vma_policy(vma), | ||
830 | ((struct vm_userfaultfd_ctx){ ctx })); | ||
831 | if (prev) { | ||
832 | vma = prev; | ||
833 | goto next; | ||
834 | } | ||
835 | if (vma->vm_start < start) { | ||
836 | ret = split_vma(mm, vma, start, 1); | ||
837 | if (ret) | ||
838 | break; | ||
839 | } | ||
840 | if (vma->vm_end > end) { | ||
841 | ret = split_vma(mm, vma, end, 0); | ||
842 | if (ret) | ||
843 | break; | ||
844 | } | ||
845 | next: | ||
846 | /* | ||
847 | * In the vma_merge() successful mprotect-like case 8: | ||
848 | * the next vma was merged into the current one and | ||
849 | * the current one has not been updated yet. | ||
850 | */ | ||
851 | vma->vm_flags = new_flags; | ||
852 | vma->vm_userfaultfd_ctx.ctx = ctx; | ||
853 | |||
854 | skip: | ||
855 | prev = vma; | ||
856 | start = vma->vm_end; | ||
857 | vma = vma->vm_next; | ||
858 | } while (vma && vma->vm_start < end); | ||
859 | out_unlock: | ||
860 | up_write(&mm->mmap_sem); | ||
861 | if (!ret) { | ||
862 | /* | ||
863 | * Now that we scanned all vmas we can already tell | ||
864 | * userland which ioctls methods are guaranteed to | ||
865 | * succeed on this range. | ||
866 | */ | ||
867 | if (put_user(UFFD_API_RANGE_IOCTLS, | ||
868 | &user_uffdio_register->ioctls)) | ||
869 | ret = -EFAULT; | ||
870 | } | ||
871 | out: | ||
872 | return ret; | ||
873 | } | ||
874 | |||
875 | static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, | ||
876 | unsigned long arg) | ||
877 | { | ||
878 | struct mm_struct *mm = ctx->mm; | ||
879 | struct vm_area_struct *vma, *prev, *cur; | ||
880 | int ret; | ||
881 | struct uffdio_range uffdio_unregister; | ||
882 | unsigned long new_flags; | ||
883 | bool found; | ||
884 | unsigned long start, end, vma_end; | ||
885 | const void __user *buf = (void __user *)arg; | ||
886 | |||
887 | ret = -EFAULT; | ||
888 | if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) | ||
889 | goto out; | ||
890 | |||
891 | ret = validate_range(mm, uffdio_unregister.start, | ||
892 | uffdio_unregister.len); | ||
893 | if (ret) | ||
894 | goto out; | ||
895 | |||
896 | start = uffdio_unregister.start; | ||
897 | end = start + uffdio_unregister.len; | ||
898 | |||
899 | down_write(&mm->mmap_sem); | ||
900 | vma = find_vma_prev(mm, start, &prev); | ||
901 | |||
902 | ret = -ENOMEM; | ||
903 | if (!vma) | ||
904 | goto out_unlock; | ||
905 | |||
906 | /* check that there's at least one vma in the range */ | ||
907 | ret = -EINVAL; | ||
908 | if (vma->vm_start >= end) | ||
909 | goto out_unlock; | ||
910 | |||
911 | /* | ||
912 | * Search for not compatible vmas. | ||
913 | * | ||
914 | * FIXME: this shall be relaxed later so that it doesn't fail | ||
915 | * on tmpfs backed vmas (in addition to the current allowance | ||
916 | * on anonymous vmas). | ||
917 | */ | ||
918 | found = false; | ||
919 | ret = -EINVAL; | ||
920 | for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { | ||
921 | cond_resched(); | ||
922 | |||
923 | BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ | ||
924 | !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); | ||
925 | |||
926 | /* | ||
927 | * Check not compatible vmas, not strictly required | ||
928 | * here as not compatible vmas cannot have an | ||
929 | * userfaultfd_ctx registered on them, but this | ||
930 | * provides for more strict behavior to notice | ||
931 | * unregistration errors. | ||
932 | */ | ||
933 | if (cur->vm_ops) | ||
934 | goto out_unlock; | ||
935 | |||
936 | found = true; | ||
937 | } | ||
938 | BUG_ON(!found); | ||
939 | |||
940 | if (vma->vm_start < start) | ||
941 | prev = vma; | ||
942 | |||
943 | ret = 0; | ||
944 | do { | ||
945 | cond_resched(); | ||
946 | |||
947 | BUG_ON(vma->vm_ops); | ||
948 | |||
949 | /* | ||
950 | * Nothing to do: this vma is already registered into this | ||
951 | * userfaultfd and with the right tracking mode too. | ||
952 | */ | ||
953 | if (!vma->vm_userfaultfd_ctx.ctx) | ||
954 | goto skip; | ||
955 | |||
956 | if (vma->vm_start > start) | ||
957 | start = vma->vm_start; | ||
958 | vma_end = min(end, vma->vm_end); | ||
959 | |||
960 | new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); | ||
961 | prev = vma_merge(mm, prev, start, vma_end, new_flags, | ||
962 | vma->anon_vma, vma->vm_file, vma->vm_pgoff, | ||
963 | vma_policy(vma), | ||
964 | NULL_VM_UFFD_CTX); | ||
965 | if (prev) { | ||
966 | vma = prev; | ||
967 | goto next; | ||
968 | } | ||
969 | if (vma->vm_start < start) { | ||
970 | ret = split_vma(mm, vma, start, 1); | ||
971 | if (ret) | ||
972 | break; | ||
973 | } | ||
974 | if (vma->vm_end > end) { | ||
975 | ret = split_vma(mm, vma, end, 0); | ||
976 | if (ret) | ||
977 | break; | ||
978 | } | ||
979 | next: | ||
980 | /* | ||
981 | * In the vma_merge() successful mprotect-like case 8: | ||
982 | * the next vma was merged into the current one and | ||
983 | * the current one has not been updated yet. | ||
984 | */ | ||
985 | vma->vm_flags = new_flags; | ||
986 | vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | ||
987 | |||
988 | skip: | ||
989 | prev = vma; | ||
990 | start = vma->vm_end; | ||
991 | vma = vma->vm_next; | ||
992 | } while (vma && vma->vm_start < end); | ||
993 | out_unlock: | ||
994 | up_write(&mm->mmap_sem); | ||
995 | out: | ||
996 | return ret; | ||
997 | } | ||
998 | |||
999 | /* | ||
1000 | * userfaultfd_wake may be used in combination with the | ||
1001 | * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. | ||
1002 | */ | ||
1003 | static int userfaultfd_wake(struct userfaultfd_ctx *ctx, | ||
1004 | unsigned long arg) | ||
1005 | { | ||
1006 | int ret; | ||
1007 | struct uffdio_range uffdio_wake; | ||
1008 | struct userfaultfd_wake_range range; | ||
1009 | const void __user *buf = (void __user *)arg; | ||
1010 | |||
1011 | ret = -EFAULT; | ||
1012 | if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) | ||
1013 | goto out; | ||
1014 | |||
1015 | ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); | ||
1016 | if (ret) | ||
1017 | goto out; | ||
1018 | |||
1019 | range.start = uffdio_wake.start; | ||
1020 | range.len = uffdio_wake.len; | ||
1021 | |||
1022 | /* | ||
1023 | * len == 0 means wake all and we don't want to wake all here, | ||
1024 | * so check it again to be sure. | ||
1025 | */ | ||
1026 | VM_BUG_ON(!range.len); | ||
1027 | |||
1028 | wake_userfault(ctx, &range); | ||
1029 | ret = 0; | ||
1030 | |||
1031 | out: | ||
1032 | return ret; | ||
1033 | } | ||
1034 | |||
1035 | static int userfaultfd_copy(struct userfaultfd_ctx *ctx, | ||
1036 | unsigned long arg) | ||
1037 | { | ||
1038 | __s64 ret; | ||
1039 | struct uffdio_copy uffdio_copy; | ||
1040 | struct uffdio_copy __user *user_uffdio_copy; | ||
1041 | struct userfaultfd_wake_range range; | ||
1042 | |||
1043 | user_uffdio_copy = (struct uffdio_copy __user *) arg; | ||
1044 | |||
1045 | ret = -EFAULT; | ||
1046 | if (copy_from_user(&uffdio_copy, user_uffdio_copy, | ||
1047 | /* don't copy "copy" last field */ | ||
1048 | sizeof(uffdio_copy)-sizeof(__s64))) | ||
1049 | goto out; | ||
1050 | |||
1051 | ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); | ||
1052 | if (ret) | ||
1053 | goto out; | ||
1054 | /* | ||
1055 | * double check for wraparound just in case. copy_from_user() | ||
1056 | * will later check uffdio_copy.src + uffdio_copy.len to fit | ||
1057 | * in the userland range. | ||
1058 | */ | ||
1059 | ret = -EINVAL; | ||
1060 | if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) | ||
1061 | goto out; | ||
1062 | if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) | ||
1063 | goto out; | ||
1064 | |||
1065 | ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, | ||
1066 | uffdio_copy.len); | ||
1067 | if (unlikely(put_user(ret, &user_uffdio_copy->copy))) | ||
1068 | return -EFAULT; | ||
1069 | if (ret < 0) | ||
1070 | goto out; | ||
1071 | BUG_ON(!ret); | ||
1072 | /* len == 0 would wake all */ | ||
1073 | range.len = ret; | ||
1074 | if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { | ||
1075 | range.start = uffdio_copy.dst; | ||
1076 | wake_userfault(ctx, &range); | ||
1077 | } | ||
1078 | ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; | ||
1079 | out: | ||
1080 | return ret; | ||
1081 | } | ||
1082 | |||
1083 | static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, | ||
1084 | unsigned long arg) | ||
1085 | { | ||
1086 | __s64 ret; | ||
1087 | struct uffdio_zeropage uffdio_zeropage; | ||
1088 | struct uffdio_zeropage __user *user_uffdio_zeropage; | ||
1089 | struct userfaultfd_wake_range range; | ||
1090 | |||
1091 | user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; | ||
1092 | |||
1093 | ret = -EFAULT; | ||
1094 | if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, | ||
1095 | /* don't copy "zeropage" last field */ | ||
1096 | sizeof(uffdio_zeropage)-sizeof(__s64))) | ||
1097 | goto out; | ||
1098 | |||
1099 | ret = validate_range(ctx->mm, uffdio_zeropage.range.start, | ||
1100 | uffdio_zeropage.range.len); | ||
1101 | if (ret) | ||
1102 | goto out; | ||
1103 | ret = -EINVAL; | ||
1104 | if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) | ||
1105 | goto out; | ||
1106 | |||
1107 | ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, | ||
1108 | uffdio_zeropage.range.len); | ||
1109 | if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) | ||
1110 | return -EFAULT; | ||
1111 | if (ret < 0) | ||
1112 | goto out; | ||
1113 | /* len == 0 would wake all */ | ||
1114 | BUG_ON(!ret); | ||
1115 | range.len = ret; | ||
1116 | if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { | ||
1117 | range.start = uffdio_zeropage.range.start; | ||
1118 | wake_userfault(ctx, &range); | ||
1119 | } | ||
1120 | ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; | ||
1121 | out: | ||
1122 | return ret; | ||
1123 | } | ||
1124 | |||
1125 | /* | ||
1126 | * userland asks for a certain API version and we return which bits | ||
1127 | * and ioctl commands are implemented in this kernel for such API | ||
1128 | * version or -EINVAL if unknown. | ||
1129 | */ | ||
1130 | static int userfaultfd_api(struct userfaultfd_ctx *ctx, | ||
1131 | unsigned long arg) | ||
1132 | { | ||
1133 | struct uffdio_api uffdio_api; | ||
1134 | void __user *buf = (void __user *)arg; | ||
1135 | int ret; | ||
1136 | |||
1137 | ret = -EINVAL; | ||
1138 | if (ctx->state != UFFD_STATE_WAIT_API) | ||
1139 | goto out; | ||
1140 | ret = -EFAULT; | ||
1141 | if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) | ||
1142 | goto out; | ||
1143 | if (uffdio_api.api != UFFD_API || uffdio_api.features) { | ||
1144 | memset(&uffdio_api, 0, sizeof(uffdio_api)); | ||
1145 | if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) | ||
1146 | goto out; | ||
1147 | ret = -EINVAL; | ||
1148 | goto out; | ||
1149 | } | ||
1150 | uffdio_api.features = UFFD_API_FEATURES; | ||
1151 | uffdio_api.ioctls = UFFD_API_IOCTLS; | ||
1152 | ret = -EFAULT; | ||
1153 | if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) | ||
1154 | goto out; | ||
1155 | ctx->state = UFFD_STATE_RUNNING; | ||
1156 | ret = 0; | ||
1157 | out: | ||
1158 | return ret; | ||
1159 | } | ||
1160 | |||
1161 | static long userfaultfd_ioctl(struct file *file, unsigned cmd, | ||
1162 | unsigned long arg) | ||
1163 | { | ||
1164 | int ret = -EINVAL; | ||
1165 | struct userfaultfd_ctx *ctx = file->private_data; | ||
1166 | |||
1167 | if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) | ||
1168 | return -EINVAL; | ||
1169 | |||
1170 | switch(cmd) { | ||
1171 | case UFFDIO_API: | ||
1172 | ret = userfaultfd_api(ctx, arg); | ||
1173 | break; | ||
1174 | case UFFDIO_REGISTER: | ||
1175 | ret = userfaultfd_register(ctx, arg); | ||
1176 | break; | ||
1177 | case UFFDIO_UNREGISTER: | ||
1178 | ret = userfaultfd_unregister(ctx, arg); | ||
1179 | break; | ||
1180 | case UFFDIO_WAKE: | ||
1181 | ret = userfaultfd_wake(ctx, arg); | ||
1182 | break; | ||
1183 | case UFFDIO_COPY: | ||
1184 | ret = userfaultfd_copy(ctx, arg); | ||
1185 | break; | ||
1186 | case UFFDIO_ZEROPAGE: | ||
1187 | ret = userfaultfd_zeropage(ctx, arg); | ||
1188 | break; | ||
1189 | } | ||
1190 | return ret; | ||
1191 | } | ||
1192 | |||
1193 | #ifdef CONFIG_PROC_FS | ||
1194 | static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) | ||
1195 | { | ||
1196 | struct userfaultfd_ctx *ctx = f->private_data; | ||
1197 | wait_queue_t *wq; | ||
1198 | struct userfaultfd_wait_queue *uwq; | ||
1199 | unsigned long pending = 0, total = 0; | ||
1200 | |||
1201 | spin_lock(&ctx->fault_pending_wqh.lock); | ||
1202 | list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { | ||
1203 | uwq = container_of(wq, struct userfaultfd_wait_queue, wq); | ||
1204 | pending++; | ||
1205 | total++; | ||
1206 | } | ||
1207 | list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { | ||
1208 | uwq = container_of(wq, struct userfaultfd_wait_queue, wq); | ||
1209 | total++; | ||
1210 | } | ||
1211 | spin_unlock(&ctx->fault_pending_wqh.lock); | ||
1212 | |||
1213 | /* | ||
1214 | * If more protocols will be added, there will be all shown | ||
1215 | * separated by a space. Like this: | ||
1216 | * protocols: aa:... bb:... | ||
1217 | */ | ||
1218 | seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", | ||
1219 | pending, total, UFFD_API, UFFD_API_FEATURES, | ||
1220 | UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); | ||
1221 | } | ||
1222 | #endif | ||
1223 | |||
1224 | static const struct file_operations userfaultfd_fops = { | ||
1225 | #ifdef CONFIG_PROC_FS | ||
1226 | .show_fdinfo = userfaultfd_show_fdinfo, | ||
1227 | #endif | ||
1228 | .release = userfaultfd_release, | ||
1229 | .poll = userfaultfd_poll, | ||
1230 | .read = userfaultfd_read, | ||
1231 | .unlocked_ioctl = userfaultfd_ioctl, | ||
1232 | .compat_ioctl = userfaultfd_ioctl, | ||
1233 | .llseek = noop_llseek, | ||
1234 | }; | ||
1235 | |||
1236 | static void init_once_userfaultfd_ctx(void *mem) | ||
1237 | { | ||
1238 | struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; | ||
1239 | |||
1240 | init_waitqueue_head(&ctx->fault_pending_wqh); | ||
1241 | init_waitqueue_head(&ctx->fault_wqh); | ||
1242 | init_waitqueue_head(&ctx->fd_wqh); | ||
1243 | seqcount_init(&ctx->refile_seq); | ||
1244 | } | ||
1245 | |||
1246 | /** | ||
1247 | * userfaultfd_file_create - Creates an userfaultfd file pointer. | ||
1248 | * @flags: Flags for the userfaultfd file. | ||
1249 | * | ||
1250 | * This function creates an userfaultfd file pointer, w/out installing | ||
1251 | * it into the fd table. This is useful when the userfaultfd file is | ||
1252 | * used during the initialization of data structures that require | ||
1253 | * extra setup after the userfaultfd creation. So the userfaultfd | ||
1254 | * creation is split into the file pointer creation phase, and the | ||
1255 | * file descriptor installation phase. In this way races with | ||
1256 | * userspace closing the newly installed file descriptor can be | ||
1257 | * avoided. Returns an userfaultfd file pointer, or a proper error | ||
1258 | * pointer. | ||
1259 | */ | ||
1260 | static struct file *userfaultfd_file_create(int flags) | ||
1261 | { | ||
1262 | struct file *file; | ||
1263 | struct userfaultfd_ctx *ctx; | ||
1264 | |||
1265 | BUG_ON(!current->mm); | ||
1266 | |||
1267 | /* Check the UFFD_* constants for consistency. */ | ||
1268 | BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); | ||
1269 | BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); | ||
1270 | |||
1271 | file = ERR_PTR(-EINVAL); | ||
1272 | if (flags & ~UFFD_SHARED_FCNTL_FLAGS) | ||
1273 | goto out; | ||
1274 | |||
1275 | file = ERR_PTR(-ENOMEM); | ||
1276 | ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); | ||
1277 | if (!ctx) | ||
1278 | goto out; | ||
1279 | |||
1280 | atomic_set(&ctx->refcount, 1); | ||
1281 | ctx->flags = flags; | ||
1282 | ctx->state = UFFD_STATE_WAIT_API; | ||
1283 | ctx->released = false; | ||
1284 | ctx->mm = current->mm; | ||
1285 | /* prevent the mm struct to be freed */ | ||
1286 | atomic_inc(&ctx->mm->mm_users); | ||
1287 | |||
1288 | file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, | ||
1289 | O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); | ||
1290 | if (IS_ERR(file)) | ||
1291 | kmem_cache_free(userfaultfd_ctx_cachep, ctx); | ||
1292 | out: | ||
1293 | return file; | ||
1294 | } | ||
1295 | |||
1296 | SYSCALL_DEFINE1(userfaultfd, int, flags) | ||
1297 | { | ||
1298 | int fd, error; | ||
1299 | struct file *file; | ||
1300 | |||
1301 | error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); | ||
1302 | if (error < 0) | ||
1303 | return error; | ||
1304 | fd = error; | ||
1305 | |||
1306 | file = userfaultfd_file_create(flags); | ||
1307 | if (IS_ERR(file)) { | ||
1308 | error = PTR_ERR(file); | ||
1309 | goto err_put_unused_fd; | ||
1310 | } | ||
1311 | fd_install(fd, file); | ||
1312 | |||
1313 | return fd; | ||
1314 | |||
1315 | err_put_unused_fd: | ||
1316 | put_unused_fd(fd); | ||
1317 | |||
1318 | return error; | ||
1319 | } | ||
1320 | |||
1321 | static int __init userfaultfd_init(void) | ||
1322 | { | ||
1323 | userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", | ||
1324 | sizeof(struct userfaultfd_ctx), | ||
1325 | 0, | ||
1326 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, | ||
1327 | init_once_userfaultfd_ctx); | ||
1328 | return 0; | ||
1329 | } | ||
1330 | __initcall(userfaultfd_init); | ||
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1fb16562c159..bbd9b1f10ffb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c | |||
@@ -511,9 +511,9 @@ xfs_showargs( | |||
511 | seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); | 511 | seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); |
512 | 512 | ||
513 | if (mp->m_logname) | 513 | if (mp->m_logname) |
514 | seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); | 514 | seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); |
515 | if (mp->m_rtname) | 515 | if (mp->m_rtname) |
516 | seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); | 516 | seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); |
517 | 517 | ||
518 | if (mp->m_dalign > 0) | 518 | if (mp->m_dalign > 0) |
519 | seq_printf(m, "," MNTOPT_SUNIT "=%d", | 519 | seq_printf(m, "," MNTOPT_SUNIT "=%d", |
diff --git a/include/linux/cred.h b/include/linux/cred.h index 8b6c083e68a7..8d70e1361ecd 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h | |||
@@ -137,6 +137,7 @@ struct cred { | |||
137 | kernel_cap_t cap_permitted; /* caps we're permitted */ | 137 | kernel_cap_t cap_permitted; /* caps we're permitted */ |
138 | kernel_cap_t cap_effective; /* caps we can actually use */ | 138 | kernel_cap_t cap_effective; /* caps we can actually use */ |
139 | kernel_cap_t cap_bset; /* capability bounding set */ | 139 | kernel_cap_t cap_bset; /* capability bounding set */ |
140 | kernel_cap_t cap_ambient; /* Ambient capability set */ | ||
140 | #ifdef CONFIG_KEYS | 141 | #ifdef CONFIG_KEYS |
141 | unsigned char jit_keyring; /* default keyring to attach requested | 142 | unsigned char jit_keyring; /* default keyring to attach requested |
142 | * keys to */ | 143 | * keys to */ |
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void) | |||
212 | } | 213 | } |
213 | #endif | 214 | #endif |
214 | 215 | ||
216 | static inline bool cap_ambient_invariant_ok(const struct cred *cred) | ||
217 | { | ||
218 | return cap_issubset(cred->cap_ambient, | ||
219 | cap_intersect(cred->cap_permitted, | ||
220 | cred->cap_inheritable)); | ||
221 | } | ||
222 | |||
215 | /** | 223 | /** |
216 | * get_new_cred - Get a reference on a new set of credentials | 224 | * get_new_cred - Get a reference on a new set of credentials |
217 | * @cred: The new credentials to reference | 225 | * @cred: The new credentials to reference |
diff --git a/include/linux/fs.h b/include/linux/fs.h index fbd780c33c5f..864203c10dbc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -1612,7 +1612,6 @@ struct file_operations { | |||
1612 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); | 1612 | long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); |
1613 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); | 1613 | long (*compat_ioctl) (struct file *, unsigned int, unsigned long); |
1614 | int (*mmap) (struct file *, struct vm_area_struct *); | 1614 | int (*mmap) (struct file *, struct vm_area_struct *); |
1615 | int (*mremap)(struct file *, struct vm_area_struct *); | ||
1616 | int (*open) (struct inode *, struct file *); | 1615 | int (*open) (struct inode *, struct file *); |
1617 | int (*flush) (struct file *, fl_owner_t id); | 1616 | int (*flush) (struct file *, fl_owner_t id); |
1618 | int (*release) (struct inode *, struct file *); | 1617 | int (*release) (struct inode *, struct file *); |
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 65a517dd32f7..e0727d77feaf 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h | |||
@@ -195,40 +195,49 @@ struct fsnotify_group { | |||
195 | #define FSNOTIFY_EVENT_INODE 2 | 195 | #define FSNOTIFY_EVENT_INODE 2 |
196 | 196 | ||
197 | /* | 197 | /* |
198 | * a mark is simply an object attached to an in core inode which allows an | 198 | * A mark is simply an object attached to an in core inode which allows an |
199 | * fsnotify listener to indicate they are either no longer interested in events | 199 | * fsnotify listener to indicate they are either no longer interested in events |
200 | * of a type matching mask or only interested in those events. | 200 | * of a type matching mask or only interested in those events. |
201 | * | 201 | * |
202 | * these are flushed when an inode is evicted from core and may be flushed | 202 | * These are flushed when an inode is evicted from core and may be flushed |
203 | * when the inode is modified (as seen by fsnotify_access). Some fsnotify users | 203 | * when the inode is modified (as seen by fsnotify_access). Some fsnotify |
204 | * (such as dnotify) will flush these when the open fd is closed and not at | 204 | * users (such as dnotify) will flush these when the open fd is closed and not |
205 | * inode eviction or modification. | 205 | * at inode eviction or modification. |
206 | * | ||
207 | * Text in brackets is showing the lock(s) protecting modifications of a | ||
208 | * particular entry. obj_lock means either inode->i_lock or | ||
209 | * mnt->mnt_root->d_lock depending on the mark type. | ||
206 | */ | 210 | */ |
207 | struct fsnotify_mark { | 211 | struct fsnotify_mark { |
208 | __u32 mask; /* mask this mark is for */ | 212 | /* Mask this mark is for [mark->lock, group->mark_mutex] */ |
209 | /* we hold ref for each i_list and g_list. also one ref for each 'thing' | 213 | __u32 mask; |
214 | /* We hold one for presence in g_list. Also one ref for each 'thing' | ||
210 | * in kernel that found and may be using this mark. */ | 215 | * in kernel that found and may be using this mark. */ |
211 | atomic_t refcnt; /* active things looking at this mark */ | 216 | atomic_t refcnt; |
212 | struct fsnotify_group *group; /* group this mark is for */ | 217 | /* Group this mark is for. Set on mark creation, stable until last ref |
213 | struct list_head g_list; /* list of marks by group->i_fsnotify_marks | 218 | * is dropped */ |
214 | * Also reused for queueing mark into | 219 | struct fsnotify_group *group; |
215 | * destroy_list when it's waiting for | 220 | /* List of marks by group->i_fsnotify_marks. Also reused for queueing |
216 | * the end of SRCU period before it can | 221 | * mark into destroy_list when it's waiting for the end of SRCU period |
217 | * be freed */ | 222 | * before it can be freed. [group->mark_mutex] */ |
218 | spinlock_t lock; /* protect group and inode */ | 223 | struct list_head g_list; |
219 | struct hlist_node obj_list; /* list of marks for inode / vfsmount */ | 224 | /* Protects inode / mnt pointers, flags, masks */ |
220 | struct list_head free_list; /* tmp list used when freeing this mark */ | 225 | spinlock_t lock; |
221 | union { | 226 | /* List of marks for inode / vfsmount [obj_lock] */ |
227 | struct hlist_node obj_list; | ||
228 | union { /* Object pointer [mark->lock, group->mark_mutex] */ | ||
222 | struct inode *inode; /* inode this mark is associated with */ | 229 | struct inode *inode; /* inode this mark is associated with */ |
223 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ | 230 | struct vfsmount *mnt; /* vfsmount this mark is associated with */ |
224 | }; | 231 | }; |
225 | __u32 ignored_mask; /* events types to ignore */ | 232 | /* Events types to ignore [mark->lock, group->mark_mutex] */ |
233 | __u32 ignored_mask; | ||
226 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 | 234 | #define FSNOTIFY_MARK_FLAG_INODE 0x01 |
227 | #define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 | 235 | #define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 |
228 | #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 | 236 | #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 |
229 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 | 237 | #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 |
230 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 | 238 | #define FSNOTIFY_MARK_FLAG_ALIVE 0x10 |
231 | unsigned int flags; /* vfsmount or inode mark? */ | 239 | #define FSNOTIFY_MARK_FLAG_ATTACHED 0x20 |
240 | unsigned int flags; /* flags [mark->lock] */ | ||
232 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ | 241 | void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ |
233 | }; | 242 | }; |
234 | 243 | ||
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_ | |||
345 | /* given a group and a mark, flag mark to be freed when all references are dropped */ | 354 | /* given a group and a mark, flag mark to be freed when all references are dropped */ |
346 | extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, | 355 | extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, |
347 | struct fsnotify_group *group); | 356 | struct fsnotify_group *group); |
348 | extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, | 357 | /* detach mark from inode / mount list, group list, drop inode reference */ |
349 | struct fsnotify_group *group); | 358 | extern void fsnotify_detach_mark(struct fsnotify_mark *mark); |
359 | /* free mark */ | ||
360 | extern void fsnotify_free_mark(struct fsnotify_mark *mark); | ||
350 | /* run all the marks in a group, and clear all of the vfsmount marks */ | 361 | /* run all the marks in a group, and clear all of the vfsmount marks */ |
351 | extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); | 362 | extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); |
352 | /* run all the marks in a group, and clear all of the inode marks */ | 363 | /* run all the marks in a group, and clear all of the inode marks */ |
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 5383bb1394a1..7ff168d06967 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h | |||
@@ -59,6 +59,8 @@ struct gen_pool { | |||
59 | 59 | ||
60 | genpool_algo_t algo; /* allocation function */ | 60 | genpool_algo_t algo; /* allocation function */ |
61 | void *data; | 61 | void *data; |
62 | |||
63 | const char *name; | ||
62 | }; | 64 | }; |
63 | 65 | ||
64 | /* | 66 | /* |
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, | |||
118 | unsigned long start, unsigned int nr, void *data); | 120 | unsigned long start, unsigned int nr, void *data); |
119 | 121 | ||
120 | extern struct gen_pool *devm_gen_pool_create(struct device *dev, | 122 | extern struct gen_pool *devm_gen_pool_create(struct device *dev, |
121 | int min_alloc_order, int nid); | 123 | int min_alloc_order, int nid, const char *name); |
122 | extern struct gen_pool *gen_pool_get(struct device *dev); | 124 | extern struct gen_pool *gen_pool_get(struct device *dev, const char *name); |
123 | 125 | ||
124 | bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, | 126 | bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, |
125 | size_t size); | 127 | size_t size); |
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 869b21dcf503..e691b6a23f72 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h | |||
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
11 | const char namefmt[], ...); | 11 | const char namefmt[], ...); |
12 | 12 | ||
13 | #define kthread_create(threadfn, data, namefmt, arg...) \ | 13 | #define kthread_create(threadfn, data, namefmt, arg...) \ |
14 | kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) | 14 | kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg) |
15 | 15 | ||
16 | 16 | ||
17 | struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | 17 | struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), |
diff --git a/include/linux/mm.h b/include/linux/mm.h index bf6f117fcf4d..8b257c43855b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp); | |||
124 | #define VM_MAYSHARE 0x00000080 | 124 | #define VM_MAYSHARE 0x00000080 |
125 | 125 | ||
126 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ | 126 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ |
127 | #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ | ||
127 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 128 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
128 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 129 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
130 | #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ | ||
129 | 131 | ||
130 | #define VM_LOCKED 0x00002000 | 132 | #define VM_LOCKED 0x00002000 |
131 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 133 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
@@ -245,6 +247,7 @@ struct vm_fault { | |||
245 | struct vm_operations_struct { | 247 | struct vm_operations_struct { |
246 | void (*open)(struct vm_area_struct * area); | 248 | void (*open)(struct vm_area_struct * area); |
247 | void (*close)(struct vm_area_struct * area); | 249 | void (*close)(struct vm_area_struct * area); |
250 | int (*mremap)(struct vm_area_struct * area); | ||
248 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 251 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
249 | void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); | 252 | void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); |
250 | 253 | ||
@@ -1833,7 +1836,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
1833 | extern struct vm_area_struct *vma_merge(struct mm_struct *, | 1836 | extern struct vm_area_struct *vma_merge(struct mm_struct *, |
1834 | struct vm_area_struct *prev, unsigned long addr, unsigned long end, | 1837 | struct vm_area_struct *prev, unsigned long addr, unsigned long end, |
1835 | unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, | 1838 | unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, |
1836 | struct mempolicy *); | 1839 | struct mempolicy *, struct vm_userfaultfd_ctx); |
1837 | extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); | 1840 | extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); |
1838 | extern int split_vma(struct mm_struct *, | 1841 | extern int split_vma(struct mm_struct *, |
1839 | struct vm_area_struct *, unsigned long addr, int new_below); | 1842 | struct vm_area_struct *, unsigned long addr, int new_below); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 15549578d559..c8d0a73d64c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -256,6 +256,16 @@ struct vm_region { | |||
256 | * this region */ | 256 | * this region */ |
257 | }; | 257 | }; |
258 | 258 | ||
259 | #ifdef CONFIG_USERFAULTFD | ||
260 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) | ||
261 | struct vm_userfaultfd_ctx { | ||
262 | struct userfaultfd_ctx *ctx; | ||
263 | }; | ||
264 | #else /* CONFIG_USERFAULTFD */ | ||
265 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) | ||
266 | struct vm_userfaultfd_ctx {}; | ||
267 | #endif /* CONFIG_USERFAULTFD */ | ||
268 | |||
259 | /* | 269 | /* |
260 | * This struct defines a memory VMM memory area. There is one of these | 270 | * This struct defines a memory VMM memory area. There is one of these |
261 | * per VM-area/task. A VM area is any part of the process virtual memory | 271 | * per VM-area/task. A VM area is any part of the process virtual memory |
@@ -322,6 +332,7 @@ struct vm_area_struct { | |||
322 | #ifdef CONFIG_NUMA | 332 | #ifdef CONFIG_NUMA |
323 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | 333 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ |
324 | #endif | 334 | #endif |
335 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | ||
325 | }; | 336 | }; |
326 | 337 | ||
327 | struct core_thread { | 338 | struct core_thread { |
@@ -543,6 +554,7 @@ enum tlb_flush_reason { | |||
543 | TLB_REMOTE_SHOOTDOWN, | 554 | TLB_REMOTE_SHOOTDOWN, |
544 | TLB_LOCAL_SHOOTDOWN, | 555 | TLB_LOCAL_SHOOTDOWN, |
545 | TLB_LOCAL_MM_SHOOTDOWN, | 556 | TLB_LOCAL_MM_SHOOTDOWN, |
557 | TLB_REMOTE_SEND_IPI, | ||
546 | NR_TLB_FLUSH_REASONS, | 558 | NR_TLB_FLUSH_REASONS, |
547 | }; | 559 | }; |
548 | 560 | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 754c25966a0a..ac00e2050943 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -690,14 +690,6 @@ struct zonelist { | |||
690 | #endif | 690 | #endif |
691 | }; | 691 | }; |
692 | 692 | ||
693 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
694 | struct node_active_region { | ||
695 | unsigned long start_pfn; | ||
696 | unsigned long end_pfn; | ||
697 | int nid; | ||
698 | }; | ||
699 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
700 | |||
701 | #ifndef CONFIG_DISCONTIGMEM | 693 | #ifndef CONFIG_DISCONTIGMEM |
702 | /* The array of struct pages - for discontigmem use pgdat->lmem_map */ | 694 | /* The array of struct pages - for discontigmem use pgdat->lmem_map */ |
703 | extern struct page *mem_map; | 695 | extern struct page *mem_map; |
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f94da0e65dea..a91adf6e02f2 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void) | |||
27 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) | 27 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) |
28 | extern void hardlockup_detector_disable(void); | 28 | extern void hardlockup_detector_disable(void); |
29 | #else | 29 | #else |
30 | static inline void hardlockup_detector_disable(void) | 30 | static inline void hardlockup_detector_disable(void) {} |
31 | { | ||
32 | } | ||
33 | #endif | 31 | #endif |
34 | 32 | ||
35 | /* | 33 | /* |
@@ -80,6 +78,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , | |||
80 | void __user *, size_t *, loff_t *); | 78 | void __user *, size_t *, loff_t *); |
81 | extern int proc_watchdog_cpumask(struct ctl_table *, int, | 79 | extern int proc_watchdog_cpumask(struct ctl_table *, int, |
82 | void __user *, size_t *, loff_t *); | 80 | void __user *, size_t *, loff_t *); |
81 | extern int lockup_detector_suspend(void); | ||
82 | extern void lockup_detector_resume(void); | ||
83 | #else | ||
84 | static inline int lockup_detector_suspend(void) | ||
85 | { | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | static inline void lockup_detector_resume(void) | ||
90 | { | ||
91 | } | ||
83 | #endif | 92 | #endif |
84 | 93 | ||
85 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI | 94 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c89c53a113a8..29446aeef36e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -89,6 +89,9 @@ enum ttu_flags { | |||
89 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ | 89 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ |
90 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ | 90 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ |
91 | TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ | 91 | TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ |
92 | TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible | ||
93 | * and caller guarantees they will | ||
94 | * do a final flush if necessary */ | ||
92 | }; | 95 | }; |
93 | 96 | ||
94 | #ifdef CONFIG_MMU | 97 | #ifdef CONFIG_MMU |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 119823decc46..a4ab9daa387c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1344,6 +1344,25 @@ enum perf_event_task_context { | |||
1344 | perf_nr_task_contexts, | 1344 | perf_nr_task_contexts, |
1345 | }; | 1345 | }; |
1346 | 1346 | ||
1347 | /* Track pages that require TLB flushes */ | ||
1348 | struct tlbflush_unmap_batch { | ||
1349 | /* | ||
1350 | * Each bit set is a CPU that potentially has a TLB entry for one of | ||
1351 | * the PFNs being flushed. See set_tlb_ubc_flush_pending(). | ||
1352 | */ | ||
1353 | struct cpumask cpumask; | ||
1354 | |||
1355 | /* True if any bit in cpumask is set */ | ||
1356 | bool flush_required; | ||
1357 | |||
1358 | /* | ||
1359 | * If true then the PTE was dirty when unmapped. The entry must be | ||
1360 | * flushed before IO is initiated or a stale TLB entry potentially | ||
1361 | * allows an update without redirtying the page. | ||
1362 | */ | ||
1363 | bool writable; | ||
1364 | }; | ||
1365 | |||
1347 | struct task_struct { | 1366 | struct task_struct { |
1348 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 1367 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
1349 | void *stack; | 1368 | void *stack; |
@@ -1700,6 +1719,10 @@ struct task_struct { | |||
1700 | unsigned long numa_pages_migrated; | 1719 | unsigned long numa_pages_migrated; |
1701 | #endif /* CONFIG_NUMA_BALANCING */ | 1720 | #endif /* CONFIG_NUMA_BALANCING */ |
1702 | 1721 | ||
1722 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
1723 | struct tlbflush_unmap_batch tlb_ubc; | ||
1724 | #endif | ||
1725 | |||
1703 | struct rcu_head rcu; | 1726 | struct rcu_head rcu; |
1704 | 1727 | ||
1705 | /* | 1728 | /* |
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 912a7c482649..d4c7271382cb 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h | |||
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq) | |||
149 | #endif | 149 | #endif |
150 | } | 150 | } |
151 | 151 | ||
152 | /** | ||
153 | * seq_show_options - display mount options with appropriate escapes. | ||
154 | * @m: the seq_file handle | ||
155 | * @name: the mount option name | ||
156 | * @value: the mount option name's value, can be NULL | ||
157 | */ | ||
158 | static inline void seq_show_option(struct seq_file *m, const char *name, | ||
159 | const char *value) | ||
160 | { | ||
161 | seq_putc(m, ','); | ||
162 | seq_escape(m, name, ",= \t\n\\"); | ||
163 | if (value) { | ||
164 | seq_putc(m, '='); | ||
165 | seq_escape(m, value, ", \t\n\\"); | ||
166 | } | ||
167 | } | ||
168 | |||
169 | /** | ||
170 | * seq_show_option_n - display mount options with appropriate escapes | ||
171 | * where @value must be a specific length. | ||
172 | * @m: the seq_file handle | ||
173 | * @name: the mount option name | ||
174 | * @value: the mount option name's value, cannot be NULL | ||
175 | * @length: the length of @value to display | ||
176 | * | ||
177 | * This is a macro since this uses "length" to define the size of the | ||
178 | * stack buffer. | ||
179 | */ | ||
180 | #define seq_show_option_n(m, name, value, length) { \ | ||
181 | char val_buf[length + 1]; \ | ||
182 | strncpy(val_buf, value, length); \ | ||
183 | val_buf[length] = '\0'; \ | ||
184 | seq_show_option(m, name, val_buf); \ | ||
185 | } | ||
186 | |||
152 | #define SEQ_START_TOKEN ((void *)1) | 187 | #define SEQ_START_TOKEN ((void *)1) |
153 | /* | 188 | /* |
154 | * Helpers for iteration over list_head-s in seq_files | 189 | * Helpers for iteration over list_head-s in seq_files |
diff --git a/include/linux/slab.h b/include/linux/slab.h index a99f0e5243e1..7e37d448ed91 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags); | |||
290 | void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); | 290 | void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); |
291 | void kmem_cache_free(struct kmem_cache *, void *); | 291 | void kmem_cache_free(struct kmem_cache *, void *); |
292 | 292 | ||
293 | /* | ||
294 | * Bulk allocation and freeing operations. These are accellerated in an | ||
295 | * allocator specific way to avoid taking locks repeatedly or building | ||
296 | * metadata structures unnecessarily. | ||
297 | * | ||
298 | * Note that interrupts must be enabled when calling these functions. | ||
299 | */ | ||
300 | void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); | ||
301 | bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); | ||
302 | |||
293 | #ifdef CONFIG_NUMA | 303 | #ifdef CONFIG_NUMA |
294 | void *__kmalloc_node(size_t size, gfp_t flags, int node); | 304 | void *__kmalloc_node(size_t size, gfp_t flags, int node); |
295 | void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); | 305 | void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); |
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index da3c593f9845..e6109a6cd8f6 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h | |||
@@ -48,7 +48,16 @@ struct smp_hotplug_thread { | |||
48 | const char *thread_comm; | 48 | const char *thread_comm; |
49 | }; | 49 | }; |
50 | 50 | ||
51 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); | 51 | int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, |
52 | const struct cpumask *cpumask); | ||
53 | |||
54 | static inline int | ||
55 | smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
56 | { | ||
57 | return smpboot_register_percpu_thread_cpumask(plug_thread, | ||
58 | cpu_possible_mask); | ||
59 | } | ||
60 | |||
52 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); | 61 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); |
53 | int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | 62 | int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, |
54 | const struct cpumask *); | 63 | const struct cpumask *); |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b45c45b8c829..08001317aee7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); | |||
810 | asmlinkage long sys_eventfd(unsigned int count); | 810 | asmlinkage long sys_eventfd(unsigned int count); |
811 | asmlinkage long sys_eventfd2(unsigned int count, int flags); | 811 | asmlinkage long sys_eventfd2(unsigned int count, int flags); |
812 | asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); | 812 | asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); |
813 | asmlinkage long sys_userfaultfd(int flags); | ||
813 | asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); | 814 | asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); |
814 | asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); | 815 | asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); |
815 | asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, | 816 | asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, |
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h new file mode 100644 index 000000000000..587480ad41b7 --- /dev/null +++ b/include/linux/userfaultfd_k.h | |||
@@ -0,0 +1,85 @@ | |||
1 | /* | ||
2 | * include/linux/userfaultfd_k.h | ||
3 | * | ||
4 | * Copyright (C) 2015 Red Hat, Inc. | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #ifndef _LINUX_USERFAULTFD_K_H | ||
9 | #define _LINUX_USERFAULTFD_K_H | ||
10 | |||
11 | #ifdef CONFIG_USERFAULTFD | ||
12 | |||
13 | #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ | ||
14 | |||
15 | #include <linux/fcntl.h> | ||
16 | |||
17 | /* | ||
18 | * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining | ||
19 | * new flags, since they might collide with O_* ones. We want | ||
20 | * to re-use O_* flags that couldn't possibly have a meaning | ||
21 | * from userfaultfd, in order to leave a free define-space for | ||
22 | * shared O_* flags. | ||
23 | */ | ||
24 | #define UFFD_CLOEXEC O_CLOEXEC | ||
25 | #define UFFD_NONBLOCK O_NONBLOCK | ||
26 | |||
27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | ||
28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | ||
29 | |||
30 | extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, | ||
31 | unsigned int flags, unsigned long reason); | ||
32 | |||
33 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | ||
34 | unsigned long src_start, unsigned long len); | ||
35 | extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, | ||
36 | unsigned long dst_start, | ||
37 | unsigned long len); | ||
38 | |||
39 | /* mm helpers */ | ||
40 | static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, | ||
41 | struct vm_userfaultfd_ctx vm_ctx) | ||
42 | { | ||
43 | return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; | ||
44 | } | ||
45 | |||
46 | static inline bool userfaultfd_missing(struct vm_area_struct *vma) | ||
47 | { | ||
48 | return vma->vm_flags & VM_UFFD_MISSING; | ||
49 | } | ||
50 | |||
51 | static inline bool userfaultfd_armed(struct vm_area_struct *vma) | ||
52 | { | ||
53 | return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); | ||
54 | } | ||
55 | |||
56 | #else /* CONFIG_USERFAULTFD */ | ||
57 | |||
58 | /* mm helpers */ | ||
59 | static inline int handle_userfault(struct vm_area_struct *vma, | ||
60 | unsigned long address, | ||
61 | unsigned int flags, | ||
62 | unsigned long reason) | ||
63 | { | ||
64 | return VM_FAULT_SIGBUS; | ||
65 | } | ||
66 | |||
67 | static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, | ||
68 | struct vm_userfaultfd_ctx vm_ctx) | ||
69 | { | ||
70 | return true; | ||
71 | } | ||
72 | |||
73 | static inline bool userfaultfd_missing(struct vm_area_struct *vma) | ||
74 | { | ||
75 | return false; | ||
76 | } | ||
77 | |||
78 | static inline bool userfaultfd_armed(struct vm_area_struct *vma) | ||
79 | { | ||
80 | return false; | ||
81 | } | ||
82 | |||
83 | #endif /* CONFIG_USERFAULTFD */ | ||
84 | |||
85 | #endif /* _LINUX_USERFAULTFD_K_H */ | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..d3d077228d4c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) | |||
147 | 147 | ||
148 | typedef int wait_bit_action_f(struct wait_bit_key *); | 148 | typedef int wait_bit_action_f(struct wait_bit_key *); |
149 | void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); | 149 | void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); |
150 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); | 150 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, |
151 | void *key); | ||
151 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); | 152 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); |
152 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); | 153 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); |
153 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); | 154 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); |
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); | |||
179 | #define wake_up_poll(x, m) \ | 180 | #define wake_up_poll(x, m) \ |
180 | __wake_up(x, TASK_NORMAL, 1, (void *) (m)) | 181 | __wake_up(x, TASK_NORMAL, 1, (void *) (m)) |
181 | #define wake_up_locked_poll(x, m) \ | 182 | #define wake_up_locked_poll(x, m) \ |
182 | __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) | 183 | __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m)) |
183 | #define wake_up_interruptible_poll(x, m) \ | 184 | #define wake_up_interruptible_poll(x, m) \ |
184 | __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) | 185 | __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) |
185 | #define wake_up_interruptible_sync_poll(x, m) \ | 186 | #define wake_up_interruptible_sync_poll(x, m) \ |
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index f47feada5b42..d74a0e907b9e 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h | |||
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, | |||
140 | extern int watchdog_register_device(struct watchdog_device *); | 140 | extern int watchdog_register_device(struct watchdog_device *); |
141 | extern void watchdog_unregister_device(struct watchdog_device *); | 141 | extern void watchdog_unregister_device(struct watchdog_device *); |
142 | 142 | ||
143 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
144 | void watchdog_nmi_disable_all(void); | ||
145 | void watchdog_nmi_enable_all(void); | ||
146 | #else | ||
147 | static inline void watchdog_nmi_disable_all(void) {} | ||
148 | static inline void watchdog_nmi_enable_all(void) {} | ||
149 | #endif | ||
150 | |||
151 | #endif /* ifndef _LINUX_WATCHDOG_H */ | 143 | #endif /* ifndef _LINUX_WATCHDOG_H */ |
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h index 4250f364a6ca..bc8815f45f3b 100644 --- a/include/trace/events/tlb.h +++ b/include/trace/events/tlb.h | |||
@@ -11,7 +11,8 @@ | |||
11 | EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ | 11 | EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ |
12 | EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ | 12 | EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ |
13 | EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ | 13 | EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ |
14 | EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) | 14 | EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ |
15 | EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) | ||
15 | 16 | ||
16 | /* | 17 | /* |
17 | * First define the enums in TLB_FLUSH_REASON to be exported to userspace | 18 | * First define the enums in TLB_FLUSH_REASON to be exported to userspace |
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index aafb9937b162..70ff1d9abf0d 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild | |||
@@ -456,3 +456,4 @@ header-y += xfrm.h | |||
456 | header-y += xilinx-v4l2-controls.h | 456 | header-y += xilinx-v4l2-controls.h |
457 | header-y += zorro.h | 457 | header-y += zorro.h |
458 | header-y += zorro_ids.h | 458 | header-y += zorro_ids.h |
459 | header-y += userfaultfd.h | ||
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 31891d9535e2..a8d0759a9e40 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h | |||
@@ -190,4 +190,11 @@ struct prctl_mm_map { | |||
190 | # define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ | 190 | # define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ |
191 | # define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ | 191 | # define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ |
192 | 192 | ||
193 | /* Control the ambient capability set */ | ||
194 | #define PR_CAP_AMBIENT 47 | ||
195 | # define PR_CAP_AMBIENT_IS_SET 1 | ||
196 | # define PR_CAP_AMBIENT_RAISE 2 | ||
197 | # define PR_CAP_AMBIENT_LOWER 3 | ||
198 | # define PR_CAP_AMBIENT_CLEAR_ALL 4 | ||
199 | |||
193 | #endif /* _LINUX_PRCTL_H */ | 200 | #endif /* _LINUX_PRCTL_H */ |
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h index 985aac9e6bf8..35ac35cef217 100644 --- a/include/uapi/linux/securebits.h +++ b/include/uapi/linux/securebits.h | |||
@@ -43,9 +43,18 @@ | |||
43 | #define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS)) | 43 | #define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS)) |
44 | #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED)) | 44 | #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED)) |
45 | 45 | ||
46 | /* When set, a process cannot add new capabilities to its ambient set. */ | ||
47 | #define SECURE_NO_CAP_AMBIENT_RAISE 6 | ||
48 | #define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */ | ||
49 | |||
50 | #define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE)) | ||
51 | #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \ | ||
52 | (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED)) | ||
53 | |||
46 | #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ | 54 | #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ |
47 | issecure_mask(SECURE_NO_SETUID_FIXUP) | \ | 55 | issecure_mask(SECURE_NO_SETUID_FIXUP) | \ |
48 | issecure_mask(SECURE_KEEP_CAPS)) | 56 | issecure_mask(SECURE_KEEP_CAPS) | \ |
57 | issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE)) | ||
49 | #define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) | 58 | #define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) |
50 | 59 | ||
51 | #endif /* _UAPI_LINUX_SECUREBITS_H */ | 60 | #endif /* _UAPI_LINUX_SECUREBITS_H */ |
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h new file mode 100644 index 000000000000..df0e09bb7dd5 --- /dev/null +++ b/include/uapi/linux/userfaultfd.h | |||
@@ -0,0 +1,169 @@ | |||
1 | /* | ||
2 | * include/linux/userfaultfd.h | ||
3 | * | ||
4 | * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> | ||
5 | * Copyright (C) 2015 Red Hat, Inc. | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | #ifndef _LINUX_USERFAULTFD_H | ||
10 | #define _LINUX_USERFAULTFD_H | ||
11 | |||
12 | #include <linux/types.h> | ||
13 | |||
14 | #include <linux/compiler.h> | ||
15 | |||
16 | #define UFFD_API ((__u64)0xAA) | ||
17 | /* | ||
18 | * After implementing the respective features it will become: | ||
19 | * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ | ||
20 | * UFFD_FEATURE_EVENT_FORK) | ||
21 | */ | ||
22 | #define UFFD_API_FEATURES (0) | ||
23 | #define UFFD_API_IOCTLS \ | ||
24 | ((__u64)1 << _UFFDIO_REGISTER | \ | ||
25 | (__u64)1 << _UFFDIO_UNREGISTER | \ | ||
26 | (__u64)1 << _UFFDIO_API) | ||
27 | #define UFFD_API_RANGE_IOCTLS \ | ||
28 | ((__u64)1 << _UFFDIO_WAKE | \ | ||
29 | (__u64)1 << _UFFDIO_COPY | \ | ||
30 | (__u64)1 << _UFFDIO_ZEROPAGE) | ||
31 | |||
32 | /* | ||
33 | * Valid ioctl command number range with this API is from 0x00 to | ||
34 | * 0x3F. UFFDIO_API is the fixed number, everything else can be | ||
35 | * changed by implementing a different UFFD_API. If sticking to the | ||
36 | * same UFFD_API more ioctl can be added and userland will be aware of | ||
37 | * which ioctl the running kernel implements through the ioctl command | ||
38 | * bitmask written by the UFFDIO_API. | ||
39 | */ | ||
40 | #define _UFFDIO_REGISTER (0x00) | ||
41 | #define _UFFDIO_UNREGISTER (0x01) | ||
42 | #define _UFFDIO_WAKE (0x02) | ||
43 | #define _UFFDIO_COPY (0x03) | ||
44 | #define _UFFDIO_ZEROPAGE (0x04) | ||
45 | #define _UFFDIO_API (0x3F) | ||
46 | |||
47 | /* userfaultfd ioctl ids */ | ||
48 | #define UFFDIO 0xAA | ||
49 | #define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ | ||
50 | struct uffdio_api) | ||
51 | #define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ | ||
52 | struct uffdio_register) | ||
53 | #define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ | ||
54 | struct uffdio_range) | ||
55 | #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ | ||
56 | struct uffdio_range) | ||
57 | #define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ | ||
58 | struct uffdio_copy) | ||
59 | #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ | ||
60 | struct uffdio_zeropage) | ||
61 | |||
62 | /* read() structure */ | ||
63 | struct uffd_msg { | ||
64 | __u8 event; | ||
65 | |||
66 | __u8 reserved1; | ||
67 | __u16 reserved2; | ||
68 | __u32 reserved3; | ||
69 | |||
70 | union { | ||
71 | struct { | ||
72 | __u64 flags; | ||
73 | __u64 address; | ||
74 | } pagefault; | ||
75 | |||
76 | struct { | ||
77 | /* unused reserved fields */ | ||
78 | __u64 reserved1; | ||
79 | __u64 reserved2; | ||
80 | __u64 reserved3; | ||
81 | } reserved; | ||
82 | } arg; | ||
83 | } __packed; | ||
84 | |||
85 | /* | ||
86 | * Start at 0x12 and not at 0 to be more strict against bugs. | ||
87 | */ | ||
88 | #define UFFD_EVENT_PAGEFAULT 0x12 | ||
89 | #if 0 /* not available yet */ | ||
90 | #define UFFD_EVENT_FORK 0x13 | ||
91 | #endif | ||
92 | |||
93 | /* flags for UFFD_EVENT_PAGEFAULT */ | ||
94 | #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ | ||
95 | #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ | ||
96 | |||
97 | struct uffdio_api { | ||
98 | /* userland asks for an API number and the features to enable */ | ||
99 | __u64 api; | ||
100 | /* | ||
101 | * Kernel answers below with the all available features for | ||
102 | * the API, this notifies userland of which events and/or | ||
103 | * which flags for each event are enabled in the current | ||
104 | * kernel. | ||
105 | * | ||
106 | * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE | ||
107 | * are to be considered implicitly always enabled in all kernels as | ||
108 | * long as the uffdio_api.api requested matches UFFD_API. | ||
109 | */ | ||
110 | #if 0 /* not available yet */ | ||
111 | #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) | ||
112 | #define UFFD_FEATURE_EVENT_FORK (1<<1) | ||
113 | #endif | ||
114 | __u64 features; | ||
115 | |||
116 | __u64 ioctls; | ||
117 | }; | ||
118 | |||
119 | struct uffdio_range { | ||
120 | __u64 start; | ||
121 | __u64 len; | ||
122 | }; | ||
123 | |||
124 | struct uffdio_register { | ||
125 | struct uffdio_range range; | ||
126 | #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) | ||
127 | #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) | ||
128 | __u64 mode; | ||
129 | |||
130 | /* | ||
131 | * kernel answers which ioctl commands are available for the | ||
132 | * range, keep at the end as the last 8 bytes aren't read. | ||
133 | */ | ||
134 | __u64 ioctls; | ||
135 | }; | ||
136 | |||
137 | struct uffdio_copy { | ||
138 | __u64 dst; | ||
139 | __u64 src; | ||
140 | __u64 len; | ||
141 | /* | ||
142 | * There will be a wrprotection flag later that allows to map | ||
143 | * pages wrprotected on the fly. And such a flag will be | ||
144 | * available if the wrprotection ioctl are implemented for the | ||
145 | * range according to the uffdio_register.ioctls. | ||
146 | */ | ||
147 | #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) | ||
148 | __u64 mode; | ||
149 | |||
150 | /* | ||
151 | * "copy" is written by the ioctl and must be at the end: the | ||
152 | * copy_from_user will not read the last 8 bytes. | ||
153 | */ | ||
154 | __s64 copy; | ||
155 | }; | ||
156 | |||
157 | struct uffdio_zeropage { | ||
158 | struct uffdio_range range; | ||
159 | #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) | ||
160 | __u64 mode; | ||
161 | |||
162 | /* | ||
163 | * "zeropage" is written by the ioctl and must be at the end: | ||
164 | * the copy_from_user will not read the last 8 bytes. | ||
165 | */ | ||
166 | __s64 zeropage; | ||
167 | }; | ||
168 | |||
169 | #endif /* _LINUX_USERFAULTFD_H */ | ||
diff --git a/init/Kconfig b/init/Kconfig index bb9b4dd55889..2c0e50ef554a 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING | |||
883 | bool | 883 | bool |
884 | 884 | ||
885 | # | 885 | # |
886 | # For architectures that prefer to flush all TLBs after a number of pages | ||
887 | # are unmapped instead of sending one IPI per page to flush. The architecture | ||
888 | # must provide guarantees on what happens if a clean TLB cache entry is | ||
889 | # written after the unmap. Details are in mm/rmap.c near the check for | ||
890 | # should_defer_flush. The architecture should also consider if the full flush | ||
891 | # and the refill costs are offset by the savings of sending fewer IPIs. | ||
892 | config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
893 | bool | ||
894 | |||
895 | # | ||
886 | # For architectures that know their GCC __int128 support is sound | 896 | # For architectures that know their GCC __int128 support is sound |
887 | # | 897 | # |
888 | config ARCH_SUPPORTS_INT128 | 898 | config ARCH_SUPPORTS_INT128 |
@@ -1576,6 +1586,14 @@ config ADVISE_SYSCALLS | |||
1576 | applications use these syscalls, you can disable this option to save | 1586 | applications use these syscalls, you can disable this option to save |
1577 | space. | 1587 | space. |
1578 | 1588 | ||
1589 | config USERFAULTFD | ||
1590 | bool "Enable userfaultfd() system call" | ||
1591 | select ANON_INODES | ||
1592 | depends on MMU | ||
1593 | help | ||
1594 | Enable the userfaultfd() system call that allows to intercept and | ||
1595 | handle page faults in userland. | ||
1596 | |||
1579 | config PCI_QUIRKS | 1597 | config PCI_QUIRKS |
1580 | default y | 1598 | default y |
1581 | bool "Enable PCI quirk workarounds" if EXPERT | 1599 | bool "Enable PCI quirk workarounds" if EXPERT |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f3f5cd5e2c0d..a8538e443784 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1342 | if (root != &cgrp_dfl_root) | 1342 | if (root != &cgrp_dfl_root) |
1343 | for_each_subsys(ss, ssid) | 1343 | for_each_subsys(ss, ssid) |
1344 | if (root->subsys_mask & (1 << ssid)) | 1344 | if (root->subsys_mask & (1 << ssid)) |
1345 | seq_printf(seq, ",%s", ss->legacy_name); | 1345 | seq_show_option(seq, ss->name, NULL); |
1346 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1346 | if (root->flags & CGRP_ROOT_NOPREFIX) |
1347 | seq_puts(seq, ",noprefix"); | 1347 | seq_puts(seq, ",noprefix"); |
1348 | if (root->flags & CGRP_ROOT_XATTR) | 1348 | if (root->flags & CGRP_ROOT_XATTR) |
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1350 | 1350 | ||
1351 | spin_lock(&release_agent_path_lock); | 1351 | spin_lock(&release_agent_path_lock); |
1352 | if (strlen(root->release_agent_path)) | 1352 | if (strlen(root->release_agent_path)) |
1353 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1353 | seq_show_option(seq, "release_agent", |
1354 | root->release_agent_path); | ||
1354 | spin_unlock(&release_agent_path_lock); | 1355 | spin_unlock(&release_agent_path_lock); |
1355 | 1356 | ||
1356 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) | 1357 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) |
1357 | seq_puts(seq, ",clone_children"); | 1358 | seq_puts(seq, ",clone_children"); |
1358 | if (strlen(root->name)) | 1359 | if (strlen(root->name)) |
1359 | seq_printf(seq, ",name=%s", root->name); | 1360 | seq_show_option(seq, "name", root->name); |
1360 | return 0; | 1361 | return 0; |
1361 | } | 1362 | } |
1362 | 1363 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 03aa2e6de7a4..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
454 | tmp->vm_mm = mm; | 454 | tmp->vm_mm = mm; |
455 | if (anon_vma_fork(tmp, mpnt)) | 455 | if (anon_vma_fork(tmp, mpnt)) |
456 | goto fail_nomem_anon_vma_fork; | 456 | goto fail_nomem_anon_vma_fork; |
457 | tmp->vm_flags &= ~VM_LOCKED; | 457 | tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); |
458 | tmp->vm_next = tmp->vm_prev = NULL; | 458 | tmp->vm_next = tmp->vm_prev = NULL; |
459 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | ||
459 | file = tmp->vm_file; | 460 | file = tmp->vm_file; |
460 | if (file) { | 461 | if (file) { |
461 | struct inode *inode = file_inode(file); | 462 | struct inode *inode = file_inode(file); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 490924cc9e7c..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) | |||
248 | * kthread_create_on_node - create a kthread. | 248 | * kthread_create_on_node - create a kthread. |
249 | * @threadfn: the function to run until signal_pending(current). | 249 | * @threadfn: the function to run until signal_pending(current). |
250 | * @data: data ptr for @threadfn. | 250 | * @data: data ptr for @threadfn. |
251 | * @node: memory node number. | 251 | * @node: task and thread structures for the thread are allocated on this node |
252 | * @namefmt: printf-style name for the thread. | 252 | * @namefmt: printf-style name for the thread. |
253 | * | 253 | * |
254 | * Description: This helper function creates and names a kernel | 254 | * Description: This helper function creates and names a kernel |
255 | * thread. The thread will be stopped: use wake_up_process() to start | 255 | * thread. The thread will be stopped: use wake_up_process() to start |
256 | * it. See also kthread_run(). | 256 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and |
257 | * is affine to all CPUs. | ||
257 | * | 258 | * |
258 | * If thread is going to be bound on a particular cpu, give its node | 259 | * If thread is going to be bound on a particular cpu, give its node |
259 | * in @node, to get NUMA affinity for kthread stack, or else give -1. | 260 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. |
260 | * When woken, the thread will run @threadfn() with @data as its | 261 | * When woken, the thread will run @threadfn() with @data as its |
261 | * argument. @threadfn() can either call do_exit() directly if it is a | 262 | * argument. @threadfn() can either call do_exit() directly if it is a |
262 | * standalone thread for which no one will call kthread_stop(), or | 263 | * standalone thread for which no one will call kthread_stop(), or |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..272d9322bc5d 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | |||
106 | } | 106 | } |
107 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 107 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
108 | 108 | ||
109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | 109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, |
110 | void *key) | ||
110 | { | 111 | { |
111 | __wake_up_common(q, mode, 1, 0, key); | 112 | __wake_up_common(q, mode, nr, 0, key); |
112 | } | 113 | } |
113 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | 114 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
114 | 115 | ||
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | |||
283 | if (!list_empty(&wait->task_list)) | 284 | if (!list_empty(&wait->task_list)) |
284 | list_del_init(&wait->task_list); | 285 | list_del_init(&wait->task_list); |
285 | else if (waitqueue_active(q)) | 286 | else if (waitqueue_active(q)) |
286 | __wake_up_locked_key(q, mode, key); | 287 | __wake_up_locked_key(q, mode, 1, key); |
287 | spin_unlock_irqrestore(&q->lock, flags); | 288 | spin_unlock_irqrestore(&q->lock, flags); |
288 | } | 289 | } |
289 | EXPORT_SYMBOL(abort_exclusive_wait); | 290 | EXPORT_SYMBOL(abort_exclusive_wait); |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..a818cbc73e14 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) | |||
113 | if (kthread_should_stop()) { | 113 | if (kthread_should_stop()) { |
114 | __set_current_state(TASK_RUNNING); | 114 | __set_current_state(TASK_RUNNING); |
115 | preempt_enable(); | 115 | preempt_enable(); |
116 | if (ht->cleanup) | 116 | /* cleanup must mirror setup */ |
117 | if (ht->cleanup && td->status != HP_THREAD_NONE) | ||
117 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | 118 | ht->cleanup(td->cpu, cpu_online(td->cpu)); |
118 | kfree(td); | 119 | kfree(td); |
119 | return 0; | 120 | return 0; |
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
259 | { | 260 | { |
260 | unsigned int cpu; | 261 | unsigned int cpu; |
261 | 262 | ||
262 | /* Unpark any threads that were voluntarily parked. */ | ||
263 | for_each_cpu_not(cpu, ht->cpumask) { | ||
264 | if (cpu_online(cpu)) { | ||
265 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
266 | if (tsk) | ||
267 | kthread_unpark(tsk); | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* We need to destroy also the parked threads of offline cpus */ | 263 | /* We need to destroy also the parked threads of offline cpus */ |
272 | for_each_possible_cpu(cpu) { | 264 | for_each_possible_cpu(cpu) { |
273 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 265 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
281 | } | 273 | } |
282 | 274 | ||
283 | /** | 275 | /** |
284 | * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug | 276 | * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related |
277 | * to hotplug | ||
285 | * @plug_thread: Hotplug thread descriptor | 278 | * @plug_thread: Hotplug thread descriptor |
279 | * @cpumask: The cpumask where threads run | ||
286 | * | 280 | * |
287 | * Creates and starts the threads on all online cpus. | 281 | * Creates and starts the threads on all online cpus. |
288 | */ | 282 | */ |
289 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | 283 | int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, |
284 | const struct cpumask *cpumask) | ||
290 | { | 285 | { |
291 | unsigned int cpu; | 286 | unsigned int cpu; |
292 | int ret = 0; | 287 | int ret = 0; |
293 | 288 | ||
294 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) | 289 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) |
295 | return -ENOMEM; | 290 | return -ENOMEM; |
296 | cpumask_copy(plug_thread->cpumask, cpu_possible_mask); | 291 | cpumask_copy(plug_thread->cpumask, cpumask); |
297 | 292 | ||
298 | get_online_cpus(); | 293 | get_online_cpus(); |
299 | mutex_lock(&smpboot_threads_lock); | 294 | mutex_lock(&smpboot_threads_lock); |
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
301 | ret = __smpboot_create_thread(plug_thread, cpu); | 296 | ret = __smpboot_create_thread(plug_thread, cpu); |
302 | if (ret) { | 297 | if (ret) { |
303 | smpboot_destroy_threads(plug_thread); | 298 | smpboot_destroy_threads(plug_thread); |
299 | free_cpumask_var(plug_thread->cpumask); | ||
304 | goto out; | 300 | goto out; |
305 | } | 301 | } |
306 | smpboot_unpark_thread(plug_thread, cpu); | 302 | if (cpumask_test_cpu(cpu, cpumask)) |
303 | smpboot_unpark_thread(plug_thread, cpu); | ||
307 | } | 304 | } |
308 | list_add(&plug_thread->list, &hotplug_threads); | 305 | list_add(&plug_thread->list, &hotplug_threads); |
309 | out: | 306 | out: |
@@ -311,7 +308,7 @@ out: | |||
311 | put_online_cpus(); | 308 | put_online_cpus(); |
312 | return ret; | 309 | return ret; |
313 | } | 310 | } |
314 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); | 311 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); |
315 | 312 | ||
316 | /** | 313 | /** |
317 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug | 314 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ca7d84f438f1..03c3875d9958 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); | |||
219 | cond_syscall(sys_eventfd); | 219 | cond_syscall(sys_eventfd); |
220 | cond_syscall(sys_eventfd2); | 220 | cond_syscall(sys_eventfd2); |
221 | cond_syscall(sys_memfd_create); | 221 | cond_syscall(sys_memfd_create); |
222 | cond_syscall(sys_userfaultfd); | ||
222 | 223 | ||
223 | /* performance counters: */ | 224 | /* performance counters: */ |
224 | cond_syscall(sys_perf_event_open); | 225 | cond_syscall(sys_perf_event_open); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f65a0a06a8c0..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | |||
39 | cred->cap_inheritable = CAP_EMPTY_SET; | 39 | cred->cap_inheritable = CAP_EMPTY_SET; |
40 | cred->cap_permitted = CAP_FULL_SET; | 40 | cred->cap_permitted = CAP_FULL_SET; |
41 | cred->cap_effective = CAP_FULL_SET; | 41 | cred->cap_effective = CAP_FULL_SET; |
42 | cred->cap_ambient = CAP_EMPTY_SET; | ||
42 | cred->cap_bset = CAP_FULL_SET; | 43 | cred->cap_bset = CAP_FULL_SET; |
43 | #ifdef CONFIG_KEYS | 44 | #ifdef CONFIG_KEYS |
44 | key_put(cred->request_key_auth); | 45 | key_put(cred->request_key_auth); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
25 | #include <linux/kvm_para.h> | 25 | #include <linux/kvm_para.h> |
26 | #include <linux/perf_event.h> | 26 | #include <linux/perf_event.h> |
27 | #include <linux/kthread.h> | ||
27 | 28 | ||
28 | /* | 29 | /* |
29 | * The run state of the lockup detectors is controlled by the content of the | 30 | * The run state of the lockup detectors is controlled by the content of the |
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | |||
66 | #define for_each_watchdog_cpu(cpu) \ | 67 | #define for_each_watchdog_cpu(cpu) \ |
67 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | 68 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) |
68 | 69 | ||
70 | /* | ||
71 | * The 'watchdog_running' variable is set to 1 when the watchdog threads | ||
72 | * are registered/started and is set to 0 when the watchdog threads are | ||
73 | * unregistered/stopped, so it is an indicator whether the threads exist. | ||
74 | */ | ||
69 | static int __read_mostly watchdog_running; | 75 | static int __read_mostly watchdog_running; |
76 | /* | ||
77 | * If a subsystem has a need to deactivate the watchdog temporarily, it | ||
78 | * can use the suspend/resume interface to achieve this. The content of | ||
79 | * the 'watchdog_suspended' variable reflects this state. Existing threads | ||
80 | * are parked/unparked by the lockup_detector_{suspend|resume} functions | ||
81 | * (see comment blocks pertaining to those functions for further details). | ||
82 | * | ||
83 | * 'watchdog_suspended' also prevents threads from being registered/started | ||
84 | * or unregistered/stopped via parameters in /proc/sys/kernel, so the state | ||
85 | * of 'watchdog_running' cannot change while the watchdog is deactivated | ||
86 | * temporarily (see related code in 'proc' handlers). | ||
87 | */ | ||
88 | static int __read_mostly watchdog_suspended; | ||
89 | |||
70 | static u64 __read_mostly sample_period; | 90 | static u64 __read_mostly sample_period; |
71 | 91 | ||
72 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 92 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
613 | } | 633 | } |
614 | } | 634 | } |
615 | 635 | ||
616 | void watchdog_nmi_enable_all(void) | ||
617 | { | ||
618 | int cpu; | ||
619 | |||
620 | mutex_lock(&watchdog_proc_mutex); | ||
621 | |||
622 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
623 | goto unlock; | ||
624 | |||
625 | get_online_cpus(); | ||
626 | for_each_watchdog_cpu(cpu) | ||
627 | watchdog_nmi_enable(cpu); | ||
628 | put_online_cpus(); | ||
629 | |||
630 | unlock: | ||
631 | mutex_unlock(&watchdog_proc_mutex); | ||
632 | } | ||
633 | |||
634 | void watchdog_nmi_disable_all(void) | ||
635 | { | ||
636 | int cpu; | ||
637 | |||
638 | mutex_lock(&watchdog_proc_mutex); | ||
639 | |||
640 | if (!watchdog_running) | ||
641 | goto unlock; | ||
642 | |||
643 | get_online_cpus(); | ||
644 | for_each_watchdog_cpu(cpu) | ||
645 | watchdog_nmi_disable(cpu); | ||
646 | put_online_cpus(); | ||
647 | |||
648 | unlock: | ||
649 | mutex_unlock(&watchdog_proc_mutex); | ||
650 | } | ||
651 | #else | 636 | #else |
652 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 637 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
653 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | 638 | static void watchdog_nmi_disable(unsigned int cpu) { return; } |
654 | void watchdog_nmi_enable_all(void) {} | ||
655 | void watchdog_nmi_disable_all(void) {} | ||
656 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 639 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
657 | 640 | ||
658 | static struct smp_hotplug_thread watchdog_threads = { | 641 | static struct smp_hotplug_thread watchdog_threads = { |
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
666 | .unpark = watchdog_enable, | 649 | .unpark = watchdog_enable, |
667 | }; | 650 | }; |
668 | 651 | ||
669 | static void restart_watchdog_hrtimer(void *info) | 652 | /* |
653 | * park all watchdog threads that are specified in 'watchdog_cpumask' | ||
654 | */ | ||
655 | static int watchdog_park_threads(void) | ||
670 | { | 656 | { |
671 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); | 657 | int cpu, ret = 0; |
672 | int ret; | ||
673 | 658 | ||
659 | get_online_cpus(); | ||
660 | for_each_watchdog_cpu(cpu) { | ||
661 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); | ||
662 | if (ret) | ||
663 | break; | ||
664 | } | ||
665 | if (ret) { | ||
666 | for_each_watchdog_cpu(cpu) | ||
667 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
668 | } | ||
669 | put_online_cpus(); | ||
670 | |||
671 | return ret; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' | ||
676 | */ | ||
677 | static void watchdog_unpark_threads(void) | ||
678 | { | ||
679 | int cpu; | ||
680 | |||
681 | get_online_cpus(); | ||
682 | for_each_watchdog_cpu(cpu) | ||
683 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
684 | put_online_cpus(); | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
689 | */ | ||
690 | int lockup_detector_suspend(void) | ||
691 | { | ||
692 | int ret = 0; | ||
693 | |||
694 | mutex_lock(&watchdog_proc_mutex); | ||
674 | /* | 695 | /* |
675 | * No need to cancel and restart hrtimer if it is currently executing | 696 | * Multiple suspend requests can be active in parallel (counted by |
676 | * because it will reprogram itself with the new period now. | 697 | * the 'watchdog_suspended' variable). If the watchdog threads are |
677 | * We should never see it unqueued here because we are running per-cpu | 698 | * running, the first caller takes care that they will be parked. |
678 | * with interrupts disabled. | 699 | * The state of 'watchdog_running' cannot change while a suspend |
700 | * request is active (see related code in 'proc' handlers). | ||
679 | */ | 701 | */ |
680 | ret = hrtimer_try_to_cancel(hrtimer); | 702 | if (watchdog_running && !watchdog_suspended) |
681 | if (ret == 1) | 703 | ret = watchdog_park_threads(); |
682 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | 704 | |
683 | HRTIMER_MODE_REL_PINNED); | 705 | if (ret == 0) |
706 | watchdog_suspended++; | ||
707 | |||
708 | mutex_unlock(&watchdog_proc_mutex); | ||
709 | |||
710 | return ret; | ||
684 | } | 711 | } |
685 | 712 | ||
686 | static void update_watchdog(int cpu) | 713 | /* |
714 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
715 | */ | ||
716 | void lockup_detector_resume(void) | ||
687 | { | 717 | { |
718 | mutex_lock(&watchdog_proc_mutex); | ||
719 | |||
720 | watchdog_suspended--; | ||
688 | /* | 721 | /* |
689 | * Make sure that perf event counter will adopt to a new | 722 | * The watchdog threads are unparked if they were previously running |
690 | * sampling period. Updating the sampling period directly would | 723 | * and if there is no more active suspend request. |
691 | * be much nicer but we do not have an API for that now so | ||
692 | * let's use a big hammer. | ||
693 | * Hrtimer will adopt the new period on the next tick but this | ||
694 | * might be late already so we have to restart the timer as well. | ||
695 | */ | 724 | */ |
696 | watchdog_nmi_disable(cpu); | 725 | if (watchdog_running && !watchdog_suspended) |
697 | smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); | 726 | watchdog_unpark_threads(); |
698 | watchdog_nmi_enable(cpu); | 727 | |
728 | mutex_unlock(&watchdog_proc_mutex); | ||
699 | } | 729 | } |
700 | 730 | ||
701 | static void update_watchdog_all_cpus(void) | 731 | static void update_watchdog_all_cpus(void) |
702 | { | 732 | { |
703 | int cpu; | 733 | watchdog_park_threads(); |
704 | 734 | watchdog_unpark_threads(); | |
705 | get_online_cpus(); | ||
706 | for_each_watchdog_cpu(cpu) | ||
707 | update_watchdog(cpu); | ||
708 | put_online_cpus(); | ||
709 | } | 735 | } |
710 | 736 | ||
711 | static int watchdog_enable_all_cpus(void) | 737 | static int watchdog_enable_all_cpus(void) |
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void) | |||
713 | int err = 0; | 739 | int err = 0; |
714 | 740 | ||
715 | if (!watchdog_running) { | 741 | if (!watchdog_running) { |
716 | err = smpboot_register_percpu_thread(&watchdog_threads); | 742 | err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, |
743 | &watchdog_cpumask); | ||
717 | if (err) | 744 | if (err) |
718 | pr_err("Failed to create watchdog threads, disabled\n"); | 745 | pr_err("Failed to create watchdog threads, disabled\n"); |
719 | else { | 746 | else |
720 | if (smpboot_update_cpumask_percpu_thread( | ||
721 | &watchdog_threads, &watchdog_cpumask)) | ||
722 | pr_err("Failed to set cpumask for watchdog threads\n"); | ||
723 | watchdog_running = 1; | 747 | watchdog_running = 1; |
724 | } | ||
725 | } else { | 748 | } else { |
726 | /* | 749 | /* |
727 | * Enable/disable the lockup detectors or | 750 | * Enable/disable the lockup detectors or |
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
787 | 810 | ||
788 | mutex_lock(&watchdog_proc_mutex); | 811 | mutex_lock(&watchdog_proc_mutex); |
789 | 812 | ||
813 | if (watchdog_suspended) { | ||
814 | /* no parameter changes allowed while watchdog is suspended */ | ||
815 | err = -EAGAIN; | ||
816 | goto out; | ||
817 | } | ||
818 | |||
790 | /* | 819 | /* |
791 | * If the parameter is being read return the state of the corresponding | 820 | * If the parameter is being read return the state of the corresponding |
792 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the | 821 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the |
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
872 | 901 | ||
873 | mutex_lock(&watchdog_proc_mutex); | 902 | mutex_lock(&watchdog_proc_mutex); |
874 | 903 | ||
904 | if (watchdog_suspended) { | ||
905 | /* no parameter changes allowed while watchdog is suspended */ | ||
906 | err = -EAGAIN; | ||
907 | goto out; | ||
908 | } | ||
909 | |||
875 | old = ACCESS_ONCE(watchdog_thresh); | 910 | old = ACCESS_ONCE(watchdog_thresh); |
876 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 911 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
877 | 912 | ||
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
903 | int err; | 938 | int err; |
904 | 939 | ||
905 | mutex_lock(&watchdog_proc_mutex); | 940 | mutex_lock(&watchdog_proc_mutex); |
941 | |||
942 | if (watchdog_suspended) { | ||
943 | /* no parameter changes allowed while watchdog is suspended */ | ||
944 | err = -EAGAIN; | ||
945 | goto out; | ||
946 | } | ||
947 | |||
906 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); | 948 | err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); |
907 | if (!err && write) { | 949 | if (!err && write) { |
908 | /* Remove impossible cpus to keep sysctl output cleaner. */ | 950 | /* Remove impossible cpus to keep sysctl output cleaner. */ |
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
920 | pr_err("cpumask update failed\n"); | 962 | pr_err("cpumask update failed\n"); |
921 | } | 963 | } |
922 | } | 964 | } |
965 | out: | ||
923 | mutex_unlock(&watchdog_proc_mutex); | 966 | mutex_unlock(&watchdog_proc_mutex); |
924 | return err; | 967 | return err; |
925 | } | 968 | } |
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void) | |||
932 | 975 | ||
933 | #ifdef CONFIG_NO_HZ_FULL | 976 | #ifdef CONFIG_NO_HZ_FULL |
934 | if (tick_nohz_full_enabled()) { | 977 | if (tick_nohz_full_enabled()) { |
935 | if (!cpumask_empty(tick_nohz_full_mask)) | 978 | pr_info("Disabling watchdog on nohz_full cores by default\n"); |
936 | pr_info("Disabling watchdog on nohz_full cores by default\n"); | 979 | cpumask_copy(&watchdog_cpumask, housekeeping_mask); |
937 | cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, | ||
938 | tick_nohz_full_mask); | ||
939 | } else | 980 | } else |
940 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | 981 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); |
941 | #else | 982 | #else |
diff --git a/lib/genalloc.c b/lib/genalloc.c index daf0afb6d979..116a166b096f 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c | |||
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid) | |||
160 | pool->min_alloc_order = min_alloc_order; | 160 | pool->min_alloc_order = min_alloc_order; |
161 | pool->algo = gen_pool_first_fit; | 161 | pool->algo = gen_pool_first_fit; |
162 | pool->data = NULL; | 162 | pool->data = NULL; |
163 | pool->name = NULL; | ||
163 | } | 164 | } |
164 | return pool; | 165 | return pool; |
165 | } | 166 | } |
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool) | |||
252 | 253 | ||
253 | kfree(chunk); | 254 | kfree(chunk); |
254 | } | 255 | } |
256 | kfree_const(pool->name); | ||
255 | kfree(pool); | 257 | kfree(pool); |
256 | return; | ||
257 | } | 258 | } |
258 | EXPORT_SYMBOL(gen_pool_destroy); | 259 | EXPORT_SYMBOL(gen_pool_destroy); |
259 | 260 | ||
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res) | |||
570 | gen_pool_destroy(*(struct gen_pool **)res); | 571 | gen_pool_destroy(*(struct gen_pool **)res); |
571 | } | 572 | } |
572 | 573 | ||
574 | static int devm_gen_pool_match(struct device *dev, void *res, void *data) | ||
575 | { | ||
576 | struct gen_pool **p = res; | ||
577 | |||
578 | /* NULL data matches only a pool without an assigned name */ | ||
579 | if (!data && !(*p)->name) | ||
580 | return 1; | ||
581 | |||
582 | if (!data || !(*p)->name) | ||
583 | return 0; | ||
584 | |||
585 | return !strcmp((*p)->name, data); | ||
586 | } | ||
587 | |||
588 | /** | ||
589 | * gen_pool_get - Obtain the gen_pool (if any) for a device | ||
590 | * @dev: device to retrieve the gen_pool from | ||
591 | * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device | ||
592 | * | ||
593 | * Returns the gen_pool for the device if one is present, or NULL. | ||
594 | */ | ||
595 | struct gen_pool *gen_pool_get(struct device *dev, const char *name) | ||
596 | { | ||
597 | struct gen_pool **p; | ||
598 | |||
599 | p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match, | ||
600 | (void *)name); | ||
601 | if (!p) | ||
602 | return NULL; | ||
603 | return *p; | ||
604 | } | ||
605 | EXPORT_SYMBOL_GPL(gen_pool_get); | ||
606 | |||
573 | /** | 607 | /** |
574 | * devm_gen_pool_create - managed gen_pool_create | 608 | * devm_gen_pool_create - managed gen_pool_create |
575 | * @dev: device that provides the gen_pool | 609 | * @dev: device that provides the gen_pool |
576 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents | 610 | * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents |
577 | * @nid: node id of the node the pool structure should be allocated on, or -1 | 611 | * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes |
612 | * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device | ||
578 | * | 613 | * |
579 | * Create a new special memory pool that can be used to manage special purpose | 614 | * Create a new special memory pool that can be used to manage special purpose |
580 | * memory not managed by the regular kmalloc/kfree interface. The pool will be | 615 | * memory not managed by the regular kmalloc/kfree interface. The pool will be |
581 | * automatically destroyed by the device management code. | 616 | * automatically destroyed by the device management code. |
582 | */ | 617 | */ |
583 | struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, | 618 | struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, |
584 | int nid) | 619 | int nid, const char *name) |
585 | { | 620 | { |
586 | struct gen_pool **ptr, *pool; | 621 | struct gen_pool **ptr, *pool; |
622 | const char *pool_name = NULL; | ||
623 | |||
624 | /* Check that genpool to be created is uniquely addressed on device */ | ||
625 | if (gen_pool_get(dev, name)) | ||
626 | return ERR_PTR(-EINVAL); | ||
627 | |||
628 | if (name) { | ||
629 | pool_name = kstrdup_const(name, GFP_KERNEL); | ||
630 | if (!pool_name) | ||
631 | return ERR_PTR(-ENOMEM); | ||
632 | } | ||
587 | 633 | ||
588 | ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); | 634 | ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); |
589 | if (!ptr) | 635 | if (!ptr) |
590 | return NULL; | 636 | goto free_pool_name; |
591 | 637 | ||
592 | pool = gen_pool_create(min_alloc_order, nid); | 638 | pool = gen_pool_create(min_alloc_order, nid); |
593 | if (pool) { | 639 | if (!pool) |
594 | *ptr = pool; | 640 | goto free_devres; |
595 | devres_add(dev, ptr); | 641 | |
596 | } else { | 642 | *ptr = pool; |
597 | devres_free(ptr); | 643 | pool->name = pool_name; |
598 | } | 644 | devres_add(dev, ptr); |
599 | 645 | ||
600 | return pool; | 646 | return pool; |
601 | } | ||
602 | EXPORT_SYMBOL(devm_gen_pool_create); | ||
603 | 647 | ||
604 | /** | 648 | free_devres: |
605 | * gen_pool_get - Obtain the gen_pool (if any) for a device | 649 | devres_free(ptr); |
606 | * @dev: device to retrieve the gen_pool from | 650 | free_pool_name: |
607 | * | 651 | kfree_const(pool_name); |
608 | * Returns the gen_pool for the device if one is present, or NULL. | ||
609 | */ | ||
610 | struct gen_pool *gen_pool_get(struct device *dev) | ||
611 | { | ||
612 | struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, | ||
613 | NULL); | ||
614 | 652 | ||
615 | if (!p) | 653 | return ERR_PTR(-ENOMEM); |
616 | return NULL; | ||
617 | return *p; | ||
618 | } | 654 | } |
619 | EXPORT_SYMBOL_GPL(gen_pool_get); | 655 | EXPORT_SYMBOL(devm_gen_pool_create); |
620 | 656 | ||
621 | #ifdef CONFIG_OF | 657 | #ifdef CONFIG_OF |
622 | /** | 658 | /** |
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np, | |||
633 | const char *propname, int index) | 669 | const char *propname, int index) |
634 | { | 670 | { |
635 | struct platform_device *pdev; | 671 | struct platform_device *pdev; |
636 | struct device_node *np_pool; | 672 | struct device_node *np_pool, *parent; |
673 | const char *name = NULL; | ||
674 | struct gen_pool *pool = NULL; | ||
637 | 675 | ||
638 | np_pool = of_parse_phandle(np, propname, index); | 676 | np_pool = of_parse_phandle(np, propname, index); |
639 | if (!np_pool) | 677 | if (!np_pool) |
640 | return NULL; | 678 | return NULL; |
679 | |||
641 | pdev = of_find_device_by_node(np_pool); | 680 | pdev = of_find_device_by_node(np_pool); |
681 | if (!pdev) { | ||
682 | /* Check if named gen_pool is created by parent node device */ | ||
683 | parent = of_get_parent(np_pool); | ||
684 | pdev = of_find_device_by_node(parent); | ||
685 | of_node_put(parent); | ||
686 | |||
687 | of_property_read_string(np_pool, "label", &name); | ||
688 | if (!name) | ||
689 | name = np_pool->name; | ||
690 | } | ||
691 | if (pdev) | ||
692 | pool = gen_pool_get(&pdev->dev, name); | ||
642 | of_node_put(np_pool); | 693 | of_node_put(np_pool); |
643 | if (!pdev) | 694 | |
644 | return NULL; | 695 | return pool; |
645 | return gen_pool_get(&pdev->dev); | ||
646 | } | 696 | } |
647 | EXPORT_SYMBOL_GPL(of_gen_pool_get); | 697 | EXPORT_SYMBOL_GPL(of_gen_pool_get); |
648 | #endif /* CONFIG_OF */ | 698 | #endif /* CONFIG_OF */ |
diff --git a/mm/Makefile b/mm/Makefile index 98c4eaeabdcb..b424d5e5b6ff 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o | |||
78 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | 78 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
79 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | 79 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o |
80 | obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o | 80 | obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o |
81 | obj-$(CONFIG_USERFAULTFD) += userfaultfd.o | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index fd5fe4342e93..59d10d16f0a5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) | |||
242 | return page; | 242 | return page; |
243 | } | 243 | } |
244 | 244 | ||
245 | static inline int is_page_busy(struct dma_page *page) | 245 | static inline bool is_page_busy(struct dma_page *page) |
246 | { | 246 | { |
247 | return page->in_use != 0; | 247 | return page->in_use != 0; |
248 | } | 248 | } |
@@ -12,7 +12,9 @@ | |||
12 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
13 | #include <linux/rwsem.h> | 13 | #include <linux/rwsem.h> |
14 | #include <linux/hugetlb.h> | 14 | #include <linux/hugetlb.h> |
15 | |||
15 | #include <asm/pgtable.h> | 16 | #include <asm/pgtable.h> |
17 | #include <asm/tlbflush.h> | ||
16 | 18 | ||
17 | #include "internal.h" | 19 | #include "internal.h" |
18 | 20 | ||
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma, | |||
32 | return NULL; | 34 | return NULL; |
33 | } | 35 | } |
34 | 36 | ||
37 | static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, | ||
38 | pte_t *pte, unsigned int flags) | ||
39 | { | ||
40 | /* No page to get reference */ | ||
41 | if (flags & FOLL_GET) | ||
42 | return -EFAULT; | ||
43 | |||
44 | if (flags & FOLL_TOUCH) { | ||
45 | pte_t entry = *pte; | ||
46 | |||
47 | if (flags & FOLL_WRITE) | ||
48 | entry = pte_mkdirty(entry); | ||
49 | entry = pte_mkyoung(entry); | ||
50 | |||
51 | if (!pte_same(*pte, entry)) { | ||
52 | set_pte_at(vma->vm_mm, address, pte, entry); | ||
53 | update_mmu_cache(vma, address, pte); | ||
54 | } | ||
55 | } | ||
56 | |||
57 | /* Proper page table entry exists, but no corresponding struct page */ | ||
58 | return -EEXIST; | ||
59 | } | ||
60 | |||
35 | static struct page *follow_page_pte(struct vm_area_struct *vma, | 61 | static struct page *follow_page_pte(struct vm_area_struct *vma, |
36 | unsigned long address, pmd_t *pmd, unsigned int flags) | 62 | unsigned long address, pmd_t *pmd, unsigned int flags) |
37 | { | 63 | { |
@@ -73,10 +99,21 @@ retry: | |||
73 | 99 | ||
74 | page = vm_normal_page(vma, address, pte); | 100 | page = vm_normal_page(vma, address, pte); |
75 | if (unlikely(!page)) { | 101 | if (unlikely(!page)) { |
76 | if ((flags & FOLL_DUMP) || | 102 | if (flags & FOLL_DUMP) { |
77 | !is_zero_pfn(pte_pfn(pte))) | 103 | /* Avoid special (like zero) pages in core dumps */ |
78 | goto bad_page; | 104 | page = ERR_PTR(-EFAULT); |
79 | page = pte_page(pte); | 105 | goto out; |
106 | } | ||
107 | |||
108 | if (is_zero_pfn(pte_pfn(pte))) { | ||
109 | page = pte_page(pte); | ||
110 | } else { | ||
111 | int ret; | ||
112 | |||
113 | ret = follow_pfn_pte(vma, address, ptep, flags); | ||
114 | page = ERR_PTR(ret); | ||
115 | goto out; | ||
116 | } | ||
80 | } | 117 | } |
81 | 118 | ||
82 | if (flags & FOLL_GET) | 119 | if (flags & FOLL_GET) |
@@ -114,12 +151,9 @@ retry: | |||
114 | unlock_page(page); | 151 | unlock_page(page); |
115 | } | 152 | } |
116 | } | 153 | } |
154 | out: | ||
117 | pte_unmap_unlock(ptep, ptl); | 155 | pte_unmap_unlock(ptep, ptl); |
118 | return page; | 156 | return page; |
119 | bad_page: | ||
120 | pte_unmap_unlock(ptep, ptl); | ||
121 | return ERR_PTR(-EFAULT); | ||
122 | |||
123 | no_page: | 157 | no_page: |
124 | pte_unmap_unlock(ptep, ptl); | 158 | pte_unmap_unlock(ptep, ptl); |
125 | if (!pte_none(pte)) | 159 | if (!pte_none(pte)) |
@@ -489,9 +523,15 @@ retry: | |||
489 | goto next_page; | 523 | goto next_page; |
490 | } | 524 | } |
491 | BUG(); | 525 | BUG(); |
492 | } | 526 | } else if (PTR_ERR(page) == -EEXIST) { |
493 | if (IS_ERR(page)) | 527 | /* |
528 | * Proper page table entry exists, but no corresponding | ||
529 | * struct page. | ||
530 | */ | ||
531 | goto next_page; | ||
532 | } else if (IS_ERR(page)) { | ||
494 | return i ? i : PTR_ERR(page); | 533 | return i ? i : PTR_ERR(page); |
534 | } | ||
495 | if (pages) { | 535 | if (pages) { |
496 | pages[i] = page; | 536 | pages[i] = page; |
497 | flush_anon_page(vma, page, start); | 537 | flush_anon_page(vma, page, start); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 097c7a4bfbd9..279a818a39b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
25 | #include <linux/hashtable.h> | 25 | #include <linux/hashtable.h> |
26 | #include <linux/userfaultfd_k.h> | ||
26 | 27 | ||
27 | #include <asm/tlb.h> | 28 | #include <asm/tlb.h> |
28 | #include <asm/pgalloc.h> | 29 | #include <asm/pgalloc.h> |
@@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | |||
716 | 717 | ||
717 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 718 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
718 | struct vm_area_struct *vma, | 719 | struct vm_area_struct *vma, |
719 | unsigned long haddr, pmd_t *pmd, | 720 | unsigned long address, pmd_t *pmd, |
720 | struct page *page, gfp_t gfp) | 721 | struct page *page, gfp_t gfp, |
722 | unsigned int flags) | ||
721 | { | 723 | { |
722 | struct mem_cgroup *memcg; | 724 | struct mem_cgroup *memcg; |
723 | pgtable_t pgtable; | 725 | pgtable_t pgtable; |
724 | spinlock_t *ptl; | 726 | spinlock_t *ptl; |
727 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
725 | 728 | ||
726 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 729 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
727 | 730 | ||
728 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) | 731 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { |
729 | return VM_FAULT_OOM; | 732 | put_page(page); |
733 | count_vm_event(THP_FAULT_FALLBACK); | ||
734 | return VM_FAULT_FALLBACK; | ||
735 | } | ||
730 | 736 | ||
731 | pgtable = pte_alloc_one(mm, haddr); | 737 | pgtable = pte_alloc_one(mm, haddr); |
732 | if (unlikely(!pgtable)) { | 738 | if (unlikely(!pgtable)) { |
733 | mem_cgroup_cancel_charge(page, memcg); | 739 | mem_cgroup_cancel_charge(page, memcg); |
740 | put_page(page); | ||
734 | return VM_FAULT_OOM; | 741 | return VM_FAULT_OOM; |
735 | } | 742 | } |
736 | 743 | ||
@@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
750 | pte_free(mm, pgtable); | 757 | pte_free(mm, pgtable); |
751 | } else { | 758 | } else { |
752 | pmd_t entry; | 759 | pmd_t entry; |
760 | |||
761 | /* Deliver the page fault to userland */ | ||
762 | if (userfaultfd_missing(vma)) { | ||
763 | int ret; | ||
764 | |||
765 | spin_unlock(ptl); | ||
766 | mem_cgroup_cancel_charge(page, memcg); | ||
767 | put_page(page); | ||
768 | pte_free(mm, pgtable); | ||
769 | ret = handle_userfault(vma, address, flags, | ||
770 | VM_UFFD_MISSING); | ||
771 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | ||
772 | return ret; | ||
773 | } | ||
774 | |||
753 | entry = mk_huge_pmd(page, vma->vm_page_prot); | 775 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
754 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 776 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
755 | page_add_new_anon_rmap(page, vma, haddr); | 777 | page_add_new_anon_rmap(page, vma, haddr); |
@@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
760 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 782 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
761 | atomic_long_inc(&mm->nr_ptes); | 783 | atomic_long_inc(&mm->nr_ptes); |
762 | spin_unlock(ptl); | 784 | spin_unlock(ptl); |
785 | count_vm_event(THP_FAULT_ALLOC); | ||
763 | } | 786 | } |
764 | 787 | ||
765 | return 0; | 788 | return 0; |
@@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
771 | } | 794 | } |
772 | 795 | ||
773 | /* Caller must hold page table lock. */ | 796 | /* Caller must hold page table lock. */ |
774 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 797 | static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
775 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 798 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
776 | struct page *zero_page) | 799 | struct page *zero_page) |
777 | { | 800 | { |
778 | pmd_t entry; | 801 | pmd_t entry; |
779 | if (!pmd_none(*pmd)) | ||
780 | return false; | ||
781 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 802 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
782 | entry = pmd_mkhuge(entry); | 803 | entry = pmd_mkhuge(entry); |
783 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 804 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
784 | set_pmd_at(mm, haddr, pmd, entry); | 805 | set_pmd_at(mm, haddr, pmd, entry); |
785 | atomic_long_inc(&mm->nr_ptes); | 806 | atomic_long_inc(&mm->nr_ptes); |
786 | return true; | ||
787 | } | 807 | } |
788 | 808 | ||
789 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 809 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
806 | pgtable_t pgtable; | 826 | pgtable_t pgtable; |
807 | struct page *zero_page; | 827 | struct page *zero_page; |
808 | bool set; | 828 | bool set; |
829 | int ret; | ||
809 | pgtable = pte_alloc_one(mm, haddr); | 830 | pgtable = pte_alloc_one(mm, haddr); |
810 | if (unlikely(!pgtable)) | 831 | if (unlikely(!pgtable)) |
811 | return VM_FAULT_OOM; | 832 | return VM_FAULT_OOM; |
@@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
816 | return VM_FAULT_FALLBACK; | 837 | return VM_FAULT_FALLBACK; |
817 | } | 838 | } |
818 | ptl = pmd_lock(mm, pmd); | 839 | ptl = pmd_lock(mm, pmd); |
819 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | 840 | ret = 0; |
820 | zero_page); | 841 | set = false; |
821 | spin_unlock(ptl); | 842 | if (pmd_none(*pmd)) { |
843 | if (userfaultfd_missing(vma)) { | ||
844 | spin_unlock(ptl); | ||
845 | ret = handle_userfault(vma, address, flags, | ||
846 | VM_UFFD_MISSING); | ||
847 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | ||
848 | } else { | ||
849 | set_huge_zero_page(pgtable, mm, vma, | ||
850 | haddr, pmd, | ||
851 | zero_page); | ||
852 | spin_unlock(ptl); | ||
853 | set = true; | ||
854 | } | ||
855 | } else | ||
856 | spin_unlock(ptl); | ||
822 | if (!set) { | 857 | if (!set) { |
823 | pte_free(mm, pgtable); | 858 | pte_free(mm, pgtable); |
824 | put_huge_zero_page(); | 859 | put_huge_zero_page(); |
825 | } | 860 | } |
826 | return 0; | 861 | return ret; |
827 | } | 862 | } |
828 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); | 863 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
829 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | 864 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
@@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
831 | count_vm_event(THP_FAULT_FALLBACK); | 866 | count_vm_event(THP_FAULT_FALLBACK); |
832 | return VM_FAULT_FALLBACK; | 867 | return VM_FAULT_FALLBACK; |
833 | } | 868 | } |
834 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { | 869 | return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, |
835 | put_page(page); | 870 | flags); |
836 | count_vm_event(THP_FAULT_FALLBACK); | ||
837 | return VM_FAULT_FALLBACK; | ||
838 | } | ||
839 | |||
840 | count_vm_event(THP_FAULT_ALLOC); | ||
841 | return 0; | ||
842 | } | 871 | } |
843 | 872 | ||
844 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 873 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
873 | */ | 902 | */ |
874 | if (is_huge_zero_pmd(pmd)) { | 903 | if (is_huge_zero_pmd(pmd)) { |
875 | struct page *zero_page; | 904 | struct page *zero_page; |
876 | bool set; | ||
877 | /* | 905 | /* |
878 | * get_huge_zero_page() will never allocate a new page here, | 906 | * get_huge_zero_page() will never allocate a new page here, |
879 | * since we already have a zero page to copy. It just takes a | 907 | * since we already have a zero page to copy. It just takes a |
880 | * reference. | 908 | * reference. |
881 | */ | 909 | */ |
882 | zero_page = get_huge_zero_page(); | 910 | zero_page = get_huge_zero_page(); |
883 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | 911 | set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, |
884 | zero_page); | 912 | zero_page); |
885 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
886 | ret = 0; | 913 | ret = 0; |
887 | goto out_unlock; | 914 | goto out_unlock; |
888 | } | 915 | } |
@@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2133 | _pte++, address += PAGE_SIZE) { | 2160 | _pte++, address += PAGE_SIZE) { |
2134 | pte_t pteval = *_pte; | 2161 | pte_t pteval = *_pte; |
2135 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | 2162 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { |
2136 | if (++none_or_zero <= khugepaged_max_ptes_none) | 2163 | if (!userfaultfd_armed(vma) && |
2164 | ++none_or_zero <= khugepaged_max_ptes_none) | ||
2137 | continue; | 2165 | continue; |
2138 | else | 2166 | else |
2139 | goto out; | 2167 | goto out; |
@@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2586 | _pte++, _address += PAGE_SIZE) { | 2614 | _pte++, _address += PAGE_SIZE) { |
2587 | pte_t pteval = *_pte; | 2615 | pte_t pteval = *_pte; |
2588 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | 2616 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { |
2589 | if (++none_or_zero <= khugepaged_max_ptes_none) | 2617 | if (!userfaultfd_armed(vma) && |
2618 | ++none_or_zero <= khugepaged_max_ptes_none) | ||
2590 | continue; | 2619 | continue; |
2591 | else | 2620 | else |
2592 | goto out_unmap; | 2621 | goto out_unmap; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087089d8..51ae41d0fbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | |||
616 | } | 616 | } |
617 | 617 | ||
618 | /* Returns true if the VMA has associated reserve pages */ | 618 | /* Returns true if the VMA has associated reserve pages */ |
619 | static int vma_has_reserves(struct vm_area_struct *vma, long chg) | 619 | static bool vma_has_reserves(struct vm_area_struct *vma, long chg) |
620 | { | 620 | { |
621 | if (vma->vm_flags & VM_NORESERVE) { | 621 | if (vma->vm_flags & VM_NORESERVE) { |
622 | /* | 622 | /* |
@@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) | |||
629 | * properly, so add work-around here. | 629 | * properly, so add work-around here. |
630 | */ | 630 | */ |
631 | if (vma->vm_flags & VM_MAYSHARE && chg == 0) | 631 | if (vma->vm_flags & VM_MAYSHARE && chg == 0) |
632 | return 1; | 632 | return true; |
633 | else | 633 | else |
634 | return 0; | 634 | return false; |
635 | } | 635 | } |
636 | 636 | ||
637 | /* Shared mappings always use reserves */ | 637 | /* Shared mappings always use reserves */ |
638 | if (vma->vm_flags & VM_MAYSHARE) | 638 | if (vma->vm_flags & VM_MAYSHARE) |
639 | return 1; | 639 | return true; |
640 | 640 | ||
641 | /* | 641 | /* |
642 | * Only the process that called mmap() has reserves for | 642 | * Only the process that called mmap() has reserves for |
643 | * private mappings. | 643 | * private mappings. |
644 | */ | 644 | */ |
645 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 645 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
646 | return 1; | 646 | return true; |
647 | 647 | ||
648 | return 0; | 648 | return false; |
649 | } | 649 | } |
650 | 650 | ||
651 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 651 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
@@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, | |||
3779 | return saddr; | 3779 | return saddr; |
3780 | } | 3780 | } |
3781 | 3781 | ||
3782 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | 3782 | static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) |
3783 | { | 3783 | { |
3784 | unsigned long base = addr & PUD_MASK; | 3784 | unsigned long base = addr & PUD_MASK; |
3785 | unsigned long end = base + PUD_SIZE; | 3785 | unsigned long end = base + PUD_SIZE; |
@@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |||
3789 | */ | 3789 | */ |
3790 | if (vma->vm_flags & VM_MAYSHARE && | 3790 | if (vma->vm_flags & VM_MAYSHARE && |
3791 | vma->vm_start <= base && end <= vma->vm_end) | 3791 | vma->vm_start <= base && end <= vma->vm_end) |
3792 | return 1; | 3792 | return true; |
3793 | return 0; | 3793 | return false; |
3794 | } | 3794 | } |
3795 | 3795 | ||
3796 | /* | 3796 | /* |
diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..1195dd2d6a2b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
426 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | 426 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ |
427 | #define ALLOC_FAIR 0x100 /* fair zone allocation */ | 427 | #define ALLOC_FAIR 0x100 /* fair zone allocation */ |
428 | 428 | ||
429 | enum ttu_flags; | ||
430 | struct tlbflush_unmap_batch; | ||
431 | |||
432 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
433 | void try_to_unmap_flush(void); | ||
434 | void try_to_unmap_flush_dirty(void); | ||
435 | #else | ||
436 | static inline void try_to_unmap_flush(void) | ||
437 | { | ||
438 | } | ||
439 | static inline void try_to_unmap_flush_dirty(void) | ||
440 | { | ||
441 | } | ||
442 | |||
443 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ | ||
429 | #endif /* __MM_INTERNAL_H */ | 444 | #endif /* __MM_INTERNAL_H */ |
diff --git a/mm/madvise.c b/mm/madvise.c index 64bb8a22110c..ce3a4222c7e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, | |||
103 | 103 | ||
104 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 104 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
105 | *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, | 105 | *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, |
106 | vma->vm_file, pgoff, vma_policy(vma)); | 106 | vma->vm_file, pgoff, vma_policy(vma), |
107 | vma->vm_userfaultfd_ctx); | ||
107 | if (*prev) { | 108 | if (*prev) { |
108 | vma = *prev; | 109 | vma = *prev; |
109 | goto success; | 110 | goto success; |
@@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
385 | } | 386 | } |
386 | } | 387 | } |
387 | 388 | ||
388 | static int | 389 | static bool |
389 | madvise_behavior_valid(int behavior) | 390 | madvise_behavior_valid(int behavior) |
390 | { | 391 | { |
391 | switch (behavior) { | 392 | switch (behavior) { |
@@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior) | |||
407 | #endif | 408 | #endif |
408 | case MADV_DONTDUMP: | 409 | case MADV_DONTDUMP: |
409 | case MADV_DODUMP: | 410 | case MADV_DODUMP: |
410 | return 1; | 411 | return true; |
411 | 412 | ||
412 | default: | 413 | default: |
413 | return 0; | 414 | return false; |
414 | } | 415 | } |
415 | } | 416 | } |
416 | 417 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 87108e77e476..95ce68c6da8a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -566,6 +566,9 @@ repeat: | |||
566 | * area, insert that portion. | 566 | * area, insert that portion. |
567 | */ | 567 | */ |
568 | if (rbase > base) { | 568 | if (rbase > base) { |
569 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
570 | WARN_ON(nid != memblock_get_region_node(rgn)); | ||
571 | #endif | ||
569 | nr_new++; | 572 | nr_new++; |
570 | if (insert) | 573 | if (insert) |
571 | memblock_insert_region(type, i++, base, | 574 | memblock_insert_region(type, i++, base, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acb93c554f6e..1af057575ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
5965 | if (!mem_cgroup_is_root(memcg)) | 5965 | if (!mem_cgroup_is_root(memcg)) |
5966 | page_counter_uncharge(&memcg->memory, 1); | 5966 | page_counter_uncharge(&memcg->memory, 1); |
5967 | 5967 | ||
5968 | /* Caller disabled preemption with mapping->tree_lock */ | 5968 | /* |
5969 | * Interrupts should be disabled here because the caller holds the | ||
5970 | * mapping->tree_lock lock which is taken with interrupts-off. It is | ||
5971 | * important here to have the interrupts disabled because it is the | ||
5972 | * only synchronisation we have for udpating the per-CPU variables. | ||
5973 | */ | ||
5974 | VM_BUG_ON(!irqs_disabled()); | ||
5969 | mem_cgroup_charge_statistics(memcg, page, -1); | 5975 | mem_cgroup_charge_statistics(memcg, page, -1); |
5970 | memcg_check_events(memcg, page); | 5976 | memcg_check_events(memcg, page); |
5971 | } | 5977 | } |
diff --git a/mm/memory.c b/mm/memory.c index 388dcf9aa283..bb04d8f2f86c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -61,6 +61,7 @@ | |||
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | 62 | #include <linux/dma-debug.h> |
63 | #include <linux/debugfs.h> | 63 | #include <linux/debugfs.h> |
64 | #include <linux/userfaultfd_k.h> | ||
64 | 65 | ||
65 | #include <asm/io.h> | 66 | #include <asm/io.h> |
66 | #include <asm/pgalloc.h> | 67 | #include <asm/pgalloc.h> |
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
180 | 181 | ||
181 | #ifdef HAVE_GENERIC_MMU_GATHER | 182 | #ifdef HAVE_GENERIC_MMU_GATHER |
182 | 183 | ||
183 | static int tlb_next_batch(struct mmu_gather *tlb) | 184 | static bool tlb_next_batch(struct mmu_gather *tlb) |
184 | { | 185 | { |
185 | struct mmu_gather_batch *batch; | 186 | struct mmu_gather_batch *batch; |
186 | 187 | ||
187 | batch = tlb->active; | 188 | batch = tlb->active; |
188 | if (batch->next) { | 189 | if (batch->next) { |
189 | tlb->active = batch->next; | 190 | tlb->active = batch->next; |
190 | return 1; | 191 | return true; |
191 | } | 192 | } |
192 | 193 | ||
193 | if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) | 194 | if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) |
194 | return 0; | 195 | return false; |
195 | 196 | ||
196 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); | 197 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); |
197 | if (!batch) | 198 | if (!batch) |
198 | return 0; | 199 | return false; |
199 | 200 | ||
200 | tlb->batch_count++; | 201 | tlb->batch_count++; |
201 | batch->next = NULL; | 202 | batch->next = NULL; |
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) | |||
205 | tlb->active->next = batch; | 206 | tlb->active->next = batch; |
206 | tlb->active = batch; | 207 | tlb->active = batch; |
207 | 208 | ||
208 | return 1; | 209 | return true; |
209 | } | 210 | } |
210 | 211 | ||
211 | /* tlb_gather_mmu | 212 | /* tlb_gather_mmu |
@@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2685 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2686 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2686 | if (!pte_none(*page_table)) | 2687 | if (!pte_none(*page_table)) |
2687 | goto unlock; | 2688 | goto unlock; |
2689 | /* Deliver the page fault to userland, check inside PT lock */ | ||
2690 | if (userfaultfd_missing(vma)) { | ||
2691 | pte_unmap_unlock(page_table, ptl); | ||
2692 | return handle_userfault(vma, address, flags, | ||
2693 | VM_UFFD_MISSING); | ||
2694 | } | ||
2688 | goto setpte; | 2695 | goto setpte; |
2689 | } | 2696 | } |
2690 | 2697 | ||
@@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2713 | if (!pte_none(*page_table)) | 2720 | if (!pte_none(*page_table)) |
2714 | goto release; | 2721 | goto release; |
2715 | 2722 | ||
2723 | /* Deliver the page fault to userland, check inside PT lock */ | ||
2724 | if (userfaultfd_missing(vma)) { | ||
2725 | pte_unmap_unlock(page_table, ptl); | ||
2726 | mem_cgroup_cancel_charge(page, memcg); | ||
2727 | page_cache_release(page); | ||
2728 | return handle_userfault(vma, address, flags, | ||
2729 | VM_UFFD_MISSING); | ||
2730 | } | ||
2731 | |||
2716 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2732 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2717 | page_add_new_anon_rmap(page, vma, address); | 2733 | page_add_new_anon_rmap(page, vma, address); |
2718 | mem_cgroup_commit_charge(page, memcg, false); | 2734 | mem_cgroup_commit_charge(page, memcg, false); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6da82bcb0a8b..8fd97dac538a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1248 | 1248 | ||
1249 | mem_hotplug_begin(); | 1249 | mem_hotplug_begin(); |
1250 | 1250 | ||
1251 | /* | ||
1252 | * Add new range to memblock so that when hotadd_new_pgdat() is called | ||
1253 | * to allocate new pgdat, get_pfn_range_for_nid() will be able to find | ||
1254 | * this new range and calculate total pages correctly. The range will | ||
1255 | * be removed at hot-remove time. | ||
1256 | */ | ||
1257 | memblock_add_node(start, size, nid); | ||
1258 | |||
1251 | new_node = !node_online(nid); | 1259 | new_node = !node_online(nid); |
1252 | if (new_node) { | 1260 | if (new_node) { |
1253 | pgdat = hotadd_new_pgdat(nid, start); | 1261 | pgdat = hotadd_new_pgdat(nid, start); |
@@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1277 | 1285 | ||
1278 | /* create new memmap entry */ | 1286 | /* create new memmap entry */ |
1279 | firmware_map_add_hotplug(start, start + size, "System RAM"); | 1287 | firmware_map_add_hotplug(start, start + size, "System RAM"); |
1280 | memblock_add_node(start, size, nid); | ||
1281 | 1288 | ||
1282 | goto out; | 1289 | goto out; |
1283 | 1290 | ||
@@ -1286,6 +1293,7 @@ error: | |||
1286 | if (new_pgdat) | 1293 | if (new_pgdat) |
1287 | rollback_node_hotadd(nid, pgdat); | 1294 | rollback_node_hotadd(nid, pgdat); |
1288 | release_memory_resource(res); | 1295 | release_memory_resource(res); |
1296 | memblock_remove(start, size); | ||
1289 | 1297 | ||
1290 | out: | 1298 | out: |
1291 | mem_hotplug_done(); | 1299 | mem_hotplug_done(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 99d4c1d0b858..a7f1e0d1d6b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
722 | pgoff = vma->vm_pgoff + | 722 | pgoff = vma->vm_pgoff + |
723 | ((vmstart - vma->vm_start) >> PAGE_SHIFT); | 723 | ((vmstart - vma->vm_start) >> PAGE_SHIFT); |
724 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 724 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
725 | vma->anon_vma, vma->vm_file, pgoff, | 725 | vma->anon_vma, vma->vm_file, pgoff, |
726 | new_pol); | 726 | new_pol, vma->vm_userfaultfd_ctx); |
727 | if (prev) { | 727 | if (prev) { |
728 | vma = prev; | 728 | vma = prev; |
729 | next = vma->vm_next; | 729 | next = vma->vm_next; |
diff --git a/mm/migrate.c b/mm/migrate.c index eb4267107d1f..5c08cab5419e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1226 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) | 1226 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) |
1227 | goto set_status; | 1227 | goto set_status; |
1228 | 1228 | ||
1229 | page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); | 1229 | /* FOLL_DUMP to ignore special (like zero) pages */ |
1230 | page = follow_page(vma, pp->addr, | ||
1231 | FOLL_GET | FOLL_SPLIT | FOLL_DUMP); | ||
1230 | 1232 | ||
1231 | err = PTR_ERR(page); | 1233 | err = PTR_ERR(page); |
1232 | if (IS_ERR(page)) | 1234 | if (IS_ERR(page)) |
@@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1236 | if (!page) | 1238 | if (!page) |
1237 | goto set_status; | 1239 | goto set_status; |
1238 | 1240 | ||
1239 | /* Use PageReserved to check for zero page */ | ||
1240 | if (PageReserved(page)) | ||
1241 | goto put_and_set; | ||
1242 | |||
1243 | pp->page = page; | 1241 | pp->page = page; |
1244 | err = page_to_nid(page); | 1242 | err = page_to_nid(page); |
1245 | 1243 | ||
@@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1396 | if (!vma || addr < vma->vm_start) | 1394 | if (!vma || addr < vma->vm_start) |
1397 | goto set_status; | 1395 | goto set_status; |
1398 | 1396 | ||
1399 | page = follow_page(vma, addr, 0); | 1397 | /* FOLL_DUMP to ignore special (like zero) pages */ |
1398 | page = follow_page(vma, addr, FOLL_DUMP); | ||
1400 | 1399 | ||
1401 | err = PTR_ERR(page); | 1400 | err = PTR_ERR(page); |
1402 | if (IS_ERR(page)) | 1401 | if (IS_ERR(page)) |
1403 | goto set_status; | 1402 | goto set_status; |
1404 | 1403 | ||
1405 | err = -ENOENT; | 1404 | err = page ? page_to_nid(page) : -ENOENT; |
1406 | /* Use PageReserved to check for zero page */ | ||
1407 | if (!page || PageReserved(page)) | ||
1408 | goto set_status; | ||
1409 | |||
1410 | err = page_to_nid(page); | ||
1411 | set_status: | 1405 | set_status: |
1412 | *status = err; | 1406 | *status = err; |
1413 | 1407 | ||
diff --git a/mm/mlock.c b/mm/mlock.c index 6fd2cf15e868..25936680064f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
510 | 510 | ||
511 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 511 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
512 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, | 512 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
513 | vma->vm_file, pgoff, vma_policy(vma)); | 513 | vma->vm_file, pgoff, vma_policy(vma), |
514 | vma->vm_userfaultfd_ctx); | ||
514 | if (*prev) { | 515 | if (*prev) { |
515 | vma = *prev; | 516 | vma = *prev; |
516 | goto success; | 517 | goto success; |
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/memory.h> | 42 | #include <linux/memory.h> |
43 | #include <linux/printk.h> | 43 | #include <linux/printk.h> |
44 | #include <linux/userfaultfd_k.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | #include <asm/cacheflush.h> | 47 | #include <asm/cacheflush.h> |
@@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
919 | * per-vma resources, so we don't attempt to merge those. | 920 | * per-vma resources, so we don't attempt to merge those. |
920 | */ | 921 | */ |
921 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 922 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
922 | struct file *file, unsigned long vm_flags) | 923 | struct file *file, unsigned long vm_flags, |
924 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx) | ||
923 | { | 925 | { |
924 | /* | 926 | /* |
925 | * VM_SOFTDIRTY should not prevent from VMA merging, if we | 927 | * VM_SOFTDIRTY should not prevent from VMA merging, if we |
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, | |||
935 | return 0; | 937 | return 0; |
936 | if (vma->vm_ops && vma->vm_ops->close) | 938 | if (vma->vm_ops && vma->vm_ops->close) |
937 | return 0; | 939 | return 0; |
940 | if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) | ||
941 | return 0; | ||
938 | return 1; | 942 | return 1; |
939 | } | 943 | } |
940 | 944 | ||
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | |||
965 | */ | 969 | */ |
966 | static int | 970 | static int |
967 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | 971 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, |
968 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 972 | struct anon_vma *anon_vma, struct file *file, |
973 | pgoff_t vm_pgoff, | ||
974 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx) | ||
969 | { | 975 | { |
970 | if (is_mergeable_vma(vma, file, vm_flags) && | 976 | if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && |
971 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 977 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
972 | if (vma->vm_pgoff == vm_pgoff) | 978 | if (vma->vm_pgoff == vm_pgoff) |
973 | return 1; | 979 | return 1; |
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | |||
984 | */ | 990 | */ |
985 | static int | 991 | static int |
986 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | 992 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, |
987 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 993 | struct anon_vma *anon_vma, struct file *file, |
994 | pgoff_t vm_pgoff, | ||
995 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx) | ||
988 | { | 996 | { |
989 | if (is_mergeable_vma(vma, file, vm_flags) && | 997 | if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && |
990 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 998 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
991 | pgoff_t vm_pglen; | 999 | pgoff_t vm_pglen; |
992 | vm_pglen = vma_pages(vma); | 1000 | vm_pglen = vma_pages(vma); |
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1029 | struct vm_area_struct *prev, unsigned long addr, | 1037 | struct vm_area_struct *prev, unsigned long addr, |
1030 | unsigned long end, unsigned long vm_flags, | 1038 | unsigned long end, unsigned long vm_flags, |
1031 | struct anon_vma *anon_vma, struct file *file, | 1039 | struct anon_vma *anon_vma, struct file *file, |
1032 | pgoff_t pgoff, struct mempolicy *policy) | 1040 | pgoff_t pgoff, struct mempolicy *policy, |
1041 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx) | ||
1033 | { | 1042 | { |
1034 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 1043 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
1035 | struct vm_area_struct *area, *next; | 1044 | struct vm_area_struct *area, *next; |
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1056 | if (prev && prev->vm_end == addr && | 1065 | if (prev && prev->vm_end == addr && |
1057 | mpol_equal(vma_policy(prev), policy) && | 1066 | mpol_equal(vma_policy(prev), policy) && |
1058 | can_vma_merge_after(prev, vm_flags, | 1067 | can_vma_merge_after(prev, vm_flags, |
1059 | anon_vma, file, pgoff)) { | 1068 | anon_vma, file, pgoff, |
1069 | vm_userfaultfd_ctx)) { | ||
1060 | /* | 1070 | /* |
1061 | * OK, it can. Can we now merge in the successor as well? | 1071 | * OK, it can. Can we now merge in the successor as well? |
1062 | */ | 1072 | */ |
1063 | if (next && end == next->vm_start && | 1073 | if (next && end == next->vm_start && |
1064 | mpol_equal(policy, vma_policy(next)) && | 1074 | mpol_equal(policy, vma_policy(next)) && |
1065 | can_vma_merge_before(next, vm_flags, | 1075 | can_vma_merge_before(next, vm_flags, |
1066 | anon_vma, file, pgoff+pglen) && | 1076 | anon_vma, file, |
1077 | pgoff+pglen, | ||
1078 | vm_userfaultfd_ctx) && | ||
1067 | is_mergeable_anon_vma(prev->anon_vma, | 1079 | is_mergeable_anon_vma(prev->anon_vma, |
1068 | next->anon_vma, NULL)) { | 1080 | next->anon_vma, NULL)) { |
1069 | /* cases 1, 6 */ | 1081 | /* cases 1, 6 */ |
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1084 | if (next && end == next->vm_start && | 1096 | if (next && end == next->vm_start && |
1085 | mpol_equal(policy, vma_policy(next)) && | 1097 | mpol_equal(policy, vma_policy(next)) && |
1086 | can_vma_merge_before(next, vm_flags, | 1098 | can_vma_merge_before(next, vm_flags, |
1087 | anon_vma, file, pgoff+pglen)) { | 1099 | anon_vma, file, pgoff+pglen, |
1100 | vm_userfaultfd_ctx)) { | ||
1088 | if (prev && addr < prev->vm_end) /* case 4 */ | 1101 | if (prev && addr < prev->vm_end) /* case 4 */ |
1089 | err = vma_adjust(prev, prev->vm_start, | 1102 | err = vma_adjust(prev, prev->vm_start, |
1090 | addr, prev->vm_pgoff, NULL); | 1103 | addr, prev->vm_pgoff, NULL); |
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1570 | /* | 1583 | /* |
1571 | * Can we just expand an old mapping? | 1584 | * Can we just expand an old mapping? |
1572 | */ | 1585 | */ |
1573 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, | 1586 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, |
1574 | NULL); | 1587 | NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); |
1575 | if (vma) | 1588 | if (vma) |
1576 | goto out; | 1589 | goto out; |
1577 | 1590 | ||
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2757 | 2770 | ||
2758 | /* Can we just expand an old private anonymous mapping? */ | 2771 | /* Can we just expand an old private anonymous mapping? */ |
2759 | vma = vma_merge(mm, prev, addr, addr + len, flags, | 2772 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
2760 | NULL, NULL, pgoff, NULL); | 2773 | NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); |
2761 | if (vma) | 2774 | if (vma) |
2762 | goto out; | 2775 | goto out; |
2763 | 2776 | ||
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2913 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) | 2926 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2914 | return NULL; /* should never get here */ | 2927 | return NULL; /* should never get here */ |
2915 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2928 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2916 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2929 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), |
2930 | vma->vm_userfaultfd_ctx); | ||
2917 | if (new_vma) { | 2931 | if (new_vma) { |
2918 | /* | 2932 | /* |
2919 | * Source vma may have been merged into new_vma | 2933 | * Source vma may have been merged into new_vma |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e7d6f1171ecb..ef5be8eaab00 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
292 | */ | 292 | */ |
293 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 293 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
294 | *pprev = vma_merge(mm, *pprev, start, end, newflags, | 294 | *pprev = vma_merge(mm, *pprev, start, end, newflags, |
295 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 295 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), |
296 | vma->vm_userfaultfd_ctx); | ||
296 | if (*pprev) { | 297 | if (*pprev) { |
297 | vma = *pprev; | 298 | vma = *pprev; |
298 | goto success; | 299 | goto success; |
diff --git a/mm/mremap.c b/mm/mremap.c index a7c93eceb1c8..5a71cce8c6ea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
276 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, | 276 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
277 | need_rmap_locks); | 277 | need_rmap_locks); |
278 | if (moved_len < old_len) { | 278 | if (moved_len < old_len) { |
279 | err = -ENOMEM; | ||
280 | } else if (vma->vm_ops && vma->vm_ops->mremap) { | ||
281 | err = vma->vm_ops->mremap(new_vma); | ||
282 | } | ||
283 | |||
284 | if (unlikely(err)) { | ||
279 | /* | 285 | /* |
280 | * On error, move entries back from new area to old, | 286 | * On error, move entries back from new area to old, |
281 | * which will succeed since page tables still there, | 287 | * which will succeed since page tables still there, |
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
286 | vma = new_vma; | 292 | vma = new_vma; |
287 | old_len = new_len; | 293 | old_len = new_len; |
288 | old_addr = new_addr; | 294 | old_addr = new_addr; |
289 | new_addr = -ENOMEM; | 295 | new_addr = err; |
290 | } else { | 296 | } else { |
291 | if (vma->vm_file && vma->vm_file->f_op->mremap) { | ||
292 | err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); | ||
293 | if (err < 0) { | ||
294 | move_page_tables(new_vma, new_addr, vma, | ||
295 | old_addr, moved_len, true); | ||
296 | return err; | ||
297 | } | ||
298 | } | ||
299 | arch_remap(mm, old_addr, old_addr + old_len, | 297 | arch_remap(mm, old_addr, old_addr + old_len, |
300 | new_addr, new_addr + new_len); | 298 | new_addr, new_addr + new_len); |
301 | } | 299 | } |
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
348 | { | 346 | { |
349 | struct mm_struct *mm = current->mm; | 347 | struct mm_struct *mm = current->mm; |
350 | struct vm_area_struct *vma = find_vma(mm, addr); | 348 | struct vm_area_struct *vma = find_vma(mm, addr); |
349 | unsigned long pgoff; | ||
351 | 350 | ||
352 | if (!vma || vma->vm_start > addr) | 351 | if (!vma || vma->vm_start > addr) |
353 | return ERR_PTR(-EFAULT); | 352 | return ERR_PTR(-EFAULT); |
@@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
359 | if (old_len > vma->vm_end - addr) | 358 | if (old_len > vma->vm_end - addr) |
360 | return ERR_PTR(-EFAULT); | 359 | return ERR_PTR(-EFAULT); |
361 | 360 | ||
361 | if (new_len == old_len) | ||
362 | return vma; | ||
363 | |||
362 | /* Need to be careful about a growing mapping */ | 364 | /* Need to be careful about a growing mapping */ |
363 | if (new_len > old_len) { | 365 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
364 | unsigned long pgoff; | 366 | pgoff += vma->vm_pgoff; |
365 | 367 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | |
366 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) | 368 | return ERR_PTR(-EINVAL); |
367 | return ERR_PTR(-EFAULT); | 369 | |
368 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; | 370 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) |
369 | pgoff += vma->vm_pgoff; | 371 | return ERR_PTR(-EFAULT); |
370 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | ||
371 | return ERR_PTR(-EINVAL); | ||
372 | } | ||
373 | 372 | ||
374 | if (vma->vm_flags & VM_LOCKED) { | 373 | if (vma->vm_flags & VM_LOCKED) { |
375 | unsigned long locked, lock_limit; | 374 | unsigned long locked, lock_limit; |
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | |||
408 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | 407 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) |
409 | goto out; | 408 | goto out; |
410 | 409 | ||
411 | /* Check if the location we're moving into overlaps the | 410 | /* Ensure the old/new locations do not overlap */ |
412 | * old location at all, and fail if it does. | 411 | if (addr + old_len > new_addr && new_addr + new_len > addr) |
413 | */ | ||
414 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
415 | goto out; | ||
416 | |||
417 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
418 | goto out; | 412 | goto out; |
419 | 413 | ||
420 | ret = do_munmap(mm, new_addr, new_len); | 414 | ret = do_munmap(mm, new_addr, new_len); |
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
580 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); | 574 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); |
581 | } | 575 | } |
582 | out: | 576 | out: |
583 | if (ret & ~PAGE_MASK) | 577 | if (ret & ~PAGE_MASK) { |
584 | vm_unacct_memory(charged); | 578 | vm_unacct_memory(charged); |
579 | locked = 0; | ||
580 | } | ||
585 | up_write(¤t->mm->mmap_sem); | 581 | up_write(¤t->mm->mmap_sem); |
586 | if (locked && new_len > old_len) | 582 | if (locked && new_len > old_len) |
587 | mm_populate(new_addr + old_len, new_len - old_len); | 583 | mm_populate(new_addr + old_len, new_len - old_len); |
@@ -62,6 +62,8 @@ | |||
62 | 62 | ||
63 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
64 | 64 | ||
65 | #include <trace/events/tlb.h> | ||
66 | |||
65 | #include "internal.h" | 67 | #include "internal.h" |
66 | 68 | ||
67 | static struct kmem_cache *anon_vma_cachep; | 69 | static struct kmem_cache *anon_vma_cachep; |
@@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
583 | return address; | 585 | return address; |
584 | } | 586 | } |
585 | 587 | ||
588 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
589 | static void percpu_flush_tlb_batch_pages(void *data) | ||
590 | { | ||
591 | /* | ||
592 | * All TLB entries are flushed on the assumption that it is | ||
593 | * cheaper to flush all TLBs and let them be refilled than | ||
594 | * flushing individual PFNs. Note that we do not track mm's | ||
595 | * to flush as that might simply be multiple full TLB flushes | ||
596 | * for no gain. | ||
597 | */ | ||
598 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | ||
599 | flush_tlb_local(); | ||
600 | } | ||
601 | |||
602 | /* | ||
603 | * Flush TLB entries for recently unmapped pages from remote CPUs. It is | ||
604 | * important if a PTE was dirty when it was unmapped that it's flushed | ||
605 | * before any IO is initiated on the page to prevent lost writes. Similarly, | ||
606 | * it must be flushed before freeing to prevent data leakage. | ||
607 | */ | ||
608 | void try_to_unmap_flush(void) | ||
609 | { | ||
610 | struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; | ||
611 | int cpu; | ||
612 | |||
613 | if (!tlb_ubc->flush_required) | ||
614 | return; | ||
615 | |||
616 | cpu = get_cpu(); | ||
617 | |||
618 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); | ||
619 | |||
620 | if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) | ||
621 | percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); | ||
622 | |||
623 | if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { | ||
624 | smp_call_function_many(&tlb_ubc->cpumask, | ||
625 | percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); | ||
626 | } | ||
627 | cpumask_clear(&tlb_ubc->cpumask); | ||
628 | tlb_ubc->flush_required = false; | ||
629 | tlb_ubc->writable = false; | ||
630 | put_cpu(); | ||
631 | } | ||
632 | |||
633 | /* Flush iff there are potentially writable TLB entries that can race with IO */ | ||
634 | void try_to_unmap_flush_dirty(void) | ||
635 | { | ||
636 | struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; | ||
637 | |||
638 | if (tlb_ubc->writable) | ||
639 | try_to_unmap_flush(); | ||
640 | } | ||
641 | |||
642 | static void set_tlb_ubc_flush_pending(struct mm_struct *mm, | ||
643 | struct page *page, bool writable) | ||
644 | { | ||
645 | struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; | ||
646 | |||
647 | cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); | ||
648 | tlb_ubc->flush_required = true; | ||
649 | |||
650 | /* | ||
651 | * If the PTE was dirty then it's best to assume it's writable. The | ||
652 | * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() | ||
653 | * before the page is queued for IO. | ||
654 | */ | ||
655 | if (writable) | ||
656 | tlb_ubc->writable = true; | ||
657 | } | ||
658 | |||
659 | /* | ||
660 | * Returns true if the TLB flush should be deferred to the end of a batch of | ||
661 | * unmap operations to reduce IPIs. | ||
662 | */ | ||
663 | static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) | ||
664 | { | ||
665 | bool should_defer = false; | ||
666 | |||
667 | if (!(flags & TTU_BATCH_FLUSH)) | ||
668 | return false; | ||
669 | |||
670 | /* If remote CPUs need to be flushed then defer batch the flush */ | ||
671 | if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) | ||
672 | should_defer = true; | ||
673 | put_cpu(); | ||
674 | |||
675 | return should_defer; | ||
676 | } | ||
677 | #else | ||
678 | static void set_tlb_ubc_flush_pending(struct mm_struct *mm, | ||
679 | struct page *page, bool writable) | ||
680 | { | ||
681 | } | ||
682 | |||
683 | static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) | ||
684 | { | ||
685 | return false; | ||
686 | } | ||
687 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ | ||
688 | |||
586 | /* | 689 | /* |
587 | * At what user virtual address is page expected in vma? | 690 | * At what user virtual address is page expected in vma? |
588 | * Caller should check the page is actually part of the vma. | 691 | * Caller should check the page is actually part of the vma. |
@@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1220 | 1323 | ||
1221 | /* Nuke the page table entry. */ | 1324 | /* Nuke the page table entry. */ |
1222 | flush_cache_page(vma, address, page_to_pfn(page)); | 1325 | flush_cache_page(vma, address, page_to_pfn(page)); |
1223 | pteval = ptep_clear_flush(vma, address, pte); | 1326 | if (should_defer_flush(mm, flags)) { |
1327 | /* | ||
1328 | * We clear the PTE but do not flush so potentially a remote | ||
1329 | * CPU could still be writing to the page. If the entry was | ||
1330 | * previously clean then the architecture must guarantee that | ||
1331 | * a clear->dirty transition on a cached TLB entry is written | ||
1332 | * through and traps if the PTE is unmapped. | ||
1333 | */ | ||
1334 | pteval = ptep_get_and_clear(mm, address, pte); | ||
1335 | |||
1336 | set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); | ||
1337 | } else { | ||
1338 | pteval = ptep_clear_flush(vma, address, pte); | ||
1339 | } | ||
1224 | 1340 | ||
1225 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1341 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1226 | if (pte_dirty(pteval)) | 1342 | if (pte_dirty(pteval)) |
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3416 | } | 3416 | } |
3417 | EXPORT_SYMBOL(kmem_cache_alloc); | 3417 | EXPORT_SYMBOL(kmem_cache_alloc); |
3418 | 3418 | ||
3419 | void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) | ||
3420 | { | ||
3421 | __kmem_cache_free_bulk(s, size, p); | ||
3422 | } | ||
3423 | EXPORT_SYMBOL(kmem_cache_free_bulk); | ||
3424 | |||
3425 | bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, | ||
3426 | void **p) | ||
3427 | { | ||
3428 | return __kmem_cache_alloc_bulk(s, flags, size, p); | ||
3429 | } | ||
3430 | EXPORT_SYMBOL(kmem_cache_alloc_bulk); | ||
3431 | |||
3419 | #ifdef CONFIG_TRACING | 3432 | #ifdef CONFIG_TRACING |
3420 | void * | 3433 | void * |
3421 | kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) | 3434 | kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) |
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); | |||
163 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 163 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
164 | size_t count, loff_t *ppos); | 164 | size_t count, loff_t *ppos); |
165 | 165 | ||
166 | /* | ||
167 | * Generic implementation of bulk operations | ||
168 | * These are useful for situations in which the allocator cannot | ||
169 | * perform optimizations. In that case segments of the objecct listed | ||
170 | * may be allocated or freed using these operations. | ||
171 | */ | ||
172 | void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); | ||
173 | bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); | ||
174 | |||
166 | #ifdef CONFIG_MEMCG_KMEM | 175 | #ifdef CONFIG_MEMCG_KMEM |
167 | /* | 176 | /* |
168 | * Iterate over all memcg caches of the given root cache. The caller must hold | 177 | * Iterate over all memcg caches of the given root cache. The caller must hold |
@@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | |||
321 | return cachep; | 330 | return cachep; |
322 | 331 | ||
323 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", | 332 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", |
324 | __func__, cachep->name, s->name); | 333 | __func__, s->name, cachep->name); |
325 | WARN_ON_ONCE(1); | 334 | WARN_ON_ONCE(1); |
326 | return s; | 335 | return s; |
327 | } | 336 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 86831105a09f..c26829fe4e37 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
104 | } | 104 | } |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) | ||
108 | { | ||
109 | size_t i; | ||
110 | |||
111 | for (i = 0; i < nr; i++) | ||
112 | kmem_cache_free(s, p[i]); | ||
113 | } | ||
114 | |||
115 | bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, | ||
116 | void **p) | ||
117 | { | ||
118 | size_t i; | ||
119 | |||
120 | for (i = 0; i < nr; i++) { | ||
121 | void *x = p[i] = kmem_cache_alloc(s, flags); | ||
122 | if (!x) { | ||
123 | __kmem_cache_free_bulk(s, i, p); | ||
124 | return false; | ||
125 | } | ||
126 | } | ||
127 | return true; | ||
128 | } | ||
129 | |||
107 | #ifdef CONFIG_MEMCG_KMEM | 130 | #ifdef CONFIG_MEMCG_KMEM |
108 | void slab_init_memcg_params(struct kmem_cache *s) | 131 | void slab_init_memcg_params(struct kmem_cache *s) |
109 | { | 132 | { |
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) | |||
611 | } | 611 | } |
612 | EXPORT_SYMBOL(kmem_cache_free); | 612 | EXPORT_SYMBOL(kmem_cache_free); |
613 | 613 | ||
614 | void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) | ||
615 | { | ||
616 | __kmem_cache_free_bulk(s, size, p); | ||
617 | } | ||
618 | EXPORT_SYMBOL(kmem_cache_free_bulk); | ||
619 | |||
620 | bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, | ||
621 | void **p) | ||
622 | { | ||
623 | return __kmem_cache_alloc_bulk(s, flags, size, p); | ||
624 | } | ||
625 | EXPORT_SYMBOL(kmem_cache_alloc_bulk); | ||
626 | |||
614 | int __kmem_cache_shutdown(struct kmem_cache *c) | 627 | int __kmem_cache_shutdown(struct kmem_cache *c) |
615 | { | 628 | { |
616 | /* No way to check for remaining objects */ | 629 | /* No way to check for remaining objects */ |
@@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
1306 | kasan_slab_free(s, x); | 1306 | kasan_slab_free(s, x); |
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | static void setup_object(struct kmem_cache *s, struct page *page, | ||
1310 | void *object) | ||
1311 | { | ||
1312 | setup_object_debug(s, page, object); | ||
1313 | if (unlikely(s->ctor)) { | ||
1314 | kasan_unpoison_object_data(s, object); | ||
1315 | s->ctor(object); | ||
1316 | kasan_poison_object_data(s, object); | ||
1317 | } | ||
1318 | } | ||
1319 | |||
1309 | /* | 1320 | /* |
1310 | * Slab allocation and freeing | 1321 | * Slab allocation and freeing |
1311 | */ | 1322 | */ |
@@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1336 | struct page *page; | 1347 | struct page *page; |
1337 | struct kmem_cache_order_objects oo = s->oo; | 1348 | struct kmem_cache_order_objects oo = s->oo; |
1338 | gfp_t alloc_gfp; | 1349 | gfp_t alloc_gfp; |
1350 | void *start, *p; | ||
1351 | int idx, order; | ||
1339 | 1352 | ||
1340 | flags &= gfp_allowed_mask; | 1353 | flags &= gfp_allowed_mask; |
1341 | 1354 | ||
@@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1349 | * so we fall-back to the minimum order allocation. | 1362 | * so we fall-back to the minimum order allocation. |
1350 | */ | 1363 | */ |
1351 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; | 1364 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; |
1365 | if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) | ||
1366 | alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; | ||
1352 | 1367 | ||
1353 | page = alloc_slab_page(s, alloc_gfp, node, oo); | 1368 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1354 | if (unlikely(!page)) { | 1369 | if (unlikely(!page)) { |
@@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1359 | * Try a lower order alloc if possible | 1374 | * Try a lower order alloc if possible |
1360 | */ | 1375 | */ |
1361 | page = alloc_slab_page(s, alloc_gfp, node, oo); | 1376 | page = alloc_slab_page(s, alloc_gfp, node, oo); |
1362 | 1377 | if (unlikely(!page)) | |
1363 | if (page) | 1378 | goto out; |
1364 | stat(s, ORDER_FALLBACK); | 1379 | stat(s, ORDER_FALLBACK); |
1365 | } | 1380 | } |
1366 | 1381 | ||
1367 | if (kmemcheck_enabled && page | 1382 | if (kmemcheck_enabled && |
1368 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1383 | !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1369 | int pages = 1 << oo_order(oo); | 1384 | int pages = 1 << oo_order(oo); |
1370 | 1385 | ||
1371 | kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); | 1386 | kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); |
@@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1380 | kmemcheck_mark_unallocated_pages(page, pages); | 1395 | kmemcheck_mark_unallocated_pages(page, pages); |
1381 | } | 1396 | } |
1382 | 1397 | ||
1383 | if (flags & __GFP_WAIT) | ||
1384 | local_irq_disable(); | ||
1385 | if (!page) | ||
1386 | return NULL; | ||
1387 | |||
1388 | page->objects = oo_objects(oo); | 1398 | page->objects = oo_objects(oo); |
1389 | mod_zone_page_state(page_zone(page), | ||
1390 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | ||
1391 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | ||
1392 | 1 << oo_order(oo)); | ||
1393 | |||
1394 | return page; | ||
1395 | } | ||
1396 | |||
1397 | static void setup_object(struct kmem_cache *s, struct page *page, | ||
1398 | void *object) | ||
1399 | { | ||
1400 | setup_object_debug(s, page, object); | ||
1401 | if (unlikely(s->ctor)) { | ||
1402 | kasan_unpoison_object_data(s, object); | ||
1403 | s->ctor(object); | ||
1404 | kasan_poison_object_data(s, object); | ||
1405 | } | ||
1406 | } | ||
1407 | |||
1408 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | ||
1409 | { | ||
1410 | struct page *page; | ||
1411 | void *start; | ||
1412 | void *p; | ||
1413 | int order; | ||
1414 | int idx; | ||
1415 | |||
1416 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { | ||
1417 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
1418 | BUG(); | ||
1419 | } | ||
1420 | |||
1421 | page = allocate_slab(s, | ||
1422 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
1423 | if (!page) | ||
1424 | goto out; | ||
1425 | 1399 | ||
1426 | order = compound_order(page); | 1400 | order = compound_order(page); |
1427 | inc_slabs_node(s, page_to_nid(page), page->objects); | ||
1428 | page->slab_cache = s; | 1401 | page->slab_cache = s; |
1429 | __SetPageSlab(page); | 1402 | __SetPageSlab(page); |
1430 | if (page_is_pfmemalloc(page)) | 1403 | if (page_is_pfmemalloc(page)) |
@@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1448 | page->freelist = start; | 1421 | page->freelist = start; |
1449 | page->inuse = page->objects; | 1422 | page->inuse = page->objects; |
1450 | page->frozen = 1; | 1423 | page->frozen = 1; |
1424 | |||
1451 | out: | 1425 | out: |
1426 | if (flags & __GFP_WAIT) | ||
1427 | local_irq_disable(); | ||
1428 | if (!page) | ||
1429 | return NULL; | ||
1430 | |||
1431 | mod_zone_page_state(page_zone(page), | ||
1432 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | ||
1433 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | ||
1434 | 1 << oo_order(oo)); | ||
1435 | |||
1436 | inc_slabs_node(s, page_to_nid(page), page->objects); | ||
1437 | |||
1452 | return page; | 1438 | return page; |
1453 | } | 1439 | } |
1454 | 1440 | ||
1441 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | ||
1442 | { | ||
1443 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { | ||
1444 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
1445 | BUG(); | ||
1446 | } | ||
1447 | |||
1448 | return allocate_slab(s, | ||
1449 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
1450 | } | ||
1451 | |||
1455 | static void __free_slab(struct kmem_cache *s, struct page *page) | 1452 | static void __free_slab(struct kmem_cache *s, struct page *page) |
1456 | { | 1453 | { |
1457 | int order = compound_order(page); | 1454 | int order = compound_order(page); |
@@ -2712,7 +2709,7 @@ redo: | |||
2712 | * Determine the currently cpus per cpu slab. | 2709 | * Determine the currently cpus per cpu slab. |
2713 | * The cpu may change afterward. However that does not matter since | 2710 | * The cpu may change afterward. However that does not matter since |
2714 | * data is retrieved via this pointer. If we are on the same cpu | 2711 | * data is retrieved via this pointer. If we are on the same cpu |
2715 | * during the cmpxchg then the free will succedd. | 2712 | * during the cmpxchg then the free will succeed. |
2716 | */ | 2713 | */ |
2717 | do { | 2714 | do { |
2718 | tid = this_cpu_read(s->cpu_slab->tid); | 2715 | tid = this_cpu_read(s->cpu_slab->tid); |
@@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
2750 | } | 2747 | } |
2751 | EXPORT_SYMBOL(kmem_cache_free); | 2748 | EXPORT_SYMBOL(kmem_cache_free); |
2752 | 2749 | ||
2750 | /* Note that interrupts must be enabled when calling this function. */ | ||
2751 | void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) | ||
2752 | { | ||
2753 | struct kmem_cache_cpu *c; | ||
2754 | struct page *page; | ||
2755 | int i; | ||
2756 | |||
2757 | local_irq_disable(); | ||
2758 | c = this_cpu_ptr(s->cpu_slab); | ||
2759 | |||
2760 | for (i = 0; i < size; i++) { | ||
2761 | void *object = p[i]; | ||
2762 | |||
2763 | BUG_ON(!object); | ||
2764 | /* kmem cache debug support */ | ||
2765 | s = cache_from_obj(s, object); | ||
2766 | if (unlikely(!s)) | ||
2767 | goto exit; | ||
2768 | slab_free_hook(s, object); | ||
2769 | |||
2770 | page = virt_to_head_page(object); | ||
2771 | |||
2772 | if (c->page == page) { | ||
2773 | /* Fastpath: local CPU free */ | ||
2774 | set_freepointer(s, object, c->freelist); | ||
2775 | c->freelist = object; | ||
2776 | } else { | ||
2777 | c->tid = next_tid(c->tid); | ||
2778 | local_irq_enable(); | ||
2779 | /* Slowpath: overhead locked cmpxchg_double_slab */ | ||
2780 | __slab_free(s, page, object, _RET_IP_); | ||
2781 | local_irq_disable(); | ||
2782 | c = this_cpu_ptr(s->cpu_slab); | ||
2783 | } | ||
2784 | } | ||
2785 | exit: | ||
2786 | c->tid = next_tid(c->tid); | ||
2787 | local_irq_enable(); | ||
2788 | } | ||
2789 | EXPORT_SYMBOL(kmem_cache_free_bulk); | ||
2790 | |||
2791 | /* Note that interrupts must be enabled when calling this function. */ | ||
2792 | bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, | ||
2793 | void **p) | ||
2794 | { | ||
2795 | struct kmem_cache_cpu *c; | ||
2796 | int i; | ||
2797 | |||
2798 | /* | ||
2799 | * Drain objects in the per cpu slab, while disabling local | ||
2800 | * IRQs, which protects against PREEMPT and interrupts | ||
2801 | * handlers invoking normal fastpath. | ||
2802 | */ | ||
2803 | local_irq_disable(); | ||
2804 | c = this_cpu_ptr(s->cpu_slab); | ||
2805 | |||
2806 | for (i = 0; i < size; i++) { | ||
2807 | void *object = c->freelist; | ||
2808 | |||
2809 | if (unlikely(!object)) { | ||
2810 | local_irq_enable(); | ||
2811 | /* | ||
2812 | * Invoking slow path likely have side-effect | ||
2813 | * of re-populating per CPU c->freelist | ||
2814 | */ | ||
2815 | p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, | ||
2816 | _RET_IP_, c); | ||
2817 | if (unlikely(!p[i])) { | ||
2818 | __kmem_cache_free_bulk(s, i, p); | ||
2819 | return false; | ||
2820 | } | ||
2821 | local_irq_disable(); | ||
2822 | c = this_cpu_ptr(s->cpu_slab); | ||
2823 | continue; /* goto for-loop */ | ||
2824 | } | ||
2825 | |||
2826 | /* kmem_cache debug support */ | ||
2827 | s = slab_pre_alloc_hook(s, flags); | ||
2828 | if (unlikely(!s)) { | ||
2829 | __kmem_cache_free_bulk(s, i, p); | ||
2830 | c->tid = next_tid(c->tid); | ||
2831 | local_irq_enable(); | ||
2832 | return false; | ||
2833 | } | ||
2834 | |||
2835 | c->freelist = get_freepointer(s, object); | ||
2836 | p[i] = object; | ||
2837 | |||
2838 | /* kmem_cache debug support */ | ||
2839 | slab_post_alloc_hook(s, flags, object); | ||
2840 | } | ||
2841 | c->tid = next_tid(c->tid); | ||
2842 | local_irq_enable(); | ||
2843 | |||
2844 | /* Clear memory outside IRQ disabled fastpath loop */ | ||
2845 | if (unlikely(flags & __GFP_ZERO)) { | ||
2846 | int j; | ||
2847 | |||
2848 | for (j = 0; j < i; j++) | ||
2849 | memset(p[j], 0, s->object_size); | ||
2850 | } | ||
2851 | |||
2852 | return true; | ||
2853 | } | ||
2854 | EXPORT_SYMBOL(kmem_cache_alloc_bulk); | ||
2855 | |||
2856 | |||
2753 | /* | 2857 | /* |
2754 | * Object placement in a slab is made very easy because we always start at | 2858 | * Object placement in a slab is made very easy because we always start at |
2755 | * offset 0. If we tune the size of the object to the alignment then we can | 2859 | * offset 0. If we tune the size of the object to the alignment then we can |
@@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5181 | s->kobj.kset = cache_kset(s); | 5285 | s->kobj.kset = cache_kset(s); |
5182 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); | 5286 | err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); |
5183 | if (err) | 5287 | if (err) |
5184 | goto out_put_kobj; | 5288 | goto out; |
5185 | 5289 | ||
5186 | err = sysfs_create_group(&s->kobj, &slab_attr_group); | 5290 | err = sysfs_create_group(&s->kobj, &slab_attr_group); |
5187 | if (err) | 5291 | if (err) |
@@ -5208,8 +5312,6 @@ out: | |||
5208 | return err; | 5312 | return err; |
5209 | out_del_kobj: | 5313 | out_del_kobj: |
5210 | kobject_del(&s->kobj); | 5314 | kobject_del(&s->kobj); |
5211 | out_put_kobj: | ||
5212 | kobject_put(&s->kobj); | ||
5213 | goto out; | 5315 | goto out; |
5214 | } | 5316 | } |
5215 | 5317 | ||
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000000..77fee9325a57 --- /dev/null +++ b/mm/userfaultfd.c | |||
@@ -0,0 +1,308 @@ | |||
1 | /* | ||
2 | * mm/userfaultfd.c | ||
3 | * | ||
4 | * Copyright (C) 2015 Red Hat, Inc. | ||
5 | * | ||
6 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
7 | * the COPYING file in the top-level directory. | ||
8 | */ | ||
9 | |||
10 | #include <linux/mm.h> | ||
11 | #include <linux/pagemap.h> | ||
12 | #include <linux/rmap.h> | ||
13 | #include <linux/swap.h> | ||
14 | #include <linux/swapops.h> | ||
15 | #include <linux/userfaultfd_k.h> | ||
16 | #include <linux/mmu_notifier.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include "internal.h" | ||
19 | |||
20 | static int mcopy_atomic_pte(struct mm_struct *dst_mm, | ||
21 | pmd_t *dst_pmd, | ||
22 | struct vm_area_struct *dst_vma, | ||
23 | unsigned long dst_addr, | ||
24 | unsigned long src_addr, | ||
25 | struct page **pagep) | ||
26 | { | ||
27 | struct mem_cgroup *memcg; | ||
28 | pte_t _dst_pte, *dst_pte; | ||
29 | spinlock_t *ptl; | ||
30 | void *page_kaddr; | ||
31 | int ret; | ||
32 | struct page *page; | ||
33 | |||
34 | if (!*pagep) { | ||
35 | ret = -ENOMEM; | ||
36 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); | ||
37 | if (!page) | ||
38 | goto out; | ||
39 | |||
40 | page_kaddr = kmap_atomic(page); | ||
41 | ret = copy_from_user(page_kaddr, | ||
42 | (const void __user *) src_addr, | ||
43 | PAGE_SIZE); | ||
44 | kunmap_atomic(page_kaddr); | ||
45 | |||
46 | /* fallback to copy_from_user outside mmap_sem */ | ||
47 | if (unlikely(ret)) { | ||
48 | ret = -EFAULT; | ||
49 | *pagep = page; | ||
50 | /* don't free the page */ | ||
51 | goto out; | ||
52 | } | ||
53 | } else { | ||
54 | page = *pagep; | ||
55 | *pagep = NULL; | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * The memory barrier inside __SetPageUptodate makes sure that | ||
60 | * preceeding stores to the page contents become visible before | ||
61 | * the set_pte_at() write. | ||
62 | */ | ||
63 | __SetPageUptodate(page); | ||
64 | |||
65 | ret = -ENOMEM; | ||
66 | if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) | ||
67 | goto out_release; | ||
68 | |||
69 | _dst_pte = mk_pte(page, dst_vma->vm_page_prot); | ||
70 | if (dst_vma->vm_flags & VM_WRITE) | ||
71 | _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); | ||
72 | |||
73 | ret = -EEXIST; | ||
74 | dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); | ||
75 | if (!pte_none(*dst_pte)) | ||
76 | goto out_release_uncharge_unlock; | ||
77 | |||
78 | inc_mm_counter(dst_mm, MM_ANONPAGES); | ||
79 | page_add_new_anon_rmap(page, dst_vma, dst_addr); | ||
80 | mem_cgroup_commit_charge(page, memcg, false); | ||
81 | lru_cache_add_active_or_unevictable(page, dst_vma); | ||
82 | |||
83 | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); | ||
84 | |||
85 | /* No need to invalidate - it was non-present before */ | ||
86 | update_mmu_cache(dst_vma, dst_addr, dst_pte); | ||
87 | |||
88 | pte_unmap_unlock(dst_pte, ptl); | ||
89 | ret = 0; | ||
90 | out: | ||
91 | return ret; | ||
92 | out_release_uncharge_unlock: | ||
93 | pte_unmap_unlock(dst_pte, ptl); | ||
94 | mem_cgroup_cancel_charge(page, memcg); | ||
95 | out_release: | ||
96 | page_cache_release(page); | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | static int mfill_zeropage_pte(struct mm_struct *dst_mm, | ||
101 | pmd_t *dst_pmd, | ||
102 | struct vm_area_struct *dst_vma, | ||
103 | unsigned long dst_addr) | ||
104 | { | ||
105 | pte_t _dst_pte, *dst_pte; | ||
106 | spinlock_t *ptl; | ||
107 | int ret; | ||
108 | |||
109 | _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), | ||
110 | dst_vma->vm_page_prot)); | ||
111 | ret = -EEXIST; | ||
112 | dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); | ||
113 | if (!pte_none(*dst_pte)) | ||
114 | goto out_unlock; | ||
115 | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); | ||
116 | /* No need to invalidate - it was non-present before */ | ||
117 | update_mmu_cache(dst_vma, dst_addr, dst_pte); | ||
118 | ret = 0; | ||
119 | out_unlock: | ||
120 | pte_unmap_unlock(dst_pte, ptl); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) | ||
125 | { | ||
126 | pgd_t *pgd; | ||
127 | pud_t *pud; | ||
128 | pmd_t *pmd = NULL; | ||
129 | |||
130 | pgd = pgd_offset(mm, address); | ||
131 | pud = pud_alloc(mm, pgd, address); | ||
132 | if (pud) | ||
133 | /* | ||
134 | * Note that we didn't run this because the pmd was | ||
135 | * missing, the *pmd may be already established and in | ||
136 | * turn it may also be a trans_huge_pmd. | ||
137 | */ | ||
138 | pmd = pmd_alloc(mm, pud, address); | ||
139 | return pmd; | ||
140 | } | ||
141 | |||
142 | static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, | ||
143 | unsigned long dst_start, | ||
144 | unsigned long src_start, | ||
145 | unsigned long len, | ||
146 | bool zeropage) | ||
147 | { | ||
148 | struct vm_area_struct *dst_vma; | ||
149 | ssize_t err; | ||
150 | pmd_t *dst_pmd; | ||
151 | unsigned long src_addr, dst_addr; | ||
152 | long copied; | ||
153 | struct page *page; | ||
154 | |||
155 | /* | ||
156 | * Sanitize the command parameters: | ||
157 | */ | ||
158 | BUG_ON(dst_start & ~PAGE_MASK); | ||
159 | BUG_ON(len & ~PAGE_MASK); | ||
160 | |||
161 | /* Does the address range wrap, or is the span zero-sized? */ | ||
162 | BUG_ON(src_start + len <= src_start); | ||
163 | BUG_ON(dst_start + len <= dst_start); | ||
164 | |||
165 | src_addr = src_start; | ||
166 | dst_addr = dst_start; | ||
167 | copied = 0; | ||
168 | page = NULL; | ||
169 | retry: | ||
170 | down_read(&dst_mm->mmap_sem); | ||
171 | |||
172 | /* | ||
173 | * Make sure the vma is not shared, that the dst range is | ||
174 | * both valid and fully within a single existing vma. | ||
175 | */ | ||
176 | err = -EINVAL; | ||
177 | dst_vma = find_vma(dst_mm, dst_start); | ||
178 | if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) | ||
179 | goto out_unlock; | ||
180 | if (dst_start < dst_vma->vm_start || | ||
181 | dst_start + len > dst_vma->vm_end) | ||
182 | goto out_unlock; | ||
183 | |||
184 | /* | ||
185 | * Be strict and only allow __mcopy_atomic on userfaultfd | ||
186 | * registered ranges to prevent userland errors going | ||
187 | * unnoticed. As far as the VM consistency is concerned, it | ||
188 | * would be perfectly safe to remove this check, but there's | ||
189 | * no useful usage for __mcopy_atomic ouside of userfaultfd | ||
190 | * registered ranges. This is after all why these are ioctls | ||
191 | * belonging to the userfaultfd and not syscalls. | ||
192 | */ | ||
193 | if (!dst_vma->vm_userfaultfd_ctx.ctx) | ||
194 | goto out_unlock; | ||
195 | |||
196 | /* | ||
197 | * FIXME: only allow copying on anonymous vmas, tmpfs should | ||
198 | * be added. | ||
199 | */ | ||
200 | if (dst_vma->vm_ops) | ||
201 | goto out_unlock; | ||
202 | |||
203 | /* | ||
204 | * Ensure the dst_vma has a anon_vma or this page | ||
205 | * would get a NULL anon_vma when moved in the | ||
206 | * dst_vma. | ||
207 | */ | ||
208 | err = -ENOMEM; | ||
209 | if (unlikely(anon_vma_prepare(dst_vma))) | ||
210 | goto out_unlock; | ||
211 | |||
212 | while (src_addr < src_start + len) { | ||
213 | pmd_t dst_pmdval; | ||
214 | |||
215 | BUG_ON(dst_addr >= dst_start + len); | ||
216 | |||
217 | dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); | ||
218 | if (unlikely(!dst_pmd)) { | ||
219 | err = -ENOMEM; | ||
220 | break; | ||
221 | } | ||
222 | |||
223 | dst_pmdval = pmd_read_atomic(dst_pmd); | ||
224 | /* | ||
225 | * If the dst_pmd is mapped as THP don't | ||
226 | * override it and just be strict. | ||
227 | */ | ||
228 | if (unlikely(pmd_trans_huge(dst_pmdval))) { | ||
229 | err = -EEXIST; | ||
230 | break; | ||
231 | } | ||
232 | if (unlikely(pmd_none(dst_pmdval)) && | ||
233 | unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, | ||
234 | dst_addr))) { | ||
235 | err = -ENOMEM; | ||
236 | break; | ||
237 | } | ||
238 | /* If an huge pmd materialized from under us fail */ | ||
239 | if (unlikely(pmd_trans_huge(*dst_pmd))) { | ||
240 | err = -EFAULT; | ||
241 | break; | ||
242 | } | ||
243 | |||
244 | BUG_ON(pmd_none(*dst_pmd)); | ||
245 | BUG_ON(pmd_trans_huge(*dst_pmd)); | ||
246 | |||
247 | if (!zeropage) | ||
248 | err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, | ||
249 | dst_addr, src_addr, &page); | ||
250 | else | ||
251 | err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, | ||
252 | dst_addr); | ||
253 | |||
254 | cond_resched(); | ||
255 | |||
256 | if (unlikely(err == -EFAULT)) { | ||
257 | void *page_kaddr; | ||
258 | |||
259 | up_read(&dst_mm->mmap_sem); | ||
260 | BUG_ON(!page); | ||
261 | |||
262 | page_kaddr = kmap(page); | ||
263 | err = copy_from_user(page_kaddr, | ||
264 | (const void __user *) src_addr, | ||
265 | PAGE_SIZE); | ||
266 | kunmap(page); | ||
267 | if (unlikely(err)) { | ||
268 | err = -EFAULT; | ||
269 | goto out; | ||
270 | } | ||
271 | goto retry; | ||
272 | } else | ||
273 | BUG_ON(page); | ||
274 | |||
275 | if (!err) { | ||
276 | dst_addr += PAGE_SIZE; | ||
277 | src_addr += PAGE_SIZE; | ||
278 | copied += PAGE_SIZE; | ||
279 | |||
280 | if (fatal_signal_pending(current)) | ||
281 | err = -EINTR; | ||
282 | } | ||
283 | if (err) | ||
284 | break; | ||
285 | } | ||
286 | |||
287 | out_unlock: | ||
288 | up_read(&dst_mm->mmap_sem); | ||
289 | out: | ||
290 | if (page) | ||
291 | page_cache_release(page); | ||
292 | BUG_ON(copied < 0); | ||
293 | BUG_ON(err > 0); | ||
294 | BUG_ON(!copied && !err); | ||
295 | return copied ? copied : err; | ||
296 | } | ||
297 | |||
298 | ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | ||
299 | unsigned long src_start, unsigned long len) | ||
300 | { | ||
301 | return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); | ||
302 | } | ||
303 | |||
304 | ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, | ||
305 | unsigned long len) | ||
306 | { | ||
307 | return __mcopy_atomic(dst_mm, start, 0, len, true); | ||
308 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8286938c70de..b1139039122a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1057 | * processes. Try to unmap it here. | 1057 | * processes. Try to unmap it here. |
1058 | */ | 1058 | */ |
1059 | if (page_mapped(page) && mapping) { | 1059 | if (page_mapped(page) && mapping) { |
1060 | switch (try_to_unmap(page, ttu_flags)) { | 1060 | switch (try_to_unmap(page, |
1061 | ttu_flags|TTU_BATCH_FLUSH)) { | ||
1061 | case SWAP_FAIL: | 1062 | case SWAP_FAIL: |
1062 | goto activate_locked; | 1063 | goto activate_locked; |
1063 | case SWAP_AGAIN: | 1064 | case SWAP_AGAIN: |
@@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1097 | if (!sc->may_writepage) | 1098 | if (!sc->may_writepage) |
1098 | goto keep_locked; | 1099 | goto keep_locked; |
1099 | 1100 | ||
1100 | /* Page is dirty, try to write it out here */ | 1101 | /* |
1102 | * Page is dirty. Flush the TLB if a writable entry | ||
1103 | * potentially exists to avoid CPU writes after IO | ||
1104 | * starts and then write it out here. | ||
1105 | */ | ||
1106 | try_to_unmap_flush_dirty(); | ||
1101 | switch (pageout(page, mapping, sc)) { | 1107 | switch (pageout(page, mapping, sc)) { |
1102 | case PAGE_KEEP: | 1108 | case PAGE_KEEP: |
1103 | goto keep_locked; | 1109 | goto keep_locked; |
@@ -1208,6 +1214,7 @@ keep: | |||
1208 | } | 1214 | } |
1209 | 1215 | ||
1210 | mem_cgroup_uncharge_list(&free_pages); | 1216 | mem_cgroup_uncharge_list(&free_pages); |
1217 | try_to_unmap_flush(); | ||
1211 | free_hot_cold_page_list(&free_pages, true); | 1218 | free_hot_cold_page_list(&free_pages, true); |
1212 | 1219 | ||
1213 | list_splice(&ret_pages, page_list); | 1220 | list_splice(&ret_pages, page_list); |
@@ -2151,6 +2158,23 @@ out: | |||
2151 | } | 2158 | } |
2152 | } | 2159 | } |
2153 | 2160 | ||
2161 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | ||
2162 | static void init_tlb_ubc(void) | ||
2163 | { | ||
2164 | /* | ||
2165 | * This deliberately does not clear the cpumask as it's expensive | ||
2166 | * and unnecessary. If there happens to be data in there then the | ||
2167 | * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and | ||
2168 | * then will be cleared. | ||
2169 | */ | ||
2170 | current->tlb_ubc.flush_required = false; | ||
2171 | } | ||
2172 | #else | ||
2173 | static inline void init_tlb_ubc(void) | ||
2174 | { | ||
2175 | } | ||
2176 | #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ | ||
2177 | |||
2154 | /* | 2178 | /* |
2155 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2179 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2156 | */ | 2180 | */ |
@@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | |||
2185 | scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && | 2209 | scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && |
2186 | sc->priority == DEF_PRIORITY); | 2210 | sc->priority == DEF_PRIORITY); |
2187 | 2211 | ||
2212 | init_tlb_ubc(); | ||
2213 | |||
2188 | blk_start_plug(&plug); | 2214 | blk_start_plug(&plug); |
2189 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2215 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2190 | nr[LRU_INACTIVE_FILE]) { | 2216 | nr[LRU_INACTIVE_FILE]) { |
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f30329f72641..69a4d30a9ccf 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) | |||
517 | struct ceph_options *opt = client->options; | 517 | struct ceph_options *opt = client->options; |
518 | size_t pos = m->count; | 518 | size_t pos = m->count; |
519 | 519 | ||
520 | if (opt->name) | 520 | if (opt->name) { |
521 | seq_printf(m, "name=%s,", opt->name); | 521 | seq_puts(m, "name="); |
522 | seq_escape(m, opt->name, ", \t\n\\"); | ||
523 | seq_putc(m, ','); | ||
524 | } | ||
522 | if (opt->key) | 525 | if (opt->key) |
523 | seq_puts(m, "secret=<hidden>,"); | 526 | seq_puts(m, "secret=<hidden>,"); |
524 | 527 | ||
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 337ca851a350..b140c092d226 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c | |||
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task) | |||
297 | clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); | 297 | clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); |
298 | ret = atomic_dec_and_test(&task->tk_count); | 298 | ret = atomic_dec_and_test(&task->tk_count); |
299 | if (waitqueue_active(wq)) | 299 | if (waitqueue_active(wq)) |
300 | __wake_up_locked_key(wq, TASK_NORMAL, &k); | 300 | __wake_up_locked_key(wq, TASK_NORMAL, 1, &k); |
301 | spin_unlock_irqrestore(&wq->lock, flags); | 301 | spin_unlock_irqrestore(&wq->lock, flags); |
302 | return ret; | 302 | return ret; |
303 | } | 303 | } |
diff --git a/scripts/Lindent b/scripts/Lindent index 9c4b3e2b7098..6d889de4e70b 100755 --- a/scripts/Lindent +++ b/scripts/Lindent | |||
@@ -1,6 +1,9 @@ | |||
1 | #!/bin/sh | 1 | #!/bin/sh |
2 | PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" | 2 | PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" |
3 | RES=`indent --version` | 3 | RES=`indent --version` |
4 | if [ "$RES" = "" ]; then | ||
5 | exit 1 | ||
6 | fi | ||
4 | V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1` | 7 | V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1` |
5 | V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2` | 8 | V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2` |
6 | V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3` | 9 | V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3` |
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 515c4c00e957..00d6d53c2681 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh | |||
@@ -14,11 +14,14 @@ declare -A cache | |||
14 | 14 | ||
15 | parse_symbol() { | 15 | parse_symbol() { |
16 | # The structure of symbol at this point is: | 16 | # The structure of symbol at this point is: |
17 | # [name]+[offset]/[total length] | 17 | # ([name]+[offset]/[total length]) |
18 | # | 18 | # |
19 | # For example: | 19 | # For example: |
20 | # do_basic_setup+0x9c/0xbf | 20 | # do_basic_setup+0x9c/0xbf |
21 | 21 | ||
22 | # Remove the englobing parenthesis | ||
23 | symbol=${symbol#\(} | ||
24 | symbol=${symbol%\)} | ||
22 | 25 | ||
23 | # Strip the symbol name so that we could look it up | 26 | # Strip the symbol name so that we could look it up |
24 | local name=${symbol%+*} | 27 | local name=${symbol%+*} |
diff --git a/scripts/kernel-doc b/scripts/kernel-doc index a7bf5f68aacb..9a08fb5c1af6 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc | |||
@@ -469,7 +469,7 @@ sub dump_section { | |||
469 | } else { | 469 | } else { |
470 | # print STDERR "other section '$name' = '$contents'\n"; | 470 | # print STDERR "other section '$name' = '$contents'\n"; |
471 | if (defined($sections{$name}) && ($sections{$name} ne "")) { | 471 | if (defined($sections{$name}) && ($sections{$name} ne "")) { |
472 | print STDERR "Error(${file}:$.): duplicate section name '$name'\n"; | 472 | print STDERR "${file}:$.: error: duplicate section name '$name'\n"; |
473 | ++$errors; | 473 | ++$errors; |
474 | } | 474 | } |
475 | $sections{$name} = $contents; | 475 | $sections{$name} = $contents; |
@@ -1820,7 +1820,7 @@ sub dump_struct($$) { | |||
1820 | }); | 1820 | }); |
1821 | } | 1821 | } |
1822 | else { | 1822 | else { |
1823 | print STDERR "Error(${file}:$.): Cannot parse struct or union!\n"; | 1823 | print STDERR "${file}:$.: error: Cannot parse struct or union!\n"; |
1824 | ++$errors; | 1824 | ++$errors; |
1825 | } | 1825 | } |
1826 | } | 1826 | } |
@@ -1841,7 +1841,7 @@ sub dump_enum($$) { | |||
1841 | push @parameterlist, $arg; | 1841 | push @parameterlist, $arg; |
1842 | if (!$parameterdescs{$arg}) { | 1842 | if (!$parameterdescs{$arg}) { |
1843 | $parameterdescs{$arg} = $undescribed; | 1843 | $parameterdescs{$arg} = $undescribed; |
1844 | print STDERR "Warning(${file}:$.): Enum value '$arg' ". | 1844 | print STDERR "${file}:$.: warning: Enum value '$arg' ". |
1845 | "not described in enum '$declaration_name'\n"; | 1845 | "not described in enum '$declaration_name'\n"; |
1846 | } | 1846 | } |
1847 | 1847 | ||
@@ -1859,7 +1859,7 @@ sub dump_enum($$) { | |||
1859 | }); | 1859 | }); |
1860 | } | 1860 | } |
1861 | else { | 1861 | else { |
1862 | print STDERR "Error(${file}:$.): Cannot parse enum!\n"; | 1862 | print STDERR "${file}:$.: error: Cannot parse enum!\n"; |
1863 | ++$errors; | 1863 | ++$errors; |
1864 | } | 1864 | } |
1865 | } | 1865 | } |
@@ -1887,7 +1887,7 @@ sub dump_typedef($$) { | |||
1887 | }); | 1887 | }); |
1888 | } | 1888 | } |
1889 | else { | 1889 | else { |
1890 | print STDERR "Error(${file}:$.): Cannot parse typedef!\n"; | 1890 | print STDERR "${file}:$.: error: Cannot parse typedef!\n"; |
1891 | ++$errors; | 1891 | ++$errors; |
1892 | } | 1892 | } |
1893 | } | 1893 | } |
@@ -2019,11 +2019,11 @@ sub push_parameter($$$) { | |||
2019 | $parameterdescs{$param_name} = $undescribed; | 2019 | $parameterdescs{$param_name} = $undescribed; |
2020 | 2020 | ||
2021 | if (($type eq 'function') || ($type eq 'enum')) { | 2021 | if (($type eq 'function') || ($type eq 'enum')) { |
2022 | print STDERR "Warning(${file}:$.): Function parameter ". | 2022 | print STDERR "${file}:$.: warning: Function parameter ". |
2023 | "or member '$param' not " . | 2023 | "or member '$param' not " . |
2024 | "described in '$declaration_name'\n"; | 2024 | "described in '$declaration_name'\n"; |
2025 | } | 2025 | } |
2026 | print STDERR "Warning(${file}:$.):" . | 2026 | print STDERR "${file}:$.: warning:" . |
2027 | " No description found for parameter '$param'\n"; | 2027 | " No description found for parameter '$param'\n"; |
2028 | ++$warnings; | 2028 | ++$warnings; |
2029 | } | 2029 | } |
@@ -2074,14 +2074,14 @@ sub check_sections($$$$$$) { | |||
2074 | } | 2074 | } |
2075 | if ($err) { | 2075 | if ($err) { |
2076 | if ($decl_type eq "function") { | 2076 | if ($decl_type eq "function") { |
2077 | print STDERR "Warning(${file}:$.): " . | 2077 | print STDERR "${file}:$.: warning: " . |
2078 | "Excess function parameter " . | 2078 | "Excess function parameter " . |
2079 | "'$sects[$sx]' " . | 2079 | "'$sects[$sx]' " . |
2080 | "description in '$decl_name'\n"; | 2080 | "description in '$decl_name'\n"; |
2081 | ++$warnings; | 2081 | ++$warnings; |
2082 | } else { | 2082 | } else { |
2083 | if ($nested !~ m/\Q$sects[$sx]\E/) { | 2083 | if ($nested !~ m/\Q$sects[$sx]\E/) { |
2084 | print STDERR "Warning(${file}:$.): " . | 2084 | print STDERR "${file}:$.: warning: " . |
2085 | "Excess struct/union/enum/typedef member " . | 2085 | "Excess struct/union/enum/typedef member " . |
2086 | "'$sects[$sx]' " . | 2086 | "'$sects[$sx]' " . |
2087 | "description in '$decl_name'\n"; | 2087 | "description in '$decl_name'\n"; |
@@ -2107,7 +2107,7 @@ sub check_return_section { | |||
2107 | 2107 | ||
2108 | if (!defined($sections{$section_return}) || | 2108 | if (!defined($sections{$section_return}) || |
2109 | $sections{$section_return} eq "") { | 2109 | $sections{$section_return} eq "") { |
2110 | print STDERR "Warning(${file}:$.): " . | 2110 | print STDERR "${file}:$.: warning: " . |
2111 | "No description found for return value of " . | 2111 | "No description found for return value of " . |
2112 | "'$declaration_name'\n"; | 2112 | "'$declaration_name'\n"; |
2113 | ++$warnings; | 2113 | ++$warnings; |
@@ -2186,7 +2186,7 @@ sub dump_function($$) { | |||
2186 | 2186 | ||
2187 | create_parameterlist($args, ',', $file); | 2187 | create_parameterlist($args, ',', $file); |
2188 | } else { | 2188 | } else { |
2189 | print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n"; | 2189 | print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n"; |
2190 | return; | 2190 | return; |
2191 | } | 2191 | } |
2192 | 2192 | ||
@@ -2251,7 +2251,7 @@ sub tracepoint_munge($) { | |||
2251 | $tracepointargs = $1; | 2251 | $tracepointargs = $1; |
2252 | } | 2252 | } |
2253 | if (($tracepointname eq 0) || ($tracepointargs eq 0)) { | 2253 | if (($tracepointname eq 0) || ($tracepointargs eq 0)) { |
2254 | print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n". | 2254 | print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n". |
2255 | "$prototype\n"; | 2255 | "$prototype\n"; |
2256 | } else { | 2256 | } else { |
2257 | $prototype = "static inline void trace_$tracepointname($tracepointargs)"; | 2257 | $prototype = "static inline void trace_$tracepointname($tracepointargs)"; |
@@ -2450,7 +2450,7 @@ sub process_file($) { | |||
2450 | } | 2450 | } |
2451 | 2451 | ||
2452 | if (($declaration_purpose eq "") && $verbose) { | 2452 | if (($declaration_purpose eq "") && $verbose) { |
2453 | print STDERR "Warning(${file}:$.): missing initial short description on line:\n"; | 2453 | print STDERR "${file}:$.: warning: missing initial short description on line:\n"; |
2454 | print STDERR $_; | 2454 | print STDERR $_; |
2455 | ++$warnings; | 2455 | ++$warnings; |
2456 | } | 2456 | } |
@@ -2468,10 +2468,10 @@ sub process_file($) { | |||
2468 | } | 2468 | } |
2469 | 2469 | ||
2470 | if ($verbose) { | 2470 | if ($verbose) { |
2471 | print STDERR "Info(${file}:$.): Scanning doc for $identifier\n"; | 2471 | print STDERR "${file}:$.: info: Scanning doc for $identifier\n"; |
2472 | } | 2472 | } |
2473 | } else { | 2473 | } else { |
2474 | print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.", | 2474 | print STDERR "${file}:$.: warning: Cannot understand $_ on line $.", |
2475 | " - I thought it was a doc line\n"; | 2475 | " - I thought it was a doc line\n"; |
2476 | ++$warnings; | 2476 | ++$warnings; |
2477 | $state = 0; | 2477 | $state = 0; |
@@ -2483,7 +2483,7 @@ sub process_file($) { | |||
2483 | 2483 | ||
2484 | if (($contents ne "") && ($contents ne "\n")) { | 2484 | if (($contents ne "") && ($contents ne "\n")) { |
2485 | if (!$in_doc_sect && $verbose) { | 2485 | if (!$in_doc_sect && $verbose) { |
2486 | print STDERR "Warning(${file}:$.): contents before sections\n"; | 2486 | print STDERR "${file}:$.: warning: contents before sections\n"; |
2487 | ++$warnings; | 2487 | ++$warnings; |
2488 | } | 2488 | } |
2489 | dump_section($file, $section, xml_escape($contents)); | 2489 | dump_section($file, $section, xml_escape($contents)); |
@@ -2509,7 +2509,7 @@ sub process_file($) { | |||
2509 | } | 2509 | } |
2510 | # look for doc_com + <text> + doc_end: | 2510 | # look for doc_com + <text> + doc_end: |
2511 | if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') { | 2511 | if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') { |
2512 | print STDERR "Warning(${file}:$.): suspicious ending line: $_"; | 2512 | print STDERR "${file}:$.: warning: suspicious ending line: $_"; |
2513 | ++$warnings; | 2513 | ++$warnings; |
2514 | } | 2514 | } |
2515 | 2515 | ||
@@ -2539,7 +2539,7 @@ sub process_file($) { | |||
2539 | } | 2539 | } |
2540 | } else { | 2540 | } else { |
2541 | # i dont know - bad line? ignore. | 2541 | # i dont know - bad line? ignore. |
2542 | print STDERR "Warning(${file}:$.): bad line: $_"; | 2542 | print STDERR "${file}:$.: warning: bad line: $_"; |
2543 | ++$warnings; | 2543 | ++$warnings; |
2544 | } | 2544 | } |
2545 | } elsif ($state == 5) { # scanning for split parameters | 2545 | } elsif ($state == 5) { # scanning for split parameters |
@@ -2631,7 +2631,7 @@ sub process_file($) { | |||
2631 | } | 2631 | } |
2632 | } | 2632 | } |
2633 | if ($initial_section_counter == $section_counter) { | 2633 | if ($initial_section_counter == $section_counter) { |
2634 | print STDERR "Warning(${file}): no structured comments found\n"; | 2634 | print STDERR "${file}:1: warning: no structured comments found\n"; |
2635 | if (($function_only == 1) && ($show_not_found == 1)) { | 2635 | if (($function_only == 1) && ($show_not_found == 1)) { |
2636 | print STDERR " Was looking for '$_'.\n" for keys %function_table; | 2636 | print STDERR " Was looking for '$_'.\n" for keys %function_table; |
2637 | } | 2637 | } |
diff --git a/scripts/spelling.txt b/scripts/spelling.txt index bb8e4d0a1911..946caf3bd694 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt | |||
@@ -32,6 +32,7 @@ accoring||according | |||
32 | accout||account | 32 | accout||account |
33 | accquire||acquire | 33 | accquire||acquire |
34 | accquired||acquired | 34 | accquired||acquired |
35 | accross||across | ||
35 | acessable||accessible | 36 | acessable||accessible |
36 | acess||access | 37 | acess||access |
37 | achitecture||architecture | 38 | achitecture||architecture |
@@ -100,8 +101,10 @@ appropiate||appropriate | |||
100 | appropriatly||appropriately | 101 | appropriatly||appropriately |
101 | approriate||appropriate | 102 | approriate||appropriate |
102 | approriately||appropriately | 103 | approriately||appropriately |
104 | apropriate||appropriate | ||
103 | aquainted||acquainted | 105 | aquainted||acquainted |
104 | aquired||acquired | 106 | aquired||acquired |
107 | aquisition||acquisition | ||
105 | arbitary||arbitrary | 108 | arbitary||arbitrary |
106 | architechture||architecture | 109 | architechture||architecture |
107 | arguement||argument | 110 | arguement||argument |
@@ -111,6 +114,8 @@ arne't||aren't | |||
111 | arraival||arrival | 114 | arraival||arrival |
112 | artifical||artificial | 115 | artifical||artificial |
113 | artillary||artillery | 116 | artillary||artillery |
117 | asign||assign | ||
118 | assertation||assertion | ||
114 | assiged||assigned | 119 | assiged||assigned |
115 | assigment||assignment | 120 | assigment||assignment |
116 | assigments||assignments | 121 | assigments||assignments |
@@ -136,6 +141,7 @@ automatize||automate | |||
136 | automatized||automated | 141 | automatized||automated |
137 | automatizes||automates | 142 | automatizes||automates |
138 | autonymous||autonomous | 143 | autonymous||autonomous |
144 | auxillary||auxiliary | ||
139 | auxilliary||auxiliary | 145 | auxilliary||auxiliary |
140 | avaiable||available | 146 | avaiable||available |
141 | avaible||available | 147 | avaible||available |
@@ -187,6 +193,7 @@ capatibilities||capabilities | |||
187 | carefuly||carefully | 193 | carefuly||carefully |
188 | cariage||carriage | 194 | cariage||carriage |
189 | catagory||category | 195 | catagory||category |
196 | cehck||check | ||
190 | challange||challenge | 197 | challange||challenge |
191 | challanges||challenges | 198 | challanges||challenges |
192 | chanell||channel | 199 | chanell||channel |
@@ -199,6 +206,8 @@ charactor||character | |||
199 | charater||character | 206 | charater||character |
200 | charaters||characters | 207 | charaters||characters |
201 | charcter||character | 208 | charcter||character |
209 | chcek||check | ||
210 | chck||check | ||
202 | checksuming||checksumming | 211 | checksuming||checksumming |
203 | childern||children | 212 | childern||children |
204 | childs||children | 213 | childs||children |
@@ -231,6 +240,8 @@ compatability||compatibility | |||
231 | compatable||compatible | 240 | compatable||compatible |
232 | compatibiliy||compatibility | 241 | compatibiliy||compatibility |
233 | compatibilty||compatibility | 242 | compatibilty||compatibility |
243 | compatiblity||compatibility | ||
244 | competion||completion | ||
234 | compilant||compliant | 245 | compilant||compliant |
235 | compleatly||completely | 246 | compleatly||completely |
236 | completly||completely | 247 | completly||completely |
@@ -291,6 +302,7 @@ defferred||deferred | |||
291 | definate||definite | 302 | definate||definite |
292 | definately||definitely | 303 | definately||definitely |
293 | defintion||definition | 304 | defintion||definition |
305 | defintions||definitions | ||
294 | defualt||default | 306 | defualt||default |
295 | defult||default | 307 | defult||default |
296 | deivce||device | 308 | deivce||device |
@@ -306,6 +318,7 @@ depreacted||deprecated | |||
306 | depreacte||deprecate | 318 | depreacte||deprecate |
307 | desactivate||deactivate | 319 | desactivate||deactivate |
308 | desciptors||descriptors | 320 | desciptors||descriptors |
321 | descripton||description | ||
309 | descrition||description | 322 | descrition||description |
310 | descritptor||descriptor | 323 | descritptor||descriptor |
311 | desctiptor||descriptor | 324 | desctiptor||descriptor |
@@ -327,6 +340,7 @@ devided||divided | |||
327 | deviece||device | 340 | deviece||device |
328 | diable||disable | 341 | diable||disable |
329 | dictionnary||dictionary | 342 | dictionnary||dictionary |
343 | didnt||didn't | ||
330 | diferent||different | 344 | diferent||different |
331 | differrence||difference | 345 | differrence||difference |
332 | difinition||definition | 346 | difinition||definition |
@@ -344,6 +358,7 @@ docuentation||documentation | |||
344 | documantation||documentation | 358 | documantation||documentation |
345 | documentaion||documentation | 359 | documentaion||documentation |
346 | documment||document | 360 | documment||document |
361 | doesnt||doesn't | ||
347 | dorp||drop | 362 | dorp||drop |
348 | dosen||doesn | 363 | dosen||doesn |
349 | downlad||download | 364 | downlad||download |
@@ -450,11 +465,13 @@ grahical||graphical | |||
450 | grahpical||graphical | 465 | grahpical||graphical |
451 | grapic||graphic | 466 | grapic||graphic |
452 | guage||gauge | 467 | guage||gauge |
468 | guarenteed||guaranteed | ||
453 | guarentee||guarantee | 469 | guarentee||guarantee |
454 | halfs||halves | 470 | halfs||halves |
455 | hander||handler | 471 | hander||handler |
456 | handfull||handful | 472 | handfull||handful |
457 | hanled||handled | 473 | hanled||handled |
474 | happend||happened | ||
458 | harware||hardware | 475 | harware||hardware |
459 | heirarchically||hierarchically | 476 | heirarchically||hierarchically |
460 | helpfull||helpful | 477 | helpfull||helpful |
@@ -512,6 +529,7 @@ initialzed||initialized | |||
512 | initilization||initialization | 529 | initilization||initialization |
513 | initilize||initialize | 530 | initilize||initialize |
514 | inofficial||unofficial | 531 | inofficial||unofficial |
532 | insititute||institute | ||
515 | instal||install | 533 | instal||install |
516 | inteface||interface | 534 | inteface||interface |
517 | integreated||integrated | 535 | integreated||integrated |
@@ -546,6 +564,7 @@ invididual||individual | |||
546 | invokation||invocation | 564 | invokation||invocation |
547 | invokations||invocations | 565 | invokations||invocations |
548 | irrelevent||irrelevant | 566 | irrelevent||irrelevant |
567 | isnt||isn't | ||
549 | isssue||issue | 568 | isssue||issue |
550 | itslef||itself | 569 | itslef||itself |
551 | jave||java | 570 | jave||java |
@@ -558,6 +577,7 @@ langauage||language | |||
558 | langauge||language | 577 | langauge||language |
559 | langugage||language | 578 | langugage||language |
560 | lauch||launch | 579 | lauch||launch |
580 | layed||laid | ||
561 | leightweight||lightweight | 581 | leightweight||lightweight |
562 | lengh||length | 582 | lengh||length |
563 | lenght||length | 583 | lenght||length |
@@ -714,6 +734,7 @@ preceeding||preceding | |||
714 | preceed||precede | 734 | preceed||precede |
715 | precendence||precedence | 735 | precendence||precedence |
716 | precission||precision | 736 | precission||precision |
737 | preemptable||preemptible | ||
717 | prefered||preferred | 738 | prefered||preferred |
718 | prefferably||preferably | 739 | prefferably||preferably |
719 | premption||preemption | 740 | premption||preemption |
@@ -744,6 +765,7 @@ programers||programmers | |||
744 | programm||program | 765 | programm||program |
745 | programms||programs | 766 | programms||programs |
746 | progresss||progress | 767 | progresss||progress |
768 | promiscous||promiscuous | ||
747 | promps||prompts | 769 | promps||prompts |
748 | pronnounced||pronounced | 770 | pronnounced||pronounced |
749 | prononciation||pronunciation | 771 | prononciation||pronunciation |
@@ -817,6 +839,7 @@ reseting||resetting | |||
817 | resizeable||resizable | 839 | resizeable||resizable |
818 | resouces||resources | 840 | resouces||resources |
819 | resoures||resources | 841 | resoures||resources |
842 | responce||response | ||
820 | ressizes||resizes | 843 | ressizes||resizes |
821 | ressource||resource | 844 | ressource||resource |
822 | ressources||resources | 845 | ressources||resources |
@@ -869,6 +892,7 @@ setts||sets | |||
869 | settting||setting | 892 | settting||setting |
870 | shotdown||shutdown | 893 | shotdown||shutdown |
871 | shoud||should | 894 | shoud||should |
895 | shouldnt||shouldn't | ||
872 | shoule||should | 896 | shoule||should |
873 | shrinked||shrunk | 897 | shrinked||shrunk |
874 | siginificantly||significantly | 898 | siginificantly||significantly |
@@ -913,9 +937,11 @@ straming||streaming | |||
913 | struc||struct | 937 | struc||struct |
914 | structres||structures | 938 | structres||structures |
915 | stuct||struct | 939 | stuct||struct |
940 | stucture||structure | ||
916 | sturcture||structure | 941 | sturcture||structure |
917 | subdirectoires||subdirectories | 942 | subdirectoires||subdirectories |
918 | suble||subtle | 943 | suble||subtle |
944 | substract||subtract | ||
919 | succesfully||successfully | 945 | succesfully||successfully |
920 | succesful||successful | 946 | succesful||successful |
921 | successfull||successful | 947 | successfull||successful |
@@ -987,6 +1013,7 @@ unexpectd||unexpected | |||
987 | unexpeted||unexpected | 1013 | unexpeted||unexpected |
988 | unfortunatelly||unfortunately | 1014 | unfortunatelly||unfortunately |
989 | unifiy||unify | 1015 | unifiy||unify |
1016 | unintialized||uninitialized | ||
990 | unknonw||unknown | 1017 | unknonw||unknown |
991 | unknow||unknown | 1018 | unknow||unknown |
992 | unkown||unknown | 1019 | unkown||unknown |
@@ -1027,7 +1054,9 @@ virtiual||virtual | |||
1027 | visiters||visitors | 1054 | visiters||visitors |
1028 | vitual||virtual | 1055 | vitual||virtual |
1029 | wating||waiting | 1056 | wating||waiting |
1057 | wether||whether | ||
1030 | whataver||whatever | 1058 | whataver||whatever |
1059 | whcih||which | ||
1031 | whenver||whenever | 1060 | whenver||whenever |
1032 | wheter||whether | 1061 | wheter||whether |
1033 | whe||when | 1062 | whe||when |
diff --git a/security/commoncap.c b/security/commoncap.c index d103f5a4043d..1832cf701c3d 100644 --- a/security/commoncap.c +++ b/security/commoncap.c | |||
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new, | |||
267 | new->cap_effective = *effective; | 267 | new->cap_effective = *effective; |
268 | new->cap_inheritable = *inheritable; | 268 | new->cap_inheritable = *inheritable; |
269 | new->cap_permitted = *permitted; | 269 | new->cap_permitted = *permitted; |
270 | |||
271 | /* | ||
272 | * Mask off ambient bits that are no longer both permitted and | ||
273 | * inheritable. | ||
274 | */ | ||
275 | new->cap_ambient = cap_intersect(new->cap_ambient, | ||
276 | cap_intersect(*permitted, | ||
277 | *inheritable)); | ||
278 | if (WARN_ON(!cap_ambient_invariant_ok(new))) | ||
279 | return -EINVAL; | ||
270 | return 0; | 280 | return 0; |
271 | } | 281 | } |
272 | 282 | ||
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, | |||
347 | 357 | ||
348 | /* | 358 | /* |
349 | * pP' = (X & fP) | (pI & fI) | 359 | * pP' = (X & fP) | (pI & fI) |
360 | * The addition of pA' is handled later. | ||
350 | */ | 361 | */ |
351 | new->cap_permitted.cap[i] = | 362 | new->cap_permitted.cap[i] = |
352 | (new->cap_bset.cap[i] & permitted) | | 363 | (new->cap_bset.cap[i] & permitted) | |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) | |||
474 | { | 485 | { |
475 | const struct cred *old = current_cred(); | 486 | const struct cred *old = current_cred(); |
476 | struct cred *new = bprm->cred; | 487 | struct cred *new = bprm->cred; |
477 | bool effective, has_cap = false; | 488 | bool effective, has_cap = false, is_setid; |
478 | int ret; | 489 | int ret; |
479 | kuid_t root_uid; | 490 | kuid_t root_uid; |
480 | 491 | ||
492 | if (WARN_ON(!cap_ambient_invariant_ok(old))) | ||
493 | return -EPERM; | ||
494 | |||
481 | effective = false; | 495 | effective = false; |
482 | ret = get_file_caps(bprm, &effective, &has_cap); | 496 | ret = get_file_caps(bprm, &effective, &has_cap); |
483 | if (ret < 0) | 497 | if (ret < 0) |
@@ -522,8 +536,9 @@ skip: | |||
522 | * | 536 | * |
523 | * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. | 537 | * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. |
524 | */ | 538 | */ |
525 | if ((!uid_eq(new->euid, old->uid) || | 539 | is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid); |
526 | !gid_eq(new->egid, old->gid) || | 540 | |
541 | if ((is_setid || | ||
527 | !cap_issubset(new->cap_permitted, old->cap_permitted)) && | 542 | !cap_issubset(new->cap_permitted, old->cap_permitted)) && |
528 | bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { | 543 | bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { |
529 | /* downgrade; they get no more than they had, and maybe less */ | 544 | /* downgrade; they get no more than they had, and maybe less */ |
@@ -539,10 +554,28 @@ skip: | |||
539 | new->suid = new->fsuid = new->euid; | 554 | new->suid = new->fsuid = new->euid; |
540 | new->sgid = new->fsgid = new->egid; | 555 | new->sgid = new->fsgid = new->egid; |
541 | 556 | ||
557 | /* File caps or setid cancels ambient. */ | ||
558 | if (has_cap || is_setid) | ||
559 | cap_clear(new->cap_ambient); | ||
560 | |||
561 | /* | ||
562 | * Now that we've computed pA', update pP' to give: | ||
563 | * pP' = (X & fP) | (pI & fI) | pA' | ||
564 | */ | ||
565 | new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); | ||
566 | |||
567 | /* | ||
568 | * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, | ||
569 | * this is the same as pE' = (fE ? pP' : 0) | pA'. | ||
570 | */ | ||
542 | if (effective) | 571 | if (effective) |
543 | new->cap_effective = new->cap_permitted; | 572 | new->cap_effective = new->cap_permitted; |
544 | else | 573 | else |
545 | cap_clear(new->cap_effective); | 574 | new->cap_effective = new->cap_ambient; |
575 | |||
576 | if (WARN_ON(!cap_ambient_invariant_ok(new))) | ||
577 | return -EPERM; | ||
578 | |||
546 | bprm->cap_effective = effective; | 579 | bprm->cap_effective = effective; |
547 | 580 | ||
548 | /* | 581 | /* |
@@ -557,7 +590,7 @@ skip: | |||
557 | * Number 1 above might fail if you don't have a full bset, but I think | 590 | * Number 1 above might fail if you don't have a full bset, but I think |
558 | * that is interesting information to audit. | 591 | * that is interesting information to audit. |
559 | */ | 592 | */ |
560 | if (!cap_isclear(new->cap_effective)) { | 593 | if (!cap_issubset(new->cap_effective, new->cap_ambient)) { |
561 | if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || | 594 | if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || |
562 | !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || | 595 | !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || |
563 | issecure(SECURE_NOROOT)) { | 596 | issecure(SECURE_NOROOT)) { |
@@ -568,6 +601,10 @@ skip: | |||
568 | } | 601 | } |
569 | 602 | ||
570 | new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); | 603 | new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); |
604 | |||
605 | if (WARN_ON(!cap_ambient_invariant_ok(new))) | ||
606 | return -EPERM; | ||
607 | |||
571 | return 0; | 608 | return 0; |
572 | } | 609 | } |
573 | 610 | ||
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm) | |||
589 | if (!uid_eq(cred->uid, root_uid)) { | 626 | if (!uid_eq(cred->uid, root_uid)) { |
590 | if (bprm->cap_effective) | 627 | if (bprm->cap_effective) |
591 | return 1; | 628 | return 1; |
592 | if (!cap_isclear(cred->cap_permitted)) | 629 | if (!cap_issubset(cred->cap_permitted, cred->cap_ambient)) |
593 | return 1; | 630 | return 1; |
594 | } | 631 | } |
595 | 632 | ||
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) | |||
691 | uid_eq(old->suid, root_uid)) && | 728 | uid_eq(old->suid, root_uid)) && |
692 | (!uid_eq(new->uid, root_uid) && | 729 | (!uid_eq(new->uid, root_uid) && |
693 | !uid_eq(new->euid, root_uid) && | 730 | !uid_eq(new->euid, root_uid) && |
694 | !uid_eq(new->suid, root_uid)) && | 731 | !uid_eq(new->suid, root_uid))) { |
695 | !issecure(SECURE_KEEP_CAPS)) { | 732 | if (!issecure(SECURE_KEEP_CAPS)) { |
696 | cap_clear(new->cap_permitted); | 733 | cap_clear(new->cap_permitted); |
697 | cap_clear(new->cap_effective); | 734 | cap_clear(new->cap_effective); |
735 | } | ||
736 | |||
737 | /* | ||
738 | * Pre-ambient programs expect setresuid to nonroot followed | ||
739 | * by exec to drop capabilities. We should make sure that | ||
740 | * this remains the case. | ||
741 | */ | ||
742 | cap_clear(new->cap_ambient); | ||
698 | } | 743 | } |
699 | if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) | 744 | if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) |
700 | cap_clear(new->cap_effective); | 745 | cap_clear(new->cap_effective); |
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
924 | new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); | 969 | new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); |
925 | return commit_creds(new); | 970 | return commit_creds(new); |
926 | 971 | ||
972 | case PR_CAP_AMBIENT: | ||
973 | if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { | ||
974 | if (arg3 | arg4 | arg5) | ||
975 | return -EINVAL; | ||
976 | |||
977 | new = prepare_creds(); | ||
978 | if (!new) | ||
979 | return -ENOMEM; | ||
980 | cap_clear(new->cap_ambient); | ||
981 | return commit_creds(new); | ||
982 | } | ||
983 | |||
984 | if (((!cap_valid(arg3)) | arg4 | arg5)) | ||
985 | return -EINVAL; | ||
986 | |||
987 | if (arg2 == PR_CAP_AMBIENT_IS_SET) { | ||
988 | return !!cap_raised(current_cred()->cap_ambient, arg3); | ||
989 | } else if (arg2 != PR_CAP_AMBIENT_RAISE && | ||
990 | arg2 != PR_CAP_AMBIENT_LOWER) { | ||
991 | return -EINVAL; | ||
992 | } else { | ||
993 | if (arg2 == PR_CAP_AMBIENT_RAISE && | ||
994 | (!cap_raised(current_cred()->cap_permitted, arg3) || | ||
995 | !cap_raised(current_cred()->cap_inheritable, | ||
996 | arg3) || | ||
997 | issecure(SECURE_NO_CAP_AMBIENT_RAISE))) | ||
998 | return -EPERM; | ||
999 | |||
1000 | new = prepare_creds(); | ||
1001 | if (!new) | ||
1002 | return -ENOMEM; | ||
1003 | if (arg2 == PR_CAP_AMBIENT_RAISE) | ||
1004 | cap_raise(new->cap_ambient, arg3); | ||
1005 | else | ||
1006 | cap_lower(new->cap_ambient, arg3); | ||
1007 | return commit_creds(new); | ||
1008 | } | ||
1009 | |||
927 | default: | 1010 | default: |
928 | /* No functionality available - continue with default */ | 1011 | /* No functionality available - continue with default */ |
929 | return -ENOSYS; | 1012 | return -ENOSYS; |
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index bd536cb221e2..43b4cddbf2b3 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c | |||
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork) | |||
848 | new->cap_inheritable = old->cap_inheritable; | 848 | new->cap_inheritable = old->cap_inheritable; |
849 | new->cap_permitted = old->cap_permitted; | 849 | new->cap_permitted = old->cap_permitted; |
850 | new->cap_effective = old->cap_effective; | 850 | new->cap_effective = old->cap_effective; |
851 | new->cap_ambient = old->cap_ambient; | ||
851 | new->cap_bset = old->cap_bset; | 852 | new->cap_bset = old->cap_bset; |
852 | 853 | ||
853 | new->jit_keyring = old->jit_keyring; | 854 | new->jit_keyring = old->jit_keyring; |
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 564079c5c49d..cdf4c589a391 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c | |||
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m, | |||
1100 | seq_puts(m, prefix); | 1100 | seq_puts(m, prefix); |
1101 | if (has_comma) | 1101 | if (has_comma) |
1102 | seq_putc(m, '\"'); | 1102 | seq_putc(m, '\"'); |
1103 | seq_puts(m, opts->mnt_opts[i]); | 1103 | seq_escape(m, opts->mnt_opts[i], "\"\n\\"); |
1104 | if (has_comma) | 1104 | if (has_comma) |
1105 | seq_putc(m, '\"'); | 1105 | seq_putc(m, '\"'); |
1106 | } | 1106 | } |
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore new file mode 100644 index 000000000000..b732dd0d4738 --- /dev/null +++ b/tools/testing/selftests/capabilities/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | test_execve | ||
2 | validate_cap | ||
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile new file mode 100644 index 000000000000..8c8f0c1f0889 --- /dev/null +++ b/tools/testing/selftests/capabilities/Makefile | |||
@@ -0,0 +1,18 @@ | |||
1 | all: | ||
2 | |||
3 | include ../lib.mk | ||
4 | |||
5 | .PHONY: all clean | ||
6 | |||
7 | TARGETS := validate_cap test_execve | ||
8 | TEST_PROGS := test_execve | ||
9 | |||
10 | CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng | ||
11 | |||
12 | all: $(TARGETS) | ||
13 | |||
14 | clean: | ||
15 | $(RM) $(TARGETS) | ||
16 | |||
17 | $(TARGETS): %: %.c | ||
18 | $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl | ||
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c new file mode 100644 index 000000000000..10a21a958aaf --- /dev/null +++ b/tools/testing/selftests/capabilities/test_execve.c | |||
@@ -0,0 +1,427 @@ | |||
1 | #define _GNU_SOURCE | ||
2 | |||
3 | #include <cap-ng.h> | ||
4 | #include <err.h> | ||
5 | #include <linux/capability.h> | ||
6 | #include <stdbool.h> | ||
7 | #include <string.h> | ||
8 | #include <stdio.h> | ||
9 | #include <fcntl.h> | ||
10 | #include <errno.h> | ||
11 | #include <stdarg.h> | ||
12 | #include <sched.h> | ||
13 | #include <sys/mount.h> | ||
14 | #include <limits.h> | ||
15 | #include <libgen.h> | ||
16 | #include <malloc.h> | ||
17 | #include <sys/wait.h> | ||
18 | #include <sys/prctl.h> | ||
19 | #include <sys/stat.h> | ||
20 | |||
21 | #ifndef PR_CAP_AMBIENT | ||
22 | #define PR_CAP_AMBIENT 47 | ||
23 | # define PR_CAP_AMBIENT_IS_SET 1 | ||
24 | # define PR_CAP_AMBIENT_RAISE 2 | ||
25 | # define PR_CAP_AMBIENT_LOWER 3 | ||
26 | # define PR_CAP_AMBIENT_CLEAR_ALL 4 | ||
27 | #endif | ||
28 | |||
29 | static int nerrs; | ||
30 | |||
31 | static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap) | ||
32 | { | ||
33 | char buf[4096]; | ||
34 | int fd; | ||
35 | ssize_t written; | ||
36 | int buf_len; | ||
37 | |||
38 | buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); | ||
39 | if (buf_len < 0) { | ||
40 | err(1, "vsnprintf failed"); | ||
41 | } | ||
42 | if (buf_len >= sizeof(buf)) { | ||
43 | errx(1, "vsnprintf output truncated"); | ||
44 | } | ||
45 | |||
46 | fd = open(filename, O_WRONLY); | ||
47 | if (fd < 0) { | ||
48 | if ((errno == ENOENT) && enoent_ok) | ||
49 | return; | ||
50 | err(1, "open of %s failed", filename); | ||
51 | } | ||
52 | written = write(fd, buf, buf_len); | ||
53 | if (written != buf_len) { | ||
54 | if (written >= 0) { | ||
55 | errx(1, "short write to %s", filename); | ||
56 | } else { | ||
57 | err(1, "write to %s failed", filename); | ||
58 | } | ||
59 | } | ||
60 | if (close(fd) != 0) { | ||
61 | err(1, "close of %s failed", filename); | ||
62 | } | ||
63 | } | ||
64 | |||
65 | static void maybe_write_file(char *filename, char *fmt, ...) | ||
66 | { | ||
67 | va_list ap; | ||
68 | |||
69 | va_start(ap, fmt); | ||
70 | vmaybe_write_file(true, filename, fmt, ap); | ||
71 | va_end(ap); | ||
72 | } | ||
73 | |||
74 | static void write_file(char *filename, char *fmt, ...) | ||
75 | { | ||
76 | va_list ap; | ||
77 | |||
78 | va_start(ap, fmt); | ||
79 | vmaybe_write_file(false, filename, fmt, ap); | ||
80 | va_end(ap); | ||
81 | } | ||
82 | |||
83 | static bool create_and_enter_ns(uid_t inner_uid) | ||
84 | { | ||
85 | uid_t outer_uid; | ||
86 | gid_t outer_gid; | ||
87 | int i; | ||
88 | bool have_outer_privilege; | ||
89 | |||
90 | outer_uid = getuid(); | ||
91 | outer_gid = getgid(); | ||
92 | |||
93 | /* | ||
94 | * TODO: If we're already root, we could skip creating the userns. | ||
95 | */ | ||
96 | |||
97 | if (unshare(CLONE_NEWNS) == 0) { | ||
98 | printf("[NOTE]\tUsing global UIDs for tests\n"); | ||
99 | if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0) | ||
100 | err(1, "PR_SET_KEEPCAPS"); | ||
101 | if (setresuid(inner_uid, inner_uid, -1) != 0) | ||
102 | err(1, "setresuid"); | ||
103 | |||
104 | // Re-enable effective caps | ||
105 | capng_get_caps_process(); | ||
106 | for (i = 0; i < CAP_LAST_CAP; i++) | ||
107 | if (capng_have_capability(CAPNG_PERMITTED, i)) | ||
108 | capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); | ||
109 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
110 | err(1, "capng_apply"); | ||
111 | |||
112 | have_outer_privilege = true; | ||
113 | } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) { | ||
114 | printf("[NOTE]\tUsing a user namespace for tests\n"); | ||
115 | maybe_write_file("/proc/self/setgroups", "deny"); | ||
116 | write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid); | ||
117 | write_file("/proc/self/gid_map", "0 %d 1", outer_gid); | ||
118 | |||
119 | have_outer_privilege = false; | ||
120 | } else { | ||
121 | errx(1, "must be root or be able to create a userns"); | ||
122 | } | ||
123 | |||
124 | if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0) | ||
125 | err(1, "remount everything private"); | ||
126 | |||
127 | return have_outer_privilege; | ||
128 | } | ||
129 | |||
130 | static void chdir_to_tmpfs(void) | ||
131 | { | ||
132 | char cwd[PATH_MAX]; | ||
133 | if (getcwd(cwd, sizeof(cwd)) != cwd) | ||
134 | err(1, "getcwd"); | ||
135 | |||
136 | if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0) | ||
137 | err(1, "mount private tmpfs"); | ||
138 | |||
139 | if (chdir(cwd) != 0) | ||
140 | err(1, "chdir to private tmpfs"); | ||
141 | |||
142 | if (umount2(".", MNT_DETACH) != 0) | ||
143 | err(1, "detach private tmpfs"); | ||
144 | } | ||
145 | |||
146 | static void copy_fromat_to(int fromfd, const char *fromname, const char *toname) | ||
147 | { | ||
148 | int from = openat(fromfd, fromname, O_RDONLY); | ||
149 | if (from == -1) | ||
150 | err(1, "open copy source"); | ||
151 | |||
152 | int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700); | ||
153 | |||
154 | while (true) { | ||
155 | char buf[4096]; | ||
156 | ssize_t sz = read(from, buf, sizeof(buf)); | ||
157 | if (sz == 0) | ||
158 | break; | ||
159 | if (sz < 0) | ||
160 | err(1, "read"); | ||
161 | |||
162 | if (write(to, buf, sz) != sz) | ||
163 | err(1, "write"); /* no short writes on tmpfs */ | ||
164 | } | ||
165 | |||
166 | close(from); | ||
167 | close(to); | ||
168 | } | ||
169 | |||
170 | static bool fork_wait(void) | ||
171 | { | ||
172 | pid_t child = fork(); | ||
173 | if (child == 0) { | ||
174 | nerrs = 0; | ||
175 | return true; | ||
176 | } else if (child > 0) { | ||
177 | int status; | ||
178 | if (waitpid(child, &status, 0) != child || | ||
179 | !WIFEXITED(status)) { | ||
180 | printf("[FAIL]\tChild died\n"); | ||
181 | nerrs++; | ||
182 | } else if (WEXITSTATUS(status) != 0) { | ||
183 | printf("[FAIL]\tChild failed\n"); | ||
184 | nerrs++; | ||
185 | } else { | ||
186 | printf("[OK]\tChild succeeded\n"); | ||
187 | } | ||
188 | |||
189 | return false; | ||
190 | } else { | ||
191 | err(1, "fork"); | ||
192 | } | ||
193 | } | ||
194 | |||
195 | static void exec_other_validate_cap(const char *name, | ||
196 | bool eff, bool perm, bool inh, bool ambient) | ||
197 | { | ||
198 | execl(name, name, (eff ? "1" : "0"), | ||
199 | (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"), | ||
200 | NULL); | ||
201 | err(1, "execl"); | ||
202 | } | ||
203 | |||
204 | static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) | ||
205 | { | ||
206 | exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient); | ||
207 | } | ||
208 | |||
209 | static int do_tests(int uid, const char *our_path) | ||
210 | { | ||
211 | bool have_outer_privilege = create_and_enter_ns(uid); | ||
212 | |||
213 | int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); | ||
214 | if (ourpath_fd == -1) | ||
215 | err(1, "open '%s'", our_path); | ||
216 | |||
217 | chdir_to_tmpfs(); | ||
218 | |||
219 | copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap"); | ||
220 | |||
221 | if (have_outer_privilege) { | ||
222 | uid_t gid = getegid(); | ||
223 | |||
224 | copy_fromat_to(ourpath_fd, "validate_cap", | ||
225 | "validate_cap_suidroot"); | ||
226 | if (chown("validate_cap_suidroot", 0, -1) != 0) | ||
227 | err(1, "chown"); | ||
228 | if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0) | ||
229 | err(1, "chmod"); | ||
230 | |||
231 | copy_fromat_to(ourpath_fd, "validate_cap", | ||
232 | "validate_cap_suidnonroot"); | ||
233 | if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0) | ||
234 | err(1, "chown"); | ||
235 | if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0) | ||
236 | err(1, "chmod"); | ||
237 | |||
238 | copy_fromat_to(ourpath_fd, "validate_cap", | ||
239 | "validate_cap_sgidroot"); | ||
240 | if (chown("validate_cap_sgidroot", -1, 0) != 0) | ||
241 | err(1, "chown"); | ||
242 | if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0) | ||
243 | err(1, "chmod"); | ||
244 | |||
245 | copy_fromat_to(ourpath_fd, "validate_cap", | ||
246 | "validate_cap_sgidnonroot"); | ||
247 | if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0) | ||
248 | err(1, "chown"); | ||
249 | if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0) | ||
250 | err(1, "chmod"); | ||
251 | } | ||
252 | |||
253 | capng_get_caps_process(); | ||
254 | |||
255 | /* Make sure that i starts out clear */ | ||
256 | capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); | ||
257 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
258 | err(1, "capng_apply"); | ||
259 | |||
260 | if (uid == 0) { | ||
261 | printf("[RUN]\tRoot => ep\n"); | ||
262 | if (fork_wait()) | ||
263 | exec_validate_cap(true, true, false, false); | ||
264 | } else { | ||
265 | printf("[RUN]\tNon-root => no caps\n"); | ||
266 | if (fork_wait()) | ||
267 | exec_validate_cap(false, false, false, false); | ||
268 | } | ||
269 | |||
270 | printf("[OK]\tCheck cap_ambient manipulation rules\n"); | ||
271 | |||
272 | /* We should not be able to add ambient caps yet. */ | ||
273 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) { | ||
274 | if (errno == EINVAL) | ||
275 | printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n"); | ||
276 | else | ||
277 | printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n"); | ||
278 | return 1; | ||
279 | } | ||
280 | printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n"); | ||
281 | |||
282 | capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW); | ||
283 | capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW); | ||
284 | capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW); | ||
285 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
286 | err(1, "capng_apply"); | ||
287 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) { | ||
288 | printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n"); | ||
289 | return 1; | ||
290 | } | ||
291 | printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n"); | ||
292 | |||
293 | capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); | ||
294 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
295 | err(1, "capng_apply"); | ||
296 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { | ||
297 | printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n"); | ||
298 | return 1; | ||
299 | } | ||
300 | printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n"); | ||
301 | |||
302 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) { | ||
303 | printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n"); | ||
304 | return 1; | ||
305 | } | ||
306 | |||
307 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0) | ||
308 | err(1, "PR_CAP_AMBIENT_CLEAR_ALL"); | ||
309 | |||
310 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { | ||
311 | printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n"); | ||
312 | return 1; | ||
313 | } | ||
314 | |||
315 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) | ||
316 | err(1, "PR_CAP_AMBIENT_RAISE"); | ||
317 | |||
318 | capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); | ||
319 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
320 | err(1, "capng_apply"); | ||
321 | |||
322 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { | ||
323 | printf("[FAIL]\tDropping I should have dropped A\n"); | ||
324 | return 1; | ||
325 | } | ||
326 | |||
327 | printf("[OK]\tBasic manipulation appears to work\n"); | ||
328 | |||
329 | capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); | ||
330 | if (capng_apply(CAPNG_SELECT_CAPS) != 0) | ||
331 | err(1, "capng_apply"); | ||
332 | if (uid == 0) { | ||
333 | printf("[RUN]\tRoot +i => eip\n"); | ||
334 | if (fork_wait()) | ||
335 | exec_validate_cap(true, true, true, false); | ||
336 | } else { | ||
337 | printf("[RUN]\tNon-root +i => i\n"); | ||
338 | if (fork_wait()) | ||
339 | exec_validate_cap(false, false, true, false); | ||
340 | } | ||
341 | |||
342 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) | ||
343 | err(1, "PR_CAP_AMBIENT_RAISE"); | ||
344 | |||
345 | printf("[RUN]\tUID %d +ia => eipa\n", uid); | ||
346 | if (fork_wait()) | ||
347 | exec_validate_cap(true, true, true, true); | ||
348 | |||
349 | /* The remaining tests need real privilege */ | ||
350 | |||
351 | if (!have_outer_privilege) { | ||
352 | printf("[SKIP]\tSUID/SGID tests (needs privilege)\n"); | ||
353 | goto done; | ||
354 | } | ||
355 | |||
356 | if (uid == 0) { | ||
357 | printf("[RUN]\tRoot +ia, suidroot => eipa\n"); | ||
358 | if (fork_wait()) | ||
359 | exec_other_validate_cap("./validate_cap_suidroot", | ||
360 | true, true, true, true); | ||
361 | |||
362 | printf("[RUN]\tRoot +ia, suidnonroot => ip\n"); | ||
363 | if (fork_wait()) | ||
364 | exec_other_validate_cap("./validate_cap_suidnonroot", | ||
365 | false, true, true, false); | ||
366 | |||
367 | printf("[RUN]\tRoot +ia, sgidroot => eipa\n"); | ||
368 | if (fork_wait()) | ||
369 | exec_other_validate_cap("./validate_cap_sgidroot", | ||
370 | true, true, true, true); | ||
371 | |||
372 | if (fork_wait()) { | ||
373 | printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n"); | ||
374 | if (setresgid(1, 1, 1) != 0) | ||
375 | err(1, "setresgid"); | ||
376 | exec_other_validate_cap("./validate_cap_sgidroot", | ||
377 | true, true, true, false); | ||
378 | } | ||
379 | |||
380 | printf("[RUN]\tRoot +ia, sgidnonroot => eip\n"); | ||
381 | if (fork_wait()) | ||
382 | exec_other_validate_cap("./validate_cap_sgidnonroot", | ||
383 | true, true, true, false); | ||
384 | } else { | ||
385 | printf("[RUN]\tNon-root +ia, sgidnonroot => i\n"); | ||
386 | exec_other_validate_cap("./validate_cap_sgidnonroot", | ||
387 | false, false, true, false); | ||
388 | |||
389 | if (fork_wait()) { | ||
390 | printf("[RUN]\tNon-root +ia, sgidroot => i\n"); | ||
391 | if (setresgid(1, 1, 1) != 0) | ||
392 | err(1, "setresgid"); | ||
393 | exec_other_validate_cap("./validate_cap_sgidroot", | ||
394 | false, false, true, false); | ||
395 | } | ||
396 | } | ||
397 | |||
398 | done: | ||
399 | return nerrs ? 1 : 0; | ||
400 | } | ||
401 | |||
402 | int main(int argc, char **argv) | ||
403 | { | ||
404 | char *tmp1, *tmp2, *our_path; | ||
405 | |||
406 | /* Find our path */ | ||
407 | tmp1 = strdup(argv[0]); | ||
408 | if (!tmp1) | ||
409 | err(1, "strdup"); | ||
410 | tmp2 = dirname(tmp1); | ||
411 | our_path = strdup(tmp2); | ||
412 | if (!our_path) | ||
413 | err(1, "strdup"); | ||
414 | free(tmp1); | ||
415 | |||
416 | if (fork_wait()) { | ||
417 | printf("[RUN]\t+++ Tests with uid == 0 +++\n"); | ||
418 | return do_tests(0, our_path); | ||
419 | } | ||
420 | |||
421 | if (fork_wait()) { | ||
422 | printf("[RUN]\t+++ Tests with uid != 0 +++\n"); | ||
423 | return do_tests(1, our_path); | ||
424 | } | ||
425 | |||
426 | return nerrs ? 1 : 0; | ||
427 | } | ||
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c new file mode 100644 index 000000000000..dd3c45f7b23c --- /dev/null +++ b/tools/testing/selftests/capabilities/validate_cap.c | |||
@@ -0,0 +1,73 @@ | |||
1 | #include <cap-ng.h> | ||
2 | #include <err.h> | ||
3 | #include <linux/capability.h> | ||
4 | #include <stdbool.h> | ||
5 | #include <string.h> | ||
6 | #include <stdio.h> | ||
7 | #include <sys/prctl.h> | ||
8 | #include <sys/auxv.h> | ||
9 | |||
10 | #ifndef PR_CAP_AMBIENT | ||
11 | #define PR_CAP_AMBIENT 47 | ||
12 | # define PR_CAP_AMBIENT_IS_SET 1 | ||
13 | # define PR_CAP_AMBIENT_RAISE 2 | ||
14 | # define PR_CAP_AMBIENT_LOWER 3 | ||
15 | # define PR_CAP_AMBIENT_CLEAR_ALL 4 | ||
16 | #endif | ||
17 | |||
18 | #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19) | ||
19 | # define HAVE_GETAUXVAL | ||
20 | #endif | ||
21 | |||
22 | static bool bool_arg(char **argv, int i) | ||
23 | { | ||
24 | if (!strcmp(argv[i], "0")) | ||
25 | return false; | ||
26 | else if (!strcmp(argv[i], "1")) | ||
27 | return true; | ||
28 | else | ||
29 | errx(1, "wrong argv[%d]", i); | ||
30 | } | ||
31 | |||
32 | int main(int argc, char **argv) | ||
33 | { | ||
34 | const char *atsec = ""; | ||
35 | |||
36 | /* | ||
37 | * Be careful just in case a setgid or setcapped copy of this | ||
38 | * helper gets out. | ||
39 | */ | ||
40 | |||
41 | if (argc != 5) | ||
42 | errx(1, "wrong argc"); | ||
43 | |||
44 | #ifdef HAVE_GETAUXVAL | ||
45 | if (getauxval(AT_SECURE)) | ||
46 | atsec = " (AT_SECURE is set)"; | ||
47 | else | ||
48 | atsec = " (AT_SECURE is not set)"; | ||
49 | #endif | ||
50 | |||
51 | capng_get_caps_process(); | ||
52 | |||
53 | if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { | ||
54 | printf("[FAIL]\tWrong effective state%s\n", atsec); | ||
55 | return 1; | ||
56 | } | ||
57 | if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) { | ||
58 | printf("[FAIL]\tWrong permitted state%s\n", atsec); | ||
59 | return 1; | ||
60 | } | ||
61 | if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) { | ||
62 | printf("[FAIL]\tWrong inheritable state%s\n", atsec); | ||
63 | return 1; | ||
64 | } | ||
65 | |||
66 | if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) { | ||
67 | printf("[FAIL]\tWrong ambient state%s\n", atsec); | ||
68 | return 1; | ||
69 | } | ||
70 | |||
71 | printf("[OK]\tCapabilities after execve were correct\n"); | ||
72 | return 0; | ||
73 | } | ||
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 231b9a031f6a..0d6854744b37 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest | |||
8 | BINARIES += map_hugetlb | 8 | BINARIES += map_hugetlb |
9 | BINARIES += thuge-gen | 9 | BINARIES += thuge-gen |
10 | BINARIES += transhuge-stress | 10 | BINARIES += transhuge-stress |
11 | BINARIES += userfaultfd | ||
11 | 12 | ||
12 | all: $(BINARIES) | 13 | all: $(BINARIES) |
13 | %: %.c | 14 | %: %.c |
14 | $(CC) $(CFLAGS) -o $@ $^ -lrt | 15 | $(CC) $(CFLAGS) -o $@ $^ -lrt |
16 | userfaultfd: userfaultfd.c | ||
17 | $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread | ||
15 | 18 | ||
16 | TEST_PROGS := run_vmtests | 19 | TEST_PROGS := run_vmtests |
17 | TEST_FILES := $(BINARIES) | 20 | TEST_FILES := $(BINARIES) |
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 49ece11ff7fd..831adeb5fc55 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests | |||
@@ -86,6 +86,17 @@ else | |||
86 | echo "[PASS]" | 86 | echo "[PASS]" |
87 | fi | 87 | fi |
88 | 88 | ||
89 | echo "--------------------" | ||
90 | echo "running userfaultfd" | ||
91 | echo "--------------------" | ||
92 | ./userfaultfd 128 32 | ||
93 | if [ $? -ne 0 ]; then | ||
94 | echo "[FAIL]" | ||
95 | exitcode=1 | ||
96 | else | ||
97 | echo "[PASS]" | ||
98 | fi | ||
99 | |||
89 | #cleanup | 100 | #cleanup |
90 | umount $mnt | 101 | umount $mnt |
91 | rm -rf $mnt | 102 | rm -rf $mnt |
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c new file mode 100644 index 000000000000..0c0b83953352 --- /dev/null +++ b/tools/testing/selftests/vm/userfaultfd.c | |||
@@ -0,0 +1,636 @@ | |||
1 | /* | ||
2 | * Stress userfaultfd syscall. | ||
3 | * | ||
4 | * Copyright (C) 2015 Red Hat, Inc. | ||
5 | * | ||
6 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
7 | * the COPYING file in the top-level directory. | ||
8 | * | ||
9 | * This test allocates two virtual areas and bounces the physical | ||
10 | * memory across the two virtual areas (from area_src to area_dst) | ||
11 | * using userfaultfd. | ||
12 | * | ||
13 | * There are three threads running per CPU: | ||
14 | * | ||
15 | * 1) one per-CPU thread takes a per-page pthread_mutex in a random | ||
16 | * page of the area_dst (while the physical page may still be in | ||
17 | * area_src), and increments a per-page counter in the same page, | ||
18 | * and checks its value against a verification region. | ||
19 | * | ||
20 | * 2) another per-CPU thread handles the userfaults generated by | ||
21 | * thread 1 above. userfaultfd blocking reads or poll() modes are | ||
22 | * exercised interleaved. | ||
23 | * | ||
24 | * 3) one last per-CPU thread transfers the memory in the background | ||
25 | * at maximum bandwidth (if not already transferred by thread | ||
26 | * 2). Each cpu thread takes cares of transferring a portion of the | ||
27 | * area. | ||
28 | * | ||
29 | * When all threads of type 3 completed the transfer, one bounce is | ||
30 | * complete. area_src and area_dst are then swapped. All threads are | ||
31 | * respawned and so the bounce is immediately restarted in the | ||
32 | * opposite direction. | ||
33 | * | ||
34 | * per-CPU threads 1 by triggering userfaults inside | ||
35 | * pthread_mutex_lock will also verify the atomicity of the memory | ||
36 | * transfer (UFFDIO_COPY). | ||
37 | * | ||
38 | * The program takes two parameters: the amounts of physical memory in | ||
39 | * megabytes (MiB) of the area and the number of bounces to execute. | ||
40 | * | ||
41 | * # 100MiB 99999 bounces | ||
42 | * ./userfaultfd 100 99999 | ||
43 | * | ||
44 | * # 1GiB 99 bounces | ||
45 | * ./userfaultfd 1000 99 | ||
46 | * | ||
47 | * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers | ||
48 | * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done | ||
49 | */ | ||
50 | |||
51 | #define _GNU_SOURCE | ||
52 | #include <stdio.h> | ||
53 | #include <errno.h> | ||
54 | #include <unistd.h> | ||
55 | #include <stdlib.h> | ||
56 | #include <sys/types.h> | ||
57 | #include <sys/stat.h> | ||
58 | #include <fcntl.h> | ||
59 | #include <time.h> | ||
60 | #include <signal.h> | ||
61 | #include <poll.h> | ||
62 | #include <string.h> | ||
63 | #include <sys/mman.h> | ||
64 | #include <sys/syscall.h> | ||
65 | #include <sys/ioctl.h> | ||
66 | #include <pthread.h> | ||
67 | #include "../../../../include/uapi/linux/userfaultfd.h" | ||
68 | |||
69 | #ifdef __x86_64__ | ||
70 | #define __NR_userfaultfd 323 | ||
71 | #elif defined(__i386__) | ||
72 | #define __NR_userfaultfd 359 | ||
73 | #elif defined(__powewrpc__) | ||
74 | #define __NR_userfaultfd 364 | ||
75 | #else | ||
76 | #error "missing __NR_userfaultfd definition" | ||
77 | #endif | ||
78 | |||
79 | static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; | ||
80 | |||
81 | #define BOUNCE_RANDOM (1<<0) | ||
82 | #define BOUNCE_RACINGFAULTS (1<<1) | ||
83 | #define BOUNCE_VERIFY (1<<2) | ||
84 | #define BOUNCE_POLL (1<<3) | ||
85 | static int bounces; | ||
86 | |||
87 | static unsigned long long *count_verify; | ||
88 | static int uffd, finished, *pipefd; | ||
89 | static char *area_src, *area_dst; | ||
90 | static char *zeropage; | ||
91 | pthread_attr_t attr; | ||
92 | |||
93 | /* pthread_mutex_t starts at page offset 0 */ | ||
94 | #define area_mutex(___area, ___nr) \ | ||
95 | ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) | ||
96 | /* | ||
97 | * count is placed in the page after pthread_mutex_t naturally aligned | ||
98 | * to avoid non alignment faults on non-x86 archs. | ||
99 | */ | ||
100 | #define area_count(___area, ___nr) \ | ||
101 | ((volatile unsigned long long *) ((unsigned long) \ | ||
102 | ((___area) + (___nr)*page_size + \ | ||
103 | sizeof(pthread_mutex_t) + \ | ||
104 | sizeof(unsigned long long) - 1) & \ | ||
105 | ~(unsigned long)(sizeof(unsigned long long) \ | ||
106 | - 1))) | ||
107 | |||
108 | static int my_bcmp(char *str1, char *str2, size_t n) | ||
109 | { | ||
110 | unsigned long i; | ||
111 | for (i = 0; i < n; i++) | ||
112 | if (str1[i] != str2[i]) | ||
113 | return 1; | ||
114 | return 0; | ||
115 | } | ||
116 | |||
117 | static void *locking_thread(void *arg) | ||
118 | { | ||
119 | unsigned long cpu = (unsigned long) arg; | ||
120 | struct random_data rand; | ||
121 | unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ | ||
122 | int32_t rand_nr; | ||
123 | unsigned long long count; | ||
124 | char randstate[64]; | ||
125 | unsigned int seed; | ||
126 | time_t start; | ||
127 | |||
128 | if (bounces & BOUNCE_RANDOM) { | ||
129 | seed = (unsigned int) time(NULL) - bounces; | ||
130 | if (!(bounces & BOUNCE_RACINGFAULTS)) | ||
131 | seed += cpu; | ||
132 | bzero(&rand, sizeof(rand)); | ||
133 | bzero(&randstate, sizeof(randstate)); | ||
134 | if (initstate_r(seed, randstate, sizeof(randstate), &rand)) | ||
135 | fprintf(stderr, "srandom_r error\n"), exit(1); | ||
136 | } else { | ||
137 | page_nr = -bounces; | ||
138 | if (!(bounces & BOUNCE_RACINGFAULTS)) | ||
139 | page_nr += cpu * nr_pages_per_cpu; | ||
140 | } | ||
141 | |||
142 | while (!finished) { | ||
143 | if (bounces & BOUNCE_RANDOM) { | ||
144 | if (random_r(&rand, &rand_nr)) | ||
145 | fprintf(stderr, "random_r 1 error\n"), exit(1); | ||
146 | page_nr = rand_nr; | ||
147 | if (sizeof(page_nr) > sizeof(rand_nr)) { | ||
148 | if (random_r(&rand, &rand_nr)) | ||
149 | fprintf(stderr, "random_r 2 error\n"), exit(1); | ||
150 | page_nr |= ((unsigned long) rand_nr) << 32; | ||
151 | } | ||
152 | } else | ||
153 | page_nr += 1; | ||
154 | page_nr %= nr_pages; | ||
155 | |||
156 | start = time(NULL); | ||
157 | if (bounces & BOUNCE_VERIFY) { | ||
158 | count = *area_count(area_dst, page_nr); | ||
159 | if (!count) | ||
160 | fprintf(stderr, | ||
161 | "page_nr %lu wrong count %Lu %Lu\n", | ||
162 | page_nr, count, | ||
163 | count_verify[page_nr]), exit(1); | ||
164 | |||
165 | |||
166 | /* | ||
167 | * We can't use bcmp (or memcmp) because that | ||
168 | * returns 0 erroneously if the memory is | ||
169 | * changing under it (even if the end of the | ||
170 | * page is never changing and always | ||
171 | * different). | ||
172 | */ | ||
173 | #if 1 | ||
174 | if (!my_bcmp(area_dst + page_nr * page_size, zeropage, | ||
175 | page_size)) | ||
176 | fprintf(stderr, | ||
177 | "my_bcmp page_nr %lu wrong count %Lu %Lu\n", | ||
178 | page_nr, count, | ||
179 | count_verify[page_nr]), exit(1); | ||
180 | #else | ||
181 | unsigned long loops; | ||
182 | |||
183 | loops = 0; | ||
184 | /* uncomment the below line to test with mutex */ | ||
185 | /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */ | ||
186 | while (!bcmp(area_dst + page_nr * page_size, zeropage, | ||
187 | page_size)) { | ||
188 | loops += 1; | ||
189 | if (loops > 10) | ||
190 | break; | ||
191 | } | ||
192 | /* uncomment below line to test with mutex */ | ||
193 | /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */ | ||
194 | if (loops) { | ||
195 | fprintf(stderr, | ||
196 | "page_nr %lu all zero thread %lu %p %lu\n", | ||
197 | page_nr, cpu, area_dst + page_nr * page_size, | ||
198 | loops); | ||
199 | if (loops > 10) | ||
200 | exit(1); | ||
201 | } | ||
202 | #endif | ||
203 | } | ||
204 | |||
205 | pthread_mutex_lock(area_mutex(area_dst, page_nr)); | ||
206 | count = *area_count(area_dst, page_nr); | ||
207 | if (count != count_verify[page_nr]) { | ||
208 | fprintf(stderr, | ||
209 | "page_nr %lu memory corruption %Lu %Lu\n", | ||
210 | page_nr, count, | ||
211 | count_verify[page_nr]), exit(1); | ||
212 | } | ||
213 | count++; | ||
214 | *area_count(area_dst, page_nr) = count_verify[page_nr] = count; | ||
215 | pthread_mutex_unlock(area_mutex(area_dst, page_nr)); | ||
216 | |||
217 | if (time(NULL) - start > 1) | ||
218 | fprintf(stderr, | ||
219 | "userfault too slow %ld " | ||
220 | "possible false positive with overcommit\n", | ||
221 | time(NULL) - start); | ||
222 | } | ||
223 | |||
224 | return NULL; | ||
225 | } | ||
226 | |||
227 | static int copy_page(unsigned long offset) | ||
228 | { | ||
229 | struct uffdio_copy uffdio_copy; | ||
230 | |||
231 | if (offset >= nr_pages * page_size) | ||
232 | fprintf(stderr, "unexpected offset %lu\n", | ||
233 | offset), exit(1); | ||
234 | uffdio_copy.dst = (unsigned long) area_dst + offset; | ||
235 | uffdio_copy.src = (unsigned long) area_src + offset; | ||
236 | uffdio_copy.len = page_size; | ||
237 | uffdio_copy.mode = 0; | ||
238 | uffdio_copy.copy = 0; | ||
239 | if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { | ||
240 | /* real retval in ufdio_copy.copy */ | ||
241 | if (uffdio_copy.copy != -EEXIST) | ||
242 | fprintf(stderr, "UFFDIO_COPY error %Ld\n", | ||
243 | uffdio_copy.copy), exit(1); | ||
244 | } else if (uffdio_copy.copy != page_size) { | ||
245 | fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", | ||
246 | uffdio_copy.copy), exit(1); | ||
247 | } else | ||
248 | return 1; | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | static void *uffd_poll_thread(void *arg) | ||
253 | { | ||
254 | unsigned long cpu = (unsigned long) arg; | ||
255 | struct pollfd pollfd[2]; | ||
256 | struct uffd_msg msg; | ||
257 | int ret; | ||
258 | unsigned long offset; | ||
259 | char tmp_chr; | ||
260 | unsigned long userfaults = 0; | ||
261 | |||
262 | pollfd[0].fd = uffd; | ||
263 | pollfd[0].events = POLLIN; | ||
264 | pollfd[1].fd = pipefd[cpu*2]; | ||
265 | pollfd[1].events = POLLIN; | ||
266 | |||
267 | for (;;) { | ||
268 | ret = poll(pollfd, 2, -1); | ||
269 | if (!ret) | ||
270 | fprintf(stderr, "poll error %d\n", ret), exit(1); | ||
271 | if (ret < 0) | ||
272 | perror("poll"), exit(1); | ||
273 | if (pollfd[1].revents & POLLIN) { | ||
274 | if (read(pollfd[1].fd, &tmp_chr, 1) != 1) | ||
275 | fprintf(stderr, "read pipefd error\n"), | ||
276 | exit(1); | ||
277 | break; | ||
278 | } | ||
279 | if (!(pollfd[0].revents & POLLIN)) | ||
280 | fprintf(stderr, "pollfd[0].revents %d\n", | ||
281 | pollfd[0].revents), exit(1); | ||
282 | ret = read(uffd, &msg, sizeof(msg)); | ||
283 | if (ret < 0) { | ||
284 | if (errno == EAGAIN) | ||
285 | continue; | ||
286 | perror("nonblocking read error"), exit(1); | ||
287 | } | ||
288 | if (msg.event != UFFD_EVENT_PAGEFAULT) | ||
289 | fprintf(stderr, "unexpected msg event %u\n", | ||
290 | msg.event), exit(1); | ||
291 | if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) | ||
292 | fprintf(stderr, "unexpected write fault\n"), exit(1); | ||
293 | offset = (char *)msg.arg.pagefault.address - area_dst; | ||
294 | offset &= ~(page_size-1); | ||
295 | if (copy_page(offset)) | ||
296 | userfaults++; | ||
297 | } | ||
298 | return (void *)userfaults; | ||
299 | } | ||
300 | |||
301 | pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; | ||
302 | |||
303 | static void *uffd_read_thread(void *arg) | ||
304 | { | ||
305 | unsigned long *this_cpu_userfaults; | ||
306 | struct uffd_msg msg; | ||
307 | unsigned long offset; | ||
308 | int ret; | ||
309 | |||
310 | this_cpu_userfaults = (unsigned long *) arg; | ||
311 | *this_cpu_userfaults = 0; | ||
312 | |||
313 | pthread_mutex_unlock(&uffd_read_mutex); | ||
314 | /* from here cancellation is ok */ | ||
315 | |||
316 | for (;;) { | ||
317 | ret = read(uffd, &msg, sizeof(msg)); | ||
318 | if (ret != sizeof(msg)) { | ||
319 | if (ret < 0) | ||
320 | perror("blocking read error"), exit(1); | ||
321 | else | ||
322 | fprintf(stderr, "short read\n"), exit(1); | ||
323 | } | ||
324 | if (msg.event != UFFD_EVENT_PAGEFAULT) | ||
325 | fprintf(stderr, "unexpected msg event %u\n", | ||
326 | msg.event), exit(1); | ||
327 | if (bounces & BOUNCE_VERIFY && | ||
328 | msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) | ||
329 | fprintf(stderr, "unexpected write fault\n"), exit(1); | ||
330 | offset = (char *)msg.arg.pagefault.address - area_dst; | ||
331 | offset &= ~(page_size-1); | ||
332 | if (copy_page(offset)) | ||
333 | (*this_cpu_userfaults)++; | ||
334 | } | ||
335 | return (void *)NULL; | ||
336 | } | ||
337 | |||
338 | static void *background_thread(void *arg) | ||
339 | { | ||
340 | unsigned long cpu = (unsigned long) arg; | ||
341 | unsigned long page_nr; | ||
342 | |||
343 | for (page_nr = cpu * nr_pages_per_cpu; | ||
344 | page_nr < (cpu+1) * nr_pages_per_cpu; | ||
345 | page_nr++) | ||
346 | copy_page(page_nr * page_size); | ||
347 | |||
348 | return NULL; | ||
349 | } | ||
350 | |||
351 | static int stress(unsigned long *userfaults) | ||
352 | { | ||
353 | unsigned long cpu; | ||
354 | pthread_t locking_threads[nr_cpus]; | ||
355 | pthread_t uffd_threads[nr_cpus]; | ||
356 | pthread_t background_threads[nr_cpus]; | ||
357 | void **_userfaults = (void **) userfaults; | ||
358 | |||
359 | finished = 0; | ||
360 | for (cpu = 0; cpu < nr_cpus; cpu++) { | ||
361 | if (pthread_create(&locking_threads[cpu], &attr, | ||
362 | locking_thread, (void *)cpu)) | ||
363 | return 1; | ||
364 | if (bounces & BOUNCE_POLL) { | ||
365 | if (pthread_create(&uffd_threads[cpu], &attr, | ||
366 | uffd_poll_thread, (void *)cpu)) | ||
367 | return 1; | ||
368 | } else { | ||
369 | if (pthread_create(&uffd_threads[cpu], &attr, | ||
370 | uffd_read_thread, | ||
371 | &_userfaults[cpu])) | ||
372 | return 1; | ||
373 | pthread_mutex_lock(&uffd_read_mutex); | ||
374 | } | ||
375 | if (pthread_create(&background_threads[cpu], &attr, | ||
376 | background_thread, (void *)cpu)) | ||
377 | return 1; | ||
378 | } | ||
379 | for (cpu = 0; cpu < nr_cpus; cpu++) | ||
380 | if (pthread_join(background_threads[cpu], NULL)) | ||
381 | return 1; | ||
382 | |||
383 | /* | ||
384 | * Be strict and immediately zap area_src, the whole area has | ||
385 | * been transferred already by the background treads. The | ||
386 | * area_src could then be faulted in in a racy way by still | ||
387 | * running uffdio_threads reading zeropages after we zapped | ||
388 | * area_src (but they're guaranteed to get -EEXIST from | ||
389 | * UFFDIO_COPY without writing zero pages into area_dst | ||
390 | * because the background threads already completed). | ||
391 | */ | ||
392 | if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) { | ||
393 | perror("madvise"); | ||
394 | return 1; | ||
395 | } | ||
396 | |||
397 | for (cpu = 0; cpu < nr_cpus; cpu++) { | ||
398 | char c; | ||
399 | if (bounces & BOUNCE_POLL) { | ||
400 | if (write(pipefd[cpu*2+1], &c, 1) != 1) { | ||
401 | fprintf(stderr, "pipefd write error\n"); | ||
402 | return 1; | ||
403 | } | ||
404 | if (pthread_join(uffd_threads[cpu], &_userfaults[cpu])) | ||
405 | return 1; | ||
406 | } else { | ||
407 | if (pthread_cancel(uffd_threads[cpu])) | ||
408 | return 1; | ||
409 | if (pthread_join(uffd_threads[cpu], NULL)) | ||
410 | return 1; | ||
411 | } | ||
412 | } | ||
413 | |||
414 | finished = 1; | ||
415 | for (cpu = 0; cpu < nr_cpus; cpu++) | ||
416 | if (pthread_join(locking_threads[cpu], NULL)) | ||
417 | return 1; | ||
418 | |||
419 | return 0; | ||
420 | } | ||
421 | |||
422 | static int userfaultfd_stress(void) | ||
423 | { | ||
424 | void *area; | ||
425 | char *tmp_area; | ||
426 | unsigned long nr; | ||
427 | struct uffdio_register uffdio_register; | ||
428 | struct uffdio_api uffdio_api; | ||
429 | unsigned long cpu; | ||
430 | int uffd_flags; | ||
431 | unsigned long userfaults[nr_cpus]; | ||
432 | |||
433 | if (posix_memalign(&area, page_size, nr_pages * page_size)) { | ||
434 | fprintf(stderr, "out of memory\n"); | ||
435 | return 1; | ||
436 | } | ||
437 | area_src = area; | ||
438 | if (posix_memalign(&area, page_size, nr_pages * page_size)) { | ||
439 | fprintf(stderr, "out of memory\n"); | ||
440 | return 1; | ||
441 | } | ||
442 | area_dst = area; | ||
443 | |||
444 | uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | ||
445 | if (uffd < 0) { | ||
446 | fprintf(stderr, | ||
447 | "userfaultfd syscall not available in this kernel\n"); | ||
448 | return 1; | ||
449 | } | ||
450 | uffd_flags = fcntl(uffd, F_GETFD, NULL); | ||
451 | |||
452 | uffdio_api.api = UFFD_API; | ||
453 | uffdio_api.features = 0; | ||
454 | if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { | ||
455 | fprintf(stderr, "UFFDIO_API\n"); | ||
456 | return 1; | ||
457 | } | ||
458 | if (uffdio_api.api != UFFD_API) { | ||
459 | fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api); | ||
460 | return 1; | ||
461 | } | ||
462 | |||
463 | count_verify = malloc(nr_pages * sizeof(unsigned long long)); | ||
464 | if (!count_verify) { | ||
465 | perror("count_verify"); | ||
466 | return 1; | ||
467 | } | ||
468 | |||
469 | for (nr = 0; nr < nr_pages; nr++) { | ||
470 | *area_mutex(area_src, nr) = (pthread_mutex_t) | ||
471 | PTHREAD_MUTEX_INITIALIZER; | ||
472 | count_verify[nr] = *area_count(area_src, nr) = 1; | ||
473 | } | ||
474 | |||
475 | pipefd = malloc(sizeof(int) * nr_cpus * 2); | ||
476 | if (!pipefd) { | ||
477 | perror("pipefd"); | ||
478 | return 1; | ||
479 | } | ||
480 | for (cpu = 0; cpu < nr_cpus; cpu++) { | ||
481 | if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) { | ||
482 | perror("pipe"); | ||
483 | return 1; | ||
484 | } | ||
485 | } | ||
486 | |||
487 | if (posix_memalign(&area, page_size, page_size)) { | ||
488 | fprintf(stderr, "out of memory\n"); | ||
489 | return 1; | ||
490 | } | ||
491 | zeropage = area; | ||
492 | bzero(zeropage, page_size); | ||
493 | |||
494 | pthread_mutex_lock(&uffd_read_mutex); | ||
495 | |||
496 | pthread_attr_init(&attr); | ||
497 | pthread_attr_setstacksize(&attr, 16*1024*1024); | ||
498 | |||
499 | while (bounces--) { | ||
500 | unsigned long expected_ioctls; | ||
501 | |||
502 | printf("bounces: %d, mode:", bounces); | ||
503 | if (bounces & BOUNCE_RANDOM) | ||
504 | printf(" rnd"); | ||
505 | if (bounces & BOUNCE_RACINGFAULTS) | ||
506 | printf(" racing"); | ||
507 | if (bounces & BOUNCE_VERIFY) | ||
508 | printf(" ver"); | ||
509 | if (bounces & BOUNCE_POLL) | ||
510 | printf(" poll"); | ||
511 | printf(", "); | ||
512 | fflush(stdout); | ||
513 | |||
514 | if (bounces & BOUNCE_POLL) | ||
515 | fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); | ||
516 | else | ||
517 | fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); | ||
518 | |||
519 | /* register */ | ||
520 | uffdio_register.range.start = (unsigned long) area_dst; | ||
521 | uffdio_register.range.len = nr_pages * page_size; | ||
522 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; | ||
523 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { | ||
524 | fprintf(stderr, "register failure\n"); | ||
525 | return 1; | ||
526 | } | ||
527 | expected_ioctls = (1 << _UFFDIO_WAKE) | | ||
528 | (1 << _UFFDIO_COPY) | | ||
529 | (1 << _UFFDIO_ZEROPAGE); | ||
530 | if ((uffdio_register.ioctls & expected_ioctls) != | ||
531 | expected_ioctls) { | ||
532 | fprintf(stderr, | ||
533 | "unexpected missing ioctl for anon memory\n"); | ||
534 | return 1; | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * The madvise done previously isn't enough: some | ||
539 | * uffd_thread could have read userfaults (one of | ||
540 | * those already resolved by the background thread) | ||
541 | * and it may be in the process of calling | ||
542 | * UFFDIO_COPY. UFFDIO_COPY will read the zapped | ||
543 | * area_src and it would map a zero page in it (of | ||
544 | * course such a UFFDIO_COPY is perfectly safe as it'd | ||
545 | * return -EEXIST). The problem comes at the next | ||
546 | * bounce though: that racing UFFDIO_COPY would | ||
547 | * generate zeropages in the area_src, so invalidating | ||
548 | * the previous MADV_DONTNEED. Without this additional | ||
549 | * MADV_DONTNEED those zeropages leftovers in the | ||
550 | * area_src would lead to -EEXIST failure during the | ||
551 | * next bounce, effectively leaving a zeropage in the | ||
552 | * area_dst. | ||
553 | * | ||
554 | * Try to comment this out madvise to see the memory | ||
555 | * corruption being caught pretty quick. | ||
556 | * | ||
557 | * khugepaged is also inhibited to collapse THP after | ||
558 | * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's | ||
559 | * required to MADV_DONTNEED here. | ||
560 | */ | ||
561 | if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) { | ||
562 | perror("madvise 2"); | ||
563 | return 1; | ||
564 | } | ||
565 | |||
566 | /* bounce pass */ | ||
567 | if (stress(userfaults)) | ||
568 | return 1; | ||
569 | |||
570 | /* unregister */ | ||
571 | if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { | ||
572 | fprintf(stderr, "register failure\n"); | ||
573 | return 1; | ||
574 | } | ||
575 | |||
576 | /* verification */ | ||
577 | if (bounces & BOUNCE_VERIFY) { | ||
578 | for (nr = 0; nr < nr_pages; nr++) { | ||
579 | if (my_bcmp(area_dst, | ||
580 | area_dst + nr * page_size, | ||
581 | sizeof(pthread_mutex_t))) { | ||
582 | fprintf(stderr, | ||
583 | "error mutex 2 %lu\n", | ||
584 | nr); | ||
585 | bounces = 0; | ||
586 | } | ||
587 | if (*area_count(area_dst, nr) != count_verify[nr]) { | ||
588 | fprintf(stderr, | ||
589 | "error area_count %Lu %Lu %lu\n", | ||
590 | *area_count(area_src, nr), | ||
591 | count_verify[nr], | ||
592 | nr); | ||
593 | bounces = 0; | ||
594 | } | ||
595 | } | ||
596 | } | ||
597 | |||
598 | /* prepare next bounce */ | ||
599 | tmp_area = area_src; | ||
600 | area_src = area_dst; | ||
601 | area_dst = tmp_area; | ||
602 | |||
603 | printf("userfaults:"); | ||
604 | for (cpu = 0; cpu < nr_cpus; cpu++) | ||
605 | printf(" %lu", userfaults[cpu]); | ||
606 | printf("\n"); | ||
607 | } | ||
608 | |||
609 | return 0; | ||
610 | } | ||
611 | |||
612 | int main(int argc, char **argv) | ||
613 | { | ||
614 | if (argc < 3) | ||
615 | fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); | ||
616 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | ||
617 | page_size = sysconf(_SC_PAGE_SIZE); | ||
618 | if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) > | ||
619 | page_size) | ||
620 | fprintf(stderr, "Impossible to run this test\n"), exit(2); | ||
621 | nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / | ||
622 | nr_cpus; | ||
623 | if (!nr_pages_per_cpu) { | ||
624 | fprintf(stderr, "invalid MiB\n"); | ||
625 | fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); | ||
626 | } | ||
627 | bounces = atoi(argv[2]); | ||
628 | if (bounces <= 0) { | ||
629 | fprintf(stderr, "invalid bounces\n"); | ||
630 | fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); | ||
631 | } | ||
632 | nr_pages = nr_pages_per_cpu * nr_cpus; | ||
633 | printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", | ||
634 | nr_pages, nr_pages_per_cpu); | ||
635 | return userfaultfd_stress(); | ||
636 | } | ||