diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-25 19:59:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-25 19:59:11 -0400 |
commit | 606c61a0579669c292dc5f5e1cf898edecfc0d53 (patch) | |
tree | 569aa7e9b99571890bfccd7278bbc303cfa0a919 | |
parent | 15dbc136dff62ebefb03353cfb7d308d49b275f3 (diff) | |
parent | 0fda2788b03c1868e2f20b3b7995b8cc2adf4715 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge fourth patch-bomb from Andrew Morton:
"A lot more stuff than expected, sorry. A bunch of ocfs2 reviewing was
finished off.
- mhocko's oom-reaper out-of-memory-handler changes
- ocfs2 fixes and features
- KASAN feature work
- various fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (42 commits)
thp: fix typo in khugepaged_scan_pmd()
MAINTAINERS: fill entries for KASAN
mm/filemap: generic_file_read_iter(): check for zero reads unconditionally
kasan: test fix: warn if the UAF could not be detected in kmalloc_uaf2
mm, kasan: stackdepot implementation. Enable stackdepot for SLAB
arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections
mm, kasan: add GFP flags to KASAN API
mm, kasan: SLAB support
kasan: modify kmalloc_large_oob_right(), add kmalloc_pagealloc_oob_right()
include/linux/oom.h: remove undefined oom_kills_count()/note_oom_kill()
mm/page_alloc: prevent merging between isolated and other pageblocks
drivers/memstick/host/r592.c: avoid gcc-6 warning
ocfs2: extend enough credits for freeing one truncate record while replaying truncate records
ocfs2: extend transaction for ocfs2_remove_rightmost_path() and ocfs2_update_edge_lengths() before to avoid inconsistency between inode and et
ocfs2/dlm: move lock to the tail of grant queue while doing in-place convert
ocfs2: solve a problem of crossing the boundary in updating backups
ocfs2: fix occurring deadlock by changing ocfs2_wq from global to local
ocfs2/dlm: fix BUG in dlm_move_lockres_to_recovery_list
ocfs2/dlm: fix race between convert and recovery
ocfs2: fix a deadlock issue in ocfs2_dio_end_io_write()
...
79 files changed, 1770 insertions, 962 deletions
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index aa1e0c91e368..7dd95b35cd7c 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt | |||
@@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access, | |||
12 | therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is | 12 | therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is |
13 | required for detection of out-of-bounds accesses to stack or global variables. | 13 | required for detection of out-of-bounds accesses to stack or global variables. |
14 | 14 | ||
15 | Currently KASAN is supported only for x86_64 architecture and requires the | 15 | Currently KASAN is supported only for x86_64 architecture. |
16 | kernel to be built with the SLUB allocator. | ||
17 | 16 | ||
18 | 1. Usage | 17 | 1. Usage |
19 | ======== | 18 | ======== |
@@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary | |||
27 | the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC | 26 | the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC |
28 | version 5.0 or later. | 27 | version 5.0 or later. |
29 | 28 | ||
30 | Currently KASAN works only with the SLUB memory allocator. | 29 | KASAN works with both SLUB and SLAB memory allocators. |
31 | For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. | 30 | For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. |
32 | 31 | ||
33 | To disable instrumentation for specific files or directories, add a line | 32 | To disable instrumentation for specific files or directories, add a line |
diff --git a/MAINTAINERS b/MAINTAINERS index f07a174bbc81..df8cf6b924c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -6165,6 +6165,20 @@ S: Maintained | |||
6165 | F: Documentation/hwmon/k8temp | 6165 | F: Documentation/hwmon/k8temp |
6166 | F: drivers/hwmon/k8temp.c | 6166 | F: drivers/hwmon/k8temp.c |
6167 | 6167 | ||
6168 | KASAN | ||
6169 | M: Andrey Ryabinin <aryabinin@virtuozzo.com> | ||
6170 | R: Alexander Potapenko <glider@google.com> | ||
6171 | R: Dmitry Vyukov <dvyukov@google.com> | ||
6172 | L: kasan-dev@googlegroups.com | ||
6173 | S: Maintained | ||
6174 | F: arch/*/include/asm/kasan.h | ||
6175 | F: arch/*/mm/kasan_init* | ||
6176 | F: Documentation/kasan.txt | ||
6177 | F: include/linux/kasan.h | ||
6178 | F: lib/test_kasan.c | ||
6179 | F: mm/kasan/ | ||
6180 | F: scripts/Makefile.kasan | ||
6181 | |||
6168 | KCONFIG | 6182 | KCONFIG |
6169 | M: "Yann E. MORIN" <yann.morin.1998@free.fr> | 6183 | M: "Yann E. MORIN" <yann.morin.1998@free.fr> |
6170 | L: linux-kbuild@vger.kernel.org | 6184 | L: linux-kbuild@vger.kernel.org |
diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h index 5abaf5bbd985..bf1991263d2d 100644 --- a/arch/arm/include/asm/exception.h +++ b/arch/arm/include/asm/exception.h | |||
@@ -7,7 +7,7 @@ | |||
7 | #ifndef __ASM_ARM_EXCEPTION_H | 7 | #ifndef __ASM_ARM_EXCEPTION_H |
8 | #define __ASM_ARM_EXCEPTION_H | 8 | #define __ASM_ARM_EXCEPTION_H |
9 | 9 | ||
10 | #include <linux/ftrace.h> | 10 | #include <linux/interrupt.h> |
11 | 11 | ||
12 | #define __exception __attribute__((section(".exception.text"))) | 12 | #define __exception __attribute__((section(".exception.text"))) |
13 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 13 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 1fab979daeaf..e2c6da096cef 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S | |||
@@ -108,6 +108,7 @@ SECTIONS | |||
108 | *(.exception.text) | 108 | *(.exception.text) |
109 | __exception_text_end = .; | 109 | __exception_text_end = .; |
110 | IRQENTRY_TEXT | 110 | IRQENTRY_TEXT |
111 | SOFTIRQENTRY_TEXT | ||
111 | TEXT_TEXT | 112 | TEXT_TEXT |
112 | SCHED_TEXT | 113 | SCHED_TEXT |
113 | LOCK_TEXT | 114 | LOCK_TEXT |
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 6cb7e1a6bc02..0c2eec490abf 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h | |||
@@ -18,7 +18,7 @@ | |||
18 | #ifndef __ASM_EXCEPTION_H | 18 | #ifndef __ASM_EXCEPTION_H |
19 | #define __ASM_EXCEPTION_H | 19 | #define __ASM_EXCEPTION_H |
20 | 20 | ||
21 | #include <linux/ftrace.h> | 21 | #include <linux/interrupt.h> |
22 | 22 | ||
23 | #define __exception __attribute__((section(".exception.text"))) | 23 | #define __exception __attribute__((section(".exception.text"))) |
24 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 24 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 37f624df68fa..5a1939a74ff3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S | |||
@@ -103,6 +103,7 @@ SECTIONS | |||
103 | *(.exception.text) | 103 | *(.exception.text) |
104 | __exception_text_end = .; | 104 | __exception_text_end = .; |
105 | IRQENTRY_TEXT | 105 | IRQENTRY_TEXT |
106 | SOFTIRQENTRY_TEXT | ||
106 | TEXT_TEXT | 107 | TEXT_TEXT |
107 | SCHED_TEXT | 108 | SCHED_TEXT |
108 | LOCK_TEXT | 109 | LOCK_TEXT |
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index c9eec84aa258..d920b959ff3a 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S | |||
@@ -35,6 +35,7 @@ SECTIONS | |||
35 | #endif | 35 | #endif |
36 | LOCK_TEXT | 36 | LOCK_TEXT |
37 | IRQENTRY_TEXT | 37 | IRQENTRY_TEXT |
38 | SOFTIRQENTRY_TEXT | ||
38 | KPROBES_TEXT | 39 | KPROBES_TEXT |
39 | #ifdef CONFIG_ROMKERNEL | 40 | #ifdef CONFIG_ROMKERNEL |
40 | __sinittext = .; | 41 | __sinittext = .; |
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 5a6e141d1641..50bc10f97bcb 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S | |||
@@ -72,6 +72,7 @@ SECTIONS | |||
72 | SCHED_TEXT | 72 | SCHED_TEXT |
73 | LOCK_TEXT | 73 | LOCK_TEXT |
74 | IRQENTRY_TEXT | 74 | IRQENTRY_TEXT |
75 | SOFTIRQENTRY_TEXT | ||
75 | KPROBES_TEXT | 76 | KPROBES_TEXT |
76 | *(.fixup) | 77 | *(.fixup) |
77 | *(.gnu.warning) | 78 | *(.gnu.warning) |
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index e12055e88bfe..150ace92c7ad 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S | |||
@@ -24,6 +24,7 @@ SECTIONS | |||
24 | LOCK_TEXT | 24 | LOCK_TEXT |
25 | KPROBES_TEXT | 25 | KPROBES_TEXT |
26 | IRQENTRY_TEXT | 26 | IRQENTRY_TEXT |
27 | SOFTIRQENTRY_TEXT | ||
27 | *(.text.*) | 28 | *(.text.*) |
28 | *(.gnu.warning) | 29 | *(.gnu.warning) |
29 | } | 30 | } |
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index be9488d69734..0a47f0410554 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S | |||
@@ -36,6 +36,7 @@ SECTIONS { | |||
36 | LOCK_TEXT | 36 | LOCK_TEXT |
37 | KPROBES_TEXT | 37 | KPROBES_TEXT |
38 | IRQENTRY_TEXT | 38 | IRQENTRY_TEXT |
39 | SOFTIRQENTRY_TEXT | ||
39 | . = ALIGN (4) ; | 40 | . = ALIGN (4) ; |
40 | _etext = . ; | 41 | _etext = . ; |
41 | } | 42 | } |
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 0a93e83cd014..54d653ee17e1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S | |||
@@ -58,6 +58,7 @@ SECTIONS | |||
58 | LOCK_TEXT | 58 | LOCK_TEXT |
59 | KPROBES_TEXT | 59 | KPROBES_TEXT |
60 | IRQENTRY_TEXT | 60 | IRQENTRY_TEXT |
61 | SOFTIRQENTRY_TEXT | ||
61 | *(.text.*) | 62 | *(.text.*) |
62 | *(.fixup) | 63 | *(.fixup) |
63 | *(.gnu.warning) | 64 | *(.gnu.warning) |
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index 326fab40a9de..e23e89539967 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S | |||
@@ -39,6 +39,7 @@ SECTIONS | |||
39 | SCHED_TEXT | 39 | SCHED_TEXT |
40 | LOCK_TEXT | 40 | LOCK_TEXT |
41 | IRQENTRY_TEXT | 41 | IRQENTRY_TEXT |
42 | SOFTIRQENTRY_TEXT | ||
42 | KPROBES_TEXT | 43 | KPROBES_TEXT |
43 | } =0 | 44 | } =0 |
44 | _etext = .; | 45 | _etext = .; |
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 2d69a853b742..d936de4c07ca 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S | |||
@@ -50,6 +50,7 @@ SECTIONS | |||
50 | LOCK_TEXT | 50 | LOCK_TEXT |
51 | KPROBES_TEXT | 51 | KPROBES_TEXT |
52 | IRQENTRY_TEXT | 52 | IRQENTRY_TEXT |
53 | SOFTIRQENTRY_TEXT | ||
53 | *(.fixup) | 54 | *(.fixup) |
54 | *(.text.__*) | 55 | *(.text.__*) |
55 | _etext = .; | 56 | _etext = .; |
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 308f29081d46..f3ead0b6ce46 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S | |||
@@ -72,6 +72,7 @@ SECTIONS | |||
72 | LOCK_TEXT | 72 | LOCK_TEXT |
73 | KPROBES_TEXT | 73 | KPROBES_TEXT |
74 | IRQENTRY_TEXT | 74 | IRQENTRY_TEXT |
75 | SOFTIRQENTRY_TEXT | ||
75 | *(.text.do_softirq) | 76 | *(.text.do_softirq) |
76 | *(.text.sys_exit) | 77 | *(.text.sys_exit) |
77 | *(.text.do_sigaltstack) | 78 | *(.text.do_sigaltstack) |
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index d41fd0af8980..2dd91f79de05 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S | |||
@@ -55,6 +55,7 @@ SECTIONS | |||
55 | LOCK_TEXT | 55 | LOCK_TEXT |
56 | KPROBES_TEXT | 56 | KPROBES_TEXT |
57 | IRQENTRY_TEXT | 57 | IRQENTRY_TEXT |
58 | SOFTIRQENTRY_TEXT | ||
58 | 59 | ||
59 | #ifdef CONFIG_PPC32 | 60 | #ifdef CONFIG_PPC32 |
60 | *(.got1) | 61 | *(.got1) |
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 445657fe658c..0f41a8286378 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S | |||
@@ -28,6 +28,7 @@ SECTIONS | |||
28 | LOCK_TEXT | 28 | LOCK_TEXT |
29 | KPROBES_TEXT | 29 | KPROBES_TEXT |
30 | IRQENTRY_TEXT | 30 | IRQENTRY_TEXT |
31 | SOFTIRQENTRY_TEXT | ||
31 | *(.fixup) | 32 | *(.fixup) |
32 | *(.gnu.warning) | 33 | *(.gnu.warning) |
33 | } :text = 0x0700 | 34 | } :text = 0x0700 |
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index db88cbf9eafd..235a4101999f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S | |||
@@ -39,6 +39,7 @@ SECTIONS | |||
39 | LOCK_TEXT | 39 | LOCK_TEXT |
40 | KPROBES_TEXT | 40 | KPROBES_TEXT |
41 | IRQENTRY_TEXT | 41 | IRQENTRY_TEXT |
42 | SOFTIRQENTRY_TEXT | ||
42 | *(.fixup) | 43 | *(.fixup) |
43 | *(.gnu.warning) | 44 | *(.gnu.warning) |
44 | _etext = .; /* End of text section */ | 45 | _etext = .; /* End of text section */ |
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index f1a2f688b28a..aadd321aa05d 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S | |||
@@ -48,6 +48,7 @@ SECTIONS | |||
48 | LOCK_TEXT | 48 | LOCK_TEXT |
49 | KPROBES_TEXT | 49 | KPROBES_TEXT |
50 | IRQENTRY_TEXT | 50 | IRQENTRY_TEXT |
51 | SOFTIRQENTRY_TEXT | ||
51 | *(.gnu.warning) | 52 | *(.gnu.warning) |
52 | } = 0 | 53 | } = 0 |
53 | _etext = .; | 54 | _etext = .; |
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 0e059a0101ea..378f5d8d1ec8 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S | |||
@@ -45,6 +45,7 @@ SECTIONS | |||
45 | LOCK_TEXT | 45 | LOCK_TEXT |
46 | KPROBES_TEXT | 46 | KPROBES_TEXT |
47 | IRQENTRY_TEXT | 47 | IRQENTRY_TEXT |
48 | SOFTIRQENTRY_TEXT | ||
48 | __fix_text_end = .; /* tile-cpack won't rearrange before this */ | 49 | __fix_text_end = .; /* tile-cpack won't rearrange before this */ |
49 | ALIGN_FUNCTION(); | 50 | ALIGN_FUNCTION(); |
50 | *(.hottext*) | 51 | *(.hottext*) |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index adaae2c781c1..616ebd22ef9a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -19,6 +19,7 @@ endif | |||
19 | KASAN_SANITIZE_head$(BITS).o := n | 19 | KASAN_SANITIZE_head$(BITS).o := n |
20 | KASAN_SANITIZE_dumpstack.o := n | 20 | KASAN_SANITIZE_dumpstack.o := n |
21 | KASAN_SANITIZE_dumpstack_$(BITS).o := n | 21 | KASAN_SANITIZE_dumpstack_$(BITS).o := n |
22 | KASAN_SANITIZE_stacktrace.o := n | ||
22 | 23 | ||
23 | OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y | 24 | OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y |
24 | OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y | 25 | OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d239639e0c1d..4c941f88d405 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -101,6 +101,7 @@ SECTIONS | |||
101 | KPROBES_TEXT | 101 | KPROBES_TEXT |
102 | ENTRY_TEXT | 102 | ENTRY_TEXT |
103 | IRQENTRY_TEXT | 103 | IRQENTRY_TEXT |
104 | SOFTIRQENTRY_TEXT | ||
104 | *(.fixup) | 105 | *(.fixup) |
105 | *(.gnu.warning) | 106 | *(.gnu.warning) |
106 | /* End of text section */ | 107 | /* End of text section */ |
diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c index 64ca7113ff28..d84d20b9cec0 100644 --- a/drivers/input/input-compat.c +++ b/drivers/input/input-compat.c | |||
@@ -17,7 +17,7 @@ | |||
17 | int input_event_from_user(const char __user *buffer, | 17 | int input_event_from_user(const char __user *buffer, |
18 | struct input_event *event) | 18 | struct input_event *event) |
19 | { | 19 | { |
20 | if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { | 20 | if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { |
21 | struct input_event_compat compat_event; | 21 | struct input_event_compat compat_event; |
22 | 22 | ||
23 | if (copy_from_user(&compat_event, buffer, | 23 | if (copy_from_user(&compat_event, buffer, |
@@ -41,7 +41,7 @@ int input_event_from_user(const char __user *buffer, | |||
41 | int input_event_to_user(char __user *buffer, | 41 | int input_event_to_user(char __user *buffer, |
42 | const struct input_event *event) | 42 | const struct input_event *event) |
43 | { | 43 | { |
44 | if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { | 44 | if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { |
45 | struct input_event_compat compat_event; | 45 | struct input_event_compat compat_event; |
46 | 46 | ||
47 | compat_event.time.tv_sec = event->time.tv_sec; | 47 | compat_event.time.tv_sec = event->time.tv_sec; |
@@ -65,7 +65,7 @@ int input_event_to_user(char __user *buffer, | |||
65 | int input_ff_effect_from_user(const char __user *buffer, size_t size, | 65 | int input_ff_effect_from_user(const char __user *buffer, size_t size, |
66 | struct ff_effect *effect) | 66 | struct ff_effect *effect) |
67 | { | 67 | { |
68 | if (INPUT_COMPAT_TEST) { | 68 | if (in_compat_syscall()) { |
69 | struct ff_effect_compat *compat_effect; | 69 | struct ff_effect_compat *compat_effect; |
70 | 70 | ||
71 | if (size != sizeof(struct ff_effect_compat)) | 71 | if (size != sizeof(struct ff_effect_compat)) |
diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h index 0f25878d5fa2..1563160a7af3 100644 --- a/drivers/input/input-compat.h +++ b/drivers/input/input-compat.h | |||
@@ -17,8 +17,6 @@ | |||
17 | 17 | ||
18 | #ifdef CONFIG_COMPAT | 18 | #ifdef CONFIG_COMPAT |
19 | 19 | ||
20 | #define INPUT_COMPAT_TEST in_compat_syscall() | ||
21 | |||
22 | struct input_event_compat { | 20 | struct input_event_compat { |
23 | struct compat_timeval time; | 21 | struct compat_timeval time; |
24 | __u16 type; | 22 | __u16 type; |
@@ -57,7 +55,7 @@ struct ff_effect_compat { | |||
57 | 55 | ||
58 | static inline size_t input_event_size(void) | 56 | static inline size_t input_event_size(void) |
59 | { | 57 | { |
60 | return (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) ? | 58 | return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ? |
61 | sizeof(struct input_event_compat) : sizeof(struct input_event); | 59 | sizeof(struct input_event_compat) : sizeof(struct input_event); |
62 | } | 60 | } |
63 | 61 | ||
diff --git a/drivers/input/input.c b/drivers/input/input.c index 880605959aa6..b87ffbd4547d 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c | |||
@@ -1015,7 +1015,7 @@ static int input_bits_to_string(char *buf, int buf_size, | |||
1015 | { | 1015 | { |
1016 | int len = 0; | 1016 | int len = 0; |
1017 | 1017 | ||
1018 | if (INPUT_COMPAT_TEST) { | 1018 | if (in_compat_syscall()) { |
1019 | u32 dword = bits >> 32; | 1019 | u32 dword = bits >> 32; |
1020 | if (dword || !skip_empty) | 1020 | if (dword || !skip_empty) |
1021 | len += snprintf(buf, buf_size, "%x ", dword); | 1021 | len += snprintf(buf, buf_size, "%x ", dword); |
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 4eb9e4d94f46..abe1a927b332 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c | |||
@@ -664,7 +664,7 @@ struct uinput_ff_upload_compat { | |||
664 | static int uinput_ff_upload_to_user(char __user *buffer, | 664 | static int uinput_ff_upload_to_user(char __user *buffer, |
665 | const struct uinput_ff_upload *ff_up) | 665 | const struct uinput_ff_upload *ff_up) |
666 | { | 666 | { |
667 | if (INPUT_COMPAT_TEST) { | 667 | if (in_compat_syscall()) { |
668 | struct uinput_ff_upload_compat ff_up_compat; | 668 | struct uinput_ff_upload_compat ff_up_compat; |
669 | 669 | ||
670 | ff_up_compat.request_id = ff_up->request_id; | 670 | ff_up_compat.request_id = ff_up->request_id; |
@@ -695,7 +695,7 @@ static int uinput_ff_upload_to_user(char __user *buffer, | |||
695 | static int uinput_ff_upload_from_user(const char __user *buffer, | 695 | static int uinput_ff_upload_from_user(const char __user *buffer, |
696 | struct uinput_ff_upload *ff_up) | 696 | struct uinput_ff_upload *ff_up) |
697 | { | 697 | { |
698 | if (INPUT_COMPAT_TEST) { | 698 | if (in_compat_syscall()) { |
699 | struct uinput_ff_upload_compat ff_up_compat; | 699 | struct uinput_ff_upload_compat ff_up_compat; |
700 | 700 | ||
701 | if (copy_from_user(&ff_up_compat, buffer, | 701 | if (copy_from_user(&ff_up_compat, buffer, |
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index ef09ba0289d7..d5cfb503b9d6 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c | |||
@@ -298,8 +298,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev) | |||
298 | sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? | 298 | sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? |
299 | PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); | 299 | PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); |
300 | 300 | ||
301 | if (sg_count != 1 || | 301 | if (sg_count != 1 || sg_dma_len(&dev->req->sg) < R592_LFIFO_SIZE) { |
302 | (sg_dma_len(&dev->req->sg) < dev->req->sg.length)) { | ||
303 | message("problem in dma_map_sg"); | 302 | message("problem in dma_map_sg"); |
304 | return -EIO; | 303 | return -EIO; |
305 | } | 304 | } |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d002579c6f2b..70907d638b60 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, | |||
2516 | struct ocfs2_extent_block *eb; | 2516 | struct ocfs2_extent_block *eb; |
2517 | u32 range; | 2517 | u32 range; |
2518 | 2518 | ||
2519 | /* | ||
2520 | * In normal tree rotation process, we will never touch the | ||
2521 | * tree branch above subtree_index and ocfs2_extend_rotate_transaction | ||
2522 | * doesn't reserve the credits for them either. | ||
2523 | * | ||
2524 | * But we do have a special case here which will update the rightmost | ||
2525 | * records for all the bh in the path. | ||
2526 | * So we have to allocate extra credits and access them. | ||
2527 | */ | ||
2528 | ret = ocfs2_extend_trans(handle, subtree_index); | ||
2529 | if (ret) { | ||
2530 | mlog_errno(ret); | ||
2531 | goto out; | ||
2532 | } | ||
2533 | |||
2534 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); | 2519 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); |
2535 | if (ret) { | 2520 | if (ret) { |
2536 | mlog_errno(ret); | 2521 | mlog_errno(ret); |
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, | |||
2956 | right_path->p_node[subtree_root].bh->b_blocknr, | 2941 | right_path->p_node[subtree_root].bh->b_blocknr, |
2957 | right_path->p_tree_depth); | 2942 | right_path->p_tree_depth); |
2958 | 2943 | ||
2959 | ret = ocfs2_extend_rotate_transaction(handle, subtree_root, | 2944 | ret = ocfs2_extend_rotate_transaction(handle, 0, |
2960 | orig_credits, left_path); | 2945 | orig_credits, left_path); |
2961 | if (ret) { | 2946 | if (ret) { |
2962 | mlog_errno(ret); | 2947 | mlog_errno(ret); |
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, | |||
3029 | struct ocfs2_extent_block *eb; | 3014 | struct ocfs2_extent_block *eb; |
3030 | struct ocfs2_extent_list *el; | 3015 | struct ocfs2_extent_list *el; |
3031 | 3016 | ||
3032 | |||
3033 | ret = ocfs2_et_sanity_check(et); | 3017 | ret = ocfs2_et_sanity_check(et); |
3034 | if (ret) | 3018 | if (ret) |
3035 | goto out; | 3019 | goto out; |
3036 | /* | ||
3037 | * There's two ways we handle this depending on | ||
3038 | * whether path is the only existing one. | ||
3039 | */ | ||
3040 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3041 | handle->h_buffer_credits, | ||
3042 | path); | ||
3043 | if (ret) { | ||
3044 | mlog_errno(ret); | ||
3045 | goto out; | ||
3046 | } | ||
3047 | 3020 | ||
3048 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); | 3021 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); |
3049 | if (ret) { | 3022 | if (ret) { |
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, | |||
3641 | */ | 3614 | */ |
3642 | if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && | 3615 | if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && |
3643 | le16_to_cpu(el->l_next_free_rec) == 1) { | 3616 | le16_to_cpu(el->l_next_free_rec) == 1) { |
3617 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
3618 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3619 | handle->h_buffer_credits, | ||
3620 | right_path); | ||
3621 | if (ret) { | ||
3622 | mlog_errno(ret); | ||
3623 | goto out; | ||
3624 | } | ||
3644 | 3625 | ||
3645 | ret = ocfs2_remove_rightmost_path(handle, et, | 3626 | ret = ocfs2_remove_rightmost_path(handle, et, |
3646 | right_path, | 3627 | right_path, |
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
3679 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); | 3660 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); |
3680 | 3661 | ||
3681 | if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { | 3662 | if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { |
3663 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
3664 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3665 | handle->h_buffer_credits, | ||
3666 | path); | ||
3667 | if (ret) { | ||
3668 | mlog_errno(ret); | ||
3669 | goto out; | ||
3670 | } | ||
3682 | /* | 3671 | /* |
3683 | * The merge code will need to create an empty | 3672 | * The merge code will need to create an empty |
3684 | * extent to take the place of the newly | 3673 | * extent to take the place of the newly |
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
3727 | */ | 3716 | */ |
3728 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | 3717 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); |
3729 | 3718 | ||
3719 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
3720 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3721 | handle->h_buffer_credits, | ||
3722 | path); | ||
3723 | if (ret) { | ||
3724 | mlog_errno(ret); | ||
3725 | goto out; | ||
3726 | } | ||
3727 | |||
3730 | /* The merge left us with an empty extent, remove it. */ | 3728 | /* The merge left us with an empty extent, remove it. */ |
3731 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 3729 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
3732 | if (ret) { | 3730 | if (ret) { |
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
3748 | goto out; | 3746 | goto out; |
3749 | } | 3747 | } |
3750 | 3748 | ||
3749 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
3750 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3751 | handle->h_buffer_credits, | ||
3752 | path); | ||
3753 | if (ret) { | ||
3754 | mlog_errno(ret); | ||
3755 | goto out; | ||
3756 | } | ||
3757 | |||
3751 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 3758 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
3752 | /* | 3759 | /* |
3753 | * Error from this last rotate is not critical, so | 3760 | * Error from this last rotate is not critical, so |
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
3783 | } | 3790 | } |
3784 | 3791 | ||
3785 | if (ctxt->c_split_covers_rec) { | 3792 | if (ctxt->c_split_covers_rec) { |
3793 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
3794 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
3795 | handle->h_buffer_credits, | ||
3796 | path); | ||
3797 | if (ret) { | ||
3798 | mlog_errno(ret); | ||
3799 | ret = 0; | ||
3800 | goto out; | ||
3801 | } | ||
3802 | |||
3786 | /* | 3803 | /* |
3787 | * The merge may have left an empty extent in | 3804 | * The merge may have left an empty extent in |
3788 | * our leaf. Try to rotate it away. | 3805 | * our leaf. Try to rotate it away. |
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, | |||
5342 | struct ocfs2_extent_block *eb; | 5359 | struct ocfs2_extent_block *eb; |
5343 | 5360 | ||
5344 | if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { | 5361 | if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { |
5362 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
5363 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
5364 | handle->h_buffer_credits, | ||
5365 | path); | ||
5366 | if (ret) { | ||
5367 | mlog_errno(ret); | ||
5368 | goto out; | ||
5369 | } | ||
5370 | |||
5345 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 5371 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
5346 | if (ret) { | 5372 | if (ret) { |
5347 | mlog_errno(ret); | 5373 | mlog_errno(ret); |
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | |||
5928 | 5954 | ||
5929 | ocfs2_journal_dirty(handle, tl_bh); | 5955 | ocfs2_journal_dirty(handle, tl_bh); |
5930 | 5956 | ||
5931 | /* TODO: Perhaps we can calculate the bulk of the | ||
5932 | * credits up front rather than extending like | ||
5933 | * this. */ | ||
5934 | status = ocfs2_extend_trans(handle, | ||
5935 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
5936 | if (status < 0) { | ||
5937 | mlog_errno(status); | ||
5938 | goto bail; | ||
5939 | } | ||
5940 | |||
5941 | rec = tl->tl_recs[i]; | 5957 | rec = tl->tl_recs[i]; |
5942 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, | 5958 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, |
5943 | le32_to_cpu(rec.t_start)); | 5959 | le32_to_cpu(rec.t_start)); |
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | |||
5958 | goto bail; | 5974 | goto bail; |
5959 | } | 5975 | } |
5960 | } | 5976 | } |
5977 | |||
5978 | status = ocfs2_extend_trans(handle, | ||
5979 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
5980 | if (status < 0) { | ||
5981 | mlog_errno(status); | ||
5982 | goto bail; | ||
5983 | } | ||
5961 | i--; | 5984 | i--; |
5962 | } | 5985 | } |
5963 | 5986 | ||
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | |||
6016 | goto out_mutex; | 6039 | goto out_mutex; |
6017 | } | 6040 | } |
6018 | 6041 | ||
6019 | handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); | 6042 | handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); |
6020 | if (IS_ERR(handle)) { | 6043 | if (IS_ERR(handle)) { |
6021 | status = PTR_ERR(handle); | 6044 | status = PTR_ERR(handle); |
6022 | mlog_errno(status); | 6045 | mlog_errno(status); |
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | |||
6079 | if (cancel) | 6102 | if (cancel) |
6080 | cancel_delayed_work(&osb->osb_truncate_log_wq); | 6103 | cancel_delayed_work(&osb->osb_truncate_log_wq); |
6081 | 6104 | ||
6082 | queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, | 6105 | queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, |
6083 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); | 6106 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); |
6084 | } | 6107 | } |
6085 | } | 6108 | } |
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) | |||
6253 | 6276 | ||
6254 | if (tl_inode) { | 6277 | if (tl_inode) { |
6255 | cancel_delayed_work(&osb->osb_truncate_log_wq); | 6278 | cancel_delayed_work(&osb->osb_truncate_log_wq); |
6256 | flush_workqueue(ocfs2_wq); | 6279 | flush_workqueue(osb->ocfs2_wq); |
6257 | 6280 | ||
6258 | status = ocfs2_flush_truncate_log(osb); | 6281 | status = ocfs2_flush_truncate_log(osb); |
6259 | if (status < 0) | 6282 | if (status < 0) |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 043110e5212d..1581240a7ca0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -499,158 +499,6 @@ bail: | |||
499 | return status; | 499 | return status; |
500 | } | 500 | } |
501 | 501 | ||
502 | /* | ||
503 | * TODO: Make this into a generic get_blocks function. | ||
504 | * | ||
505 | * From do_direct_io in direct-io.c: | ||
506 | * "So what we do is to permit the ->get_blocks function to populate | ||
507 | * bh.b_size with the size of IO which is permitted at this offset and | ||
508 | * this i_blkbits." | ||
509 | * | ||
510 | * This function is called directly from get_more_blocks in direct-io.c. | ||
511 | * | ||
512 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
513 | * fs_count, map_bh, dio->rw == WRITE); | ||
514 | */ | ||
515 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | ||
516 | struct buffer_head *bh_result, int create) | ||
517 | { | ||
518 | int ret; | ||
519 | u32 cpos = 0; | ||
520 | int alloc_locked = 0; | ||
521 | u64 p_blkno, inode_blocks, contig_blocks; | ||
522 | unsigned int ext_flags; | ||
523 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | ||
524 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
525 | unsigned long len = bh_result->b_size; | ||
526 | unsigned int clusters_to_alloc = 0, contig_clusters = 0; | ||
527 | |||
528 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); | ||
529 | |||
530 | /* This function won't even be called if the request isn't all | ||
531 | * nicely aligned and of the right size, so there's no need | ||
532 | * for us to check any of that. */ | ||
533 | |||
534 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
535 | |||
536 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
537 | |||
538 | /* This figures out the size of the next contiguous block, and | ||
539 | * our logical offset */ | ||
540 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | ||
541 | &contig_blocks, &ext_flags); | ||
542 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
543 | |||
544 | if (ret) { | ||
545 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
546 | (unsigned long long)iblock); | ||
547 | ret = -EIO; | ||
548 | goto bail; | ||
549 | } | ||
550 | |||
551 | /* We should already CoW the refcounted extent in case of create. */ | ||
552 | BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); | ||
553 | |||
554 | /* allocate blocks if no p_blkno is found, and create == 1 */ | ||
555 | if (!p_blkno && create) { | ||
556 | ret = ocfs2_inode_lock(inode, NULL, 1); | ||
557 | if (ret < 0) { | ||
558 | mlog_errno(ret); | ||
559 | goto bail; | ||
560 | } | ||
561 | |||
562 | alloc_locked = 1; | ||
563 | |||
564 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
565 | |||
566 | /* fill hole, allocate blocks can't be larger than the size | ||
567 | * of the hole */ | ||
568 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); | ||
569 | contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, | ||
570 | contig_blocks); | ||
571 | if (clusters_to_alloc > contig_clusters) | ||
572 | clusters_to_alloc = contig_clusters; | ||
573 | |||
574 | /* allocate extent and insert them into the extent tree */ | ||
575 | ret = ocfs2_extend_allocation(inode, cpos, | ||
576 | clusters_to_alloc, 0); | ||
577 | if (ret < 0) { | ||
578 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
579 | mlog_errno(ret); | ||
580 | goto bail; | ||
581 | } | ||
582 | |||
583 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | ||
584 | &contig_blocks, &ext_flags); | ||
585 | if (ret < 0) { | ||
586 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
587 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
588 | (unsigned long long)iblock); | ||
589 | ret = -EIO; | ||
590 | goto bail; | ||
591 | } | ||
592 | set_buffer_new(bh_result); | ||
593 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
594 | } | ||
595 | |||
596 | /* | ||
597 | * get_more_blocks() expects us to describe a hole by clearing | ||
598 | * the mapped bit on bh_result(). | ||
599 | * | ||
600 | * Consider an unwritten extent as a hole. | ||
601 | */ | ||
602 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
603 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
604 | else | ||
605 | clear_buffer_mapped(bh_result); | ||
606 | |||
607 | /* make sure we don't map more than max_blocks blocks here as | ||
608 | that's all the kernel will handle at this point. */ | ||
609 | if (max_blocks < contig_blocks) | ||
610 | contig_blocks = max_blocks; | ||
611 | bh_result->b_size = contig_blocks << blocksize_bits; | ||
612 | bail: | ||
613 | if (alloc_locked) | ||
614 | ocfs2_inode_unlock(inode, 1); | ||
615 | return ret; | ||
616 | } | ||
617 | |||
618 | /* | ||
619 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
620 | * particularly interested in the aio/dio case. We use the rw_lock DLM lock | ||
621 | * to protect io on one node from truncation on another. | ||
622 | */ | ||
623 | static int ocfs2_dio_end_io(struct kiocb *iocb, | ||
624 | loff_t offset, | ||
625 | ssize_t bytes, | ||
626 | void *private) | ||
627 | { | ||
628 | struct inode *inode = file_inode(iocb->ki_filp); | ||
629 | int level; | ||
630 | |||
631 | if (bytes <= 0) | ||
632 | return 0; | ||
633 | |||
634 | /* this io's submitter should not have unlocked this before we could */ | ||
635 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
636 | |||
637 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
638 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
639 | |||
640 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | ||
641 | } | ||
642 | |||
643 | /* Let rw unlock to be done later to protect append direct io write */ | ||
644 | if (offset + bytes <= i_size_read(inode)) { | ||
645 | ocfs2_iocb_clear_rw_locked(iocb); | ||
646 | |||
647 | level = ocfs2_iocb_rw_locked_level(iocb); | ||
648 | ocfs2_rw_unlock(inode, level); | ||
649 | } | ||
650 | |||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | static int ocfs2_releasepage(struct page *page, gfp_t wait) | 502 | static int ocfs2_releasepage(struct page *page, gfp_t wait) |
655 | { | 503 | { |
656 | if (!page_has_buffers(page)) | 504 | if (!page_has_buffers(page)) |
@@ -658,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) | |||
658 | return try_to_free_buffers(page); | 506 | return try_to_free_buffers(page); |
659 | } | 507 | } |
660 | 508 | ||
661 | static int ocfs2_is_overwrite(struct ocfs2_super *osb, | ||
662 | struct inode *inode, loff_t offset) | ||
663 | { | ||
664 | int ret = 0; | ||
665 | u32 v_cpos = 0; | ||
666 | u32 p_cpos = 0; | ||
667 | unsigned int num_clusters = 0; | ||
668 | unsigned int ext_flags = 0; | ||
669 | |||
670 | v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); | ||
671 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, | ||
672 | &num_clusters, &ext_flags); | ||
673 | if (ret < 0) { | ||
674 | mlog_errno(ret); | ||
675 | return ret; | ||
676 | } | ||
677 | |||
678 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
679 | return 1; | ||
680 | |||
681 | return 0; | ||
682 | } | ||
683 | |||
684 | static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, | ||
685 | struct inode *inode, loff_t offset, | ||
686 | u64 zero_len, int cluster_align) | ||
687 | { | ||
688 | u32 p_cpos = 0; | ||
689 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); | ||
690 | unsigned int num_clusters = 0; | ||
691 | unsigned int ext_flags = 0; | ||
692 | int ret = 0; | ||
693 | |||
694 | if (offset <= i_size_read(inode) || cluster_align) | ||
695 | return 0; | ||
696 | |||
697 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, | ||
698 | &ext_flags); | ||
699 | if (ret < 0) { | ||
700 | mlog_errno(ret); | ||
701 | return ret; | ||
702 | } | ||
703 | |||
704 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { | ||
705 | u64 s = i_size_read(inode); | ||
706 | sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + | ||
707 | (do_div(s, osb->s_clustersize) >> 9); | ||
708 | |||
709 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, | ||
710 | zero_len >> 9, GFP_NOFS, false); | ||
711 | if (ret < 0) | ||
712 | mlog_errno(ret); | ||
713 | } | ||
714 | |||
715 | return ret; | ||
716 | } | ||
717 | |||
718 | static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, | ||
719 | struct inode *inode, loff_t offset) | ||
720 | { | ||
721 | u64 zero_start, zero_len, total_zero_len; | ||
722 | u32 p_cpos = 0, clusters_to_add; | ||
723 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); | ||
724 | unsigned int num_clusters = 0; | ||
725 | unsigned int ext_flags = 0; | ||
726 | u32 size_div, offset_div; | ||
727 | int ret = 0; | ||
728 | |||
729 | { | ||
730 | u64 o = offset; | ||
731 | u64 s = i_size_read(inode); | ||
732 | |||
733 | offset_div = do_div(o, osb->s_clustersize); | ||
734 | size_div = do_div(s, osb->s_clustersize); | ||
735 | } | ||
736 | |||
737 | if (offset <= i_size_read(inode)) | ||
738 | return 0; | ||
739 | |||
740 | clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - | ||
741 | ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); | ||
742 | total_zero_len = offset - i_size_read(inode); | ||
743 | if (clusters_to_add) | ||
744 | total_zero_len -= offset_div; | ||
745 | |||
746 | /* Allocate clusters to fill out holes, and this is only needed | ||
747 | * when we add more than one clusters. Otherwise the cluster will | ||
748 | * be allocated during direct IO */ | ||
749 | if (clusters_to_add > 1) { | ||
750 | ret = ocfs2_extend_allocation(inode, | ||
751 | OCFS2_I(inode)->ip_clusters, | ||
752 | clusters_to_add - 1, 0); | ||
753 | if (ret) { | ||
754 | mlog_errno(ret); | ||
755 | goto out; | ||
756 | } | ||
757 | } | ||
758 | |||
759 | while (total_zero_len) { | ||
760 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, | ||
761 | &ext_flags); | ||
762 | if (ret < 0) { | ||
763 | mlog_errno(ret); | ||
764 | goto out; | ||
765 | } | ||
766 | |||
767 | zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + | ||
768 | size_div; | ||
769 | zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - | ||
770 | size_div; | ||
771 | zero_len = min(total_zero_len, zero_len); | ||
772 | |||
773 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { | ||
774 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, | ||
775 | zero_start >> 9, zero_len >> 9, | ||
776 | GFP_NOFS, false); | ||
777 | if (ret < 0) { | ||
778 | mlog_errno(ret); | ||
779 | goto out; | ||
780 | } | ||
781 | } | ||
782 | |||
783 | total_zero_len -= zero_len; | ||
784 | v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); | ||
785 | |||
786 | /* Only at first iteration can be cluster not aligned. | ||
787 | * So set size_div to 0 for the rest */ | ||
788 | size_div = 0; | ||
789 | } | ||
790 | |||
791 | out: | ||
792 | return ret; | ||
793 | } | ||
794 | |||
795 | static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | ||
796 | struct iov_iter *iter, | ||
797 | loff_t offset) | ||
798 | { | ||
799 | ssize_t ret = 0; | ||
800 | ssize_t written = 0; | ||
801 | bool orphaned = false; | ||
802 | int is_overwrite = 0; | ||
803 | struct file *file = iocb->ki_filp; | ||
804 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
805 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
806 | struct buffer_head *di_bh = NULL; | ||
807 | size_t count = iter->count; | ||
808 | journal_t *journal = osb->journal->j_journal; | ||
809 | u64 zero_len_head, zero_len_tail; | ||
810 | int cluster_align_head, cluster_align_tail; | ||
811 | loff_t final_size = offset + count; | ||
812 | int append_write = offset >= i_size_read(inode) ? 1 : 0; | ||
813 | unsigned int num_clusters = 0; | ||
814 | unsigned int ext_flags = 0; | ||
815 | |||
816 | { | ||
817 | u64 o = offset; | ||
818 | u64 s = i_size_read(inode); | ||
819 | |||
820 | zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); | ||
821 | cluster_align_head = !zero_len_head; | ||
822 | |||
823 | zero_len_tail = osb->s_clustersize - | ||
824 | do_div(s, osb->s_clustersize); | ||
825 | if ((offset - i_size_read(inode)) < zero_len_tail) | ||
826 | zero_len_tail = offset - i_size_read(inode); | ||
827 | cluster_align_tail = !zero_len_tail; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * when final_size > inode->i_size, inode->i_size will be | ||
832 | * updated after direct write, so add the inode to orphan | ||
833 | * dir first. | ||
834 | */ | ||
835 | if (final_size > i_size_read(inode)) { | ||
836 | ret = ocfs2_add_inode_to_orphan(osb, inode); | ||
837 | if (ret < 0) { | ||
838 | mlog_errno(ret); | ||
839 | goto out; | ||
840 | } | ||
841 | orphaned = true; | ||
842 | } | ||
843 | |||
844 | if (append_write) { | ||
845 | ret = ocfs2_inode_lock(inode, NULL, 1); | ||
846 | if (ret < 0) { | ||
847 | mlog_errno(ret); | ||
848 | goto clean_orphan; | ||
849 | } | ||
850 | |||
851 | /* zeroing out the previously allocated cluster tail | ||
852 | * that but not zeroed */ | ||
853 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
854 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
855 | ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, | ||
856 | zero_len_tail, cluster_align_tail); | ||
857 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
858 | } else { | ||
859 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
860 | ret = ocfs2_direct_IO_extend_no_holes(osb, inode, | ||
861 | offset); | ||
862 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
863 | } | ||
864 | if (ret < 0) { | ||
865 | mlog_errno(ret); | ||
866 | ocfs2_inode_unlock(inode, 1); | ||
867 | goto clean_orphan; | ||
868 | } | ||
869 | |||
870 | is_overwrite = ocfs2_is_overwrite(osb, inode, offset); | ||
871 | if (is_overwrite < 0) { | ||
872 | mlog_errno(is_overwrite); | ||
873 | ret = is_overwrite; | ||
874 | ocfs2_inode_unlock(inode, 1); | ||
875 | goto clean_orphan; | ||
876 | } | ||
877 | |||
878 | ocfs2_inode_unlock(inode, 1); | ||
879 | } | ||
880 | |||
881 | written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, | ||
882 | offset, ocfs2_direct_IO_get_blocks, | ||
883 | ocfs2_dio_end_io, NULL, 0); | ||
884 | /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ | ||
885 | if ((written < 0) && (written != -EIOCBQUEUED)) { | ||
886 | loff_t i_size = i_size_read(inode); | ||
887 | |||
888 | if (offset + count > i_size) { | ||
889 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
890 | if (ret < 0) { | ||
891 | mlog_errno(ret); | ||
892 | goto clean_orphan; | ||
893 | } | ||
894 | |||
895 | if (i_size == i_size_read(inode)) { | ||
896 | ret = ocfs2_truncate_file(inode, di_bh, | ||
897 | i_size); | ||
898 | if (ret < 0) { | ||
899 | if (ret != -ENOSPC) | ||
900 | mlog_errno(ret); | ||
901 | |||
902 | ocfs2_inode_unlock(inode, 1); | ||
903 | brelse(di_bh); | ||
904 | di_bh = NULL; | ||
905 | goto clean_orphan; | ||
906 | } | ||
907 | } | ||
908 | |||
909 | ocfs2_inode_unlock(inode, 1); | ||
910 | brelse(di_bh); | ||
911 | di_bh = NULL; | ||
912 | |||
913 | ret = jbd2_journal_force_commit(journal); | ||
914 | if (ret < 0) | ||
915 | mlog_errno(ret); | ||
916 | } | ||
917 | } else if (written > 0 && append_write && !is_overwrite && | ||
918 | !cluster_align_head) { | ||
919 | /* zeroing out the allocated cluster head */ | ||
920 | u32 p_cpos = 0; | ||
921 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); | ||
922 | |||
923 | ret = ocfs2_inode_lock(inode, NULL, 0); | ||
924 | if (ret < 0) { | ||
925 | mlog_errno(ret); | ||
926 | goto clean_orphan; | ||
927 | } | ||
928 | |||
929 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, | ||
930 | &num_clusters, &ext_flags); | ||
931 | if (ret < 0) { | ||
932 | mlog_errno(ret); | ||
933 | ocfs2_inode_unlock(inode, 0); | ||
934 | goto clean_orphan; | ||
935 | } | ||
936 | |||
937 | BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); | ||
938 | |||
939 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, | ||
940 | (u64)p_cpos << (osb->s_clustersize_bits - 9), | ||
941 | zero_len_head >> 9, GFP_NOFS, false); | ||
942 | if (ret < 0) | ||
943 | mlog_errno(ret); | ||
944 | |||
945 | ocfs2_inode_unlock(inode, 0); | ||
946 | } | ||
947 | |||
948 | clean_orphan: | ||
949 | if (orphaned) { | ||
950 | int tmp_ret; | ||
951 | int update_isize = written > 0 ? 1 : 0; | ||
952 | loff_t end = update_isize ? offset + written : 0; | ||
953 | |||
954 | tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
955 | if (tmp_ret < 0) { | ||
956 | ret = tmp_ret; | ||
957 | mlog_errno(ret); | ||
958 | goto out; | ||
959 | } | ||
960 | |||
961 | tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, | ||
962 | update_isize, end); | ||
963 | if (tmp_ret < 0) { | ||
964 | ocfs2_inode_unlock(inode, 1); | ||
965 | ret = tmp_ret; | ||
966 | mlog_errno(ret); | ||
967 | brelse(di_bh); | ||
968 | goto out; | ||
969 | } | ||
970 | |||
971 | ocfs2_inode_unlock(inode, 1); | ||
972 | brelse(di_bh); | ||
973 | |||
974 | tmp_ret = jbd2_journal_force_commit(journal); | ||
975 | if (tmp_ret < 0) { | ||
976 | ret = tmp_ret; | ||
977 | mlog_errno(tmp_ret); | ||
978 | } | ||
979 | } | ||
980 | |||
981 | out: | ||
982 | if (ret >= 0) | ||
983 | ret = written; | ||
984 | return ret; | ||
985 | } | ||
986 | |||
987 | static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
988 | loff_t offset) | ||
989 | { | ||
990 | struct file *file = iocb->ki_filp; | ||
991 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
992 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
993 | int full_coherency = !(osb->s_mount_opt & | ||
994 | OCFS2_MOUNT_COHERENCY_BUFFERED); | ||
995 | |||
996 | /* | ||
997 | * Fallback to buffered I/O if we see an inode without | ||
998 | * extents. | ||
999 | */ | ||
1000 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
1001 | return 0; | ||
1002 | |||
1003 | /* Fallback to buffered I/O if we are appending and | ||
1004 | * concurrent O_DIRECT writes are allowed. | ||
1005 | */ | ||
1006 | if (i_size_read(inode) <= offset && !full_coherency) | ||
1007 | return 0; | ||
1008 | |||
1009 | if (iov_iter_rw(iter) == READ) | ||
1010 | return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, | ||
1011 | iter, offset, | ||
1012 | ocfs2_direct_IO_get_blocks, | ||
1013 | ocfs2_dio_end_io, NULL, 0); | ||
1014 | else | ||
1015 | return ocfs2_direct_IO_write(iocb, iter, offset); | ||
1016 | } | ||
1017 | |||
1018 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | 509 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, |
1019 | u32 cpos, | 510 | u32 cpos, |
1020 | unsigned int *start, | 511 | unsigned int *start, |
@@ -1201,6 +692,13 @@ next_bh: | |||
1201 | 692 | ||
1202 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | 693 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) |
1203 | 694 | ||
695 | struct ocfs2_unwritten_extent { | ||
696 | struct list_head ue_node; | ||
697 | struct list_head ue_ip_node; | ||
698 | u32 ue_cpos; | ||
699 | u32 ue_phys; | ||
700 | }; | ||
701 | |||
1204 | /* | 702 | /* |
1205 | * Describe the state of a single cluster to be written to. | 703 | * Describe the state of a single cluster to be written to. |
1206 | */ | 704 | */ |
@@ -1212,7 +710,7 @@ struct ocfs2_write_cluster_desc { | |||
1212 | * filled. | 710 | * filled. |
1213 | */ | 711 | */ |
1214 | unsigned c_new; | 712 | unsigned c_new; |
1215 | unsigned c_unwritten; | 713 | unsigned c_clear_unwritten; |
1216 | unsigned c_needs_zero; | 714 | unsigned c_needs_zero; |
1217 | }; | 715 | }; |
1218 | 716 | ||
@@ -1224,6 +722,9 @@ struct ocfs2_write_ctxt { | |||
1224 | /* First cluster allocated in a nonsparse extend */ | 722 | /* First cluster allocated in a nonsparse extend */ |
1225 | u32 w_first_new_cpos; | 723 | u32 w_first_new_cpos; |
1226 | 724 | ||
725 | /* Type of caller. Must be one of buffer, mmap, direct. */ | ||
726 | ocfs2_write_type_t w_type; | ||
727 | |||
1227 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; | 728 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
1228 | 729 | ||
1229 | /* | 730 | /* |
@@ -1272,6 +773,8 @@ struct ocfs2_write_ctxt { | |||
1272 | struct buffer_head *w_di_bh; | 773 | struct buffer_head *w_di_bh; |
1273 | 774 | ||
1274 | struct ocfs2_cached_dealloc_ctxt w_dealloc; | 775 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
776 | |||
777 | struct list_head w_unwritten_list; | ||
1275 | }; | 778 | }; |
1276 | 779 | ||
1277 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) | 780 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) |
@@ -1310,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) | |||
1310 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); | 813 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); |
1311 | } | 814 | } |
1312 | 815 | ||
1313 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | 816 | static void ocfs2_free_unwritten_list(struct inode *inode, |
817 | struct list_head *head) | ||
1314 | { | 818 | { |
819 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
820 | struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; | ||
821 | |||
822 | list_for_each_entry_safe(ue, tmp, head, ue_node) { | ||
823 | list_del(&ue->ue_node); | ||
824 | spin_lock(&oi->ip_lock); | ||
825 | list_del(&ue->ue_ip_node); | ||
826 | spin_unlock(&oi->ip_lock); | ||
827 | kfree(ue); | ||
828 | } | ||
829 | } | ||
830 | |||
831 | static void ocfs2_free_write_ctxt(struct inode *inode, | ||
832 | struct ocfs2_write_ctxt *wc) | ||
833 | { | ||
834 | ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); | ||
1315 | ocfs2_unlock_pages(wc); | 835 | ocfs2_unlock_pages(wc); |
1316 | brelse(wc->w_di_bh); | 836 | brelse(wc->w_di_bh); |
1317 | kfree(wc); | 837 | kfree(wc); |
@@ -1319,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | |||
1319 | 839 | ||
1320 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | 840 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
1321 | struct ocfs2_super *osb, loff_t pos, | 841 | struct ocfs2_super *osb, loff_t pos, |
1322 | unsigned len, struct buffer_head *di_bh) | 842 | unsigned len, ocfs2_write_type_t type, |
843 | struct buffer_head *di_bh) | ||
1323 | { | 844 | { |
1324 | u32 cend; | 845 | u32 cend; |
1325 | struct ocfs2_write_ctxt *wc; | 846 | struct ocfs2_write_ctxt *wc; |
@@ -1334,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |||
1334 | wc->w_clen = cend - wc->w_cpos + 1; | 855 | wc->w_clen = cend - wc->w_cpos + 1; |
1335 | get_bh(di_bh); | 856 | get_bh(di_bh); |
1336 | wc->w_di_bh = di_bh; | 857 | wc->w_di_bh = di_bh; |
858 | wc->w_type = type; | ||
1337 | 859 | ||
1338 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 860 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
1339 | wc->w_large_pages = 1; | 861 | wc->w_large_pages = 1; |
@@ -1341,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |||
1341 | wc->w_large_pages = 0; | 863 | wc->w_large_pages = 0; |
1342 | 864 | ||
1343 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | 865 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); |
866 | INIT_LIST_HEAD(&wc->w_unwritten_list); | ||
1344 | 867 | ||
1345 | *wcp = wc; | 868 | *wcp = wc; |
1346 | 869 | ||
@@ -1401,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode, | |||
1401 | to = user_pos + user_len; | 924 | to = user_pos + user_len; |
1402 | struct page *tmppage; | 925 | struct page *tmppage; |
1403 | 926 | ||
1404 | ocfs2_zero_new_buffers(wc->w_target_page, from, to); | 927 | if (wc->w_target_page) |
928 | ocfs2_zero_new_buffers(wc->w_target_page, from, to); | ||
1405 | 929 | ||
1406 | for(i = 0; i < wc->w_num_pages; i++) { | 930 | for(i = 0; i < wc->w_num_pages; i++) { |
1407 | tmppage = wc->w_pages[i]; | 931 | tmppage = wc->w_pages[i]; |
1408 | 932 | ||
1409 | if (page_has_buffers(tmppage)) { | 933 | if (tmppage && page_has_buffers(tmppage)) { |
1410 | if (ocfs2_should_order_data(inode)) | 934 | if (ocfs2_should_order_data(inode)) |
1411 | ocfs2_jbd2_file_inode(wc->w_handle, inode); | 935 | ocfs2_jbd2_file_inode(wc->w_handle, inode); |
1412 | 936 | ||
@@ -1536,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1536 | wc->w_num_pages = 1; | 1060 | wc->w_num_pages = 1; |
1537 | start = target_index; | 1061 | start = target_index; |
1538 | } | 1062 | } |
1063 | end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT; | ||
1539 | 1064 | ||
1540 | for(i = 0; i < wc->w_num_pages; i++) { | 1065 | for(i = 0; i < wc->w_num_pages; i++) { |
1541 | index = start + i; | 1066 | index = start + i; |
1542 | 1067 | ||
1543 | if (index == target_index && mmap_page) { | 1068 | if (index >= target_index && index <= end_index && |
1069 | wc->w_type == OCFS2_WRITE_MMAP) { | ||
1544 | /* | 1070 | /* |
1545 | * ocfs2_pagemkwrite() is a little different | 1071 | * ocfs2_pagemkwrite() is a little different |
1546 | * and wants us to directly use the page | 1072 | * and wants us to directly use the page |
@@ -1559,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1559 | page_cache_get(mmap_page); | 1085 | page_cache_get(mmap_page); |
1560 | wc->w_pages[i] = mmap_page; | 1086 | wc->w_pages[i] = mmap_page; |
1561 | wc->w_target_locked = true; | 1087 | wc->w_target_locked = true; |
1088 | } else if (index >= target_index && index <= end_index && | ||
1089 | wc->w_type == OCFS2_WRITE_DIRECT) { | ||
1090 | /* Direct write has no mapping page. */ | ||
1091 | wc->w_pages[i] = NULL; | ||
1092 | continue; | ||
1562 | } else { | 1093 | } else { |
1563 | wc->w_pages[i] = find_or_create_page(mapping, index, | 1094 | wc->w_pages[i] = find_or_create_page(mapping, index, |
1564 | GFP_NOFS); | 1095 | GFP_NOFS); |
@@ -1583,19 +1114,20 @@ out: | |||
1583 | * Prepare a single cluster for write one cluster into the file. | 1114 | * Prepare a single cluster for write one cluster into the file. |
1584 | */ | 1115 | */ |
1585 | static int ocfs2_write_cluster(struct address_space *mapping, | 1116 | static int ocfs2_write_cluster(struct address_space *mapping, |
1586 | u32 phys, unsigned int unwritten, | 1117 | u32 *phys, unsigned int new, |
1118 | unsigned int clear_unwritten, | ||
1587 | unsigned int should_zero, | 1119 | unsigned int should_zero, |
1588 | struct ocfs2_alloc_context *data_ac, | 1120 | struct ocfs2_alloc_context *data_ac, |
1589 | struct ocfs2_alloc_context *meta_ac, | 1121 | struct ocfs2_alloc_context *meta_ac, |
1590 | struct ocfs2_write_ctxt *wc, u32 cpos, | 1122 | struct ocfs2_write_ctxt *wc, u32 cpos, |
1591 | loff_t user_pos, unsigned user_len) | 1123 | loff_t user_pos, unsigned user_len) |
1592 | { | 1124 | { |
1593 | int ret, i, new; | 1125 | int ret, i; |
1594 | u64 v_blkno, p_blkno; | 1126 | u64 p_blkno; |
1595 | struct inode *inode = mapping->host; | 1127 | struct inode *inode = mapping->host; |
1596 | struct ocfs2_extent_tree et; | 1128 | struct ocfs2_extent_tree et; |
1129 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | ||
1597 | 1130 | ||
1598 | new = phys == 0 ? 1 : 0; | ||
1599 | if (new) { | 1131 | if (new) { |
1600 | u32 tmp_pos; | 1132 | u32 tmp_pos; |
1601 | 1133 | ||
@@ -1605,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
1605 | */ | 1137 | */ |
1606 | tmp_pos = cpos; | 1138 | tmp_pos = cpos; |
1607 | ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, | 1139 | ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, |
1608 | &tmp_pos, 1, 0, wc->w_di_bh, | 1140 | &tmp_pos, 1, !clear_unwritten, |
1609 | wc->w_handle, data_ac, | 1141 | wc->w_di_bh, wc->w_handle, |
1610 | meta_ac, NULL); | 1142 | data_ac, meta_ac, NULL); |
1611 | /* | 1143 | /* |
1612 | * This shouldn't happen because we must have already | 1144 | * This shouldn't happen because we must have already |
1613 | * calculated the correct meta data allocation required. The | 1145 | * calculated the correct meta data allocation required. The |
@@ -1624,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
1624 | mlog_errno(ret); | 1156 | mlog_errno(ret); |
1625 | goto out; | 1157 | goto out; |
1626 | } | 1158 | } |
1627 | } else if (unwritten) { | 1159 | } else if (clear_unwritten) { |
1628 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), | 1160 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), |
1629 | wc->w_di_bh); | 1161 | wc->w_di_bh); |
1630 | ret = ocfs2_mark_extent_written(inode, &et, | 1162 | ret = ocfs2_mark_extent_written(inode, &et, |
1631 | wc->w_handle, cpos, 1, phys, | 1163 | wc->w_handle, cpos, 1, *phys, |
1632 | meta_ac, &wc->w_dealloc); | 1164 | meta_ac, &wc->w_dealloc); |
1633 | if (ret < 0) { | 1165 | if (ret < 0) { |
1634 | mlog_errno(ret); | 1166 | mlog_errno(ret); |
@@ -1636,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
1636 | } | 1168 | } |
1637 | } | 1169 | } |
1638 | 1170 | ||
1639 | if (should_zero) | ||
1640 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
1641 | else | ||
1642 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
1643 | |||
1644 | /* | 1171 | /* |
1645 | * The only reason this should fail is due to an inability to | 1172 | * The only reason this should fail is due to an inability to |
1646 | * find the extent added. | 1173 | * find the extent added. |
1647 | */ | 1174 | */ |
1648 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1175 | ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); |
1649 | NULL); | ||
1650 | if (ret < 0) { | 1176 | if (ret < 0) { |
1651 | mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " | 1177 | mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " |
1652 | "at logical block %llu", | 1178 | "at logical cluster %u", |
1653 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1179 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); |
1654 | (unsigned long long)v_blkno); | ||
1655 | goto out; | 1180 | goto out; |
1656 | } | 1181 | } |
1657 | 1182 | ||
1658 | BUG_ON(p_blkno == 0); | 1183 | BUG_ON(*phys == 0); |
1184 | |||
1185 | p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); | ||
1186 | if (!should_zero) | ||
1187 | p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); | ||
1659 | 1188 | ||
1660 | for(i = 0; i < wc->w_num_pages; i++) { | 1189 | for(i = 0; i < wc->w_num_pages; i++) { |
1661 | int tmpret; | 1190 | int tmpret; |
1662 | 1191 | ||
1192 | /* This is the direct io target page. */ | ||
1193 | if (wc->w_pages[i] == NULL) { | ||
1194 | p_blkno++; | ||
1195 | continue; | ||
1196 | } | ||
1197 | |||
1663 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, | 1198 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
1664 | wc->w_pages[i], cpos, | 1199 | wc->w_pages[i], cpos, |
1665 | user_pos, user_len, | 1200 | user_pos, user_len, |
@@ -1706,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, | |||
1706 | if ((cluster_off + local_len) > osb->s_clustersize) | 1241 | if ((cluster_off + local_len) > osb->s_clustersize) |
1707 | local_len = osb->s_clustersize - cluster_off; | 1242 | local_len = osb->s_clustersize - cluster_off; |
1708 | 1243 | ||
1709 | ret = ocfs2_write_cluster(mapping, desc->c_phys, | 1244 | ret = ocfs2_write_cluster(mapping, &desc->c_phys, |
1710 | desc->c_unwritten, | 1245 | desc->c_new, |
1246 | desc->c_clear_unwritten, | ||
1711 | desc->c_needs_zero, | 1247 | desc->c_needs_zero, |
1712 | data_ac, meta_ac, | 1248 | data_ac, meta_ac, |
1713 | wc, desc->c_cpos, pos, local_len); | 1249 | wc, desc->c_cpos, pos, local_len); |
@@ -1778,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | |||
1778 | } | 1314 | } |
1779 | 1315 | ||
1780 | /* | 1316 | /* |
1317 | * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to | ||
1318 | * do the zero work. And should not to clear UNWRITTEN since it will be cleared | ||
1319 | * by the direct io procedure. | ||
1320 | * If this is a new extent that allocated by direct io, we should mark it in | ||
1321 | * the ip_unwritten_list. | ||
1322 | */ | ||
1323 | static int ocfs2_unwritten_check(struct inode *inode, | ||
1324 | struct ocfs2_write_ctxt *wc, | ||
1325 | struct ocfs2_write_cluster_desc *desc) | ||
1326 | { | ||
1327 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1328 | struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; | ||
1329 | int ret = 0; | ||
1330 | |||
1331 | if (!desc->c_needs_zero) | ||
1332 | return 0; | ||
1333 | |||
1334 | retry: | ||
1335 | spin_lock(&oi->ip_lock); | ||
1336 | /* Needs not to zero no metter buffer or direct. The one who is zero | ||
1337 | * the cluster is doing zero. And he will clear unwritten after all | ||
1338 | * cluster io finished. */ | ||
1339 | list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { | ||
1340 | if (desc->c_cpos == ue->ue_cpos) { | ||
1341 | BUG_ON(desc->c_new); | ||
1342 | desc->c_needs_zero = 0; | ||
1343 | desc->c_clear_unwritten = 0; | ||
1344 | goto unlock; | ||
1345 | } | ||
1346 | } | ||
1347 | |||
1348 | if (wc->w_type != OCFS2_WRITE_DIRECT) | ||
1349 | goto unlock; | ||
1350 | |||
1351 | if (new == NULL) { | ||
1352 | spin_unlock(&oi->ip_lock); | ||
1353 | new = kmalloc(sizeof(struct ocfs2_unwritten_extent), | ||
1354 | GFP_NOFS); | ||
1355 | if (new == NULL) { | ||
1356 | ret = -ENOMEM; | ||
1357 | goto out; | ||
1358 | } | ||
1359 | goto retry; | ||
1360 | } | ||
1361 | /* This direct write will doing zero. */ | ||
1362 | new->ue_cpos = desc->c_cpos; | ||
1363 | new->ue_phys = desc->c_phys; | ||
1364 | desc->c_clear_unwritten = 0; | ||
1365 | list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); | ||
1366 | list_add_tail(&new->ue_node, &wc->w_unwritten_list); | ||
1367 | new = NULL; | ||
1368 | unlock: | ||
1369 | spin_unlock(&oi->ip_lock); | ||
1370 | out: | ||
1371 | if (new) | ||
1372 | kfree(new); | ||
1373 | return ret; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1781 | * Populate each single-cluster write descriptor in the write context | 1377 | * Populate each single-cluster write descriptor in the write context |
1782 | * with information about the i/o to be done. | 1378 | * with information about the i/o to be done. |
1783 | * | 1379 | * |
@@ -1852,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode, | |||
1852 | if (phys == 0) { | 1448 | if (phys == 0) { |
1853 | desc->c_new = 1; | 1449 | desc->c_new = 1; |
1854 | desc->c_needs_zero = 1; | 1450 | desc->c_needs_zero = 1; |
1451 | desc->c_clear_unwritten = 1; | ||
1855 | *clusters_to_alloc = *clusters_to_alloc + 1; | 1452 | *clusters_to_alloc = *clusters_to_alloc + 1; |
1856 | } | 1453 | } |
1857 | 1454 | ||
1858 | if (ext_flags & OCFS2_EXT_UNWRITTEN) { | 1455 | if (ext_flags & OCFS2_EXT_UNWRITTEN) { |
1859 | desc->c_unwritten = 1; | 1456 | desc->c_clear_unwritten = 1; |
1860 | desc->c_needs_zero = 1; | 1457 | desc->c_needs_zero = 1; |
1861 | } | 1458 | } |
1862 | 1459 | ||
1460 | ret = ocfs2_unwritten_check(inode, wc, desc); | ||
1461 | if (ret) { | ||
1462 | mlog_errno(ret); | ||
1463 | goto out; | ||
1464 | } | ||
1465 | |||
1863 | num_clusters--; | 1466 | num_clusters--; |
1864 | } | 1467 | } |
1865 | 1468 | ||
@@ -2022,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, | |||
2022 | if (ret) | 1625 | if (ret) |
2023 | mlog_errno(ret); | 1626 | mlog_errno(ret); |
2024 | 1627 | ||
2025 | wc->w_first_new_cpos = | 1628 | /* There is no wc if this is call from direct. */ |
2026 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); | 1629 | if (wc) |
1630 | wc->w_first_new_cpos = | ||
1631 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); | ||
2027 | 1632 | ||
2028 | return ret; | 1633 | return ret; |
2029 | } | 1634 | } |
@@ -2077,9 +1682,8 @@ out: | |||
2077 | return ret; | 1682 | return ret; |
2078 | } | 1683 | } |
2079 | 1684 | ||
2080 | int ocfs2_write_begin_nolock(struct file *filp, | 1685 | int ocfs2_write_begin_nolock(struct address_space *mapping, |
2081 | struct address_space *mapping, | 1686 | loff_t pos, unsigned len, ocfs2_write_type_t type, |
2082 | loff_t pos, unsigned len, unsigned flags, | ||
2083 | struct page **pagep, void **fsdata, | 1687 | struct page **pagep, void **fsdata, |
2084 | struct buffer_head *di_bh, struct page *mmap_page) | 1688 | struct buffer_head *di_bh, struct page *mmap_page) |
2085 | { | 1689 | { |
@@ -2096,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp, | |||
2096 | int try_free = 1, ret1; | 1700 | int try_free = 1, ret1; |
2097 | 1701 | ||
2098 | try_again: | 1702 | try_again: |
2099 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); | 1703 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); |
2100 | if (ret) { | 1704 | if (ret) { |
2101 | mlog_errno(ret); | 1705 | mlog_errno(ret); |
2102 | return ret; | 1706 | return ret; |
@@ -2115,14 +1719,17 @@ try_again: | |||
2115 | } | 1719 | } |
2116 | } | 1720 | } |
2117 | 1721 | ||
2118 | if (ocfs2_sparse_alloc(osb)) | 1722 | /* Direct io change i_size late, should not zero tail here. */ |
2119 | ret = ocfs2_zero_tail(inode, di_bh, pos); | 1723 | if (type != OCFS2_WRITE_DIRECT) { |
2120 | else | 1724 | if (ocfs2_sparse_alloc(osb)) |
2121 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, | 1725 | ret = ocfs2_zero_tail(inode, di_bh, pos); |
2122 | wc); | 1726 | else |
2123 | if (ret) { | 1727 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, |
2124 | mlog_errno(ret); | 1728 | len, wc); |
2125 | goto out; | 1729 | if (ret) { |
1730 | mlog_errno(ret); | ||
1731 | goto out; | ||
1732 | } | ||
2126 | } | 1733 | } |
2127 | 1734 | ||
2128 | ret = ocfs2_check_range_for_refcount(inode, pos, len); | 1735 | ret = ocfs2_check_range_for_refcount(inode, pos, len); |
@@ -2153,7 +1760,7 @@ try_again: | |||
2153 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1760 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
2154 | (long long)i_size_read(inode), | 1761 | (long long)i_size_read(inode), |
2155 | le32_to_cpu(di->i_clusters), | 1762 | le32_to_cpu(di->i_clusters), |
2156 | pos, len, flags, mmap_page, | 1763 | pos, len, type, mmap_page, |
2157 | clusters_to_alloc, extents_to_split); | 1764 | clusters_to_alloc, extents_to_split); |
2158 | 1765 | ||
2159 | /* | 1766 | /* |
@@ -2183,17 +1790,17 @@ try_again: | |||
2183 | 1790 | ||
2184 | credits = ocfs2_calc_extend_credits(inode->i_sb, | 1791 | credits = ocfs2_calc_extend_credits(inode->i_sb, |
2185 | &di->id2.i_list); | 1792 | &di->id2.i_list); |
2186 | 1793 | } else if (type == OCFS2_WRITE_DIRECT) | |
2187 | } | 1794 | /* direct write needs not to start trans if no extents alloc. */ |
1795 | goto success; | ||
2188 | 1796 | ||
2189 | /* | 1797 | /* |
2190 | * We have to zero sparse allocated clusters, unwritten extent clusters, | 1798 | * We have to zero sparse allocated clusters, unwritten extent clusters, |
2191 | * and non-sparse clusters we just extended. For non-sparse writes, | 1799 | * and non-sparse clusters we just extended. For non-sparse writes, |
2192 | * we know zeros will only be needed in the first and/or last cluster. | 1800 | * we know zeros will only be needed in the first and/or last cluster. |
2193 | */ | 1801 | */ |
2194 | if (clusters_to_alloc || extents_to_split || | 1802 | if (wc->w_clen && (wc->w_desc[0].c_needs_zero || |
2195 | (wc->w_clen && (wc->w_desc[0].c_needs_zero || | 1803 | wc->w_desc[wc->w_clen - 1].c_needs_zero)) |
2196 | wc->w_desc[wc->w_clen - 1].c_needs_zero))) | ||
2197 | cluster_of_pages = 1; | 1804 | cluster_of_pages = 1; |
2198 | else | 1805 | else |
2199 | cluster_of_pages = 0; | 1806 | cluster_of_pages = 0; |
@@ -2260,7 +1867,8 @@ try_again: | |||
2260 | ocfs2_free_alloc_context(meta_ac); | 1867 | ocfs2_free_alloc_context(meta_ac); |
2261 | 1868 | ||
2262 | success: | 1869 | success: |
2263 | *pagep = wc->w_target_page; | 1870 | if (pagep) |
1871 | *pagep = wc->w_target_page; | ||
2264 | *fsdata = wc; | 1872 | *fsdata = wc; |
2265 | return 0; | 1873 | return 0; |
2266 | out_quota: | 1874 | out_quota: |
@@ -2271,7 +1879,7 @@ out_commit: | |||
2271 | ocfs2_commit_trans(osb, handle); | 1879 | ocfs2_commit_trans(osb, handle); |
2272 | 1880 | ||
2273 | out: | 1881 | out: |
2274 | ocfs2_free_write_ctxt(wc); | 1882 | ocfs2_free_write_ctxt(inode, wc); |
2275 | 1883 | ||
2276 | if (data_ac) { | 1884 | if (data_ac) { |
2277 | ocfs2_free_alloc_context(data_ac); | 1885 | ocfs2_free_alloc_context(data_ac); |
@@ -2323,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |||
2323 | */ | 1931 | */ |
2324 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 1932 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
2325 | 1933 | ||
2326 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, | 1934 | ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, |
2327 | fsdata, di_bh, NULL); | 1935 | pagep, fsdata, di_bh, NULL); |
2328 | if (ret) { | 1936 | if (ret) { |
2329 | mlog_errno(ret); | 1937 | mlog_errno(ret); |
2330 | goto out_fail; | 1938 | goto out_fail; |
@@ -2381,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2381 | handle_t *handle = wc->w_handle; | 1989 | handle_t *handle = wc->w_handle; |
2382 | struct page *tmppage; | 1990 | struct page *tmppage; |
2383 | 1991 | ||
2384 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, | 1992 | BUG_ON(!list_empty(&wc->w_unwritten_list)); |
2385 | OCFS2_JOURNAL_ACCESS_WRITE); | 1993 | |
2386 | if (ret) { | 1994 | if (handle) { |
2387 | copied = ret; | 1995 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), |
2388 | mlog_errno(ret); | 1996 | wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); |
2389 | goto out; | 1997 | if (ret) { |
1998 | copied = ret; | ||
1999 | mlog_errno(ret); | ||
2000 | goto out; | ||
2001 | } | ||
2390 | } | 2002 | } |
2391 | 2003 | ||
2392 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 2004 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
@@ -2394,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2394 | goto out_write_size; | 2006 | goto out_write_size; |
2395 | } | 2007 | } |
2396 | 2008 | ||
2397 | if (unlikely(copied < len)) { | 2009 | if (unlikely(copied < len) && wc->w_target_page) { |
2398 | if (!PageUptodate(wc->w_target_page)) | 2010 | if (!PageUptodate(wc->w_target_page)) |
2399 | copied = 0; | 2011 | copied = 0; |
2400 | 2012 | ||
2401 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | 2013 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, |
2402 | start+len); | 2014 | start+len); |
2403 | } | 2015 | } |
2404 | flush_dcache_page(wc->w_target_page); | 2016 | if (wc->w_target_page) |
2017 | flush_dcache_page(wc->w_target_page); | ||
2405 | 2018 | ||
2406 | for(i = 0; i < wc->w_num_pages; i++) { | 2019 | for(i = 0; i < wc->w_num_pages; i++) { |
2407 | tmppage = wc->w_pages[i]; | 2020 | tmppage = wc->w_pages[i]; |
2408 | 2021 | ||
2022 | /* This is the direct io target page. */ | ||
2023 | if (tmppage == NULL) | ||
2024 | continue; | ||
2025 | |||
2409 | if (tmppage == wc->w_target_page) { | 2026 | if (tmppage == wc->w_target_page) { |
2410 | from = wc->w_target_from; | 2027 | from = wc->w_target_from; |
2411 | to = wc->w_target_to; | 2028 | to = wc->w_target_to; |
@@ -2424,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2424 | } | 2041 | } |
2425 | 2042 | ||
2426 | if (page_has_buffers(tmppage)) { | 2043 | if (page_has_buffers(tmppage)) { |
2427 | if (ocfs2_should_order_data(inode)) | 2044 | if (handle && ocfs2_should_order_data(inode)) |
2428 | ocfs2_jbd2_file_inode(wc->w_handle, inode); | 2045 | ocfs2_jbd2_file_inode(handle, inode); |
2429 | block_commit_write(tmppage, from, to); | 2046 | block_commit_write(tmppage, from, to); |
2430 | } | 2047 | } |
2431 | } | 2048 | } |
2432 | 2049 | ||
2433 | out_write_size: | 2050 | out_write_size: |
2434 | pos += copied; | 2051 | /* Direct io do not update i_size here. */ |
2435 | if (pos > i_size_read(inode)) { | 2052 | if (wc->w_type != OCFS2_WRITE_DIRECT) { |
2436 | i_size_write(inode, pos); | 2053 | pos += copied; |
2437 | mark_inode_dirty(inode); | 2054 | if (pos > i_size_read(inode)) { |
2438 | } | 2055 | i_size_write(inode, pos); |
2439 | inode->i_blocks = ocfs2_inode_sector_count(inode); | 2056 | mark_inode_dirty(inode); |
2440 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | 2057 | } |
2441 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 2058 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
2442 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | 2059 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); |
2443 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 2060 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
2444 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | 2061 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); |
2445 | ocfs2_journal_dirty(handle, wc->w_di_bh); | 2062 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
2063 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | ||
2064 | } | ||
2065 | if (handle) | ||
2066 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
2446 | 2067 | ||
2447 | out: | 2068 | out: |
2448 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier | 2069 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier |
@@ -2452,7 +2073,8 @@ out: | |||
2452 | */ | 2073 | */ |
2453 | ocfs2_unlock_pages(wc); | 2074 | ocfs2_unlock_pages(wc); |
2454 | 2075 | ||
2455 | ocfs2_commit_trans(osb, handle); | 2076 | if (handle) |
2077 | ocfs2_commit_trans(osb, handle); | ||
2456 | 2078 | ||
2457 | ocfs2_run_deallocs(osb, &wc->w_dealloc); | 2079 | ocfs2_run_deallocs(osb, &wc->w_dealloc); |
2458 | 2080 | ||
@@ -2477,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, | |||
2477 | return ret; | 2099 | return ret; |
2478 | } | 2100 | } |
2479 | 2101 | ||
2102 | struct ocfs2_dio_write_ctxt { | ||
2103 | struct list_head dw_zero_list; | ||
2104 | unsigned dw_zero_count; | ||
2105 | int dw_orphaned; | ||
2106 | pid_t dw_writer_pid; | ||
2107 | }; | ||
2108 | |||
2109 | static struct ocfs2_dio_write_ctxt * | ||
2110 | ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) | ||
2111 | { | ||
2112 | struct ocfs2_dio_write_ctxt *dwc = NULL; | ||
2113 | |||
2114 | if (bh->b_private) | ||
2115 | return bh->b_private; | ||
2116 | |||
2117 | dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); | ||
2118 | if (dwc == NULL) | ||
2119 | return NULL; | ||
2120 | INIT_LIST_HEAD(&dwc->dw_zero_list); | ||
2121 | dwc->dw_zero_count = 0; | ||
2122 | dwc->dw_orphaned = 0; | ||
2123 | dwc->dw_writer_pid = task_pid_nr(current); | ||
2124 | bh->b_private = dwc; | ||
2125 | *alloc = 1; | ||
2126 | |||
2127 | return dwc; | ||
2128 | } | ||
2129 | |||
2130 | static void ocfs2_dio_free_write_ctx(struct inode *inode, | ||
2131 | struct ocfs2_dio_write_ctxt *dwc) | ||
2132 | { | ||
2133 | ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); | ||
2134 | kfree(dwc); | ||
2135 | } | ||
2136 | |||
2137 | /* | ||
2138 | * TODO: Make this into a generic get_blocks function. | ||
2139 | * | ||
2140 | * From do_direct_io in direct-io.c: | ||
2141 | * "So what we do is to permit the ->get_blocks function to populate | ||
2142 | * bh.b_size with the size of IO which is permitted at this offset and | ||
2143 | * this i_blkbits." | ||
2144 | * | ||
2145 | * This function is called directly from get_more_blocks in direct-io.c. | ||
2146 | * | ||
2147 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
2148 | * fs_count, map_bh, dio->rw == WRITE); | ||
2149 | */ | ||
2150 | static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, | ||
2151 | struct buffer_head *bh_result, int create) | ||
2152 | { | ||
2153 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2154 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
2155 | struct ocfs2_write_ctxt *wc; | ||
2156 | struct ocfs2_write_cluster_desc *desc = NULL; | ||
2157 | struct ocfs2_dio_write_ctxt *dwc = NULL; | ||
2158 | struct buffer_head *di_bh = NULL; | ||
2159 | u64 p_blkno; | ||
2160 | loff_t pos = iblock << inode->i_sb->s_blocksize_bits; | ||
2161 | unsigned len, total_len = bh_result->b_size; | ||
2162 | int ret = 0, first_get_block = 0; | ||
2163 | |||
2164 | len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); | ||
2165 | len = min(total_len, len); | ||
2166 | |||
2167 | mlog(0, "get block of %lu at %llu:%u req %u\n", | ||
2168 | inode->i_ino, pos, len, total_len); | ||
2169 | |||
2170 | /* | ||
2171 | * Because we need to change file size in ocfs2_dio_end_io_write(), or | ||
2172 | * we may need to add it to orphan dir. So can not fall to fast path | ||
2173 | * while file size will be changed. | ||
2174 | */ | ||
2175 | if (pos + total_len <= i_size_read(inode)) { | ||
2176 | down_read(&oi->ip_alloc_sem); | ||
2177 | /* This is the fast path for re-write. */ | ||
2178 | ret = ocfs2_get_block(inode, iblock, bh_result, create); | ||
2179 | |||
2180 | up_read(&oi->ip_alloc_sem); | ||
2181 | |||
2182 | if (buffer_mapped(bh_result) && | ||
2183 | !buffer_new(bh_result) && | ||
2184 | ret == 0) | ||
2185 | goto out; | ||
2186 | |||
2187 | /* Clear state set by ocfs2_get_block. */ | ||
2188 | bh_result->b_state = 0; | ||
2189 | } | ||
2190 | |||
2191 | dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); | ||
2192 | if (unlikely(dwc == NULL)) { | ||
2193 | ret = -ENOMEM; | ||
2194 | mlog_errno(ret); | ||
2195 | goto out; | ||
2196 | } | ||
2197 | |||
2198 | if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > | ||
2199 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && | ||
2200 | !dwc->dw_orphaned) { | ||
2201 | /* | ||
2202 | * when we are going to alloc extents beyond file size, add the | ||
2203 | * inode to orphan dir, so we can recall those spaces when | ||
2204 | * system crashed during write. | ||
2205 | */ | ||
2206 | ret = ocfs2_add_inode_to_orphan(osb, inode); | ||
2207 | if (ret < 0) { | ||
2208 | mlog_errno(ret); | ||
2209 | goto out; | ||
2210 | } | ||
2211 | dwc->dw_orphaned = 1; | ||
2212 | } | ||
2213 | |||
2214 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
2215 | if (ret) { | ||
2216 | mlog_errno(ret); | ||
2217 | goto out; | ||
2218 | } | ||
2219 | |||
2220 | down_write(&oi->ip_alloc_sem); | ||
2221 | |||
2222 | if (first_get_block) { | ||
2223 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
2224 | ret = ocfs2_zero_tail(inode, di_bh, pos); | ||
2225 | else | ||
2226 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, | ||
2227 | total_len, NULL); | ||
2228 | if (ret < 0) { | ||
2229 | mlog_errno(ret); | ||
2230 | goto unlock; | ||
2231 | } | ||
2232 | } | ||
2233 | |||
2234 | ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, | ||
2235 | OCFS2_WRITE_DIRECT, NULL, | ||
2236 | (void **)&wc, di_bh, NULL); | ||
2237 | if (ret) { | ||
2238 | mlog_errno(ret); | ||
2239 | goto unlock; | ||
2240 | } | ||
2241 | |||
2242 | desc = &wc->w_desc[0]; | ||
2243 | |||
2244 | p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); | ||
2245 | BUG_ON(p_blkno == 0); | ||
2246 | p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); | ||
2247 | |||
2248 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
2249 | bh_result->b_size = len; | ||
2250 | if (desc->c_needs_zero) | ||
2251 | set_buffer_new(bh_result); | ||
2252 | |||
2253 | /* May sleep in end_io. It should not happen in a irq context. So defer | ||
2254 | * it to dio work queue. */ | ||
2255 | set_buffer_defer_completion(bh_result); | ||
2256 | |||
2257 | if (!list_empty(&wc->w_unwritten_list)) { | ||
2258 | struct ocfs2_unwritten_extent *ue = NULL; | ||
2259 | |||
2260 | ue = list_first_entry(&wc->w_unwritten_list, | ||
2261 | struct ocfs2_unwritten_extent, | ||
2262 | ue_node); | ||
2263 | BUG_ON(ue->ue_cpos != desc->c_cpos); | ||
2264 | /* The physical address may be 0, fill it. */ | ||
2265 | ue->ue_phys = desc->c_phys; | ||
2266 | |||
2267 | list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); | ||
2268 | dwc->dw_zero_count++; | ||
2269 | } | ||
2270 | |||
2271 | ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); | ||
2272 | BUG_ON(ret != len); | ||
2273 | ret = 0; | ||
2274 | unlock: | ||
2275 | up_write(&oi->ip_alloc_sem); | ||
2276 | ocfs2_inode_unlock(inode, 1); | ||
2277 | brelse(di_bh); | ||
2278 | out: | ||
2279 | if (ret < 0) | ||
2280 | ret = -EIO; | ||
2281 | return ret; | ||
2282 | } | ||
2283 | |||
2284 | static void ocfs2_dio_end_io_write(struct inode *inode, | ||
2285 | struct ocfs2_dio_write_ctxt *dwc, | ||
2286 | loff_t offset, | ||
2287 | ssize_t bytes) | ||
2288 | { | ||
2289 | struct ocfs2_cached_dealloc_ctxt dealloc; | ||
2290 | struct ocfs2_extent_tree et; | ||
2291 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2292 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
2293 | struct ocfs2_unwritten_extent *ue = NULL; | ||
2294 | struct buffer_head *di_bh = NULL; | ||
2295 | struct ocfs2_dinode *di; | ||
2296 | struct ocfs2_alloc_context *data_ac = NULL; | ||
2297 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
2298 | handle_t *handle = NULL; | ||
2299 | loff_t end = offset + bytes; | ||
2300 | int ret = 0, credits = 0, locked = 0; | ||
2301 | |||
2302 | ocfs2_init_dealloc_ctxt(&dealloc); | ||
2303 | |||
2304 | /* We do clear unwritten, delete orphan, change i_size here. If neither | ||
2305 | * of these happen, we can skip all this. */ | ||
2306 | if (list_empty(&dwc->dw_zero_list) && | ||
2307 | end <= i_size_read(inode) && | ||
2308 | !dwc->dw_orphaned) | ||
2309 | goto out; | ||
2310 | |||
2311 | /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we | ||
2312 | * are in that context. */ | ||
2313 | if (dwc->dw_writer_pid != task_pid_nr(current)) { | ||
2314 | mutex_lock(&inode->i_mutex); | ||
2315 | locked = 1; | ||
2316 | } | ||
2317 | |||
2318 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
2319 | if (ret < 0) { | ||
2320 | mlog_errno(ret); | ||
2321 | goto out; | ||
2322 | } | ||
2323 | |||
2324 | down_write(&oi->ip_alloc_sem); | ||
2325 | |||
2326 | /* Delete orphan before acquire i_mutex. */ | ||
2327 | if (dwc->dw_orphaned) { | ||
2328 | BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); | ||
2329 | |||
2330 | end = end > i_size_read(inode) ? end : 0; | ||
2331 | |||
2332 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, | ||
2333 | !!end, end); | ||
2334 | if (ret < 0) | ||
2335 | mlog_errno(ret); | ||
2336 | } | ||
2337 | |||
2338 | di = (struct ocfs2_dinode *)di_bh; | ||
2339 | |||
2340 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); | ||
2341 | |||
2342 | ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, | ||
2343 | &data_ac, &meta_ac); | ||
2344 | if (ret) { | ||
2345 | mlog_errno(ret); | ||
2346 | goto unlock; | ||
2347 | } | ||
2348 | |||
2349 | credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); | ||
2350 | |||
2351 | handle = ocfs2_start_trans(osb, credits); | ||
2352 | if (IS_ERR(handle)) { | ||
2353 | ret = PTR_ERR(handle); | ||
2354 | mlog_errno(ret); | ||
2355 | goto unlock; | ||
2356 | } | ||
2357 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | ||
2358 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2359 | if (ret) { | ||
2360 | mlog_errno(ret); | ||
2361 | goto commit; | ||
2362 | } | ||
2363 | |||
2364 | list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { | ||
2365 | ret = ocfs2_mark_extent_written(inode, &et, handle, | ||
2366 | ue->ue_cpos, 1, | ||
2367 | ue->ue_phys, | ||
2368 | meta_ac, &dealloc); | ||
2369 | if (ret < 0) { | ||
2370 | mlog_errno(ret); | ||
2371 | break; | ||
2372 | } | ||
2373 | } | ||
2374 | |||
2375 | if (end > i_size_read(inode)) { | ||
2376 | ret = ocfs2_set_inode_size(handle, inode, di_bh, end); | ||
2377 | if (ret < 0) | ||
2378 | mlog_errno(ret); | ||
2379 | } | ||
2380 | commit: | ||
2381 | ocfs2_commit_trans(osb, handle); | ||
2382 | unlock: | ||
2383 | up_write(&oi->ip_alloc_sem); | ||
2384 | ocfs2_inode_unlock(inode, 1); | ||
2385 | brelse(di_bh); | ||
2386 | out: | ||
2387 | if (data_ac) | ||
2388 | ocfs2_free_alloc_context(data_ac); | ||
2389 | if (meta_ac) | ||
2390 | ocfs2_free_alloc_context(meta_ac); | ||
2391 | ocfs2_run_deallocs(osb, &dealloc); | ||
2392 | if (locked) | ||
2393 | mutex_unlock(&inode->i_mutex); | ||
2394 | ocfs2_dio_free_write_ctx(inode, dwc); | ||
2395 | } | ||
2396 | |||
2397 | /* | ||
2398 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
2399 | * particularly interested in the aio/dio case. We use the rw_lock DLM lock | ||
2400 | * to protect io on one node from truncation on another. | ||
2401 | */ | ||
2402 | static int ocfs2_dio_end_io(struct kiocb *iocb, | ||
2403 | loff_t offset, | ||
2404 | ssize_t bytes, | ||
2405 | void *private) | ||
2406 | { | ||
2407 | struct inode *inode = file_inode(iocb->ki_filp); | ||
2408 | int level; | ||
2409 | |||
2410 | if (bytes <= 0) | ||
2411 | return 0; | ||
2412 | |||
2413 | /* this io's submitter should not have unlocked this before we could */ | ||
2414 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
2415 | |||
2416 | if (private) | ||
2417 | ocfs2_dio_end_io_write(inode, private, offset, bytes); | ||
2418 | |||
2419 | ocfs2_iocb_clear_rw_locked(iocb); | ||
2420 | |||
2421 | level = ocfs2_iocb_rw_locked_level(iocb); | ||
2422 | ocfs2_rw_unlock(inode, level); | ||
2423 | return 0; | ||
2424 | } | ||
2425 | |||
2426 | static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
2427 | loff_t offset) | ||
2428 | { | ||
2429 | struct file *file = iocb->ki_filp; | ||
2430 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
2431 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2432 | loff_t end = offset + iter->count; | ||
2433 | get_block_t *get_block; | ||
2434 | |||
2435 | /* | ||
2436 | * Fallback to buffered I/O if we see an inode without | ||
2437 | * extents. | ||
2438 | */ | ||
2439 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
2440 | return 0; | ||
2441 | |||
2442 | /* Fallback to buffered I/O if we do not support append dio. */ | ||
2443 | if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) | ||
2444 | return 0; | ||
2445 | |||
2446 | if (iov_iter_rw(iter) == READ) | ||
2447 | get_block = ocfs2_get_block; | ||
2448 | else | ||
2449 | get_block = ocfs2_dio_get_block; | ||
2450 | |||
2451 | return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, | ||
2452 | iter, offset, get_block, | ||
2453 | ocfs2_dio_end_io, NULL, 0); | ||
2454 | } | ||
2455 | |||
2480 | const struct address_space_operations ocfs2_aops = { | 2456 | const struct address_space_operations ocfs2_aops = { |
2481 | .readpage = ocfs2_readpage, | 2457 | .readpage = ocfs2_readpage, |
2482 | .readpages = ocfs2_readpages, | 2458 | .readpages = ocfs2_readpages, |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 24e496d6bdcd..b1c9f28a57b1 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
47 | loff_t pos, unsigned len, unsigned copied, | 47 | loff_t pos, unsigned len, unsigned copied, |
48 | struct page *page, void *fsdata); | 48 | struct page *page, void *fsdata); |
49 | 49 | ||
50 | int ocfs2_write_begin_nolock(struct file *filp, | 50 | typedef enum { |
51 | struct address_space *mapping, | 51 | OCFS2_WRITE_BUFFER = 0, |
52 | loff_t pos, unsigned len, unsigned flags, | 52 | OCFS2_WRITE_DIRECT, |
53 | OCFS2_WRITE_MMAP, | ||
54 | } ocfs2_write_type_t; | ||
55 | |||
56 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
57 | loff_t pos, unsigned len, ocfs2_write_type_t type, | ||
53 | struct page **pagep, void **fsdata, | 58 | struct page **pagep, void **fsdata, |
54 | struct buffer_head *di_bh, struct page *mmap_page); | 59 | struct buffer_head *di_bh, struct page *mmap_page); |
55 | 60 | ||
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) | |||
79 | enum ocfs2_iocb_lock_bits { | 84 | enum ocfs2_iocb_lock_bits { |
80 | OCFS2_IOCB_RW_LOCK = 0, | 85 | OCFS2_IOCB_RW_LOCK = 0, |
81 | OCFS2_IOCB_RW_LOCK_LEVEL, | 86 | OCFS2_IOCB_RW_LOCK_LEVEL, |
82 | OCFS2_IOCB_UNALIGNED_IO, | ||
83 | OCFS2_IOCB_NUM_LOCKS | 87 | OCFS2_IOCB_NUM_LOCKS |
84 | }; | 88 | }; |
85 | 89 | ||
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits { | |||
88 | #define ocfs2_iocb_rw_locked_level(iocb) \ | 92 | #define ocfs2_iocb_rw_locked_level(iocb) \ |
89 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) | 93 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) |
90 | 94 | ||
91 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ | ||
92 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
93 | #define ocfs2_iocb_clear_unaligned_aio(iocb) \ | ||
94 | clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
95 | #define ocfs2_iocb_is_unaligned_aio(iocb) \ | ||
96 | test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
97 | |||
98 | #endif /* OCFS2_FILE_H */ | 95 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ef6a2ec494de..bd15929b5f92 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -1444,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item) | |||
1444 | debugfs_remove(reg->hr_debug_dir); | 1444 | debugfs_remove(reg->hr_debug_dir); |
1445 | kfree(reg->hr_db_livenodes); | 1445 | kfree(reg->hr_db_livenodes); |
1446 | kfree(reg->hr_db_regnum); | 1446 | kfree(reg->hr_db_regnum); |
1447 | kfree(reg->hr_debug_elapsed_time); | 1447 | kfree(reg->hr_db_elapsed_time); |
1448 | kfree(reg->hr_debug_pinned); | 1448 | kfree(reg->hr_db_pinned); |
1449 | 1449 | ||
1450 | spin_lock(&o2hb_live_lock); | 1450 | spin_lock(&o2hb_live_lock); |
1451 | list_del(®->hr_all_item); | 1451 | list_del(®->hr_all_item); |
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index e36d63ff1783..cdeafb4e7ed6 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c | |||
@@ -212,6 +212,12 @@ grant: | |||
212 | if (lock->lksb->flags & DLM_LKSB_PUT_LVB) | 212 | if (lock->lksb->flags & DLM_LKSB_PUT_LVB) |
213 | memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); | 213 | memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); |
214 | 214 | ||
215 | /* | ||
216 | * Move the lock to the tail because it may be the only lock which has | ||
217 | * an invalid lvb. | ||
218 | */ | ||
219 | list_move_tail(&lock->list, &res->granted); | ||
220 | |||
215 | status = DLM_NORMAL; | 221 | status = DLM_NORMAL; |
216 | *call_ast = 1; | 222 | *call_ast = 1; |
217 | goto unlock_exit; | 223 | goto unlock_exit; |
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
262 | struct dlm_lock *lock, int flags, int type) | 268 | struct dlm_lock *lock, int flags, int type) |
263 | { | 269 | { |
264 | enum dlm_status status; | 270 | enum dlm_status status; |
271 | u8 old_owner = res->owner; | ||
265 | 272 | ||
266 | mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, | 273 | mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, |
267 | lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); | 274 | lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); |
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
287 | status = DLM_DENIED; | 294 | status = DLM_DENIED; |
288 | goto bail; | 295 | goto bail; |
289 | } | 296 | } |
297 | |||
298 | if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { | ||
299 | mlog(0, "last convert request returned DLM_RECOVERING, but " | ||
300 | "owner has already queued and sent ast to me. res %.*s, " | ||
301 | "(cookie=%u:%llu, type=%d, conv=%d)\n", | ||
302 | res->lockname.len, res->lockname.name, | ||
303 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
304 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
305 | lock->ml.type, lock->ml.convert_type); | ||
306 | status = DLM_NORMAL; | ||
307 | goto bail; | ||
308 | } | ||
309 | |||
290 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | 310 | res->state |= DLM_LOCK_RES_IN_PROGRESS; |
291 | /* move lock to local convert queue */ | 311 | /* move lock to local convert queue */ |
292 | /* do not alter lock refcount. switching lists. */ | 312 | /* do not alter lock refcount. switching lists. */ |
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
316 | spin_lock(&res->spinlock); | 336 | spin_lock(&res->spinlock); |
317 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 337 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
318 | lock->convert_pending = 0; | 338 | lock->convert_pending = 0; |
319 | /* if it failed, move it back to granted queue */ | 339 | /* if it failed, move it back to granted queue. |
340 | * if master returns DLM_NORMAL and then down before sending ast, | ||
341 | * it may have already been moved to granted queue, reset to | ||
342 | * DLM_RECOVERING and retry convert */ | ||
320 | if (status != DLM_NORMAL) { | 343 | if (status != DLM_NORMAL) { |
321 | if (status != DLM_NOTQUEUED) | 344 | if (status != DLM_NOTQUEUED) |
322 | dlm_error(status); | 345 | dlm_error(status); |
323 | dlm_revert_pending_convert(res, lock); | 346 | dlm_revert_pending_convert(res, lock); |
347 | } else if ((res->state & DLM_LOCK_RES_RECOVERING) || | ||
348 | (old_owner != res->owner)) { | ||
349 | mlog(0, "res %.*s is in recovering or has been recovered.\n", | ||
350 | res->lockname.len, res->lockname.name); | ||
351 | status = DLM_RECOVERING; | ||
324 | } | 352 | } |
325 | bail: | 353 | bail: |
326 | spin_unlock(&res->spinlock); | 354 | spin_unlock(&res->spinlock); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index cd38488a10fc..f6b313898763 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -2083,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
2083 | dlm_lock_get(lock); | 2083 | dlm_lock_get(lock); |
2084 | if (lock->convert_pending) { | 2084 | if (lock->convert_pending) { |
2085 | /* move converting lock back to granted */ | 2085 | /* move converting lock back to granted */ |
2086 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
2087 | mlog(0, "node died with convert pending " | 2086 | mlog(0, "node died with convert pending " |
2088 | "on %.*s. move back to granted list.\n", | 2087 | "on %.*s. move back to granted list.\n", |
2089 | res->lockname.len, res->lockname.name); | 2088 | res->lockname.len, res->lockname.name); |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7cb38fdca229..c18ab45f8d21 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1381,44 +1381,6 @@ out: | |||
1381 | return ret; | 1381 | return ret; |
1382 | } | 1382 | } |
1383 | 1383 | ||
1384 | /* | ||
1385 | * Will look for holes and unwritten extents in the range starting at | ||
1386 | * pos for count bytes (inclusive). | ||
1387 | */ | ||
1388 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
1389 | size_t count) | ||
1390 | { | ||
1391 | int ret = 0; | ||
1392 | unsigned int extent_flags; | ||
1393 | u32 cpos, clusters, extent_len, phys_cpos; | ||
1394 | struct super_block *sb = inode->i_sb; | ||
1395 | |||
1396 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
1397 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
1398 | |||
1399 | while (clusters) { | ||
1400 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
1401 | &extent_flags); | ||
1402 | if (ret < 0) { | ||
1403 | mlog_errno(ret); | ||
1404 | goto out; | ||
1405 | } | ||
1406 | |||
1407 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
1408 | ret = 1; | ||
1409 | break; | ||
1410 | } | ||
1411 | |||
1412 | if (extent_len > clusters) | ||
1413 | extent_len = clusters; | ||
1414 | |||
1415 | clusters -= extent_len; | ||
1416 | cpos += extent_len; | ||
1417 | } | ||
1418 | out: | ||
1419 | return ret; | ||
1420 | } | ||
1421 | |||
1422 | static int ocfs2_write_remove_suid(struct inode *inode) | 1384 | static int ocfs2_write_remove_suid(struct inode *inode) |
1423 | { | 1385 | { |
1424 | int ret; | 1386 | int ret; |
@@ -2129,18 +2091,12 @@ out: | |||
2129 | 2091 | ||
2130 | static int ocfs2_prepare_inode_for_write(struct file *file, | 2092 | static int ocfs2_prepare_inode_for_write(struct file *file, |
2131 | loff_t pos, | 2093 | loff_t pos, |
2132 | size_t count, | 2094 | size_t count) |
2133 | int appending, | ||
2134 | int *direct_io, | ||
2135 | int *has_refcount) | ||
2136 | { | 2095 | { |
2137 | int ret = 0, meta_level = 0; | 2096 | int ret = 0, meta_level = 0; |
2138 | struct dentry *dentry = file->f_path.dentry; | 2097 | struct dentry *dentry = file->f_path.dentry; |
2139 | struct inode *inode = d_inode(dentry); | 2098 | struct inode *inode = d_inode(dentry); |
2140 | loff_t end; | 2099 | loff_t end; |
2141 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2142 | int full_coherency = !(osb->s_mount_opt & | ||
2143 | OCFS2_MOUNT_COHERENCY_BUFFERED); | ||
2144 | 2100 | ||
2145 | /* | 2101 | /* |
2146 | * We start with a read level meta lock and only jump to an ex | 2102 | * We start with a read level meta lock and only jump to an ex |
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2189 | pos, | 2145 | pos, |
2190 | count, | 2146 | count, |
2191 | &meta_level); | 2147 | &meta_level); |
2192 | if (has_refcount) | ||
2193 | *has_refcount = 1; | ||
2194 | if (direct_io) | ||
2195 | *direct_io = 0; | ||
2196 | } | 2148 | } |
2197 | 2149 | ||
2198 | if (ret < 0) { | 2150 | if (ret < 0) { |
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2200 | goto out_unlock; | 2152 | goto out_unlock; |
2201 | } | 2153 | } |
2202 | 2154 | ||
2203 | /* | ||
2204 | * Skip the O_DIRECT checks if we don't need | ||
2205 | * them. | ||
2206 | */ | ||
2207 | if (!direct_io || !(*direct_io)) | ||
2208 | break; | ||
2209 | |||
2210 | /* | ||
2211 | * There's no sane way to do direct writes to an inode | ||
2212 | * with inline data. | ||
2213 | */ | ||
2214 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | ||
2215 | *direct_io = 0; | ||
2216 | break; | ||
2217 | } | ||
2218 | |||
2219 | /* | ||
2220 | * Allowing concurrent direct writes means | ||
2221 | * i_size changes wouldn't be synchronized, so | ||
2222 | * one node could wind up truncating another | ||
2223 | * nodes writes. | ||
2224 | */ | ||
2225 | if (end > i_size_read(inode) && !full_coherency) { | ||
2226 | *direct_io = 0; | ||
2227 | break; | ||
2228 | } | ||
2229 | |||
2230 | /* | ||
2231 | * Fallback to old way if the feature bit is not set. | ||
2232 | */ | ||
2233 | if (end > i_size_read(inode) && | ||
2234 | !ocfs2_supports_append_dio(osb)) { | ||
2235 | *direct_io = 0; | ||
2236 | break; | ||
2237 | } | ||
2238 | |||
2239 | /* | ||
2240 | * We don't fill holes during direct io, so | ||
2241 | * check for them here. If any are found, the | ||
2242 | * caller will have to retake some cluster | ||
2243 | * locks and initiate the io as buffered. | ||
2244 | */ | ||
2245 | ret = ocfs2_check_range_for_holes(inode, pos, count); | ||
2246 | if (ret == 1) { | ||
2247 | /* | ||
2248 | * Fallback to old way if the feature bit is not set. | ||
2249 | * Otherwise try dio first and then complete the rest | ||
2250 | * request through buffer io. | ||
2251 | */ | ||
2252 | if (!ocfs2_supports_append_dio(osb)) | ||
2253 | *direct_io = 0; | ||
2254 | ret = 0; | ||
2255 | } else if (ret < 0) | ||
2256 | mlog_errno(ret); | ||
2257 | break; | 2155 | break; |
2258 | } | 2156 | } |
2259 | 2157 | ||
2260 | out_unlock: | 2158 | out_unlock: |
2261 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, | 2159 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, |
2262 | pos, appending, count, | 2160 | pos, count); |
2263 | direct_io, has_refcount); | ||
2264 | 2161 | ||
2265 | if (meta_level >= 0) | 2162 | if (meta_level >= 0) |
2266 | ocfs2_inode_unlock(inode, meta_level); | 2163 | ocfs2_inode_unlock(inode, meta_level); |
@@ -2272,18 +2169,16 @@ out: | |||
2272 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | 2169 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, |
2273 | struct iov_iter *from) | 2170 | struct iov_iter *from) |
2274 | { | 2171 | { |
2275 | int direct_io, appending, rw_level; | 2172 | int direct_io, rw_level; |
2276 | int can_do_direct, has_refcount = 0; | ||
2277 | ssize_t written = 0; | 2173 | ssize_t written = 0; |
2278 | ssize_t ret; | 2174 | ssize_t ret; |
2279 | size_t count = iov_iter_count(from), orig_count; | 2175 | size_t count = iov_iter_count(from); |
2280 | struct file *file = iocb->ki_filp; | 2176 | struct file *file = iocb->ki_filp; |
2281 | struct inode *inode = file_inode(file); | 2177 | struct inode *inode = file_inode(file); |
2282 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2178 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2283 | int full_coherency = !(osb->s_mount_opt & | 2179 | int full_coherency = !(osb->s_mount_opt & |
2284 | OCFS2_MOUNT_COHERENCY_BUFFERED); | 2180 | OCFS2_MOUNT_COHERENCY_BUFFERED); |
2285 | int unaligned_dio = 0; | 2181 | void *saved_ki_complete = NULL; |
2286 | int dropped_dio = 0; | ||
2287 | int append_write = ((iocb->ki_pos + count) >= | 2182 | int append_write = ((iocb->ki_pos + count) >= |
2288 | i_size_read(inode) ? 1 : 0); | 2183 | i_size_read(inode) ? 1 : 0); |
2289 | 2184 | ||
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | |||
2296 | if (count == 0) | 2191 | if (count == 0) |
2297 | return 0; | 2192 | return 0; |
2298 | 2193 | ||
2299 | appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; | ||
2300 | direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; | 2194 | direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; |
2301 | 2195 | ||
2302 | inode_lock(inode); | 2196 | inode_lock(inode); |
2303 | 2197 | ||
2304 | relock: | ||
2305 | /* | 2198 | /* |
2306 | * Concurrent O_DIRECT writes are allowed with | 2199 | * Concurrent O_DIRECT writes are allowed with |
2307 | * mount_option "coherency=buffered". | 2200 | * mount_option "coherency=buffered". |
@@ -2334,7 +2227,6 @@ relock: | |||
2334 | ocfs2_inode_unlock(inode, 1); | 2227 | ocfs2_inode_unlock(inode, 1); |
2335 | } | 2228 | } |
2336 | 2229 | ||
2337 | orig_count = iov_iter_count(from); | ||
2338 | ret = generic_write_checks(iocb, from); | 2230 | ret = generic_write_checks(iocb, from); |
2339 | if (ret <= 0) { | 2231 | if (ret <= 0) { |
2340 | if (ret) | 2232 | if (ret) |
@@ -2343,41 +2235,18 @@ relock: | |||
2343 | } | 2235 | } |
2344 | count = ret; | 2236 | count = ret; |
2345 | 2237 | ||
2346 | can_do_direct = direct_io; | 2238 | ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); |
2347 | ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, | ||
2348 | &can_do_direct, &has_refcount); | ||
2349 | if (ret < 0) { | 2239 | if (ret < 0) { |
2350 | mlog_errno(ret); | 2240 | mlog_errno(ret); |
2351 | goto out; | 2241 | goto out; |
2352 | } | 2242 | } |
2353 | 2243 | ||
2354 | if (direct_io && !is_sync_kiocb(iocb)) | 2244 | if (direct_io && !is_sync_kiocb(iocb) && |
2355 | unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); | 2245 | ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { |
2356 | |||
2357 | /* | ||
2358 | * We can't complete the direct I/O as requested, fall back to | ||
2359 | * buffered I/O. | ||
2360 | */ | ||
2361 | if (direct_io && !can_do_direct) { | ||
2362 | ocfs2_rw_unlock(inode, rw_level); | ||
2363 | |||
2364 | rw_level = -1; | ||
2365 | |||
2366 | direct_io = 0; | ||
2367 | iocb->ki_flags &= ~IOCB_DIRECT; | ||
2368 | iov_iter_reexpand(from, orig_count); | ||
2369 | dropped_dio = 1; | ||
2370 | goto relock; | ||
2371 | } | ||
2372 | |||
2373 | if (unaligned_dio) { | ||
2374 | /* | 2246 | /* |
2375 | * Wait on previous unaligned aio to complete before | 2247 | * Make it a sync io if it's an unaligned aio. |
2376 | * proceeding. | ||
2377 | */ | 2248 | */ |
2378 | mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); | 2249 | saved_ki_complete = xchg(&iocb->ki_complete, NULL); |
2379 | /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ | ||
2380 | ocfs2_iocb_set_unaligned_aio(iocb); | ||
2381 | } | 2250 | } |
2382 | 2251 | ||
2383 | /* communicate with ocfs2_dio_end_io */ | 2252 | /* communicate with ocfs2_dio_end_io */ |
@@ -2398,14 +2267,13 @@ relock: | |||
2398 | */ | 2267 | */ |
2399 | if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | 2268 | if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { |
2400 | rw_level = -1; | 2269 | rw_level = -1; |
2401 | unaligned_dio = 0; | ||
2402 | } | 2270 | } |
2403 | 2271 | ||
2404 | if (unlikely(written <= 0)) | 2272 | if (unlikely(written <= 0)) |
2405 | goto no_sync; | 2273 | goto out; |
2406 | 2274 | ||
2407 | if (((file->f_flags & O_DSYNC) && !direct_io) || | 2275 | if (((file->f_flags & O_DSYNC) && !direct_io) || |
2408 | IS_SYNC(inode) || dropped_dio) { | 2276 | IS_SYNC(inode)) { |
2409 | ret = filemap_fdatawrite_range(file->f_mapping, | 2277 | ret = filemap_fdatawrite_range(file->f_mapping, |
2410 | iocb->ki_pos - written, | 2278 | iocb->ki_pos - written, |
2411 | iocb->ki_pos - 1); | 2279 | iocb->ki_pos - 1); |
@@ -2424,13 +2292,10 @@ relock: | |||
2424 | iocb->ki_pos - 1); | 2292 | iocb->ki_pos - 1); |
2425 | } | 2293 | } |
2426 | 2294 | ||
2427 | no_sync: | ||
2428 | if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
2429 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
2430 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | ||
2431 | } | ||
2432 | |||
2433 | out: | 2295 | out: |
2296 | if (saved_ki_complete) | ||
2297 | xchg(&iocb->ki_complete, saved_ki_complete); | ||
2298 | |||
2434 | if (rw_level != -1) | 2299 | if (rw_level != -1) |
2435 | ocfs2_rw_unlock(inode, rw_level); | 2300 | ocfs2_rw_unlock(inode, rw_level); |
2436 | 2301 | ||
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ba495beff1c2..12f4a9e9800f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode) | |||
1170 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), | 1170 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), |
1171 | "Clear inode of %llu, inode has io markers\n", | 1171 | "Clear inode of %llu, inode has io markers\n", |
1172 | (unsigned long long)oi->ip_blkno); | 1172 | (unsigned long long)oi->ip_blkno); |
1173 | mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), | ||
1174 | "Clear inode of %llu, inode has unwritten extents\n", | ||
1175 | (unsigned long long)oi->ip_blkno); | ||
1173 | 1176 | ||
1174 | ocfs2_extent_map_trunc(inode, 0); | 1177 | ocfs2_extent_map_trunc(inode, 0); |
1175 | 1178 | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 01635e016b3e..d8f3fc8d2551 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -43,9 +43,6 @@ struct ocfs2_inode_info | |||
43 | /* protects extended attribute changes on this inode */ | 43 | /* protects extended attribute changes on this inode */ |
44 | struct rw_semaphore ip_xattr_sem; | 44 | struct rw_semaphore ip_xattr_sem; |
45 | 45 | ||
46 | /* Number of outstanding AIO's which are not page aligned */ | ||
47 | struct mutex ip_unaligned_aio; | ||
48 | |||
49 | /* These fields are protected by ip_lock */ | 46 | /* These fields are protected by ip_lock */ |
50 | spinlock_t ip_lock; | 47 | spinlock_t ip_lock; |
51 | u32 ip_open_count; | 48 | u32 ip_open_count; |
@@ -57,6 +54,9 @@ struct ocfs2_inode_info | |||
57 | u32 ip_flags; /* see below */ | 54 | u32 ip_flags; /* see below */ |
58 | u32 ip_attr; /* inode attributes */ | 55 | u32 ip_attr; /* inode attributes */ |
59 | 56 | ||
57 | /* Record unwritten extents during direct io. */ | ||
58 | struct list_head ip_unwritten_list; | ||
59 | |||
60 | /* protected by recovery_lock. */ | 60 | /* protected by recovery_lock. */ |
61 | struct inode *ip_next_orphan; | 61 | struct inode *ip_next_orphan; |
62 | 62 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 61b833b721d8..e607419cdfa4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) | |||
231 | /* At this point, we know that no more recovery threads can be | 231 | /* At this point, we know that no more recovery threads can be |
232 | * launched, so wait for any recovery completion work to | 232 | * launched, so wait for any recovery completion work to |
233 | * complete. */ | 233 | * complete. */ |
234 | flush_workqueue(ocfs2_wq); | 234 | flush_workqueue(osb->ocfs2_wq); |
235 | 235 | ||
236 | /* | 236 | /* |
237 | * Now that recovery is shut down, and the osb is about to be | 237 | * Now that recovery is shut down, and the osb is about to be |
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, | |||
1326 | 1326 | ||
1327 | spin_lock(&journal->j_lock); | 1327 | spin_lock(&journal->j_lock); |
1328 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); | 1328 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); |
1329 | queue_work(ocfs2_wq, &journal->j_recovery_work); | 1329 | queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); |
1330 | spin_unlock(&journal->j_lock); | 1330 | spin_unlock(&journal->j_lock); |
1331 | } | 1331 | } |
1332 | 1332 | ||
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work) | |||
1968 | mutex_lock(&os->os_lock); | 1968 | mutex_lock(&os->os_lock); |
1969 | ocfs2_queue_orphan_scan(osb); | 1969 | ocfs2_queue_orphan_scan(osb); |
1970 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) | 1970 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) |
1971 | queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, | 1971 | queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, |
1972 | ocfs2_orphan_scan_timeout()); | 1972 | ocfs2_orphan_scan_timeout()); |
1973 | mutex_unlock(&os->os_lock); | 1973 | mutex_unlock(&os->os_lock); |
1974 | } | 1974 | } |
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) | |||
2008 | atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); | 2008 | atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); |
2009 | else { | 2009 | else { |
2010 | atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); | 2010 | atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); |
2011 | queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, | 2011 | queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, |
2012 | ocfs2_orphan_scan_timeout()); | 2012 | ocfs2_orphan_scan_timeout()); |
2013 | } | 2013 | } |
2014 | } | 2014 | } |
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7d62c43a2c3e..fe0d1f9571bb 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c | |||
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) | |||
386 | struct ocfs2_dinode *alloc = NULL; | 386 | struct ocfs2_dinode *alloc = NULL; |
387 | 387 | ||
388 | cancel_delayed_work(&osb->la_enable_wq); | 388 | cancel_delayed_work(&osb->la_enable_wq); |
389 | flush_workqueue(ocfs2_wq); | 389 | flush_workqueue(osb->ocfs2_wq); |
390 | 390 | ||
391 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) | 391 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) |
392 | goto out; | 392 | goto out; |
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, | |||
1085 | } else { | 1085 | } else { |
1086 | osb->local_alloc_state = OCFS2_LA_DISABLED; | 1086 | osb->local_alloc_state = OCFS2_LA_DISABLED; |
1087 | } | 1087 | } |
1088 | queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, | 1088 | queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, |
1089 | OCFS2_LA_ENABLE_INTERVAL); | 1089 | OCFS2_LA_ENABLE_INTERVAL); |
1090 | goto out_unlock; | 1090 | goto out_unlock; |
1091 | } | 1091 | } |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 77ebc2bc1cca..9ea081f4e6e4 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
104 | if (page->index == last_index) | 104 | if (page->index == last_index) |
105 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; | 105 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; |
106 | 106 | ||
107 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, | 107 | ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, |
108 | &fsdata, di_bh, page); | 108 | &locked_page, &fsdata, di_bh, page); |
109 | if (ret) { | 109 | if (ret) { |
110 | if (ret != -ENOSPC) | 110 | if (ret != -ENOSPC) |
111 | mlog_errno(ret); | 111 | mlog_errno(ret); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7a0126267847..6cf6538a0651 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -464,6 +464,14 @@ struct ocfs2_super | |||
464 | struct ocfs2_refcount_tree *osb_ref_tree_lru; | 464 | struct ocfs2_refcount_tree *osb_ref_tree_lru; |
465 | 465 | ||
466 | struct mutex system_file_mutex; | 466 | struct mutex system_file_mutex; |
467 | |||
468 | /* | ||
469 | * OCFS2 needs to schedule several different types of work which | ||
470 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
471 | * types of work tend to be heavy we avoid using the kernel events | ||
472 | * workqueue and schedule on our own. | ||
473 | */ | ||
474 | struct workqueue_struct *ocfs2_wq; | ||
467 | }; | 475 | }; |
468 | 476 | ||
469 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) | 477 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) |
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 24b7e7f591dc..f8f5fc5e6c05 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h | |||
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); | |||
1450 | 1450 | ||
1451 | TRACE_EVENT(ocfs2_prepare_inode_for_write, | 1451 | TRACE_EVENT(ocfs2_prepare_inode_for_write, |
1452 | TP_PROTO(unsigned long long ino, unsigned long long saved_pos, | 1452 | TP_PROTO(unsigned long long ino, unsigned long long saved_pos, |
1453 | int appending, unsigned long count, | 1453 | unsigned long count), |
1454 | int *direct_io, int *has_refcount), | 1454 | TP_ARGS(ino, saved_pos, count), |
1455 | TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), | ||
1456 | TP_STRUCT__entry( | 1455 | TP_STRUCT__entry( |
1457 | __field(unsigned long long, ino) | 1456 | __field(unsigned long long, ino) |
1458 | __field(unsigned long long, saved_pos) | 1457 | __field(unsigned long long, saved_pos) |
1459 | __field(int, appending) | ||
1460 | __field(unsigned long, count) | 1458 | __field(unsigned long, count) |
1461 | __field(int, direct_io) | ||
1462 | __field(int, has_refcount) | ||
1463 | ), | 1459 | ), |
1464 | TP_fast_assign( | 1460 | TP_fast_assign( |
1465 | __entry->ino = ino; | 1461 | __entry->ino = ino; |
1466 | __entry->saved_pos = saved_pos; | 1462 | __entry->saved_pos = saved_pos; |
1467 | __entry->appending = appending; | ||
1468 | __entry->count = count; | 1463 | __entry->count = count; |
1469 | __entry->direct_io = direct_io ? *direct_io : -1; | ||
1470 | __entry->has_refcount = has_refcount ? *has_refcount : -1; | ||
1471 | ), | 1464 | ), |
1472 | TP_printk("%llu %llu %d %lu %d %d", __entry->ino, | 1465 | TP_printk("%llu %llu %lu", __entry->ino, |
1473 | __entry->saved_pos, __entry->appending, __entry->count, | 1466 | __entry->saved_pos, __entry->count) |
1474 | __entry->direct_io, __entry->has_refcount) | ||
1475 | ); | 1467 | ); |
1476 | 1468 | ||
1477 | DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); | 1469 | DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); |
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 91bc674203ed..3892f3c079ca 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c | |||
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) | |||
726 | dqgrab(dquot); | 726 | dqgrab(dquot); |
727 | /* First entry on list -> queue work */ | 727 | /* First entry on list -> queue work */ |
728 | if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) | 728 | if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) |
729 | queue_work(ocfs2_wq, &osb->dquot_drop_work); | 729 | queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); |
730 | goto out; | 730 | goto out; |
731 | } | 731 | } |
732 | status = ocfs2_lock_global_qf(oinfo, 1); | 732 | status = ocfs2_lock_global_qf(oinfo, 1); |
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 576b9a04873f..18451e0fab81 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c | |||
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) | |||
196 | for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { | 196 | for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { |
197 | blkno = ocfs2_backup_super_blkno(inode->i_sb, i); | 197 | blkno = ocfs2_backup_super_blkno(inode->i_sb, i); |
198 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); | 198 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); |
199 | if (cluster > clusters) | 199 | if (cluster >= clusters) |
200 | break; | 200 | break; |
201 | 201 | ||
202 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); | 202 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ccc9386c42c5..7db631e1c8b0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -80,12 +80,6 @@ static struct kmem_cache *ocfs2_inode_cachep; | |||
80 | struct kmem_cache *ocfs2_dquot_cachep; | 80 | struct kmem_cache *ocfs2_dquot_cachep; |
81 | struct kmem_cache *ocfs2_qf_chunk_cachep; | 81 | struct kmem_cache *ocfs2_qf_chunk_cachep; |
82 | 82 | ||
83 | /* OCFS2 needs to schedule several different types of work which | ||
84 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
85 | * types of work tend to be heavy we avoid using the kernel events | ||
86 | * workqueue and schedule on our own. */ | ||
87 | struct workqueue_struct *ocfs2_wq = NULL; | ||
88 | |||
89 | static struct dentry *ocfs2_debugfs_root; | 83 | static struct dentry *ocfs2_debugfs_root; |
90 | 84 | ||
91 | MODULE_AUTHOR("Oracle"); | 85 | MODULE_AUTHOR("Oracle"); |
@@ -1613,33 +1607,25 @@ static int __init ocfs2_init(void) | |||
1613 | if (status < 0) | 1607 | if (status < 0) |
1614 | goto out2; | 1608 | goto out2; |
1615 | 1609 | ||
1616 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
1617 | if (!ocfs2_wq) { | ||
1618 | status = -ENOMEM; | ||
1619 | goto out3; | ||
1620 | } | ||
1621 | |||
1622 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | 1610 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); |
1623 | if (!ocfs2_debugfs_root) { | 1611 | if (!ocfs2_debugfs_root) { |
1624 | status = -ENOMEM; | 1612 | status = -ENOMEM; |
1625 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | 1613 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); |
1626 | goto out4; | 1614 | goto out3; |
1627 | } | 1615 | } |
1628 | 1616 | ||
1629 | ocfs2_set_locking_protocol(); | 1617 | ocfs2_set_locking_protocol(); |
1630 | 1618 | ||
1631 | status = register_quota_format(&ocfs2_quota_format); | 1619 | status = register_quota_format(&ocfs2_quota_format); |
1632 | if (status < 0) | 1620 | if (status < 0) |
1633 | goto out4; | 1621 | goto out3; |
1634 | status = register_filesystem(&ocfs2_fs_type); | 1622 | status = register_filesystem(&ocfs2_fs_type); |
1635 | if (!status) | 1623 | if (!status) |
1636 | return 0; | 1624 | return 0; |
1637 | 1625 | ||
1638 | unregister_quota_format(&ocfs2_quota_format); | 1626 | unregister_quota_format(&ocfs2_quota_format); |
1639 | out4: | ||
1640 | destroy_workqueue(ocfs2_wq); | ||
1641 | debugfs_remove(ocfs2_debugfs_root); | ||
1642 | out3: | 1627 | out3: |
1628 | debugfs_remove(ocfs2_debugfs_root); | ||
1643 | ocfs2_free_mem_caches(); | 1629 | ocfs2_free_mem_caches(); |
1644 | out2: | 1630 | out2: |
1645 | exit_ocfs2_uptodate_cache(); | 1631 | exit_ocfs2_uptodate_cache(); |
@@ -1650,11 +1636,6 @@ out1: | |||
1650 | 1636 | ||
1651 | static void __exit ocfs2_exit(void) | 1637 | static void __exit ocfs2_exit(void) |
1652 | { | 1638 | { |
1653 | if (ocfs2_wq) { | ||
1654 | flush_workqueue(ocfs2_wq); | ||
1655 | destroy_workqueue(ocfs2_wq); | ||
1656 | } | ||
1657 | |||
1658 | unregister_quota_format(&ocfs2_quota_format); | 1639 | unregister_quota_format(&ocfs2_quota_format); |
1659 | 1640 | ||
1660 | debugfs_remove(ocfs2_debugfs_root); | 1641 | debugfs_remove(ocfs2_debugfs_root); |
@@ -1745,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data) | |||
1745 | spin_lock_init(&oi->ip_lock); | 1726 | spin_lock_init(&oi->ip_lock); |
1746 | ocfs2_extent_map_init(&oi->vfs_inode); | 1727 | ocfs2_extent_map_init(&oi->vfs_inode); |
1747 | INIT_LIST_HEAD(&oi->ip_io_markers); | 1728 | INIT_LIST_HEAD(&oi->ip_io_markers); |
1729 | INIT_LIST_HEAD(&oi->ip_unwritten_list); | ||
1748 | oi->ip_dir_start_lookup = 0; | 1730 | oi->ip_dir_start_lookup = 0; |
1749 | mutex_init(&oi->ip_unaligned_aio); | ||
1750 | init_rwsem(&oi->ip_alloc_sem); | 1731 | init_rwsem(&oi->ip_alloc_sem); |
1751 | init_rwsem(&oi->ip_xattr_sem); | 1732 | init_rwsem(&oi->ip_xattr_sem); |
1752 | mutex_init(&oi->ip_io_mutex); | 1733 | mutex_init(&oi->ip_io_mutex); |
@@ -2349,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2349 | } | 2330 | } |
2350 | cleancache_init_shared_fs(sb); | 2331 | cleancache_init_shared_fs(sb); |
2351 | 2332 | ||
2333 | osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
2334 | if (!osb->ocfs2_wq) { | ||
2335 | status = -ENOMEM; | ||
2336 | mlog_errno(status); | ||
2337 | } | ||
2338 | |||
2352 | bail: | 2339 | bail: |
2353 | return status; | 2340 | return status; |
2354 | } | 2341 | } |
@@ -2536,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) | |||
2536 | { | 2523 | { |
2537 | /* This function assumes that the caller has the main osb resource */ | 2524 | /* This function assumes that the caller has the main osb resource */ |
2538 | 2525 | ||
2526 | /* ocfs2_initializer_super have already created this workqueue */ | ||
2527 | if (osb->ocfs2_wq) { | ||
2528 | flush_workqueue(osb->ocfs2_wq); | ||
2529 | destroy_workqueue(osb->ocfs2_wq); | ||
2530 | } | ||
2531 | |||
2539 | ocfs2_free_slot_info(osb); | 2532 | ocfs2_free_slot_info(osb); |
2540 | 2533 | ||
2541 | kfree(osb->osb_orphan_wipes); | 2534 | kfree(osb->osb_orphan_wipes); |
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b477d0b1c7b6..b023e4f3d740 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h | |||
@@ -26,8 +26,6 @@ | |||
26 | #ifndef OCFS2_SUPER_H | 26 | #ifndef OCFS2_SUPER_H |
27 | #define OCFS2_SUPER_H | 27 | #define OCFS2_SUPER_H |
28 | 28 | ||
29 | extern struct workqueue_struct *ocfs2_wq; | ||
30 | |||
31 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, | 29 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, |
32 | int node_num); | 30 | int node_num); |
33 | 31 | ||
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8f5a12ab2f2b..339125bb4d2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h | |||
@@ -456,7 +456,7 @@ | |||
456 | *(.entry.text) \ | 456 | *(.entry.text) \ |
457 | VMLINUX_SYMBOL(__entry_text_end) = .; | 457 | VMLINUX_SYMBOL(__entry_text_end) = .; |
458 | 458 | ||
459 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 459 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
460 | #define IRQENTRY_TEXT \ | 460 | #define IRQENTRY_TEXT \ |
461 | ALIGN_FUNCTION(); \ | 461 | ALIGN_FUNCTION(); \ |
462 | VMLINUX_SYMBOL(__irqentry_text_start) = .; \ | 462 | VMLINUX_SYMBOL(__irqentry_text_start) = .; \ |
@@ -466,6 +466,16 @@ | |||
466 | #define IRQENTRY_TEXT | 466 | #define IRQENTRY_TEXT |
467 | #endif | 467 | #endif |
468 | 468 | ||
469 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) | ||
470 | #define SOFTIRQENTRY_TEXT \ | ||
471 | ALIGN_FUNCTION(); \ | ||
472 | VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ | ||
473 | *(.softirqentry.text) \ | ||
474 | VMLINUX_SYMBOL(__softirqentry_text_end) = .; | ||
475 | #else | ||
476 | #define SOFTIRQENTRY_TEXT | ||
477 | #endif | ||
478 | |||
469 | /* Section used for early init (in .S files) */ | 479 | /* Section used for early init (in .S files) */ |
470 | #define HEAD_TEXT *(.head.text) | 480 | #define HEAD_TEXT *(.head.text) |
471 | 481 | ||
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d9df3f7e334..dea12a6e413b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
@@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
811 | */ | 811 | */ |
812 | #define __notrace_funcgraph notrace | 812 | #define __notrace_funcgraph notrace |
813 | 813 | ||
814 | /* | ||
815 | * We want to which function is an entrypoint of a hardirq. | ||
816 | * That will help us to put a signal on output. | ||
817 | */ | ||
818 | #define __irq_entry __attribute__((__section__(".irqentry.text"))) | ||
819 | |||
820 | /* Limits of hardirq entrypoints */ | ||
821 | extern char __irqentry_text_start[]; | ||
822 | extern char __irqentry_text_end[]; | ||
823 | |||
824 | #define FTRACE_NOTRACE_DEPTH 65536 | 814 | #define FTRACE_NOTRACE_DEPTH 65536 |
825 | #define FTRACE_RETFUNC_DEPTH 50 | 815 | #define FTRACE_RETFUNC_DEPTH 50 |
826 | #define FTRACE_RETSTACK_ALLOC_SIZE 32 | 816 | #define FTRACE_RETSTACK_ALLOC_SIZE 32 |
@@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void) | |||
857 | #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ | 847 | #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ |
858 | 848 | ||
859 | #define __notrace_funcgraph | 849 | #define __notrace_funcgraph |
860 | #define __irq_entry | ||
861 | #define INIT_FTRACE_GRAPH | 850 | #define INIT_FTRACE_GRAPH |
862 | 851 | ||
863 | static inline void ftrace_graph_init_task(struct task_struct *t) { } | 852 | static inline void ftrace_graph_init_task(struct task_struct *t) { } |
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 358076eda364..9fcabeb07787 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h | |||
@@ -683,4 +683,24 @@ extern int early_irq_init(void); | |||
683 | extern int arch_probe_nr_irqs(void); | 683 | extern int arch_probe_nr_irqs(void); |
684 | extern int arch_early_irq_init(void); | 684 | extern int arch_early_irq_init(void); |
685 | 685 | ||
686 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) | ||
687 | /* | ||
688 | * We want to know which function is an entrypoint of a hardirq or a softirq. | ||
689 | */ | ||
690 | #define __irq_entry __attribute__((__section__(".irqentry.text"))) | ||
691 | #define __softirq_entry \ | ||
692 | __attribute__((__section__(".softirqentry.text"))) | ||
693 | |||
694 | /* Limits of hardirq entrypoints */ | ||
695 | extern char __irqentry_text_start[]; | ||
696 | extern char __irqentry_text_end[]; | ||
697 | /* Limits of softirq entrypoints */ | ||
698 | extern char __softirqentry_text_start[]; | ||
699 | extern char __softirqentry_text_end[]; | ||
700 | |||
701 | #else | ||
702 | #define __irq_entry | ||
703 | #define __softirq_entry | ||
704 | #endif | ||
705 | |||
686 | #endif | 706 | #endif |
diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0fdc798e3ff7..737371b56044 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h | |||
@@ -48,19 +48,28 @@ void kasan_unpoison_task_stack(struct task_struct *task); | |||
48 | void kasan_alloc_pages(struct page *page, unsigned int order); | 48 | void kasan_alloc_pages(struct page *page, unsigned int order); |
49 | void kasan_free_pages(struct page *page, unsigned int order); | 49 | void kasan_free_pages(struct page *page, unsigned int order); |
50 | 50 | ||
51 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, | ||
52 | unsigned long *flags); | ||
53 | |||
51 | void kasan_poison_slab(struct page *page); | 54 | void kasan_poison_slab(struct page *page); |
52 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); | 55 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); |
53 | void kasan_poison_object_data(struct kmem_cache *cache, void *object); | 56 | void kasan_poison_object_data(struct kmem_cache *cache, void *object); |
54 | 57 | ||
55 | void kasan_kmalloc_large(const void *ptr, size_t size); | 58 | void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); |
56 | void kasan_kfree_large(const void *ptr); | 59 | void kasan_kfree_large(const void *ptr); |
57 | void kasan_kfree(void *ptr); | 60 | void kasan_kfree(void *ptr); |
58 | void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); | 61 | void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, |
59 | void kasan_krealloc(const void *object, size_t new_size); | 62 | gfp_t flags); |
63 | void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); | ||
60 | 64 | ||
61 | void kasan_slab_alloc(struct kmem_cache *s, void *object); | 65 | void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); |
62 | void kasan_slab_free(struct kmem_cache *s, void *object); | 66 | void kasan_slab_free(struct kmem_cache *s, void *object); |
63 | 67 | ||
68 | struct kasan_cache { | ||
69 | int alloc_meta_offset; | ||
70 | int free_meta_offset; | ||
71 | }; | ||
72 | |||
64 | int kasan_module_alloc(void *addr, size_t size); | 73 | int kasan_module_alloc(void *addr, size_t size); |
65 | void kasan_free_shadow(const struct vm_struct *vm); | 74 | void kasan_free_shadow(const struct vm_struct *vm); |
66 | 75 | ||
@@ -76,20 +85,26 @@ static inline void kasan_disable_current(void) {} | |||
76 | static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} | 85 | static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} |
77 | static inline void kasan_free_pages(struct page *page, unsigned int order) {} | 86 | static inline void kasan_free_pages(struct page *page, unsigned int order) {} |
78 | 87 | ||
88 | static inline void kasan_cache_create(struct kmem_cache *cache, | ||
89 | size_t *size, | ||
90 | unsigned long *flags) {} | ||
91 | |||
79 | static inline void kasan_poison_slab(struct page *page) {} | 92 | static inline void kasan_poison_slab(struct page *page) {} |
80 | static inline void kasan_unpoison_object_data(struct kmem_cache *cache, | 93 | static inline void kasan_unpoison_object_data(struct kmem_cache *cache, |
81 | void *object) {} | 94 | void *object) {} |
82 | static inline void kasan_poison_object_data(struct kmem_cache *cache, | 95 | static inline void kasan_poison_object_data(struct kmem_cache *cache, |
83 | void *object) {} | 96 | void *object) {} |
84 | 97 | ||
85 | static inline void kasan_kmalloc_large(void *ptr, size_t size) {} | 98 | static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {} |
86 | static inline void kasan_kfree_large(const void *ptr) {} | 99 | static inline void kasan_kfree_large(const void *ptr) {} |
87 | static inline void kasan_kfree(void *ptr) {} | 100 | static inline void kasan_kfree(void *ptr) {} |
88 | static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, | 101 | static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, |
89 | size_t size) {} | 102 | size_t size, gfp_t flags) {} |
90 | static inline void kasan_krealloc(const void *object, size_t new_size) {} | 103 | static inline void kasan_krealloc(const void *object, size_t new_size, |
104 | gfp_t flags) {} | ||
91 | 105 | ||
92 | static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} | 106 | static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, |
107 | gfp_t flags) {} | ||
93 | static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} | 108 | static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} |
94 | 109 | ||
95 | static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } | 110 | static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 450fc977ed02..ed6407d1b7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1132,6 +1132,8 @@ struct zap_details { | |||
1132 | struct address_space *check_mapping; /* Check page->mapping if set */ | 1132 | struct address_space *check_mapping; /* Check page->mapping if set */ |
1133 | pgoff_t first_index; /* Lowest page->index to unmap */ | 1133 | pgoff_t first_index; /* Lowest page->index to unmap */ |
1134 | pgoff_t last_index; /* Highest page->index to unmap */ | 1134 | pgoff_t last_index; /* Highest page->index to unmap */ |
1135 | bool ignore_dirty; /* Ignore dirty pages */ | ||
1136 | bool check_swap_entries; /* Check also swap entries */ | ||
1135 | }; | 1137 | }; |
1136 | 1138 | ||
1137 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 1139 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 03e6257321f0..628a43242a34 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -76,8 +76,6 @@ extern unsigned long oom_badness(struct task_struct *p, | |||
76 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 76 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
77 | unsigned long totalpages); | 77 | unsigned long totalpages); |
78 | 78 | ||
79 | extern int oom_kills_count(void); | ||
80 | extern void note_oom_kill(void); | ||
81 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, | 79 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, |
82 | unsigned int points, unsigned long totalpages, | 80 | unsigned int points, unsigned long totalpages, |
83 | struct mem_cgroup *memcg, const char *message); | 81 | struct mem_cgroup *memcg, const char *message); |
@@ -91,7 +89,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | |||
91 | 89 | ||
92 | extern bool out_of_memory(struct oom_control *oc); | 90 | extern bool out_of_memory(struct oom_control *oc); |
93 | 91 | ||
94 | extern void exit_oom_victim(void); | 92 | extern void exit_oom_victim(struct task_struct *tsk); |
95 | 93 | ||
96 | extern int register_oom_notifier(struct notifier_block *nb); | 94 | extern int register_oom_notifier(struct notifier_block *nb); |
97 | extern int unregister_oom_notifier(struct notifier_block *nb); | 95 | extern int unregister_oom_notifier(struct notifier_block *nb); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 589c4780b077..60bba7e032dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout); | |||
426 | extern signed long schedule_timeout_interruptible(signed long timeout); | 426 | extern signed long schedule_timeout_interruptible(signed long timeout); |
427 | extern signed long schedule_timeout_killable(signed long timeout); | 427 | extern signed long schedule_timeout_killable(signed long timeout); |
428 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 428 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
429 | extern signed long schedule_timeout_idle(signed long timeout); | ||
429 | asmlinkage void schedule(void); | 430 | asmlinkage void schedule(void); |
430 | extern void schedule_preempt_disabled(void); | 431 | extern void schedule_preempt_disabled(void); |
431 | 432 | ||
@@ -1848,6 +1849,9 @@ struct task_struct { | |||
1848 | unsigned long task_state_change; | 1849 | unsigned long task_state_change; |
1849 | #endif | 1850 | #endif |
1850 | int pagefault_disabled; | 1851 | int pagefault_disabled; |
1852 | #ifdef CONFIG_MMU | ||
1853 | struct task_struct *oom_reaper_list; | ||
1854 | #endif | ||
1851 | /* CPU-specific state of this task */ | 1855 | /* CPU-specific state of this task */ |
1852 | struct thread_struct thread; | 1856 | struct thread_struct thread; |
1853 | /* | 1857 | /* |
diff --git a/include/linux/slab.h b/include/linux/slab.h index e4b568738ca3..508bd827e6dc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -92,6 +92,12 @@ | |||
92 | # define SLAB_ACCOUNT 0x00000000UL | 92 | # define SLAB_ACCOUNT 0x00000000UL |
93 | #endif | 93 | #endif |
94 | 94 | ||
95 | #ifdef CONFIG_KASAN | ||
96 | #define SLAB_KASAN 0x08000000UL | ||
97 | #else | ||
98 | #define SLAB_KASAN 0x00000000UL | ||
99 | #endif | ||
100 | |||
95 | /* The following flags affect the page allocator grouping pages by mobility */ | 101 | /* The following flags affect the page allocator grouping pages by mobility */ |
96 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ | 102 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ |
97 | #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ | 103 | #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ |
@@ -370,7 +376,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, | |||
370 | { | 376 | { |
371 | void *ret = kmem_cache_alloc(s, flags); | 377 | void *ret = kmem_cache_alloc(s, flags); |
372 | 378 | ||
373 | kasan_kmalloc(s, ret, size); | 379 | kasan_kmalloc(s, ret, size, flags); |
374 | return ret; | 380 | return ret; |
375 | } | 381 | } |
376 | 382 | ||
@@ -381,7 +387,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
381 | { | 387 | { |
382 | void *ret = kmem_cache_alloc_node(s, gfpflags, node); | 388 | void *ret = kmem_cache_alloc_node(s, gfpflags, node); |
383 | 389 | ||
384 | kasan_kmalloc(s, ret, size); | 390 | kasan_kmalloc(s, ret, size, gfpflags); |
385 | return ret; | 391 | return ret; |
386 | } | 392 | } |
387 | #endif /* CONFIG_TRACING */ | 393 | #endif /* CONFIG_TRACING */ |
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e878ba35ae91..9edbbf352340 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h | |||
@@ -76,8 +76,22 @@ struct kmem_cache { | |||
76 | #ifdef CONFIG_MEMCG | 76 | #ifdef CONFIG_MEMCG |
77 | struct memcg_cache_params memcg_params; | 77 | struct memcg_cache_params memcg_params; |
78 | #endif | 78 | #endif |
79 | #ifdef CONFIG_KASAN | ||
80 | struct kasan_cache kasan_info; | ||
81 | #endif | ||
79 | 82 | ||
80 | struct kmem_cache_node *node[MAX_NUMNODES]; | 83 | struct kmem_cache_node *node[MAX_NUMNODES]; |
81 | }; | 84 | }; |
82 | 85 | ||
86 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | ||
87 | void *x) { | ||
88 | void *object = x - (x - page->s_mem) % cache->size; | ||
89 | void *last_object = page->s_mem + (cache->num - 1) * cache->size; | ||
90 | |||
91 | if (unlikely(object > last_object)) | ||
92 | return last_object; | ||
93 | else | ||
94 | return object; | ||
95 | } | ||
96 | |||
83 | #endif /* _LINUX_SLAB_DEF_H */ | 97 | #endif /* _LINUX_SLAB_DEF_H */ |
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index ac5143f95ee6..665cd0cd18b8 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s, | |||
130 | void object_err(struct kmem_cache *s, struct page *page, | 130 | void object_err(struct kmem_cache *s, struct page *page, |
131 | u8 *object, char *reason); | 131 | u8 *object, char *reason); |
132 | 132 | ||
133 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | ||
134 | void *x) { | ||
135 | void *object = x - (x - page_address(page)) % cache->size; | ||
136 | void *last_object = page_address(page) + | ||
137 | (page->objects - 1) * cache->size; | ||
138 | if (unlikely(object > last_object)) | ||
139 | return last_object; | ||
140 | else | ||
141 | return object; | ||
142 | } | ||
143 | |||
133 | #endif /* _LINUX_SLUB_DEF_H */ | 144 | #endif /* _LINUX_SLUB_DEF_H */ |
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h new file mode 100644 index 000000000000..7978b3e2c1e1 --- /dev/null +++ b/include/linux/stackdepot.h | |||
@@ -0,0 +1,32 @@ | |||
1 | /* | ||
2 | * A generic stack depot implementation | ||
3 | * | ||
4 | * Author: Alexander Potapenko <glider@google.com> | ||
5 | * Copyright (C) 2016 Google, Inc. | ||
6 | * | ||
7 | * Based on code by Dmitry Chernenkov. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #ifndef _LINUX_STACKDEPOT_H | ||
22 | #define _LINUX_STACKDEPOT_H | ||
23 | |||
24 | typedef u32 depot_stack_handle_t; | ||
25 | |||
26 | struct stack_trace; | ||
27 | |||
28 | depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); | ||
29 | |||
30 | void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); | ||
31 | |||
32 | #endif | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 953d1a1c0387..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) | |||
435 | mm_update_next_owner(mm); | 435 | mm_update_next_owner(mm); |
436 | mmput(mm); | 436 | mmput(mm); |
437 | if (test_thread_flag(TIF_MEMDIE)) | 437 | if (test_thread_flag(TIF_MEMDIE)) |
438 | exit_oom_victim(); | 438 | exit_oom_victim(tsk); |
439 | } | 439 | } |
440 | 440 | ||
441 | static struct task_struct *find_alive_thread(struct task_struct *p) | 441 | static struct task_struct *find_alive_thread(struct task_struct *p) |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 8aae49dd7da8..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } | |||
227 | static inline void lockdep_softirq_end(bool in_hardirq) { } | 227 | static inline void lockdep_softirq_end(bool in_hardirq) { } |
228 | #endif | 228 | #endif |
229 | 229 | ||
230 | asmlinkage __visible void __do_softirq(void) | 230 | asmlinkage __visible void __softirq_entry __do_softirq(void) |
231 | { | 231 | { |
232 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 232 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
233 | unsigned long old_flags = current->flags; | 233 | unsigned long old_flags = current->flags; |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d1798fa0c743..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
1566 | } | 1566 | } |
1567 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1567 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1568 | 1568 | ||
1569 | /* | ||
1570 | * Like schedule_timeout_uninterruptible(), except this task will not contribute | ||
1571 | * to load average. | ||
1572 | */ | ||
1573 | signed long __sched schedule_timeout_idle(signed long timeout) | ||
1574 | { | ||
1575 | __set_current_state(TASK_IDLE); | ||
1576 | return schedule_timeout(timeout); | ||
1577 | } | ||
1578 | EXPORT_SYMBOL(schedule_timeout_idle); | ||
1579 | |||
1569 | #ifdef CONFIG_HOTPLUG_CPU | 1580 | #ifdef CONFIG_HOTPLUG_CPU |
1570 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) | 1581 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) |
1571 | { | 1582 | { |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 91d6a63a2ea7..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> |
11 | #include <linux/interrupt.h> | ||
11 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
12 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
13 | 14 | ||
diff --git a/lib/Kconfig b/lib/Kconfig index 133ebc0c1773..3cca1222578e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig | |||
@@ -536,4 +536,8 @@ config ARCH_HAS_PMEM_API | |||
536 | config ARCH_HAS_MMIO_FLUSH | 536 | config ARCH_HAS_MMIO_FLUSH |
537 | bool | 537 | bool |
538 | 538 | ||
539 | config STACKDEPOT | ||
540 | bool | ||
541 | select STACKTRACE | ||
542 | |||
539 | endmenu | 543 | endmenu |
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 0fee5acd5aa0..67d8c6838ba9 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
@@ -5,8 +5,9 @@ if HAVE_ARCH_KASAN | |||
5 | 5 | ||
6 | config KASAN | 6 | config KASAN |
7 | bool "KASan: runtime memory debugger" | 7 | bool "KASan: runtime memory debugger" |
8 | depends on SLUB_DEBUG | 8 | depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) |
9 | select CONSTRUCTORS | 9 | select CONSTRUCTORS |
10 | select STACKDEPOT if SLAB | ||
10 | help | 11 | help |
11 | Enables kernel address sanitizer - runtime memory debugger, | 12 | Enables kernel address sanitizer - runtime memory debugger, |
12 | designed to find out-of-bounds accesses and use-after-free bugs. | 13 | designed to find out-of-bounds accesses and use-after-free bugs. |
@@ -16,6 +17,8 @@ config KASAN | |||
16 | This feature consumes about 1/8 of available memory and brings about | 17 | This feature consumes about 1/8 of available memory and brings about |
17 | ~x3 performance slowdown. | 18 | ~x3 performance slowdown. |
18 | For better error detection enable CONFIG_STACKTRACE. | 19 | For better error detection enable CONFIG_STACKTRACE. |
20 | Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB | ||
21 | (the resulting kernel does not boot). | ||
19 | 22 | ||
20 | choice | 23 | choice |
21 | prompt "Instrumentation type" | 24 | prompt "Instrumentation type" |
diff --git a/lib/Makefile b/lib/Makefile index a1de5b61ff40..7bd6fd436c97 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -181,6 +181,9 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o | |||
181 | obj-$(CONFIG_STMP_DEVICE) += stmp_device.o | 181 | obj-$(CONFIG_STMP_DEVICE) += stmp_device.o |
182 | obj-$(CONFIG_IRQ_POLL) += irq_poll.o | 182 | obj-$(CONFIG_IRQ_POLL) += irq_poll.o |
183 | 183 | ||
184 | obj-$(CONFIG_STACKDEPOT) += stackdepot.o | ||
185 | KASAN_SANITIZE_stackdepot.o := n | ||
186 | |||
184 | libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ | 187 | libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ |
185 | fdt_empty_tree.o | 188 | fdt_empty_tree.o |
186 | $(foreach file, $(libfdt_files), \ | 189 | $(foreach file, $(libfdt_files), \ |
diff --git a/lib/stackdepot.c b/lib/stackdepot.c new file mode 100644 index 000000000000..654c9d87e83a --- /dev/null +++ b/lib/stackdepot.c | |||
@@ -0,0 +1,284 @@ | |||
1 | /* | ||
2 | * Generic stack depot for storing stack traces. | ||
3 | * | ||
4 | * Some debugging tools need to save stack traces of certain events which can | ||
5 | * be later presented to the user. For example, KASAN needs to safe alloc and | ||
6 | * free stacks for each object, but storing two stack traces per object | ||
7 | * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for | ||
8 | * that). | ||
9 | * | ||
10 | * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc | ||
11 | * and free stacks repeat a lot, we save about 100x space. | ||
12 | * Stacks are never removed from depot, so we store them contiguously one after | ||
13 | * another in a contiguos memory allocation. | ||
14 | * | ||
15 | * Author: Alexander Potapenko <glider@google.com> | ||
16 | * Copyright (C) 2016 Google, Inc. | ||
17 | * | ||
18 | * Based on code by Dmitry Chernenkov. | ||
19 | * | ||
20 | * This program is free software; you can redistribute it and/or | ||
21 | * modify it under the terms of the GNU General Public License | ||
22 | * version 2 as published by the Free Software Foundation. | ||
23 | * | ||
24 | * This program is distributed in the hope that it will be useful, but | ||
25 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
27 | * General Public License for more details. | ||
28 | * | ||
29 | */ | ||
30 | |||
31 | #include <linux/gfp.h> | ||
32 | #include <linux/jhash.h> | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/mm.h> | ||
35 | #include <linux/percpu.h> | ||
36 | #include <linux/printk.h> | ||
37 | #include <linux/slab.h> | ||
38 | #include <linux/stacktrace.h> | ||
39 | #include <linux/stackdepot.h> | ||
40 | #include <linux/string.h> | ||
41 | #include <linux/types.h> | ||
42 | |||
43 | #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) | ||
44 | |||
45 | #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ | ||
46 | #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) | ||
47 | #define STACK_ALLOC_ALIGN 4 | ||
48 | #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ | ||
49 | STACK_ALLOC_ALIGN) | ||
50 | #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS) | ||
51 | #define STACK_ALLOC_SLABS_CAP 1024 | ||
52 | #define STACK_ALLOC_MAX_SLABS \ | ||
53 | (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ | ||
54 | (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) | ||
55 | |||
56 | /* The compact structure to store the reference to stacks. */ | ||
57 | union handle_parts { | ||
58 | depot_stack_handle_t handle; | ||
59 | struct { | ||
60 | u32 slabindex : STACK_ALLOC_INDEX_BITS; | ||
61 | u32 offset : STACK_ALLOC_OFFSET_BITS; | ||
62 | }; | ||
63 | }; | ||
64 | |||
65 | struct stack_record { | ||
66 | struct stack_record *next; /* Link in the hashtable */ | ||
67 | u32 hash; /* Hash in the hastable */ | ||
68 | u32 size; /* Number of frames in the stack */ | ||
69 | union handle_parts handle; | ||
70 | unsigned long entries[1]; /* Variable-sized array of entries. */ | ||
71 | }; | ||
72 | |||
73 | static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; | ||
74 | |||
75 | static int depot_index; | ||
76 | static int next_slab_inited; | ||
77 | static size_t depot_offset; | ||
78 | static DEFINE_SPINLOCK(depot_lock); | ||
79 | |||
80 | static bool init_stack_slab(void **prealloc) | ||
81 | { | ||
82 | if (!*prealloc) | ||
83 | return false; | ||
84 | /* | ||
85 | * This smp_load_acquire() pairs with smp_store_release() to | ||
86 | * |next_slab_inited| below and in depot_alloc_stack(). | ||
87 | */ | ||
88 | if (smp_load_acquire(&next_slab_inited)) | ||
89 | return true; | ||
90 | if (stack_slabs[depot_index] == NULL) { | ||
91 | stack_slabs[depot_index] = *prealloc; | ||
92 | } else { | ||
93 | stack_slabs[depot_index + 1] = *prealloc; | ||
94 | /* | ||
95 | * This smp_store_release pairs with smp_load_acquire() from | ||
96 | * |next_slab_inited| above and in depot_save_stack(). | ||
97 | */ | ||
98 | smp_store_release(&next_slab_inited, 1); | ||
99 | } | ||
100 | *prealloc = NULL; | ||
101 | return true; | ||
102 | } | ||
103 | |||
104 | /* Allocation of a new stack in raw storage */ | ||
105 | static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, | ||
106 | u32 hash, void **prealloc, gfp_t alloc_flags) | ||
107 | { | ||
108 | int required_size = offsetof(struct stack_record, entries) + | ||
109 | sizeof(unsigned long) * size; | ||
110 | struct stack_record *stack; | ||
111 | |||
112 | required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); | ||
113 | |||
114 | if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { | ||
115 | if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { | ||
116 | WARN_ONCE(1, "Stack depot reached limit capacity"); | ||
117 | return NULL; | ||
118 | } | ||
119 | depot_index++; | ||
120 | depot_offset = 0; | ||
121 | /* | ||
122 | * smp_store_release() here pairs with smp_load_acquire() from | ||
123 | * |next_slab_inited| in depot_save_stack() and | ||
124 | * init_stack_slab(). | ||
125 | */ | ||
126 | if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) | ||
127 | smp_store_release(&next_slab_inited, 0); | ||
128 | } | ||
129 | init_stack_slab(prealloc); | ||
130 | if (stack_slabs[depot_index] == NULL) | ||
131 | return NULL; | ||
132 | |||
133 | stack = stack_slabs[depot_index] + depot_offset; | ||
134 | |||
135 | stack->hash = hash; | ||
136 | stack->size = size; | ||
137 | stack->handle.slabindex = depot_index; | ||
138 | stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; | ||
139 | memcpy(stack->entries, entries, size * sizeof(unsigned long)); | ||
140 | depot_offset += required_size; | ||
141 | |||
142 | return stack; | ||
143 | } | ||
144 | |||
145 | #define STACK_HASH_ORDER 20 | ||
146 | #define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) | ||
147 | #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) | ||
148 | #define STACK_HASH_SEED 0x9747b28c | ||
149 | |||
150 | static struct stack_record *stack_table[STACK_HASH_SIZE] = { | ||
151 | [0 ... STACK_HASH_SIZE - 1] = NULL | ||
152 | }; | ||
153 | |||
154 | /* Calculate hash for a stack */ | ||
155 | static inline u32 hash_stack(unsigned long *entries, unsigned int size) | ||
156 | { | ||
157 | return jhash2((u32 *)entries, | ||
158 | size * sizeof(unsigned long) / sizeof(u32), | ||
159 | STACK_HASH_SEED); | ||
160 | } | ||
161 | |||
162 | /* Find a stack that is equal to the one stored in entries in the hash */ | ||
163 | static inline struct stack_record *find_stack(struct stack_record *bucket, | ||
164 | unsigned long *entries, int size, | ||
165 | u32 hash) | ||
166 | { | ||
167 | struct stack_record *found; | ||
168 | |||
169 | for (found = bucket; found; found = found->next) { | ||
170 | if (found->hash == hash && | ||
171 | found->size == size && | ||
172 | !memcmp(entries, found->entries, | ||
173 | size * sizeof(unsigned long))) { | ||
174 | return found; | ||
175 | } | ||
176 | } | ||
177 | return NULL; | ||
178 | } | ||
179 | |||
180 | void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) | ||
181 | { | ||
182 | union handle_parts parts = { .handle = handle }; | ||
183 | void *slab = stack_slabs[parts.slabindex]; | ||
184 | size_t offset = parts.offset << STACK_ALLOC_ALIGN; | ||
185 | struct stack_record *stack = slab + offset; | ||
186 | |||
187 | trace->nr_entries = trace->max_entries = stack->size; | ||
188 | trace->entries = stack->entries; | ||
189 | trace->skip = 0; | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * depot_save_stack - save stack in a stack depot. | ||
194 | * @trace - the stacktrace to save. | ||
195 | * @alloc_flags - flags for allocating additional memory if required. | ||
196 | * | ||
197 | * Returns the handle of the stack struct stored in depot. | ||
198 | */ | ||
199 | depot_stack_handle_t depot_save_stack(struct stack_trace *trace, | ||
200 | gfp_t alloc_flags) | ||
201 | { | ||
202 | u32 hash; | ||
203 | depot_stack_handle_t retval = 0; | ||
204 | struct stack_record *found = NULL, **bucket; | ||
205 | unsigned long flags; | ||
206 | struct page *page = NULL; | ||
207 | void *prealloc = NULL; | ||
208 | |||
209 | if (unlikely(trace->nr_entries == 0)) | ||
210 | goto fast_exit; | ||
211 | |||
212 | hash = hash_stack(trace->entries, trace->nr_entries); | ||
213 | /* Bad luck, we won't store this stack. */ | ||
214 | if (hash == 0) | ||
215 | goto exit; | ||
216 | |||
217 | bucket = &stack_table[hash & STACK_HASH_MASK]; | ||
218 | |||
219 | /* | ||
220 | * Fast path: look the stack trace up without locking. | ||
221 | * The smp_load_acquire() here pairs with smp_store_release() to | ||
222 | * |bucket| below. | ||
223 | */ | ||
224 | found = find_stack(smp_load_acquire(bucket), trace->entries, | ||
225 | trace->nr_entries, hash); | ||
226 | if (found) | ||
227 | goto exit; | ||
228 | |||
229 | /* | ||
230 | * Check if the current or the next stack slab need to be initialized. | ||
231 | * If so, allocate the memory - we won't be able to do that under the | ||
232 | * lock. | ||
233 | * | ||
234 | * The smp_load_acquire() here pairs with smp_store_release() to | ||
235 | * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). | ||
236 | */ | ||
237 | if (unlikely(!smp_load_acquire(&next_slab_inited))) { | ||
238 | /* | ||
239 | * Zero out zone modifiers, as we don't have specific zone | ||
240 | * requirements. Keep the flags related to allocation in atomic | ||
241 | * contexts and I/O. | ||
242 | */ | ||
243 | alloc_flags &= ~GFP_ZONEMASK; | ||
244 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); | ||
245 | page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); | ||
246 | if (page) | ||
247 | prealloc = page_address(page); | ||
248 | } | ||
249 | |||
250 | spin_lock_irqsave(&depot_lock, flags); | ||
251 | |||
252 | found = find_stack(*bucket, trace->entries, trace->nr_entries, hash); | ||
253 | if (!found) { | ||
254 | struct stack_record *new = | ||
255 | depot_alloc_stack(trace->entries, trace->nr_entries, | ||
256 | hash, &prealloc, alloc_flags); | ||
257 | if (new) { | ||
258 | new->next = *bucket; | ||
259 | /* | ||
260 | * This smp_store_release() pairs with | ||
261 | * smp_load_acquire() from |bucket| above. | ||
262 | */ | ||
263 | smp_store_release(bucket, new); | ||
264 | found = new; | ||
265 | } | ||
266 | } else if (prealloc) { | ||
267 | /* | ||
268 | * We didn't need to store this stack trace, but let's keep | ||
269 | * the preallocated memory for the future. | ||
270 | */ | ||
271 | WARN_ON(!init_stack_slab(&prealloc)); | ||
272 | } | ||
273 | |||
274 | spin_unlock_irqrestore(&depot_lock, flags); | ||
275 | exit: | ||
276 | if (prealloc) { | ||
277 | /* Nobody used this memory, ok to free it. */ | ||
278 | free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); | ||
279 | } | ||
280 | if (found) | ||
281 | retval = found->handle.handle; | ||
282 | fast_exit: | ||
283 | return retval; | ||
284 | } | ||
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c32f3b0048dc..82169fbf2453 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
@@ -65,11 +65,34 @@ static noinline void __init kmalloc_node_oob_right(void) | |||
65 | kfree(ptr); | 65 | kfree(ptr); |
66 | } | 66 | } |
67 | 67 | ||
68 | static noinline void __init kmalloc_large_oob_right(void) | 68 | #ifdef CONFIG_SLUB |
69 | static noinline void __init kmalloc_pagealloc_oob_right(void) | ||
69 | { | 70 | { |
70 | char *ptr; | 71 | char *ptr; |
71 | size_t size = KMALLOC_MAX_CACHE_SIZE + 10; | 72 | size_t size = KMALLOC_MAX_CACHE_SIZE + 10; |
72 | 73 | ||
74 | /* Allocate a chunk that does not fit into a SLUB cache to trigger | ||
75 | * the page allocator fallback. | ||
76 | */ | ||
77 | pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n"); | ||
78 | ptr = kmalloc(size, GFP_KERNEL); | ||
79 | if (!ptr) { | ||
80 | pr_err("Allocation failed\n"); | ||
81 | return; | ||
82 | } | ||
83 | |||
84 | ptr[size] = 0; | ||
85 | kfree(ptr); | ||
86 | } | ||
87 | #endif | ||
88 | |||
89 | static noinline void __init kmalloc_large_oob_right(void) | ||
90 | { | ||
91 | char *ptr; | ||
92 | size_t size = KMALLOC_MAX_CACHE_SIZE - 256; | ||
93 | /* Allocate a chunk that is large enough, but still fits into a slab | ||
94 | * and does not trigger the page allocator fallback in SLUB. | ||
95 | */ | ||
73 | pr_info("kmalloc large allocation: out-of-bounds to right\n"); | 96 | pr_info("kmalloc large allocation: out-of-bounds to right\n"); |
74 | ptr = kmalloc(size, GFP_KERNEL); | 97 | ptr = kmalloc(size, GFP_KERNEL); |
75 | if (!ptr) { | 98 | if (!ptr) { |
@@ -271,6 +294,8 @@ static noinline void __init kmalloc_uaf2(void) | |||
271 | } | 294 | } |
272 | 295 | ||
273 | ptr1[40] = 'x'; | 296 | ptr1[40] = 'x'; |
297 | if (ptr1 == ptr2) | ||
298 | pr_err("Could not detect use-after-free: ptr1 == ptr2\n"); | ||
274 | kfree(ptr2); | 299 | kfree(ptr2); |
275 | } | 300 | } |
276 | 301 | ||
@@ -324,6 +349,9 @@ static int __init kmalloc_tests_init(void) | |||
324 | kmalloc_oob_right(); | 349 | kmalloc_oob_right(); |
325 | kmalloc_oob_left(); | 350 | kmalloc_oob_left(); |
326 | kmalloc_node_oob_right(); | 351 | kmalloc_node_oob_right(); |
352 | #ifdef CONFIG_SLUB | ||
353 | kmalloc_pagealloc_oob_right(); | ||
354 | #endif | ||
327 | kmalloc_large_oob_right(); | 355 | kmalloc_large_oob_right(); |
328 | kmalloc_oob_krealloc_more(); | 356 | kmalloc_oob_krealloc_more(); |
329 | kmalloc_oob_krealloc_less(); | 357 | kmalloc_oob_krealloc_less(); |
diff --git a/mm/Makefile b/mm/Makefile index f5e797cbd128..deb467edca2d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -3,6 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | KASAN_SANITIZE_slab_common.o := n | 5 | KASAN_SANITIZE_slab_common.o := n |
6 | KASAN_SANITIZE_slab.o := n | ||
6 | KASAN_SANITIZE_slub.o := n | 7 | KASAN_SANITIZE_slub.o := n |
7 | 8 | ||
8 | # These files are disabled because they produce non-interesting and/or | 9 | # These files are disabled because they produce non-interesting and/or |
diff --git a/mm/filemap.c b/mm/filemap.c index 7c00f105845e..a8c69c8c0a90 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1840,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
1840 | ssize_t retval = 0; | 1840 | ssize_t retval = 0; |
1841 | loff_t *ppos = &iocb->ki_pos; | 1841 | loff_t *ppos = &iocb->ki_pos; |
1842 | loff_t pos = *ppos; | 1842 | loff_t pos = *ppos; |
1843 | size_t count = iov_iter_count(iter); | ||
1844 | |||
1845 | if (!count) | ||
1846 | goto out; /* skip atime */ | ||
1843 | 1847 | ||
1844 | if (iocb->ki_flags & IOCB_DIRECT) { | 1848 | if (iocb->ki_flags & IOCB_DIRECT) { |
1845 | struct address_space *mapping = file->f_mapping; | 1849 | struct address_space *mapping = file->f_mapping; |
1846 | struct inode *inode = mapping->host; | 1850 | struct inode *inode = mapping->host; |
1847 | size_t count = iov_iter_count(iter); | ||
1848 | loff_t size; | 1851 | loff_t size; |
1849 | 1852 | ||
1850 | if (!count) | ||
1851 | goto out; /* skip atime */ | ||
1852 | size = i_size_read(inode); | 1853 | size = i_size_read(inode); |
1853 | retval = filemap_write_and_wait_range(mapping, pos, | 1854 | retval = filemap_write_and_wait_range(mapping, pos, |
1854 | pos + count - 1); | 1855 | pos + count - 1); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fbfb1b8d6726..86f9f8b82f8e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2578,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2578 | } | 2578 | } |
2579 | khugepaged_node_load[node]++; | 2579 | khugepaged_node_load[node]++; |
2580 | if (!PageLRU(page)) { | 2580 | if (!PageLRU(page)) { |
2581 | result = SCAN_SCAN_ABORT; | 2581 | result = SCAN_PAGE_LRU; |
2582 | goto out_unmap; | 2582 | goto out_unmap; |
2583 | } | 2583 | } |
2584 | if (PageLocked(page)) { | 2584 | if (PageLocked(page)) { |
diff --git a/mm/internal.h b/mm/internal.h index 7449392c6faa..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -38,6 +38,11 @@ | |||
38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
39 | unsigned long floor, unsigned long ceiling); | 39 | unsigned long floor, unsigned long ceiling); |
40 | 40 | ||
41 | void unmap_page_range(struct mmu_gather *tlb, | ||
42 | struct vm_area_struct *vma, | ||
43 | unsigned long addr, unsigned long end, | ||
44 | struct zap_details *details); | ||
45 | |||
41 | extern int __do_page_cache_readahead(struct address_space *mapping, | 46 | extern int __do_page_cache_readahead(struct address_space *mapping, |
42 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, | 47 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, |
43 | unsigned long lookahead_size); | 48 | unsigned long lookahead_size); |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ade8c91..acb3b6c4dd89 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -17,7 +17,9 @@ | |||
17 | #define DISABLE_BRANCH_PROFILING | 17 | #define DISABLE_BRANCH_PROFILING |
18 | 18 | ||
19 | #include <linux/export.h> | 19 | #include <linux/export.h> |
20 | #include <linux/interrupt.h> | ||
20 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/kasan.h> | ||
21 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
22 | #include <linux/kmemleak.h> | 24 | #include <linux/kmemleak.h> |
23 | #include <linux/linkage.h> | 25 | #include <linux/linkage.h> |
@@ -32,7 +34,6 @@ | |||
32 | #include <linux/string.h> | 34 | #include <linux/string.h> |
33 | #include <linux/types.h> | 35 | #include <linux/types.h> |
34 | #include <linux/vmalloc.h> | 36 | #include <linux/vmalloc.h> |
35 | #include <linux/kasan.h> | ||
36 | 37 | ||
37 | #include "kasan.h" | 38 | #include "kasan.h" |
38 | #include "../slab.h" | 39 | #include "../slab.h" |
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order) | |||
334 | KASAN_FREE_PAGE); | 335 | KASAN_FREE_PAGE); |
335 | } | 336 | } |
336 | 337 | ||
338 | #ifdef CONFIG_SLAB | ||
339 | /* | ||
340 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. | ||
341 | * For larger allocations larger redzones are used. | ||
342 | */ | ||
343 | static size_t optimal_redzone(size_t object_size) | ||
344 | { | ||
345 | int rz = | ||
346 | object_size <= 64 - 16 ? 16 : | ||
347 | object_size <= 128 - 32 ? 32 : | ||
348 | object_size <= 512 - 64 ? 64 : | ||
349 | object_size <= 4096 - 128 ? 128 : | ||
350 | object_size <= (1 << 14) - 256 ? 256 : | ||
351 | object_size <= (1 << 15) - 512 ? 512 : | ||
352 | object_size <= (1 << 16) - 1024 ? 1024 : 2048; | ||
353 | return rz; | ||
354 | } | ||
355 | |||
356 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, | ||
357 | unsigned long *flags) | ||
358 | { | ||
359 | int redzone_adjust; | ||
360 | /* Make sure the adjusted size is still less than | ||
361 | * KMALLOC_MAX_CACHE_SIZE. | ||
362 | * TODO: this check is only useful for SLAB, but not SLUB. We'll need | ||
363 | * to skip it for SLUB when it starts using kasan_cache_create(). | ||
364 | */ | ||
365 | if (*size > KMALLOC_MAX_CACHE_SIZE - | ||
366 | sizeof(struct kasan_alloc_meta) - | ||
367 | sizeof(struct kasan_free_meta)) | ||
368 | return; | ||
369 | *flags |= SLAB_KASAN; | ||
370 | /* Add alloc meta. */ | ||
371 | cache->kasan_info.alloc_meta_offset = *size; | ||
372 | *size += sizeof(struct kasan_alloc_meta); | ||
373 | |||
374 | /* Add free meta. */ | ||
375 | if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || | ||
376 | cache->object_size < sizeof(struct kasan_free_meta)) { | ||
377 | cache->kasan_info.free_meta_offset = *size; | ||
378 | *size += sizeof(struct kasan_free_meta); | ||
379 | } | ||
380 | redzone_adjust = optimal_redzone(cache->object_size) - | ||
381 | (*size - cache->object_size); | ||
382 | if (redzone_adjust > 0) | ||
383 | *size += redzone_adjust; | ||
384 | *size = min(KMALLOC_MAX_CACHE_SIZE, | ||
385 | max(*size, | ||
386 | cache->object_size + | ||
387 | optimal_redzone(cache->object_size))); | ||
388 | } | ||
389 | #endif | ||
390 | |||
337 | void kasan_poison_slab(struct page *page) | 391 | void kasan_poison_slab(struct page *page) |
338 | { | 392 | { |
339 | kasan_poison_shadow(page_address(page), | 393 | kasan_poison_shadow(page_address(page), |
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) | |||
351 | kasan_poison_shadow(object, | 405 | kasan_poison_shadow(object, |
352 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), | 406 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), |
353 | KASAN_KMALLOC_REDZONE); | 407 | KASAN_KMALLOC_REDZONE); |
408 | #ifdef CONFIG_SLAB | ||
409 | if (cache->flags & SLAB_KASAN) { | ||
410 | struct kasan_alloc_meta *alloc_info = | ||
411 | get_alloc_info(cache, object); | ||
412 | alloc_info->state = KASAN_STATE_INIT; | ||
413 | } | ||
414 | #endif | ||
354 | } | 415 | } |
355 | 416 | ||
356 | void kasan_slab_alloc(struct kmem_cache *cache, void *object) | 417 | #ifdef CONFIG_SLAB |
418 | static inline int in_irqentry_text(unsigned long ptr) | ||
357 | { | 419 | { |
358 | kasan_kmalloc(cache, object, cache->object_size); | 420 | return (ptr >= (unsigned long)&__irqentry_text_start && |
421 | ptr < (unsigned long)&__irqentry_text_end) || | ||
422 | (ptr >= (unsigned long)&__softirqentry_text_start && | ||
423 | ptr < (unsigned long)&__softirqentry_text_end); | ||
424 | } | ||
425 | |||
426 | static inline void filter_irq_stacks(struct stack_trace *trace) | ||
427 | { | ||
428 | int i; | ||
429 | |||
430 | if (!trace->nr_entries) | ||
431 | return; | ||
432 | for (i = 0; i < trace->nr_entries; i++) | ||
433 | if (in_irqentry_text(trace->entries[i])) { | ||
434 | /* Include the irqentry function into the stack. */ | ||
435 | trace->nr_entries = i + 1; | ||
436 | break; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | static inline depot_stack_handle_t save_stack(gfp_t flags) | ||
441 | { | ||
442 | unsigned long entries[KASAN_STACK_DEPTH]; | ||
443 | struct stack_trace trace = { | ||
444 | .nr_entries = 0, | ||
445 | .entries = entries, | ||
446 | .max_entries = KASAN_STACK_DEPTH, | ||
447 | .skip = 0 | ||
448 | }; | ||
449 | |||
450 | save_stack_trace(&trace); | ||
451 | filter_irq_stacks(&trace); | ||
452 | if (trace.nr_entries != 0 && | ||
453 | trace.entries[trace.nr_entries-1] == ULONG_MAX) | ||
454 | trace.nr_entries--; | ||
455 | |||
456 | return depot_save_stack(&trace, flags); | ||
457 | } | ||
458 | |||
459 | static inline void set_track(struct kasan_track *track, gfp_t flags) | ||
460 | { | ||
461 | track->pid = current->pid; | ||
462 | track->stack = save_stack(flags); | ||
463 | } | ||
464 | |||
465 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | ||
466 | const void *object) | ||
467 | { | ||
468 | BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); | ||
469 | return (void *)object + cache->kasan_info.alloc_meta_offset; | ||
470 | } | ||
471 | |||
472 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | ||
473 | const void *object) | ||
474 | { | ||
475 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); | ||
476 | return (void *)object + cache->kasan_info.free_meta_offset; | ||
477 | } | ||
478 | #endif | ||
479 | |||
480 | void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) | ||
481 | { | ||
482 | kasan_kmalloc(cache, object, cache->object_size, flags); | ||
359 | } | 483 | } |
360 | 484 | ||
361 | void kasan_slab_free(struct kmem_cache *cache, void *object) | 485 | void kasan_slab_free(struct kmem_cache *cache, void *object) |
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) | |||
367 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 491 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) |
368 | return; | 492 | return; |
369 | 493 | ||
494 | #ifdef CONFIG_SLAB | ||
495 | if (cache->flags & SLAB_KASAN) { | ||
496 | struct kasan_free_meta *free_info = | ||
497 | get_free_info(cache, object); | ||
498 | struct kasan_alloc_meta *alloc_info = | ||
499 | get_alloc_info(cache, object); | ||
500 | alloc_info->state = KASAN_STATE_FREE; | ||
501 | set_track(&free_info->track); | ||
502 | } | ||
503 | #endif | ||
504 | |||
370 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | 505 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); |
371 | } | 506 | } |
372 | 507 | ||
373 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | 508 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, |
509 | gfp_t flags) | ||
374 | { | 510 | { |
375 | unsigned long redzone_start; | 511 | unsigned long redzone_start; |
376 | unsigned long redzone_end; | 512 | unsigned long redzone_end; |
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | |||
386 | kasan_unpoison_shadow(object, size); | 522 | kasan_unpoison_shadow(object, size); |
387 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 523 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
388 | KASAN_KMALLOC_REDZONE); | 524 | KASAN_KMALLOC_REDZONE); |
525 | #ifdef CONFIG_SLAB | ||
526 | if (cache->flags & SLAB_KASAN) { | ||
527 | struct kasan_alloc_meta *alloc_info = | ||
528 | get_alloc_info(cache, object); | ||
529 | |||
530 | alloc_info->state = KASAN_STATE_ALLOC; | ||
531 | alloc_info->alloc_size = size; | ||
532 | set_track(&alloc_info->track, flags); | ||
533 | } | ||
534 | #endif | ||
389 | } | 535 | } |
390 | EXPORT_SYMBOL(kasan_kmalloc); | 536 | EXPORT_SYMBOL(kasan_kmalloc); |
391 | 537 | ||
392 | void kasan_kmalloc_large(const void *ptr, size_t size) | 538 | void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) |
393 | { | 539 | { |
394 | struct page *page; | 540 | struct page *page; |
395 | unsigned long redzone_start; | 541 | unsigned long redzone_start; |
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) | |||
408 | KASAN_PAGE_REDZONE); | 554 | KASAN_PAGE_REDZONE); |
409 | } | 555 | } |
410 | 556 | ||
411 | void kasan_krealloc(const void *object, size_t size) | 557 | void kasan_krealloc(const void *object, size_t size, gfp_t flags) |
412 | { | 558 | { |
413 | struct page *page; | 559 | struct page *page; |
414 | 560 | ||
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size) | |||
418 | page = virt_to_head_page(object); | 564 | page = virt_to_head_page(object); |
419 | 565 | ||
420 | if (unlikely(!PageSlab(page))) | 566 | if (unlikely(!PageSlab(page))) |
421 | kasan_kmalloc_large(object, size); | 567 | kasan_kmalloc_large(object, size, flags); |
422 | else | 568 | else |
423 | kasan_kmalloc(page->slab_cache, object, size); | 569 | kasan_kmalloc(page->slab_cache, object, size, flags); |
424 | } | 570 | } |
425 | 571 | ||
426 | void kasan_kfree(void *ptr) | 572 | void kasan_kfree(void *ptr) |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e5c21e..30a2f0ba0e09 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define __MM_KASAN_KASAN_H | 2 | #define __MM_KASAN_KASAN_H |
3 | 3 | ||
4 | #include <linux/kasan.h> | 4 | #include <linux/kasan.h> |
5 | #include <linux/stackdepot.h> | ||
5 | 6 | ||
6 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) | 7 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) |
7 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) | 8 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) |
@@ -54,6 +55,42 @@ struct kasan_global { | |||
54 | #endif | 55 | #endif |
55 | }; | 56 | }; |
56 | 57 | ||
58 | /** | ||
59 | * Structures to keep alloc and free tracks * | ||
60 | */ | ||
61 | |||
62 | enum kasan_state { | ||
63 | KASAN_STATE_INIT, | ||
64 | KASAN_STATE_ALLOC, | ||
65 | KASAN_STATE_FREE | ||
66 | }; | ||
67 | |||
68 | #define KASAN_STACK_DEPTH 64 | ||
69 | |||
70 | struct kasan_track { | ||
71 | u32 pid; | ||
72 | depot_stack_handle_t stack; | ||
73 | }; | ||
74 | |||
75 | struct kasan_alloc_meta { | ||
76 | struct kasan_track track; | ||
77 | u32 state : 2; /* enum kasan_state */ | ||
78 | u32 alloc_size : 30; | ||
79 | u32 reserved; | ||
80 | }; | ||
81 | |||
82 | struct kasan_free_meta { | ||
83 | /* Allocator freelist pointer, unused by KASAN. */ | ||
84 | void **freelist; | ||
85 | struct kasan_track track; | ||
86 | }; | ||
87 | |||
88 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | ||
89 | const void *object); | ||
90 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | ||
91 | const void *object); | ||
92 | |||
93 | |||
57 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | 94 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) |
58 | { | 95 | { |
59 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | 96 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 745aa8f36028..60869a5a0124 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/printk.h> | 18 | #include <linux/printk.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/stackdepot.h> | ||
21 | #include <linux/stacktrace.h> | 22 | #include <linux/stacktrace.h> |
22 | #include <linux/string.h> | 23 | #include <linux/string.h> |
23 | #include <linux/types.h> | 24 | #include <linux/types.h> |
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr) | |||
115 | sizeof(init_thread_union.stack)); | 116 | sizeof(init_thread_union.stack)); |
116 | } | 117 | } |
117 | 118 | ||
119 | #ifdef CONFIG_SLAB | ||
120 | static void print_track(struct kasan_track *track) | ||
121 | { | ||
122 | pr_err("PID = %u\n", track->pid); | ||
123 | if (track->stack) { | ||
124 | struct stack_trace trace; | ||
125 | |||
126 | depot_fetch_stack(track->stack, &trace); | ||
127 | print_stack_trace(&trace, 0); | ||
128 | } else { | ||
129 | pr_err("(stack is not available)\n"); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void object_err(struct kmem_cache *cache, struct page *page, | ||
134 | void *object, char *unused_reason) | ||
135 | { | ||
136 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); | ||
137 | struct kasan_free_meta *free_info; | ||
138 | |||
139 | dump_stack(); | ||
140 | pr_err("Object at %p, in cache %s\n", object, cache->name); | ||
141 | if (!(cache->flags & SLAB_KASAN)) | ||
142 | return; | ||
143 | switch (alloc_info->state) { | ||
144 | case KASAN_STATE_INIT: | ||
145 | pr_err("Object not allocated yet\n"); | ||
146 | break; | ||
147 | case KASAN_STATE_ALLOC: | ||
148 | pr_err("Object allocated with size %u bytes.\n", | ||
149 | alloc_info->alloc_size); | ||
150 | pr_err("Allocation:\n"); | ||
151 | print_track(&alloc_info->track); | ||
152 | break; | ||
153 | case KASAN_STATE_FREE: | ||
154 | pr_err("Object freed, allocated with size %u bytes\n", | ||
155 | alloc_info->alloc_size); | ||
156 | free_info = get_free_info(cache, object); | ||
157 | pr_err("Allocation:\n"); | ||
158 | print_track(&alloc_info->track); | ||
159 | pr_err("Deallocation:\n"); | ||
160 | print_track(&free_info->track); | ||
161 | break; | ||
162 | } | ||
163 | } | ||
164 | #endif | ||
165 | |||
118 | static void print_address_description(struct kasan_access_info *info) | 166 | static void print_address_description(struct kasan_access_info *info) |
119 | { | 167 | { |
120 | const void *addr = info->access_addr; | 168 | const void *addr = info->access_addr; |
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info) | |||
126 | if (PageSlab(page)) { | 174 | if (PageSlab(page)) { |
127 | void *object; | 175 | void *object; |
128 | struct kmem_cache *cache = page->slab_cache; | 176 | struct kmem_cache *cache = page->slab_cache; |
129 | void *last_object; | 177 | object = nearest_obj(cache, page, |
130 | 178 | (void *)info->access_addr); | |
131 | object = virt_to_obj(cache, page_address(page), addr); | ||
132 | last_object = page_address(page) + | ||
133 | page->objects * cache->size; | ||
134 | |||
135 | if (unlikely(object > last_object)) | ||
136 | object = last_object; /* we hit into padding */ | ||
137 | |||
138 | object_err(cache, page, object, | 179 | object_err(cache, page, object, |
139 | "kasan: bad access detected"); | 180 | "kasan: bad access detected"); |
140 | return; | 181 | return; |
141 | } | 182 | } |
142 | dump_page(page, "kasan: bad access detected"); | 183 | dump_page(page, "kasan: bad access detected"); |
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info) | |||
146 | if (!init_task_stack_addr(addr)) | 187 | if (!init_task_stack_addr(addr)) |
147 | pr_err("Address belongs to variable %pS\n", addr); | 188 | pr_err("Address belongs to variable %pS\n", addr); |
148 | } | 189 | } |
149 | |||
150 | dump_stack(); | 190 | dump_stack(); |
151 | } | 191 | } |
152 | 192 | ||
diff --git a/mm/memory.c b/mm/memory.c index 81dca0083fcd..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1102,6 +1102,12 @@ again: | |||
1102 | 1102 | ||
1103 | if (!PageAnon(page)) { | 1103 | if (!PageAnon(page)) { |
1104 | if (pte_dirty(ptent)) { | 1104 | if (pte_dirty(ptent)) { |
1105 | /* | ||
1106 | * oom_reaper cannot tear down dirty | ||
1107 | * pages | ||
1108 | */ | ||
1109 | if (unlikely(details && details->ignore_dirty)) | ||
1110 | continue; | ||
1105 | force_flush = 1; | 1111 | force_flush = 1; |
1106 | set_page_dirty(page); | 1112 | set_page_dirty(page); |
1107 | } | 1113 | } |
@@ -1120,8 +1126,8 @@ again: | |||
1120 | } | 1126 | } |
1121 | continue; | 1127 | continue; |
1122 | } | 1128 | } |
1123 | /* If details->check_mapping, we leave swap entries. */ | 1129 | /* only check swap_entries if explicitly asked for in details */ |
1124 | if (unlikely(details)) | 1130 | if (unlikely(details && !details->check_swap_entries)) |
1125 | continue; | 1131 | continue; |
1126 | 1132 | ||
1127 | entry = pte_to_swp_entry(ptent); | 1133 | entry = pte_to_swp_entry(ptent); |
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1226 | return addr; | 1232 | return addr; |
1227 | } | 1233 | } |
1228 | 1234 | ||
1229 | static void unmap_page_range(struct mmu_gather *tlb, | 1235 | void unmap_page_range(struct mmu_gather *tlb, |
1230 | struct vm_area_struct *vma, | 1236 | struct vm_area_struct *vma, |
1231 | unsigned long addr, unsigned long end, | 1237 | unsigned long addr, unsigned long end, |
1232 | struct zap_details *details) | 1238 | struct zap_details *details) |
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1234 | pgd_t *pgd; | 1240 | pgd_t *pgd; |
1235 | unsigned long next; | 1241 | unsigned long next; |
1236 | 1242 | ||
1237 | if (details && !details->check_mapping) | ||
1238 | details = NULL; | ||
1239 | |||
1240 | BUG_ON(addr >= end); | 1243 | BUG_ON(addr >= end); |
1241 | tlb_start_vma(tlb, vma); | 1244 | tlb_start_vma(tlb, vma); |
1242 | pgd = pgd_offset(vma->vm_mm, addr); | 1245 | pgd = pgd_offset(vma->vm_mm, addr); |
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2432 | void unmap_mapping_range(struct address_space *mapping, | 2435 | void unmap_mapping_range(struct address_space *mapping, |
2433 | loff_t const holebegin, loff_t const holelen, int even_cows) | 2436 | loff_t const holebegin, loff_t const holelen, int even_cows) |
2434 | { | 2437 | { |
2435 | struct zap_details details; | 2438 | struct zap_details details = { }; |
2436 | pgoff_t hba = holebegin >> PAGE_SHIFT; | 2439 | pgoff_t hba = holebegin >> PAGE_SHIFT; |
2437 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2440 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2438 | 2441 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index 07c383ddbbab..9b7a14a791cc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) | |||
112 | kasan_free_pages(element, (unsigned long)pool->pool_data); | 112 | kasan_free_pages(element, (unsigned long)pool->pool_data); |
113 | } | 113 | } |
114 | 114 | ||
115 | static void kasan_unpoison_element(mempool_t *pool, void *element) | 115 | static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) |
116 | { | 116 | { |
117 | if (pool->alloc == mempool_alloc_slab) | 117 | if (pool->alloc == mempool_alloc_slab) |
118 | kasan_slab_alloc(pool->pool_data, element); | 118 | kasan_slab_alloc(pool->pool_data, element, flags); |
119 | if (pool->alloc == mempool_kmalloc) | 119 | if (pool->alloc == mempool_kmalloc) |
120 | kasan_krealloc(element, (size_t)pool->pool_data); | 120 | kasan_krealloc(element, (size_t)pool->pool_data, flags); |
121 | if (pool->alloc == mempool_alloc_pages) | 121 | if (pool->alloc == mempool_alloc_pages) |
122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); | 122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); |
123 | } | 123 | } |
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) | |||
130 | pool->elements[pool->curr_nr++] = element; | 130 | pool->elements[pool->curr_nr++] = element; |
131 | } | 131 | } |
132 | 132 | ||
133 | static void *remove_element(mempool_t *pool) | 133 | static void *remove_element(mempool_t *pool, gfp_t flags) |
134 | { | 134 | { |
135 | void *element = pool->elements[--pool->curr_nr]; | 135 | void *element = pool->elements[--pool->curr_nr]; |
136 | 136 | ||
137 | BUG_ON(pool->curr_nr < 0); | 137 | BUG_ON(pool->curr_nr < 0); |
138 | kasan_unpoison_element(pool, element); | 138 | kasan_unpoison_element(pool, element, flags); |
139 | check_element(pool, element); | 139 | check_element(pool, element); |
140 | return element; | 140 | return element; |
141 | } | 141 | } |
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) | |||
154 | return; | 154 | return; |
155 | 155 | ||
156 | while (pool->curr_nr) { | 156 | while (pool->curr_nr) { |
157 | void *element = remove_element(pool); | 157 | void *element = remove_element(pool, GFP_KERNEL); |
158 | pool->free(element, pool->pool_data); | 158 | pool->free(element, pool->pool_data); |
159 | } | 159 | } |
160 | kfree(pool->elements); | 160 | kfree(pool->elements); |
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) | |||
250 | spin_lock_irqsave(&pool->lock, flags); | 250 | spin_lock_irqsave(&pool->lock, flags); |
251 | if (new_min_nr <= pool->min_nr) { | 251 | if (new_min_nr <= pool->min_nr) { |
252 | while (new_min_nr < pool->curr_nr) { | 252 | while (new_min_nr < pool->curr_nr) { |
253 | element = remove_element(pool); | 253 | element = remove_element(pool, GFP_KERNEL); |
254 | spin_unlock_irqrestore(&pool->lock, flags); | 254 | spin_unlock_irqrestore(&pool->lock, flags); |
255 | pool->free(element, pool->pool_data); | 255 | pool->free(element, pool->pool_data); |
256 | spin_lock_irqsave(&pool->lock, flags); | 256 | spin_lock_irqsave(&pool->lock, flags); |
@@ -347,7 +347,7 @@ repeat_alloc: | |||
347 | 347 | ||
348 | spin_lock_irqsave(&pool->lock, flags); | 348 | spin_lock_irqsave(&pool->lock, flags); |
349 | if (likely(pool->curr_nr)) { | 349 | if (likely(pool->curr_nr)) { |
350 | element = remove_element(pool); | 350 | element = remove_element(pool, gfp_temp); |
351 | spin_unlock_irqrestore(&pool->lock, flags); | 351 | spin_unlock_irqrestore(&pool->lock, flags); |
352 | /* paired with rmb in mempool_free(), read comment there */ | 352 | /* paired with rmb in mempool_free(), read comment there */ |
353 | smp_wmb(); | 353 | smp_wmb(); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e1707847..b34d279a7ee6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -35,6 +35,11 @@ | |||
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
38 | #include <linux/kthread.h> | ||
39 | #include <linux/init.h> | ||
40 | |||
41 | #include <asm/tlb.h> | ||
42 | #include "internal.h" | ||
38 | 43 | ||
39 | #define CREATE_TRACE_POINTS | 44 | #define CREATE_TRACE_POINTS |
40 | #include <trace/events/oom.h> | 45 | #include <trace/events/oom.h> |
@@ -405,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | |||
405 | 410 | ||
406 | bool oom_killer_disabled __read_mostly; | 411 | bool oom_killer_disabled __read_mostly; |
407 | 412 | ||
413 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
414 | |||
415 | #ifdef CONFIG_MMU | ||
416 | /* | ||
417 | * OOM Reaper kernel thread which tries to reap the memory used by the OOM | ||
418 | * victim (if that is possible) to help the OOM killer to move on. | ||
419 | */ | ||
420 | static struct task_struct *oom_reaper_th; | ||
421 | static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | ||
422 | static struct task_struct *oom_reaper_list; | ||
423 | static DEFINE_SPINLOCK(oom_reaper_lock); | ||
424 | |||
425 | |||
426 | static bool __oom_reap_task(struct task_struct *tsk) | ||
427 | { | ||
428 | struct mmu_gather tlb; | ||
429 | struct vm_area_struct *vma; | ||
430 | struct mm_struct *mm; | ||
431 | struct task_struct *p; | ||
432 | struct zap_details details = {.check_swap_entries = true, | ||
433 | .ignore_dirty = true}; | ||
434 | bool ret = true; | ||
435 | |||
436 | /* | ||
437 | * Make sure we find the associated mm_struct even when the particular | ||
438 | * thread has already terminated and cleared its mm. | ||
439 | * We might have race with exit path so consider our work done if there | ||
440 | * is no mm. | ||
441 | */ | ||
442 | p = find_lock_task_mm(tsk); | ||
443 | if (!p) | ||
444 | return true; | ||
445 | |||
446 | mm = p->mm; | ||
447 | if (!atomic_inc_not_zero(&mm->mm_users)) { | ||
448 | task_unlock(p); | ||
449 | return true; | ||
450 | } | ||
451 | |||
452 | task_unlock(p); | ||
453 | |||
454 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
455 | ret = false; | ||
456 | goto out; | ||
457 | } | ||
458 | |||
459 | tlb_gather_mmu(&tlb, mm, 0, -1); | ||
460 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | ||
461 | if (is_vm_hugetlb_page(vma)) | ||
462 | continue; | ||
463 | |||
464 | /* | ||
465 | * mlocked VMAs require explicit munlocking before unmap. | ||
466 | * Let's keep it simple here and skip such VMAs. | ||
467 | */ | ||
468 | if (vma->vm_flags & VM_LOCKED) | ||
469 | continue; | ||
470 | |||
471 | /* | ||
472 | * Only anonymous pages have a good chance to be dropped | ||
473 | * without additional steps which we cannot afford as we | ||
474 | * are OOM already. | ||
475 | * | ||
476 | * We do not even care about fs backed pages because all | ||
477 | * which are reclaimable have already been reclaimed and | ||
478 | * we do not want to block exit_mmap by keeping mm ref | ||
479 | * count elevated without a good reason. | ||
480 | */ | ||
481 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) | ||
482 | unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, | ||
483 | &details); | ||
484 | } | ||
485 | tlb_finish_mmu(&tlb, 0, -1); | ||
486 | pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | ||
487 | task_pid_nr(tsk), tsk->comm, | ||
488 | K(get_mm_counter(mm, MM_ANONPAGES)), | ||
489 | K(get_mm_counter(mm, MM_FILEPAGES)), | ||
490 | K(get_mm_counter(mm, MM_SHMEMPAGES))); | ||
491 | up_read(&mm->mmap_sem); | ||
492 | |||
493 | /* | ||
494 | * Clear TIF_MEMDIE because the task shouldn't be sitting on a | ||
495 | * reasonably reclaimable memory anymore. OOM killer can continue | ||
496 | * by selecting other victim if unmapping hasn't led to any | ||
497 | * improvements. This also means that selecting this task doesn't | ||
498 | * make any sense. | ||
499 | */ | ||
500 | tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; | ||
501 | exit_oom_victim(tsk); | ||
502 | out: | ||
503 | mmput(mm); | ||
504 | return ret; | ||
505 | } | ||
506 | |||
507 | #define MAX_OOM_REAP_RETRIES 10 | ||
508 | static void oom_reap_task(struct task_struct *tsk) | ||
509 | { | ||
510 | int attempts = 0; | ||
511 | |||
512 | /* Retry the down_read_trylock(mmap_sem) a few times */ | ||
513 | while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) | ||
514 | schedule_timeout_idle(HZ/10); | ||
515 | |||
516 | if (attempts > MAX_OOM_REAP_RETRIES) { | ||
517 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | ||
518 | task_pid_nr(tsk), tsk->comm); | ||
519 | debug_show_all_locks(); | ||
520 | } | ||
521 | |||
522 | /* Drop a reference taken by wake_oom_reaper */ | ||
523 | put_task_struct(tsk); | ||
524 | } | ||
525 | |||
526 | static int oom_reaper(void *unused) | ||
527 | { | ||
528 | set_freezable(); | ||
529 | |||
530 | while (true) { | ||
531 | struct task_struct *tsk = NULL; | ||
532 | |||
533 | wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); | ||
534 | spin_lock(&oom_reaper_lock); | ||
535 | if (oom_reaper_list != NULL) { | ||
536 | tsk = oom_reaper_list; | ||
537 | oom_reaper_list = tsk->oom_reaper_list; | ||
538 | } | ||
539 | spin_unlock(&oom_reaper_lock); | ||
540 | |||
541 | if (tsk) | ||
542 | oom_reap_task(tsk); | ||
543 | } | ||
544 | |||
545 | return 0; | ||
546 | } | ||
547 | |||
548 | static void wake_oom_reaper(struct task_struct *tsk) | ||
549 | { | ||
550 | if (!oom_reaper_th || tsk->oom_reaper_list) | ||
551 | return; | ||
552 | |||
553 | get_task_struct(tsk); | ||
554 | |||
555 | spin_lock(&oom_reaper_lock); | ||
556 | tsk->oom_reaper_list = oom_reaper_list; | ||
557 | oom_reaper_list = tsk; | ||
558 | spin_unlock(&oom_reaper_lock); | ||
559 | wake_up(&oom_reaper_wait); | ||
560 | } | ||
561 | |||
562 | static int __init oom_init(void) | ||
563 | { | ||
564 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | ||
565 | if (IS_ERR(oom_reaper_th)) { | ||
566 | pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", | ||
567 | PTR_ERR(oom_reaper_th)); | ||
568 | oom_reaper_th = NULL; | ||
569 | } | ||
570 | return 0; | ||
571 | } | ||
572 | subsys_initcall(oom_init) | ||
573 | #else | ||
574 | static void wake_oom_reaper(struct task_struct *tsk) | ||
575 | { | ||
576 | } | ||
577 | #endif | ||
578 | |||
408 | /** | 579 | /** |
409 | * mark_oom_victim - mark the given task as OOM victim | 580 | * mark_oom_victim - mark the given task as OOM victim |
410 | * @tsk: task to mark | 581 | * @tsk: task to mark |
@@ -431,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk) | |||
431 | /** | 602 | /** |
432 | * exit_oom_victim - note the exit of an OOM victim | 603 | * exit_oom_victim - note the exit of an OOM victim |
433 | */ | 604 | */ |
434 | void exit_oom_victim(void) | 605 | void exit_oom_victim(struct task_struct *tsk) |
435 | { | 606 | { |
436 | clear_thread_flag(TIF_MEMDIE); | 607 | if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) |
608 | return; | ||
437 | 609 | ||
438 | if (!atomic_dec_return(&oom_victims)) | 610 | if (!atomic_dec_return(&oom_victims)) |
439 | wake_up_all(&oom_victims_wait); | 611 | wake_up_all(&oom_victims_wait); |
@@ -494,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | |||
494 | return false; | 666 | return false; |
495 | } | 667 | } |
496 | 668 | ||
497 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
498 | /* | 669 | /* |
499 | * Must be called while holding a reference to p, which will be released upon | 670 | * Must be called while holding a reference to p, which will be released upon |
500 | * returning. | 671 | * returning. |
@@ -510,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
510 | unsigned int victim_points = 0; | 681 | unsigned int victim_points = 0; |
511 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 682 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
512 | DEFAULT_RATELIMIT_BURST); | 683 | DEFAULT_RATELIMIT_BURST); |
684 | bool can_oom_reap = true; | ||
513 | 685 | ||
514 | /* | 686 | /* |
515 | * If the task is already exiting, don't alarm the sysadmin or kill | 687 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -600,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
600 | continue; | 772 | continue; |
601 | if (same_thread_group(p, victim)) | 773 | if (same_thread_group(p, victim)) |
602 | continue; | 774 | continue; |
603 | if (unlikely(p->flags & PF_KTHREAD)) | 775 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || |
604 | continue; | 776 | p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
605 | if (is_global_init(p)) | 777 | /* |
606 | continue; | 778 | * We cannot use oom_reaper for the mm shared by this |
607 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | 779 | * process because it wouldn't get killed and so the |
780 | * memory might be still used. | ||
781 | */ | ||
782 | can_oom_reap = false; | ||
608 | continue; | 783 | continue; |
609 | 784 | } | |
610 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 785 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
611 | } | 786 | } |
612 | rcu_read_unlock(); | 787 | rcu_read_unlock(); |
613 | 788 | ||
789 | if (can_oom_reap) | ||
790 | wake_oom_reaper(victim); | ||
791 | |||
614 | mmdrop(mm); | 792 | mmdrop(mm); |
615 | put_task_struct(victim); | 793 | put_task_struct(victim); |
616 | } | 794 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a762be57e46e..59de90d5d3a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -692,34 +692,28 @@ static inline void __free_one_page(struct page *page, | |||
692 | unsigned long combined_idx; | 692 | unsigned long combined_idx; |
693 | unsigned long uninitialized_var(buddy_idx); | 693 | unsigned long uninitialized_var(buddy_idx); |
694 | struct page *buddy; | 694 | struct page *buddy; |
695 | unsigned int max_order = MAX_ORDER; | 695 | unsigned int max_order; |
696 | |||
697 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
696 | 698 | ||
697 | VM_BUG_ON(!zone_is_initialized(zone)); | 699 | VM_BUG_ON(!zone_is_initialized(zone)); |
698 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | 700 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
699 | 701 | ||
700 | VM_BUG_ON(migratetype == -1); | 702 | VM_BUG_ON(migratetype == -1); |
701 | if (is_migrate_isolate(migratetype)) { | 703 | if (likely(!is_migrate_isolate(migratetype))) |
702 | /* | ||
703 | * We restrict max order of merging to prevent merge | ||
704 | * between freepages on isolate pageblock and normal | ||
705 | * pageblock. Without this, pageblock isolation | ||
706 | * could cause incorrect freepage accounting. | ||
707 | */ | ||
708 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
709 | } else { | ||
710 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 704 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
711 | } | ||
712 | 705 | ||
713 | page_idx = pfn & ((1 << max_order) - 1); | 706 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
714 | 707 | ||
715 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); | 708 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
716 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 709 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
717 | 710 | ||
711 | continue_merging: | ||
718 | while (order < max_order - 1) { | 712 | while (order < max_order - 1) { |
719 | buddy_idx = __find_buddy_index(page_idx, order); | 713 | buddy_idx = __find_buddy_index(page_idx, order); |
720 | buddy = page + (buddy_idx - page_idx); | 714 | buddy = page + (buddy_idx - page_idx); |
721 | if (!page_is_buddy(page, buddy, order)) | 715 | if (!page_is_buddy(page, buddy, order)) |
722 | break; | 716 | goto done_merging; |
723 | /* | 717 | /* |
724 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, | 718 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
725 | * merge with it and move up one order. | 719 | * merge with it and move up one order. |
@@ -736,6 +730,32 @@ static inline void __free_one_page(struct page *page, | |||
736 | page_idx = combined_idx; | 730 | page_idx = combined_idx; |
737 | order++; | 731 | order++; |
738 | } | 732 | } |
733 | if (max_order < MAX_ORDER) { | ||
734 | /* If we are here, it means order is >= pageblock_order. | ||
735 | * We want to prevent merge between freepages on isolate | ||
736 | * pageblock and normal pageblock. Without this, pageblock | ||
737 | * isolation could cause incorrect freepage or CMA accounting. | ||
738 | * | ||
739 | * We don't want to hit this code for the more frequent | ||
740 | * low-order merging. | ||
741 | */ | ||
742 | if (unlikely(has_isolate_pageblock(zone))) { | ||
743 | int buddy_mt; | ||
744 | |||
745 | buddy_idx = __find_buddy_index(page_idx, order); | ||
746 | buddy = page + (buddy_idx - page_idx); | ||
747 | buddy_mt = get_pageblock_migratetype(buddy); | ||
748 | |||
749 | if (migratetype != buddy_mt | ||
750 | && (is_migrate_isolate(migratetype) || | ||
751 | is_migrate_isolate(buddy_mt))) | ||
752 | goto done_merging; | ||
753 | } | ||
754 | max_order++; | ||
755 | goto continue_merging; | ||
756 | } | ||
757 | |||
758 | done_merging: | ||
739 | set_page_order(page, order); | 759 | set_page_order(page, order); |
740 | 760 | ||
741 | /* | 761 | /* |
@@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2086 | } | 2086 | } |
2087 | #endif | 2087 | #endif |
2088 | 2088 | ||
2089 | kasan_cache_create(cachep, &size, &flags); | ||
2090 | |||
2089 | size = ALIGN(size, cachep->align); | 2091 | size = ALIGN(size, cachep->align); |
2090 | /* | 2092 | /* |
2091 | * We should restrict the number of objects in a slab to implement | 2093 | * We should restrict the number of objects in a slab to implement |
@@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) | |||
2387 | * cache which they are a constructor for. Otherwise, deadlock. | 2389 | * cache which they are a constructor for. Otherwise, deadlock. |
2388 | * They must also be threaded. | 2390 | * They must also be threaded. |
2389 | */ | 2391 | */ |
2390 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2392 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { |
2393 | kasan_unpoison_object_data(cachep, | ||
2394 | objp + obj_offset(cachep)); | ||
2391 | cachep->ctor(objp + obj_offset(cachep)); | 2395 | cachep->ctor(objp + obj_offset(cachep)); |
2396 | kasan_poison_object_data( | ||
2397 | cachep, objp + obj_offset(cachep)); | ||
2398 | } | ||
2392 | 2399 | ||
2393 | if (cachep->flags & SLAB_RED_ZONE) { | 2400 | if (cachep->flags & SLAB_RED_ZONE) { |
2394 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2401 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2409 | struct page *page) | 2416 | struct page *page) |
2410 | { | 2417 | { |
2411 | int i; | 2418 | int i; |
2419 | void *objp; | ||
2412 | 2420 | ||
2413 | cache_init_objs_debug(cachep, page); | 2421 | cache_init_objs_debug(cachep, page); |
2414 | 2422 | ||
@@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2419 | 2427 | ||
2420 | for (i = 0; i < cachep->num; i++) { | 2428 | for (i = 0; i < cachep->num; i++) { |
2421 | /* constructor could break poison info */ | 2429 | /* constructor could break poison info */ |
2422 | if (DEBUG == 0 && cachep->ctor) | 2430 | if (DEBUG == 0 && cachep->ctor) { |
2423 | cachep->ctor(index_to_obj(cachep, page, i)); | 2431 | objp = index_to_obj(cachep, page, i); |
2432 | kasan_unpoison_object_data(cachep, objp); | ||
2433 | cachep->ctor(objp); | ||
2434 | kasan_poison_object_data(cachep, objp); | ||
2435 | } | ||
2424 | 2436 | ||
2425 | set_free_obj(page, i, i); | 2437 | set_free_obj(page, i, i); |
2426 | } | 2438 | } |
@@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2550 | 2562 | ||
2551 | slab_map_pages(cachep, page, freelist); | 2563 | slab_map_pages(cachep, page, freelist); |
2552 | 2564 | ||
2565 | kasan_poison_slab(page); | ||
2553 | cache_init_objs(cachep, page); | 2566 | cache_init_objs(cachep, page); |
2554 | 2567 | ||
2555 | if (gfpflags_allow_blocking(local_flags)) | 2568 | if (gfpflags_allow_blocking(local_flags)) |
@@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3316 | { | 3329 | { |
3317 | struct array_cache *ac = cpu_cache_get(cachep); | 3330 | struct array_cache *ac = cpu_cache_get(cachep); |
3318 | 3331 | ||
3332 | kasan_slab_free(cachep, objp); | ||
3333 | |||
3319 | check_irq_off(); | 3334 | check_irq_off(); |
3320 | kmemleak_free_recursive(objp, cachep->flags); | 3335 | kmemleak_free_recursive(objp, cachep->flags); |
3321 | objp = cache_free_debugcheck(cachep, objp, caller); | 3336 | objp = cache_free_debugcheck(cachep, objp, caller); |
@@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3363 | { | 3378 | { |
3364 | void *ret = slab_alloc(cachep, flags, _RET_IP_); | 3379 | void *ret = slab_alloc(cachep, flags, _RET_IP_); |
3365 | 3380 | ||
3381 | kasan_slab_alloc(cachep, ret, flags); | ||
3366 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3382 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3367 | cachep->object_size, cachep->size, flags); | 3383 | cachep->object_size, cachep->size, flags); |
3368 | 3384 | ||
@@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) | |||
3428 | 3444 | ||
3429 | ret = slab_alloc(cachep, flags, _RET_IP_); | 3445 | ret = slab_alloc(cachep, flags, _RET_IP_); |
3430 | 3446 | ||
3447 | kasan_kmalloc(cachep, ret, size, flags); | ||
3431 | trace_kmalloc(_RET_IP_, ret, | 3448 | trace_kmalloc(_RET_IP_, ret, |
3432 | size, cachep->size, flags); | 3449 | size, cachep->size, flags); |
3433 | return ret; | 3450 | return ret; |
@@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3451 | { | 3468 | { |
3452 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3469 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3453 | 3470 | ||
3471 | kasan_slab_alloc(cachep, ret, flags); | ||
3454 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3472 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3455 | cachep->object_size, cachep->size, | 3473 | cachep->object_size, cachep->size, |
3456 | flags, nodeid); | 3474 | flags, nodeid); |
@@ -3469,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, | |||
3469 | 3487 | ||
3470 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3488 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3471 | 3489 | ||
3490 | kasan_kmalloc(cachep, ret, size, flags); | ||
3472 | trace_kmalloc_node(_RET_IP_, ret, | 3491 | trace_kmalloc_node(_RET_IP_, ret, |
3473 | size, cachep->size, | 3492 | size, cachep->size, |
3474 | flags, nodeid); | 3493 | flags, nodeid); |
@@ -3481,11 +3500,15 @@ static __always_inline void * | |||
3481 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | 3500 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) |
3482 | { | 3501 | { |
3483 | struct kmem_cache *cachep; | 3502 | struct kmem_cache *cachep; |
3503 | void *ret; | ||
3484 | 3504 | ||
3485 | cachep = kmalloc_slab(size, flags); | 3505 | cachep = kmalloc_slab(size, flags); |
3486 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3506 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3487 | return cachep; | 3507 | return cachep; |
3488 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3508 | ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3509 | kasan_kmalloc(cachep, ret, size, flags); | ||
3510 | |||
3511 | return ret; | ||
3489 | } | 3512 | } |
3490 | 3513 | ||
3491 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3514 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
@@ -3519,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3519 | return cachep; | 3542 | return cachep; |
3520 | ret = slab_alloc(cachep, flags, caller); | 3543 | ret = slab_alloc(cachep, flags, caller); |
3521 | 3544 | ||
3545 | kasan_kmalloc(cachep, ret, size, flags); | ||
3522 | trace_kmalloc(caller, ret, | 3546 | trace_kmalloc(caller, ret, |
3523 | size, cachep->size, flags); | 3547 | size, cachep->size, flags); |
3524 | 3548 | ||
@@ -4290,10 +4314,18 @@ module_init(slab_proc_init); | |||
4290 | */ | 4314 | */ |
4291 | size_t ksize(const void *objp) | 4315 | size_t ksize(const void *objp) |
4292 | { | 4316 | { |
4317 | size_t size; | ||
4318 | |||
4293 | BUG_ON(!objp); | 4319 | BUG_ON(!objp); |
4294 | if (unlikely(objp == ZERO_SIZE_PTR)) | 4320 | if (unlikely(objp == ZERO_SIZE_PTR)) |
4295 | return 0; | 4321 | return 0; |
4296 | 4322 | ||
4297 | return virt_to_cache(objp)->object_size; | 4323 | size = virt_to_cache(objp)->object_size; |
4324 | /* We assume that ksize callers could use the whole allocated area, | ||
4325 | * so we need to unpoison this area. | ||
4326 | */ | ||
4327 | kasan_krealloc(objp, size, GFP_NOWAIT); | ||
4328 | |||
4329 | return size; | ||
4298 | } | 4330 | } |
4299 | EXPORT_SYMBOL(ksize); | 4331 | EXPORT_SYMBOL(ksize); |
@@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
405 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 405 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
406 | kmemleak_alloc_recursive(object, s->object_size, 1, | 406 | kmemleak_alloc_recursive(object, s->object_size, 1, |
407 | s->flags, flags); | 407 | s->flags, flags); |
408 | kasan_slab_alloc(s, object); | 408 | kasan_slab_alloc(s, object, flags); |
409 | } | 409 | } |
410 | memcg_kmem_put_cache(s); | 410 | memcg_kmem_put_cache(s); |
411 | } | 411 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index b2e379639a5b..3239bfd758e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; | |||
35 | */ | 35 | */ |
36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | 37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
38 | SLAB_FAILSLAB) | 38 | SLAB_FAILSLAB | SLAB_KASAN) |
39 | 39 | ||
40 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ | 40 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ |
41 | SLAB_NOTRACK | SLAB_ACCOUNT) | 41 | SLAB_NOTRACK | SLAB_ACCOUNT) |
@@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
1013 | page = alloc_kmem_pages(flags, order); | 1013 | page = alloc_kmem_pages(flags, order); |
1014 | ret = page ? page_address(page) : NULL; | 1014 | ret = page ? page_address(page) : NULL; |
1015 | kmemleak_alloc(ret, size, 1, flags); | 1015 | kmemleak_alloc(ret, size, 1, flags); |
1016 | kasan_kmalloc_large(ret, size); | 1016 | kasan_kmalloc_large(ret, size, flags); |
1017 | return ret; | 1017 | return ret; |
1018 | } | 1018 | } |
1019 | EXPORT_SYMBOL(kmalloc_order); | 1019 | EXPORT_SYMBOL(kmalloc_order); |
@@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
1192 | ks = ksize(p); | 1192 | ks = ksize(p); |
1193 | 1193 | ||
1194 | if (ks >= new_size) { | 1194 | if (ks >= new_size) { |
1195 | kasan_krealloc((void *)p, new_size); | 1195 | kasan_krealloc((void *)p, new_size, flags); |
1196 | return (void *)p; | 1196 | return (void *)p; |
1197 | } | 1197 | } |
1198 | 1198 | ||
@@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, | |||
1313 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1313 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
1314 | { | 1314 | { |
1315 | kmemleak_alloc(ptr, size, 1, flags); | 1315 | kmemleak_alloc(ptr, size, 1, flags); |
1316 | kasan_kmalloc_large(ptr, size); | 1316 | kasan_kmalloc_large(ptr, size, flags); |
1317 | } | 1317 | } |
1318 | 1318 | ||
1319 | static inline void kfree_hook(const void *x) | 1319 | static inline void kfree_hook(const void *x) |
@@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | |||
2596 | { | 2596 | { |
2597 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); | 2597 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
2598 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2598 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
2599 | kasan_kmalloc(s, ret, size); | 2599 | kasan_kmalloc(s, ret, size, gfpflags); |
2600 | return ret; | 2600 | return ret; |
2601 | } | 2601 | } |
2602 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 2602 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
@@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
2624 | trace_kmalloc_node(_RET_IP_, ret, | 2624 | trace_kmalloc_node(_RET_IP_, ret, |
2625 | size, s->size, gfpflags, node); | 2625 | size, s->size, gfpflags, node); |
2626 | 2626 | ||
2627 | kasan_kmalloc(s, ret, size); | 2627 | kasan_kmalloc(s, ret, size, gfpflags); |
2628 | return ret; | 2628 | return ret; |
2629 | } | 2629 | } |
2630 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 2630 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
@@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
3182 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 3182 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
3183 | init_tracking(kmem_cache_node, n); | 3183 | init_tracking(kmem_cache_node, n); |
3184 | #endif | 3184 | #endif |
3185 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); | 3185 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), |
3186 | GFP_KERNEL); | ||
3186 | init_kmem_cache_node(n); | 3187 | init_kmem_cache_node(n); |
3187 | inc_slabs_node(kmem_cache_node, node, page->objects); | 3188 | inc_slabs_node(kmem_cache_node, node, page->objects); |
3188 | 3189 | ||
@@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
3561 | 3562 | ||
3562 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3563 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
3563 | 3564 | ||
3564 | kasan_kmalloc(s, ret, size); | 3565 | kasan_kmalloc(s, ret, size, flags); |
3565 | 3566 | ||
3566 | return ret; | 3567 | return ret; |
3567 | } | 3568 | } |
@@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3606 | 3607 | ||
3607 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3608 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
3608 | 3609 | ||
3609 | kasan_kmalloc(s, ret, size); | 3610 | kasan_kmalloc(s, ret, size, flags); |
3610 | 3611 | ||
3611 | return ret; | 3612 | return ret; |
3612 | } | 3613 | } |
@@ -3635,7 +3636,7 @@ size_t ksize(const void *object) | |||
3635 | size_t size = __ksize(object); | 3636 | size_t size = __ksize(object); |
3636 | /* We assume that ksize callers could use whole allocated area, | 3637 | /* We assume that ksize callers could use whole allocated area, |
3637 | so we need unpoison this area. */ | 3638 | so we need unpoison this area. */ |
3638 | kasan_krealloc(object, size); | 3639 | kasan_krealloc(object, size, GFP_NOWAIT); |
3639 | return size; | 3640 | return size; |
3640 | } | 3641 | } |
3641 | EXPORT_SYMBOL(ksize); | 3642 | EXPORT_SYMBOL(ksize); |