diff options
79 files changed, 1770 insertions, 962 deletions
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index aa1e0c91e368..7dd95b35cd7c 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt | |||
| @@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access, | |||
| 12 | therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is | 12 | therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is |
| 13 | required for detection of out-of-bounds accesses to stack or global variables. | 13 | required for detection of out-of-bounds accesses to stack or global variables. |
| 14 | 14 | ||
| 15 | Currently KASAN is supported only for x86_64 architecture and requires the | 15 | Currently KASAN is supported only for x86_64 architecture. |
| 16 | kernel to be built with the SLUB allocator. | ||
| 17 | 16 | ||
| 18 | 1. Usage | 17 | 1. Usage |
| 19 | ======== | 18 | ======== |
| @@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary | |||
| 27 | the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC | 26 | the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC |
| 28 | version 5.0 or later. | 27 | version 5.0 or later. |
| 29 | 28 | ||
| 30 | Currently KASAN works only with the SLUB memory allocator. | 29 | KASAN works with both SLUB and SLAB memory allocators. |
| 31 | For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. | 30 | For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. |
| 32 | 31 | ||
| 33 | To disable instrumentation for specific files or directories, add a line | 32 | To disable instrumentation for specific files or directories, add a line |
diff --git a/MAINTAINERS b/MAINTAINERS index f07a174bbc81..df8cf6b924c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -6165,6 +6165,20 @@ S: Maintained | |||
| 6165 | F: Documentation/hwmon/k8temp | 6165 | F: Documentation/hwmon/k8temp |
| 6166 | F: drivers/hwmon/k8temp.c | 6166 | F: drivers/hwmon/k8temp.c |
| 6167 | 6167 | ||
| 6168 | KASAN | ||
| 6169 | M: Andrey Ryabinin <aryabinin@virtuozzo.com> | ||
| 6170 | R: Alexander Potapenko <glider@google.com> | ||
| 6171 | R: Dmitry Vyukov <dvyukov@google.com> | ||
| 6172 | L: kasan-dev@googlegroups.com | ||
| 6173 | S: Maintained | ||
| 6174 | F: arch/*/include/asm/kasan.h | ||
| 6175 | F: arch/*/mm/kasan_init* | ||
| 6176 | F: Documentation/kasan.txt | ||
| 6177 | F: include/linux/kasan.h | ||
| 6178 | F: lib/test_kasan.c | ||
| 6179 | F: mm/kasan/ | ||
| 6180 | F: scripts/Makefile.kasan | ||
| 6181 | |||
| 6168 | KCONFIG | 6182 | KCONFIG |
| 6169 | M: "Yann E. MORIN" <yann.morin.1998@free.fr> | 6183 | M: "Yann E. MORIN" <yann.morin.1998@free.fr> |
| 6170 | L: linux-kbuild@vger.kernel.org | 6184 | L: linux-kbuild@vger.kernel.org |
diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h index 5abaf5bbd985..bf1991263d2d 100644 --- a/arch/arm/include/asm/exception.h +++ b/arch/arm/include/asm/exception.h | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | #ifndef __ASM_ARM_EXCEPTION_H | 7 | #ifndef __ASM_ARM_EXCEPTION_H |
| 8 | #define __ASM_ARM_EXCEPTION_H | 8 | #define __ASM_ARM_EXCEPTION_H |
| 9 | 9 | ||
| 10 | #include <linux/ftrace.h> | 10 | #include <linux/interrupt.h> |
| 11 | 11 | ||
| 12 | #define __exception __attribute__((section(".exception.text"))) | 12 | #define __exception __attribute__((section(".exception.text"))) |
| 13 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 13 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 1fab979daeaf..e2c6da096cef 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S | |||
| @@ -108,6 +108,7 @@ SECTIONS | |||
| 108 | *(.exception.text) | 108 | *(.exception.text) |
| 109 | __exception_text_end = .; | 109 | __exception_text_end = .; |
| 110 | IRQENTRY_TEXT | 110 | IRQENTRY_TEXT |
| 111 | SOFTIRQENTRY_TEXT | ||
| 111 | TEXT_TEXT | 112 | TEXT_TEXT |
| 112 | SCHED_TEXT | 113 | SCHED_TEXT |
| 113 | LOCK_TEXT | 114 | LOCK_TEXT |
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 6cb7e1a6bc02..0c2eec490abf 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | #ifndef __ASM_EXCEPTION_H | 18 | #ifndef __ASM_EXCEPTION_H |
| 19 | #define __ASM_EXCEPTION_H | 19 | #define __ASM_EXCEPTION_H |
| 20 | 20 | ||
| 21 | #include <linux/ftrace.h> | 21 | #include <linux/interrupt.h> |
| 22 | 22 | ||
| 23 | #define __exception __attribute__((section(".exception.text"))) | 23 | #define __exception __attribute__((section(".exception.text"))) |
| 24 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 24 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 37f624df68fa..5a1939a74ff3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S | |||
| @@ -103,6 +103,7 @@ SECTIONS | |||
| 103 | *(.exception.text) | 103 | *(.exception.text) |
| 104 | __exception_text_end = .; | 104 | __exception_text_end = .; |
| 105 | IRQENTRY_TEXT | 105 | IRQENTRY_TEXT |
| 106 | SOFTIRQENTRY_TEXT | ||
| 106 | TEXT_TEXT | 107 | TEXT_TEXT |
| 107 | SCHED_TEXT | 108 | SCHED_TEXT |
| 108 | LOCK_TEXT | 109 | LOCK_TEXT |
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index c9eec84aa258..d920b959ff3a 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S | |||
| @@ -35,6 +35,7 @@ SECTIONS | |||
| 35 | #endif | 35 | #endif |
| 36 | LOCK_TEXT | 36 | LOCK_TEXT |
| 37 | IRQENTRY_TEXT | 37 | IRQENTRY_TEXT |
| 38 | SOFTIRQENTRY_TEXT | ||
| 38 | KPROBES_TEXT | 39 | KPROBES_TEXT |
| 39 | #ifdef CONFIG_ROMKERNEL | 40 | #ifdef CONFIG_ROMKERNEL |
| 40 | __sinittext = .; | 41 | __sinittext = .; |
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 5a6e141d1641..50bc10f97bcb 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S | |||
| @@ -72,6 +72,7 @@ SECTIONS | |||
| 72 | SCHED_TEXT | 72 | SCHED_TEXT |
| 73 | LOCK_TEXT | 73 | LOCK_TEXT |
| 74 | IRQENTRY_TEXT | 74 | IRQENTRY_TEXT |
| 75 | SOFTIRQENTRY_TEXT | ||
| 75 | KPROBES_TEXT | 76 | KPROBES_TEXT |
| 76 | *(.fixup) | 77 | *(.fixup) |
| 77 | *(.gnu.warning) | 78 | *(.gnu.warning) |
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index e12055e88bfe..150ace92c7ad 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S | |||
| @@ -24,6 +24,7 @@ SECTIONS | |||
| 24 | LOCK_TEXT | 24 | LOCK_TEXT |
| 25 | KPROBES_TEXT | 25 | KPROBES_TEXT |
| 26 | IRQENTRY_TEXT | 26 | IRQENTRY_TEXT |
| 27 | SOFTIRQENTRY_TEXT | ||
| 27 | *(.text.*) | 28 | *(.text.*) |
| 28 | *(.gnu.warning) | 29 | *(.gnu.warning) |
| 29 | } | 30 | } |
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index be9488d69734..0a47f0410554 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S | |||
| @@ -36,6 +36,7 @@ SECTIONS { | |||
| 36 | LOCK_TEXT | 36 | LOCK_TEXT |
| 37 | KPROBES_TEXT | 37 | KPROBES_TEXT |
| 38 | IRQENTRY_TEXT | 38 | IRQENTRY_TEXT |
| 39 | SOFTIRQENTRY_TEXT | ||
| 39 | . = ALIGN (4) ; | 40 | . = ALIGN (4) ; |
| 40 | _etext = . ; | 41 | _etext = . ; |
| 41 | } | 42 | } |
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 0a93e83cd014..54d653ee17e1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S | |||
| @@ -58,6 +58,7 @@ SECTIONS | |||
| 58 | LOCK_TEXT | 58 | LOCK_TEXT |
| 59 | KPROBES_TEXT | 59 | KPROBES_TEXT |
| 60 | IRQENTRY_TEXT | 60 | IRQENTRY_TEXT |
| 61 | SOFTIRQENTRY_TEXT | ||
| 61 | *(.text.*) | 62 | *(.text.*) |
| 62 | *(.fixup) | 63 | *(.fixup) |
| 63 | *(.gnu.warning) | 64 | *(.gnu.warning) |
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index 326fab40a9de..e23e89539967 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S | |||
| @@ -39,6 +39,7 @@ SECTIONS | |||
| 39 | SCHED_TEXT | 39 | SCHED_TEXT |
| 40 | LOCK_TEXT | 40 | LOCK_TEXT |
| 41 | IRQENTRY_TEXT | 41 | IRQENTRY_TEXT |
| 42 | SOFTIRQENTRY_TEXT | ||
| 42 | KPROBES_TEXT | 43 | KPROBES_TEXT |
| 43 | } =0 | 44 | } =0 |
| 44 | _etext = .; | 45 | _etext = .; |
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 2d69a853b742..d936de4c07ca 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S | |||
| @@ -50,6 +50,7 @@ SECTIONS | |||
| 50 | LOCK_TEXT | 50 | LOCK_TEXT |
| 51 | KPROBES_TEXT | 51 | KPROBES_TEXT |
| 52 | IRQENTRY_TEXT | 52 | IRQENTRY_TEXT |
| 53 | SOFTIRQENTRY_TEXT | ||
| 53 | *(.fixup) | 54 | *(.fixup) |
| 54 | *(.text.__*) | 55 | *(.text.__*) |
| 55 | _etext = .; | 56 | _etext = .; |
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 308f29081d46..f3ead0b6ce46 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S | |||
| @@ -72,6 +72,7 @@ SECTIONS | |||
| 72 | LOCK_TEXT | 72 | LOCK_TEXT |
| 73 | KPROBES_TEXT | 73 | KPROBES_TEXT |
| 74 | IRQENTRY_TEXT | 74 | IRQENTRY_TEXT |
| 75 | SOFTIRQENTRY_TEXT | ||
| 75 | *(.text.do_softirq) | 76 | *(.text.do_softirq) |
| 76 | *(.text.sys_exit) | 77 | *(.text.sys_exit) |
| 77 | *(.text.do_sigaltstack) | 78 | *(.text.do_sigaltstack) |
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index d41fd0af8980..2dd91f79de05 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S | |||
| @@ -55,6 +55,7 @@ SECTIONS | |||
| 55 | LOCK_TEXT | 55 | LOCK_TEXT |
| 56 | KPROBES_TEXT | 56 | KPROBES_TEXT |
| 57 | IRQENTRY_TEXT | 57 | IRQENTRY_TEXT |
| 58 | SOFTIRQENTRY_TEXT | ||
| 58 | 59 | ||
| 59 | #ifdef CONFIG_PPC32 | 60 | #ifdef CONFIG_PPC32 |
| 60 | *(.got1) | 61 | *(.got1) |
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 445657fe658c..0f41a8286378 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S | |||
| @@ -28,6 +28,7 @@ SECTIONS | |||
| 28 | LOCK_TEXT | 28 | LOCK_TEXT |
| 29 | KPROBES_TEXT | 29 | KPROBES_TEXT |
| 30 | IRQENTRY_TEXT | 30 | IRQENTRY_TEXT |
| 31 | SOFTIRQENTRY_TEXT | ||
| 31 | *(.fixup) | 32 | *(.fixup) |
| 32 | *(.gnu.warning) | 33 | *(.gnu.warning) |
| 33 | } :text = 0x0700 | 34 | } :text = 0x0700 |
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index db88cbf9eafd..235a4101999f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S | |||
| @@ -39,6 +39,7 @@ SECTIONS | |||
| 39 | LOCK_TEXT | 39 | LOCK_TEXT |
| 40 | KPROBES_TEXT | 40 | KPROBES_TEXT |
| 41 | IRQENTRY_TEXT | 41 | IRQENTRY_TEXT |
| 42 | SOFTIRQENTRY_TEXT | ||
| 42 | *(.fixup) | 43 | *(.fixup) |
| 43 | *(.gnu.warning) | 44 | *(.gnu.warning) |
| 44 | _etext = .; /* End of text section */ | 45 | _etext = .; /* End of text section */ |
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index f1a2f688b28a..aadd321aa05d 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S | |||
| @@ -48,6 +48,7 @@ SECTIONS | |||
| 48 | LOCK_TEXT | 48 | LOCK_TEXT |
| 49 | KPROBES_TEXT | 49 | KPROBES_TEXT |
| 50 | IRQENTRY_TEXT | 50 | IRQENTRY_TEXT |
| 51 | SOFTIRQENTRY_TEXT | ||
| 51 | *(.gnu.warning) | 52 | *(.gnu.warning) |
| 52 | } = 0 | 53 | } = 0 |
| 53 | _etext = .; | 54 | _etext = .; |
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 0e059a0101ea..378f5d8d1ec8 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S | |||
| @@ -45,6 +45,7 @@ SECTIONS | |||
| 45 | LOCK_TEXT | 45 | LOCK_TEXT |
| 46 | KPROBES_TEXT | 46 | KPROBES_TEXT |
| 47 | IRQENTRY_TEXT | 47 | IRQENTRY_TEXT |
| 48 | SOFTIRQENTRY_TEXT | ||
| 48 | __fix_text_end = .; /* tile-cpack won't rearrange before this */ | 49 | __fix_text_end = .; /* tile-cpack won't rearrange before this */ |
| 49 | ALIGN_FUNCTION(); | 50 | ALIGN_FUNCTION(); |
| 50 | *(.hottext*) | 51 | *(.hottext*) |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index adaae2c781c1..616ebd22ef9a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
| @@ -19,6 +19,7 @@ endif | |||
| 19 | KASAN_SANITIZE_head$(BITS).o := n | 19 | KASAN_SANITIZE_head$(BITS).o := n |
| 20 | KASAN_SANITIZE_dumpstack.o := n | 20 | KASAN_SANITIZE_dumpstack.o := n |
| 21 | KASAN_SANITIZE_dumpstack_$(BITS).o := n | 21 | KASAN_SANITIZE_dumpstack_$(BITS).o := n |
| 22 | KASAN_SANITIZE_stacktrace.o := n | ||
| 22 | 23 | ||
| 23 | OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y | 24 | OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y |
| 24 | OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y | 25 | OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d239639e0c1d..4c941f88d405 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
| @@ -101,6 +101,7 @@ SECTIONS | |||
| 101 | KPROBES_TEXT | 101 | KPROBES_TEXT |
| 102 | ENTRY_TEXT | 102 | ENTRY_TEXT |
| 103 | IRQENTRY_TEXT | 103 | IRQENTRY_TEXT |
| 104 | SOFTIRQENTRY_TEXT | ||
| 104 | *(.fixup) | 105 | *(.fixup) |
| 105 | *(.gnu.warning) | 106 | *(.gnu.warning) |
| 106 | /* End of text section */ | 107 | /* End of text section */ |
diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c index 64ca7113ff28..d84d20b9cec0 100644 --- a/drivers/input/input-compat.c +++ b/drivers/input/input-compat.c | |||
| @@ -17,7 +17,7 @@ | |||
| 17 | int input_event_from_user(const char __user *buffer, | 17 | int input_event_from_user(const char __user *buffer, |
| 18 | struct input_event *event) | 18 | struct input_event *event) |
| 19 | { | 19 | { |
| 20 | if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { | 20 | if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { |
| 21 | struct input_event_compat compat_event; | 21 | struct input_event_compat compat_event; |
| 22 | 22 | ||
| 23 | if (copy_from_user(&compat_event, buffer, | 23 | if (copy_from_user(&compat_event, buffer, |
| @@ -41,7 +41,7 @@ int input_event_from_user(const char __user *buffer, | |||
| 41 | int input_event_to_user(char __user *buffer, | 41 | int input_event_to_user(char __user *buffer, |
| 42 | const struct input_event *event) | 42 | const struct input_event *event) |
| 43 | { | 43 | { |
| 44 | if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { | 44 | if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { |
| 45 | struct input_event_compat compat_event; | 45 | struct input_event_compat compat_event; |
| 46 | 46 | ||
| 47 | compat_event.time.tv_sec = event->time.tv_sec; | 47 | compat_event.time.tv_sec = event->time.tv_sec; |
| @@ -65,7 +65,7 @@ int input_event_to_user(char __user *buffer, | |||
| 65 | int input_ff_effect_from_user(const char __user *buffer, size_t size, | 65 | int input_ff_effect_from_user(const char __user *buffer, size_t size, |
| 66 | struct ff_effect *effect) | 66 | struct ff_effect *effect) |
| 67 | { | 67 | { |
| 68 | if (INPUT_COMPAT_TEST) { | 68 | if (in_compat_syscall()) { |
| 69 | struct ff_effect_compat *compat_effect; | 69 | struct ff_effect_compat *compat_effect; |
| 70 | 70 | ||
| 71 | if (size != sizeof(struct ff_effect_compat)) | 71 | if (size != sizeof(struct ff_effect_compat)) |
diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h index 0f25878d5fa2..1563160a7af3 100644 --- a/drivers/input/input-compat.h +++ b/drivers/input/input-compat.h | |||
| @@ -17,8 +17,6 @@ | |||
| 17 | 17 | ||
| 18 | #ifdef CONFIG_COMPAT | 18 | #ifdef CONFIG_COMPAT |
| 19 | 19 | ||
| 20 | #define INPUT_COMPAT_TEST in_compat_syscall() | ||
| 21 | |||
| 22 | struct input_event_compat { | 20 | struct input_event_compat { |
| 23 | struct compat_timeval time; | 21 | struct compat_timeval time; |
| 24 | __u16 type; | 22 | __u16 type; |
| @@ -57,7 +55,7 @@ struct ff_effect_compat { | |||
| 57 | 55 | ||
| 58 | static inline size_t input_event_size(void) | 56 | static inline size_t input_event_size(void) |
| 59 | { | 57 | { |
| 60 | return (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) ? | 58 | return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ? |
| 61 | sizeof(struct input_event_compat) : sizeof(struct input_event); | 59 | sizeof(struct input_event_compat) : sizeof(struct input_event); |
| 62 | } | 60 | } |
| 63 | 61 | ||
diff --git a/drivers/input/input.c b/drivers/input/input.c index 880605959aa6..b87ffbd4547d 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c | |||
| @@ -1015,7 +1015,7 @@ static int input_bits_to_string(char *buf, int buf_size, | |||
| 1015 | { | 1015 | { |
| 1016 | int len = 0; | 1016 | int len = 0; |
| 1017 | 1017 | ||
| 1018 | if (INPUT_COMPAT_TEST) { | 1018 | if (in_compat_syscall()) { |
| 1019 | u32 dword = bits >> 32; | 1019 | u32 dword = bits >> 32; |
| 1020 | if (dword || !skip_empty) | 1020 | if (dword || !skip_empty) |
| 1021 | len += snprintf(buf, buf_size, "%x ", dword); | 1021 | len += snprintf(buf, buf_size, "%x ", dword); |
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 4eb9e4d94f46..abe1a927b332 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c | |||
| @@ -664,7 +664,7 @@ struct uinput_ff_upload_compat { | |||
| 664 | static int uinput_ff_upload_to_user(char __user *buffer, | 664 | static int uinput_ff_upload_to_user(char __user *buffer, |
| 665 | const struct uinput_ff_upload *ff_up) | 665 | const struct uinput_ff_upload *ff_up) |
| 666 | { | 666 | { |
| 667 | if (INPUT_COMPAT_TEST) { | 667 | if (in_compat_syscall()) { |
| 668 | struct uinput_ff_upload_compat ff_up_compat; | 668 | struct uinput_ff_upload_compat ff_up_compat; |
| 669 | 669 | ||
| 670 | ff_up_compat.request_id = ff_up->request_id; | 670 | ff_up_compat.request_id = ff_up->request_id; |
| @@ -695,7 +695,7 @@ static int uinput_ff_upload_to_user(char __user *buffer, | |||
| 695 | static int uinput_ff_upload_from_user(const char __user *buffer, | 695 | static int uinput_ff_upload_from_user(const char __user *buffer, |
| 696 | struct uinput_ff_upload *ff_up) | 696 | struct uinput_ff_upload *ff_up) |
| 697 | { | 697 | { |
| 698 | if (INPUT_COMPAT_TEST) { | 698 | if (in_compat_syscall()) { |
| 699 | struct uinput_ff_upload_compat ff_up_compat; | 699 | struct uinput_ff_upload_compat ff_up_compat; |
| 700 | 700 | ||
| 701 | if (copy_from_user(&ff_up_compat, buffer, | 701 | if (copy_from_user(&ff_up_compat, buffer, |
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index ef09ba0289d7..d5cfb503b9d6 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c | |||
| @@ -298,8 +298,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev) | |||
| 298 | sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? | 298 | sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? |
| 299 | PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); | 299 | PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); |
| 300 | 300 | ||
| 301 | if (sg_count != 1 || | 301 | if (sg_count != 1 || sg_dma_len(&dev->req->sg) < R592_LFIFO_SIZE) { |
| 302 | (sg_dma_len(&dev->req->sg) < dev->req->sg.length)) { | ||
| 303 | message("problem in dma_map_sg"); | 302 | message("problem in dma_map_sg"); |
| 304 | return -EIO; | 303 | return -EIO; |
| 305 | } | 304 | } |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d002579c6f2b..70907d638b60 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
| @@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, | |||
| 2516 | struct ocfs2_extent_block *eb; | 2516 | struct ocfs2_extent_block *eb; |
| 2517 | u32 range; | 2517 | u32 range; |
| 2518 | 2518 | ||
| 2519 | /* | ||
| 2520 | * In normal tree rotation process, we will never touch the | ||
| 2521 | * tree branch above subtree_index and ocfs2_extend_rotate_transaction | ||
| 2522 | * doesn't reserve the credits for them either. | ||
| 2523 | * | ||
| 2524 | * But we do have a special case here which will update the rightmost | ||
| 2525 | * records for all the bh in the path. | ||
| 2526 | * So we have to allocate extra credits and access them. | ||
| 2527 | */ | ||
| 2528 | ret = ocfs2_extend_trans(handle, subtree_index); | ||
| 2529 | if (ret) { | ||
| 2530 | mlog_errno(ret); | ||
| 2531 | goto out; | ||
| 2532 | } | ||
| 2533 | |||
| 2534 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); | 2519 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); |
| 2535 | if (ret) { | 2520 | if (ret) { |
| 2536 | mlog_errno(ret); | 2521 | mlog_errno(ret); |
| @@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, | |||
| 2956 | right_path->p_node[subtree_root].bh->b_blocknr, | 2941 | right_path->p_node[subtree_root].bh->b_blocknr, |
| 2957 | right_path->p_tree_depth); | 2942 | right_path->p_tree_depth); |
| 2958 | 2943 | ||
| 2959 | ret = ocfs2_extend_rotate_transaction(handle, subtree_root, | 2944 | ret = ocfs2_extend_rotate_transaction(handle, 0, |
| 2960 | orig_credits, left_path); | 2945 | orig_credits, left_path); |
| 2961 | if (ret) { | 2946 | if (ret) { |
| 2962 | mlog_errno(ret); | 2947 | mlog_errno(ret); |
| @@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, | |||
| 3029 | struct ocfs2_extent_block *eb; | 3014 | struct ocfs2_extent_block *eb; |
| 3030 | struct ocfs2_extent_list *el; | 3015 | struct ocfs2_extent_list *el; |
| 3031 | 3016 | ||
| 3032 | |||
| 3033 | ret = ocfs2_et_sanity_check(et); | 3017 | ret = ocfs2_et_sanity_check(et); |
| 3034 | if (ret) | 3018 | if (ret) |
| 3035 | goto out; | 3019 | goto out; |
| 3036 | /* | ||
| 3037 | * There's two ways we handle this depending on | ||
| 3038 | * whether path is the only existing one. | ||
| 3039 | */ | ||
| 3040 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3041 | handle->h_buffer_credits, | ||
| 3042 | path); | ||
| 3043 | if (ret) { | ||
| 3044 | mlog_errno(ret); | ||
| 3045 | goto out; | ||
| 3046 | } | ||
| 3047 | 3020 | ||
| 3048 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); | 3021 | ret = ocfs2_journal_access_path(et->et_ci, handle, path); |
| 3049 | if (ret) { | 3022 | if (ret) { |
| @@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, | |||
| 3641 | */ | 3614 | */ |
| 3642 | if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && | 3615 | if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && |
| 3643 | le16_to_cpu(el->l_next_free_rec) == 1) { | 3616 | le16_to_cpu(el->l_next_free_rec) == 1) { |
| 3617 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 3618 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3619 | handle->h_buffer_credits, | ||
| 3620 | right_path); | ||
| 3621 | if (ret) { | ||
| 3622 | mlog_errno(ret); | ||
| 3623 | goto out; | ||
| 3624 | } | ||
| 3644 | 3625 | ||
| 3645 | ret = ocfs2_remove_rightmost_path(handle, et, | 3626 | ret = ocfs2_remove_rightmost_path(handle, et, |
| 3646 | right_path, | 3627 | right_path, |
| @@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
| 3679 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); | 3660 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); |
| 3680 | 3661 | ||
| 3681 | if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { | 3662 | if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { |
| 3663 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 3664 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3665 | handle->h_buffer_credits, | ||
| 3666 | path); | ||
| 3667 | if (ret) { | ||
| 3668 | mlog_errno(ret); | ||
| 3669 | goto out; | ||
| 3670 | } | ||
| 3682 | /* | 3671 | /* |
| 3683 | * The merge code will need to create an empty | 3672 | * The merge code will need to create an empty |
| 3684 | * extent to take the place of the newly | 3673 | * extent to take the place of the newly |
| @@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
| 3727 | */ | 3716 | */ |
| 3728 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | 3717 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); |
| 3729 | 3718 | ||
| 3719 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 3720 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3721 | handle->h_buffer_credits, | ||
| 3722 | path); | ||
| 3723 | if (ret) { | ||
| 3724 | mlog_errno(ret); | ||
| 3725 | goto out; | ||
| 3726 | } | ||
| 3727 | |||
| 3730 | /* The merge left us with an empty extent, remove it. */ | 3728 | /* The merge left us with an empty extent, remove it. */ |
| 3731 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 3729 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
| 3732 | if (ret) { | 3730 | if (ret) { |
| @@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
| 3748 | goto out; | 3746 | goto out; |
| 3749 | } | 3747 | } |
| 3750 | 3748 | ||
| 3749 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 3750 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3751 | handle->h_buffer_credits, | ||
| 3752 | path); | ||
| 3753 | if (ret) { | ||
| 3754 | mlog_errno(ret); | ||
| 3755 | goto out; | ||
| 3756 | } | ||
| 3757 | |||
| 3751 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 3758 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
| 3752 | /* | 3759 | /* |
| 3753 | * Error from this last rotate is not critical, so | 3760 | * Error from this last rotate is not critical, so |
| @@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, | |||
| 3783 | } | 3790 | } |
| 3784 | 3791 | ||
| 3785 | if (ctxt->c_split_covers_rec) { | 3792 | if (ctxt->c_split_covers_rec) { |
| 3793 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 3794 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 3795 | handle->h_buffer_credits, | ||
| 3796 | path); | ||
| 3797 | if (ret) { | ||
| 3798 | mlog_errno(ret); | ||
| 3799 | ret = 0; | ||
| 3800 | goto out; | ||
| 3801 | } | ||
| 3802 | |||
| 3786 | /* | 3803 | /* |
| 3787 | * The merge may have left an empty extent in | 3804 | * The merge may have left an empty extent in |
| 3788 | * our leaf. Try to rotate it away. | 3805 | * our leaf. Try to rotate it away. |
| @@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, | |||
| 5342 | struct ocfs2_extent_block *eb; | 5359 | struct ocfs2_extent_block *eb; |
| 5343 | 5360 | ||
| 5344 | if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { | 5361 | if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { |
| 5362 | /* extend credit for ocfs2_remove_rightmost_path */ | ||
| 5363 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
| 5364 | handle->h_buffer_credits, | ||
| 5365 | path); | ||
| 5366 | if (ret) { | ||
| 5367 | mlog_errno(ret); | ||
| 5368 | goto out; | ||
| 5369 | } | ||
| 5370 | |||
| 5345 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); | 5371 | ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); |
| 5346 | if (ret) { | 5372 | if (ret) { |
| 5347 | mlog_errno(ret); | 5373 | mlog_errno(ret); |
| @@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | |||
| 5928 | 5954 | ||
| 5929 | ocfs2_journal_dirty(handle, tl_bh); | 5955 | ocfs2_journal_dirty(handle, tl_bh); |
| 5930 | 5956 | ||
| 5931 | /* TODO: Perhaps we can calculate the bulk of the | ||
| 5932 | * credits up front rather than extending like | ||
| 5933 | * this. */ | ||
| 5934 | status = ocfs2_extend_trans(handle, | ||
| 5935 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
| 5936 | if (status < 0) { | ||
| 5937 | mlog_errno(status); | ||
| 5938 | goto bail; | ||
| 5939 | } | ||
| 5940 | |||
| 5941 | rec = tl->tl_recs[i]; | 5957 | rec = tl->tl_recs[i]; |
| 5942 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, | 5958 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, |
| 5943 | le32_to_cpu(rec.t_start)); | 5959 | le32_to_cpu(rec.t_start)); |
| @@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | |||
| 5958 | goto bail; | 5974 | goto bail; |
| 5959 | } | 5975 | } |
| 5960 | } | 5976 | } |
| 5977 | |||
| 5978 | status = ocfs2_extend_trans(handle, | ||
| 5979 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
| 5980 | if (status < 0) { | ||
| 5981 | mlog_errno(status); | ||
| 5982 | goto bail; | ||
| 5983 | } | ||
| 5961 | i--; | 5984 | i--; |
| 5962 | } | 5985 | } |
| 5963 | 5986 | ||
| @@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | |||
| 6016 | goto out_mutex; | 6039 | goto out_mutex; |
| 6017 | } | 6040 | } |
| 6018 | 6041 | ||
| 6019 | handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); | 6042 | handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); |
| 6020 | if (IS_ERR(handle)) { | 6043 | if (IS_ERR(handle)) { |
| 6021 | status = PTR_ERR(handle); | 6044 | status = PTR_ERR(handle); |
| 6022 | mlog_errno(status); | 6045 | mlog_errno(status); |
| @@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | |||
| 6079 | if (cancel) | 6102 | if (cancel) |
| 6080 | cancel_delayed_work(&osb->osb_truncate_log_wq); | 6103 | cancel_delayed_work(&osb->osb_truncate_log_wq); |
| 6081 | 6104 | ||
| 6082 | queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, | 6105 | queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, |
| 6083 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); | 6106 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); |
| 6084 | } | 6107 | } |
| 6085 | } | 6108 | } |
| @@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) | |||
| 6253 | 6276 | ||
| 6254 | if (tl_inode) { | 6277 | if (tl_inode) { |
| 6255 | cancel_delayed_work(&osb->osb_truncate_log_wq); | 6278 | cancel_delayed_work(&osb->osb_truncate_log_wq); |
| 6256 | flush_workqueue(ocfs2_wq); | 6279 | flush_workqueue(osb->ocfs2_wq); |
| 6257 | 6280 | ||
| 6258 | status = ocfs2_flush_truncate_log(osb); | 6281 | status = ocfs2_flush_truncate_log(osb); |
| 6259 | if (status < 0) | 6282 | if (status < 0) |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 043110e5212d..1581240a7ca0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
| @@ -499,158 +499,6 @@ bail: | |||
| 499 | return status; | 499 | return status; |
| 500 | } | 500 | } |
| 501 | 501 | ||
| 502 | /* | ||
| 503 | * TODO: Make this into a generic get_blocks function. | ||
| 504 | * | ||
| 505 | * From do_direct_io in direct-io.c: | ||
| 506 | * "So what we do is to permit the ->get_blocks function to populate | ||
| 507 | * bh.b_size with the size of IO which is permitted at this offset and | ||
| 508 | * this i_blkbits." | ||
| 509 | * | ||
| 510 | * This function is called directly from get_more_blocks in direct-io.c. | ||
| 511 | * | ||
| 512 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
| 513 | * fs_count, map_bh, dio->rw == WRITE); | ||
| 514 | */ | ||
| 515 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | ||
| 516 | struct buffer_head *bh_result, int create) | ||
| 517 | { | ||
| 518 | int ret; | ||
| 519 | u32 cpos = 0; | ||
| 520 | int alloc_locked = 0; | ||
| 521 | u64 p_blkno, inode_blocks, contig_blocks; | ||
| 522 | unsigned int ext_flags; | ||
| 523 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | ||
| 524 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
| 525 | unsigned long len = bh_result->b_size; | ||
| 526 | unsigned int clusters_to_alloc = 0, contig_clusters = 0; | ||
| 527 | |||
| 528 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); | ||
| 529 | |||
| 530 | /* This function won't even be called if the request isn't all | ||
| 531 | * nicely aligned and of the right size, so there's no need | ||
| 532 | * for us to check any of that. */ | ||
| 533 | |||
| 534 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
| 535 | |||
| 536 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 537 | |||
| 538 | /* This figures out the size of the next contiguous block, and | ||
| 539 | * our logical offset */ | ||
| 540 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | ||
| 541 | &contig_blocks, &ext_flags); | ||
| 542 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 543 | |||
| 544 | if (ret) { | ||
| 545 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
| 546 | (unsigned long long)iblock); | ||
| 547 | ret = -EIO; | ||
| 548 | goto bail; | ||
| 549 | } | ||
| 550 | |||
| 551 | /* We should already CoW the refcounted extent in case of create. */ | ||
| 552 | BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); | ||
| 553 | |||
| 554 | /* allocate blocks if no p_blkno is found, and create == 1 */ | ||
| 555 | if (!p_blkno && create) { | ||
| 556 | ret = ocfs2_inode_lock(inode, NULL, 1); | ||
| 557 | if (ret < 0) { | ||
| 558 | mlog_errno(ret); | ||
| 559 | goto bail; | ||
| 560 | } | ||
| 561 | |||
| 562 | alloc_locked = 1; | ||
| 563 | |||
| 564 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 565 | |||
| 566 | /* fill hole, allocate blocks can't be larger than the size | ||
| 567 | * of the hole */ | ||
| 568 | clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); | ||
| 569 | contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, | ||
| 570 | contig_blocks); | ||
| 571 | if (clusters_to_alloc > contig_clusters) | ||
| 572 | clusters_to_alloc = contig_clusters; | ||
| 573 | |||
| 574 | /* allocate extent and insert them into the extent tree */ | ||
| 575 | ret = ocfs2_extend_allocation(inode, cpos, | ||
| 576 | clusters_to_alloc, 0); | ||
| 577 | if (ret < 0) { | ||
| 578 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 579 | mlog_errno(ret); | ||
| 580 | goto bail; | ||
| 581 | } | ||
| 582 | |||
| 583 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | ||
| 584 | &contig_blocks, &ext_flags); | ||
| 585 | if (ret < 0) { | ||
| 586 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 587 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
| 588 | (unsigned long long)iblock); | ||
| 589 | ret = -EIO; | ||
| 590 | goto bail; | ||
| 591 | } | ||
| 592 | set_buffer_new(bh_result); | ||
| 593 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 594 | } | ||
| 595 | |||
| 596 | /* | ||
| 597 | * get_more_blocks() expects us to describe a hole by clearing | ||
| 598 | * the mapped bit on bh_result(). | ||
| 599 | * | ||
| 600 | * Consider an unwritten extent as a hole. | ||
| 601 | */ | ||
| 602 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 603 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 604 | else | ||
| 605 | clear_buffer_mapped(bh_result); | ||
| 606 | |||
| 607 | /* make sure we don't map more than max_blocks blocks here as | ||
| 608 | that's all the kernel will handle at this point. */ | ||
| 609 | if (max_blocks < contig_blocks) | ||
| 610 | contig_blocks = max_blocks; | ||
| 611 | bh_result->b_size = contig_blocks << blocksize_bits; | ||
| 612 | bail: | ||
| 613 | if (alloc_locked) | ||
| 614 | ocfs2_inode_unlock(inode, 1); | ||
| 615 | return ret; | ||
| 616 | } | ||
| 617 | |||
| 618 | /* | ||
| 619 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
| 620 | * particularly interested in the aio/dio case. We use the rw_lock DLM lock | ||
| 621 | * to protect io on one node from truncation on another. | ||
| 622 | */ | ||
| 623 | static int ocfs2_dio_end_io(struct kiocb *iocb, | ||
| 624 | loff_t offset, | ||
| 625 | ssize_t bytes, | ||
| 626 | void *private) | ||
| 627 | { | ||
| 628 | struct inode *inode = file_inode(iocb->ki_filp); | ||
| 629 | int level; | ||
| 630 | |||
| 631 | if (bytes <= 0) | ||
| 632 | return 0; | ||
| 633 | |||
| 634 | /* this io's submitter should not have unlocked this before we could */ | ||
| 635 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
| 636 | |||
| 637 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
| 638 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
| 639 | |||
| 640 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | ||
| 641 | } | ||
| 642 | |||
| 643 | /* Let rw unlock to be done later to protect append direct io write */ | ||
| 644 | if (offset + bytes <= i_size_read(inode)) { | ||
| 645 | ocfs2_iocb_clear_rw_locked(iocb); | ||
| 646 | |||
| 647 | level = ocfs2_iocb_rw_locked_level(iocb); | ||
| 648 | ocfs2_rw_unlock(inode, level); | ||
| 649 | } | ||
| 650 | |||
| 651 | return 0; | ||
| 652 | } | ||
| 653 | |||
| 654 | static int ocfs2_releasepage(struct page *page, gfp_t wait) | 502 | static int ocfs2_releasepage(struct page *page, gfp_t wait) |
| 655 | { | 503 | { |
| 656 | if (!page_has_buffers(page)) | 504 | if (!page_has_buffers(page)) |
| @@ -658,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) | |||
| 658 | return try_to_free_buffers(page); | 506 | return try_to_free_buffers(page); |
| 659 | } | 507 | } |
| 660 | 508 | ||
| 661 | static int ocfs2_is_overwrite(struct ocfs2_super *osb, | ||
| 662 | struct inode *inode, loff_t offset) | ||
| 663 | { | ||
| 664 | int ret = 0; | ||
| 665 | u32 v_cpos = 0; | ||
| 666 | u32 p_cpos = 0; | ||
| 667 | unsigned int num_clusters = 0; | ||
| 668 | unsigned int ext_flags = 0; | ||
| 669 | |||
| 670 | v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); | ||
| 671 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, | ||
| 672 | &num_clusters, &ext_flags); | ||
| 673 | if (ret < 0) { | ||
| 674 | mlog_errno(ret); | ||
| 675 | return ret; | ||
| 676 | } | ||
| 677 | |||
| 678 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 679 | return 1; | ||
| 680 | |||
| 681 | return 0; | ||
| 682 | } | ||
| 683 | |||
| 684 | static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, | ||
| 685 | struct inode *inode, loff_t offset, | ||
| 686 | u64 zero_len, int cluster_align) | ||
| 687 | { | ||
| 688 | u32 p_cpos = 0; | ||
| 689 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); | ||
| 690 | unsigned int num_clusters = 0; | ||
| 691 | unsigned int ext_flags = 0; | ||
| 692 | int ret = 0; | ||
| 693 | |||
| 694 | if (offset <= i_size_read(inode) || cluster_align) | ||
| 695 | return 0; | ||
| 696 | |||
| 697 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, | ||
| 698 | &ext_flags); | ||
| 699 | if (ret < 0) { | ||
| 700 | mlog_errno(ret); | ||
| 701 | return ret; | ||
| 702 | } | ||
| 703 | |||
| 704 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { | ||
| 705 | u64 s = i_size_read(inode); | ||
| 706 | sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + | ||
| 707 | (do_div(s, osb->s_clustersize) >> 9); | ||
| 708 | |||
| 709 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, | ||
| 710 | zero_len >> 9, GFP_NOFS, false); | ||
| 711 | if (ret < 0) | ||
| 712 | mlog_errno(ret); | ||
| 713 | } | ||
| 714 | |||
| 715 | return ret; | ||
| 716 | } | ||
| 717 | |||
| 718 | static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, | ||
| 719 | struct inode *inode, loff_t offset) | ||
| 720 | { | ||
| 721 | u64 zero_start, zero_len, total_zero_len; | ||
| 722 | u32 p_cpos = 0, clusters_to_add; | ||
| 723 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); | ||
| 724 | unsigned int num_clusters = 0; | ||
| 725 | unsigned int ext_flags = 0; | ||
| 726 | u32 size_div, offset_div; | ||
| 727 | int ret = 0; | ||
| 728 | |||
| 729 | { | ||
| 730 | u64 o = offset; | ||
| 731 | u64 s = i_size_read(inode); | ||
| 732 | |||
| 733 | offset_div = do_div(o, osb->s_clustersize); | ||
| 734 | size_div = do_div(s, osb->s_clustersize); | ||
| 735 | } | ||
| 736 | |||
| 737 | if (offset <= i_size_read(inode)) | ||
| 738 | return 0; | ||
| 739 | |||
| 740 | clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - | ||
| 741 | ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); | ||
| 742 | total_zero_len = offset - i_size_read(inode); | ||
| 743 | if (clusters_to_add) | ||
| 744 | total_zero_len -= offset_div; | ||
| 745 | |||
| 746 | /* Allocate clusters to fill out holes, and this is only needed | ||
| 747 | * when we add more than one clusters. Otherwise the cluster will | ||
| 748 | * be allocated during direct IO */ | ||
| 749 | if (clusters_to_add > 1) { | ||
| 750 | ret = ocfs2_extend_allocation(inode, | ||
| 751 | OCFS2_I(inode)->ip_clusters, | ||
| 752 | clusters_to_add - 1, 0); | ||
| 753 | if (ret) { | ||
| 754 | mlog_errno(ret); | ||
| 755 | goto out; | ||
| 756 | } | ||
| 757 | } | ||
| 758 | |||
| 759 | while (total_zero_len) { | ||
| 760 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, | ||
| 761 | &ext_flags); | ||
| 762 | if (ret < 0) { | ||
| 763 | mlog_errno(ret); | ||
| 764 | goto out; | ||
| 765 | } | ||
| 766 | |||
| 767 | zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + | ||
| 768 | size_div; | ||
| 769 | zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - | ||
| 770 | size_div; | ||
| 771 | zero_len = min(total_zero_len, zero_len); | ||
| 772 | |||
| 773 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { | ||
| 774 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, | ||
| 775 | zero_start >> 9, zero_len >> 9, | ||
| 776 | GFP_NOFS, false); | ||
| 777 | if (ret < 0) { | ||
| 778 | mlog_errno(ret); | ||
| 779 | goto out; | ||
| 780 | } | ||
| 781 | } | ||
| 782 | |||
| 783 | total_zero_len -= zero_len; | ||
| 784 | v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); | ||
| 785 | |||
| 786 | /* Only at first iteration can be cluster not aligned. | ||
| 787 | * So set size_div to 0 for the rest */ | ||
| 788 | size_div = 0; | ||
| 789 | } | ||
| 790 | |||
| 791 | out: | ||
| 792 | return ret; | ||
| 793 | } | ||
| 794 | |||
| 795 | static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, | ||
| 796 | struct iov_iter *iter, | ||
| 797 | loff_t offset) | ||
| 798 | { | ||
| 799 | ssize_t ret = 0; | ||
| 800 | ssize_t written = 0; | ||
| 801 | bool orphaned = false; | ||
| 802 | int is_overwrite = 0; | ||
| 803 | struct file *file = iocb->ki_filp; | ||
| 804 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
| 805 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 806 | struct buffer_head *di_bh = NULL; | ||
| 807 | size_t count = iter->count; | ||
| 808 | journal_t *journal = osb->journal->j_journal; | ||
| 809 | u64 zero_len_head, zero_len_tail; | ||
| 810 | int cluster_align_head, cluster_align_tail; | ||
| 811 | loff_t final_size = offset + count; | ||
| 812 | int append_write = offset >= i_size_read(inode) ? 1 : 0; | ||
| 813 | unsigned int num_clusters = 0; | ||
| 814 | unsigned int ext_flags = 0; | ||
| 815 | |||
| 816 | { | ||
| 817 | u64 o = offset; | ||
| 818 | u64 s = i_size_read(inode); | ||
| 819 | |||
| 820 | zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); | ||
| 821 | cluster_align_head = !zero_len_head; | ||
| 822 | |||
| 823 | zero_len_tail = osb->s_clustersize - | ||
| 824 | do_div(s, osb->s_clustersize); | ||
| 825 | if ((offset - i_size_read(inode)) < zero_len_tail) | ||
| 826 | zero_len_tail = offset - i_size_read(inode); | ||
| 827 | cluster_align_tail = !zero_len_tail; | ||
| 828 | } | ||
| 829 | |||
| 830 | /* | ||
| 831 | * when final_size > inode->i_size, inode->i_size will be | ||
| 832 | * updated after direct write, so add the inode to orphan | ||
| 833 | * dir first. | ||
| 834 | */ | ||
| 835 | if (final_size > i_size_read(inode)) { | ||
| 836 | ret = ocfs2_add_inode_to_orphan(osb, inode); | ||
| 837 | if (ret < 0) { | ||
| 838 | mlog_errno(ret); | ||
| 839 | goto out; | ||
| 840 | } | ||
| 841 | orphaned = true; | ||
| 842 | } | ||
| 843 | |||
| 844 | if (append_write) { | ||
| 845 | ret = ocfs2_inode_lock(inode, NULL, 1); | ||
| 846 | if (ret < 0) { | ||
| 847 | mlog_errno(ret); | ||
| 848 | goto clean_orphan; | ||
| 849 | } | ||
| 850 | |||
| 851 | /* zeroing out the previously allocated cluster tail | ||
| 852 | * that but not zeroed */ | ||
| 853 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
| 854 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 855 | ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, | ||
| 856 | zero_len_tail, cluster_align_tail); | ||
| 857 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 858 | } else { | ||
| 859 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 860 | ret = ocfs2_direct_IO_extend_no_holes(osb, inode, | ||
| 861 | offset); | ||
| 862 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 863 | } | ||
| 864 | if (ret < 0) { | ||
| 865 | mlog_errno(ret); | ||
| 866 | ocfs2_inode_unlock(inode, 1); | ||
| 867 | goto clean_orphan; | ||
| 868 | } | ||
| 869 | |||
| 870 | is_overwrite = ocfs2_is_overwrite(osb, inode, offset); | ||
| 871 | if (is_overwrite < 0) { | ||
| 872 | mlog_errno(is_overwrite); | ||
| 873 | ret = is_overwrite; | ||
| 874 | ocfs2_inode_unlock(inode, 1); | ||
| 875 | goto clean_orphan; | ||
| 876 | } | ||
| 877 | |||
| 878 | ocfs2_inode_unlock(inode, 1); | ||
| 879 | } | ||
| 880 | |||
| 881 | written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, | ||
| 882 | offset, ocfs2_direct_IO_get_blocks, | ||
| 883 | ocfs2_dio_end_io, NULL, 0); | ||
| 884 | /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ | ||
| 885 | if ((written < 0) && (written != -EIOCBQUEUED)) { | ||
| 886 | loff_t i_size = i_size_read(inode); | ||
| 887 | |||
| 888 | if (offset + count > i_size) { | ||
| 889 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
| 890 | if (ret < 0) { | ||
| 891 | mlog_errno(ret); | ||
| 892 | goto clean_orphan; | ||
| 893 | } | ||
| 894 | |||
| 895 | if (i_size == i_size_read(inode)) { | ||
| 896 | ret = ocfs2_truncate_file(inode, di_bh, | ||
| 897 | i_size); | ||
| 898 | if (ret < 0) { | ||
| 899 | if (ret != -ENOSPC) | ||
| 900 | mlog_errno(ret); | ||
| 901 | |||
| 902 | ocfs2_inode_unlock(inode, 1); | ||
| 903 | brelse(di_bh); | ||
| 904 | di_bh = NULL; | ||
| 905 | goto clean_orphan; | ||
| 906 | } | ||
| 907 | } | ||
| 908 | |||
| 909 | ocfs2_inode_unlock(inode, 1); | ||
| 910 | brelse(di_bh); | ||
| 911 | di_bh = NULL; | ||
| 912 | |||
| 913 | ret = jbd2_journal_force_commit(journal); | ||
| 914 | if (ret < 0) | ||
| 915 | mlog_errno(ret); | ||
| 916 | } | ||
| 917 | } else if (written > 0 && append_write && !is_overwrite && | ||
| 918 | !cluster_align_head) { | ||
| 919 | /* zeroing out the allocated cluster head */ | ||
| 920 | u32 p_cpos = 0; | ||
| 921 | u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); | ||
| 922 | |||
| 923 | ret = ocfs2_inode_lock(inode, NULL, 0); | ||
| 924 | if (ret < 0) { | ||
| 925 | mlog_errno(ret); | ||
| 926 | goto clean_orphan; | ||
| 927 | } | ||
| 928 | |||
| 929 | ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, | ||
| 930 | &num_clusters, &ext_flags); | ||
| 931 | if (ret < 0) { | ||
| 932 | mlog_errno(ret); | ||
| 933 | ocfs2_inode_unlock(inode, 0); | ||
| 934 | goto clean_orphan; | ||
| 935 | } | ||
| 936 | |||
| 937 | BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); | ||
| 938 | |||
| 939 | ret = blkdev_issue_zeroout(osb->sb->s_bdev, | ||
| 940 | (u64)p_cpos << (osb->s_clustersize_bits - 9), | ||
| 941 | zero_len_head >> 9, GFP_NOFS, false); | ||
| 942 | if (ret < 0) | ||
| 943 | mlog_errno(ret); | ||
| 944 | |||
| 945 | ocfs2_inode_unlock(inode, 0); | ||
| 946 | } | ||
| 947 | |||
| 948 | clean_orphan: | ||
| 949 | if (orphaned) { | ||
| 950 | int tmp_ret; | ||
| 951 | int update_isize = written > 0 ? 1 : 0; | ||
| 952 | loff_t end = update_isize ? offset + written : 0; | ||
| 953 | |||
| 954 | tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
| 955 | if (tmp_ret < 0) { | ||
| 956 | ret = tmp_ret; | ||
| 957 | mlog_errno(ret); | ||
| 958 | goto out; | ||
| 959 | } | ||
| 960 | |||
| 961 | tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, | ||
| 962 | update_isize, end); | ||
| 963 | if (tmp_ret < 0) { | ||
| 964 | ocfs2_inode_unlock(inode, 1); | ||
| 965 | ret = tmp_ret; | ||
| 966 | mlog_errno(ret); | ||
| 967 | brelse(di_bh); | ||
| 968 | goto out; | ||
| 969 | } | ||
| 970 | |||
| 971 | ocfs2_inode_unlock(inode, 1); | ||
| 972 | brelse(di_bh); | ||
| 973 | |||
| 974 | tmp_ret = jbd2_journal_force_commit(journal); | ||
| 975 | if (tmp_ret < 0) { | ||
| 976 | ret = tmp_ret; | ||
| 977 | mlog_errno(tmp_ret); | ||
| 978 | } | ||
| 979 | } | ||
| 980 | |||
| 981 | out: | ||
| 982 | if (ret >= 0) | ||
| 983 | ret = written; | ||
| 984 | return ret; | ||
| 985 | } | ||
| 986 | |||
| 987 | static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
| 988 | loff_t offset) | ||
| 989 | { | ||
| 990 | struct file *file = iocb->ki_filp; | ||
| 991 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
| 992 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 993 | int full_coherency = !(osb->s_mount_opt & | ||
| 994 | OCFS2_MOUNT_COHERENCY_BUFFERED); | ||
| 995 | |||
| 996 | /* | ||
| 997 | * Fallback to buffered I/O if we see an inode without | ||
| 998 | * extents. | ||
| 999 | */ | ||
| 1000 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
| 1001 | return 0; | ||
| 1002 | |||
| 1003 | /* Fallback to buffered I/O if we are appending and | ||
| 1004 | * concurrent O_DIRECT writes are allowed. | ||
| 1005 | */ | ||
| 1006 | if (i_size_read(inode) <= offset && !full_coherency) | ||
| 1007 | return 0; | ||
| 1008 | |||
| 1009 | if (iov_iter_rw(iter) == READ) | ||
| 1010 | return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, | ||
| 1011 | iter, offset, | ||
| 1012 | ocfs2_direct_IO_get_blocks, | ||
| 1013 | ocfs2_dio_end_io, NULL, 0); | ||
| 1014 | else | ||
| 1015 | return ocfs2_direct_IO_write(iocb, iter, offset); | ||
| 1016 | } | ||
| 1017 | |||
| 1018 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | 509 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, |
| 1019 | u32 cpos, | 510 | u32 cpos, |
| 1020 | unsigned int *start, | 511 | unsigned int *start, |
| @@ -1201,6 +692,13 @@ next_bh: | |||
| 1201 | 692 | ||
| 1202 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | 693 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) |
| 1203 | 694 | ||
| 695 | struct ocfs2_unwritten_extent { | ||
| 696 | struct list_head ue_node; | ||
| 697 | struct list_head ue_ip_node; | ||
| 698 | u32 ue_cpos; | ||
| 699 | u32 ue_phys; | ||
| 700 | }; | ||
| 701 | |||
| 1204 | /* | 702 | /* |
| 1205 | * Describe the state of a single cluster to be written to. | 703 | * Describe the state of a single cluster to be written to. |
| 1206 | */ | 704 | */ |
| @@ -1212,7 +710,7 @@ struct ocfs2_write_cluster_desc { | |||
| 1212 | * filled. | 710 | * filled. |
| 1213 | */ | 711 | */ |
| 1214 | unsigned c_new; | 712 | unsigned c_new; |
| 1215 | unsigned c_unwritten; | 713 | unsigned c_clear_unwritten; |
| 1216 | unsigned c_needs_zero; | 714 | unsigned c_needs_zero; |
| 1217 | }; | 715 | }; |
| 1218 | 716 | ||
| @@ -1224,6 +722,9 @@ struct ocfs2_write_ctxt { | |||
| 1224 | /* First cluster allocated in a nonsparse extend */ | 722 | /* First cluster allocated in a nonsparse extend */ |
| 1225 | u32 w_first_new_cpos; | 723 | u32 w_first_new_cpos; |
| 1226 | 724 | ||
| 725 | /* Type of caller. Must be one of buffer, mmap, direct. */ | ||
| 726 | ocfs2_write_type_t w_type; | ||
| 727 | |||
| 1227 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; | 728 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
| 1228 | 729 | ||
| 1229 | /* | 730 | /* |
| @@ -1272,6 +773,8 @@ struct ocfs2_write_ctxt { | |||
| 1272 | struct buffer_head *w_di_bh; | 773 | struct buffer_head *w_di_bh; |
| 1273 | 774 | ||
| 1274 | struct ocfs2_cached_dealloc_ctxt w_dealloc; | 775 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
| 776 | |||
| 777 | struct list_head w_unwritten_list; | ||
| 1275 | }; | 778 | }; |
| 1276 | 779 | ||
| 1277 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) | 780 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) |
| @@ -1310,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) | |||
| 1310 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); | 813 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); |
| 1311 | } | 814 | } |
| 1312 | 815 | ||
| 1313 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | 816 | static void ocfs2_free_unwritten_list(struct inode *inode, |
| 817 | struct list_head *head) | ||
| 1314 | { | 818 | { |
| 819 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 820 | struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; | ||
| 821 | |||
| 822 | list_for_each_entry_safe(ue, tmp, head, ue_node) { | ||
| 823 | list_del(&ue->ue_node); | ||
| 824 | spin_lock(&oi->ip_lock); | ||
| 825 | list_del(&ue->ue_ip_node); | ||
| 826 | spin_unlock(&oi->ip_lock); | ||
| 827 | kfree(ue); | ||
| 828 | } | ||
| 829 | } | ||
| 830 | |||
| 831 | static void ocfs2_free_write_ctxt(struct inode *inode, | ||
| 832 | struct ocfs2_write_ctxt *wc) | ||
| 833 | { | ||
| 834 | ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); | ||
| 1315 | ocfs2_unlock_pages(wc); | 835 | ocfs2_unlock_pages(wc); |
| 1316 | brelse(wc->w_di_bh); | 836 | brelse(wc->w_di_bh); |
| 1317 | kfree(wc); | 837 | kfree(wc); |
| @@ -1319,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | |||
| 1319 | 839 | ||
| 1320 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | 840 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
| 1321 | struct ocfs2_super *osb, loff_t pos, | 841 | struct ocfs2_super *osb, loff_t pos, |
| 1322 | unsigned len, struct buffer_head *di_bh) | 842 | unsigned len, ocfs2_write_type_t type, |
| 843 | struct buffer_head *di_bh) | ||
| 1323 | { | 844 | { |
| 1324 | u32 cend; | 845 | u32 cend; |
| 1325 | struct ocfs2_write_ctxt *wc; | 846 | struct ocfs2_write_ctxt *wc; |
| @@ -1334,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |||
| 1334 | wc->w_clen = cend - wc->w_cpos + 1; | 855 | wc->w_clen = cend - wc->w_cpos + 1; |
| 1335 | get_bh(di_bh); | 856 | get_bh(di_bh); |
| 1336 | wc->w_di_bh = di_bh; | 857 | wc->w_di_bh = di_bh; |
| 858 | wc->w_type = type; | ||
| 1337 | 859 | ||
| 1338 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 860 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
| 1339 | wc->w_large_pages = 1; | 861 | wc->w_large_pages = 1; |
| @@ -1341,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |||
| 1341 | wc->w_large_pages = 0; | 863 | wc->w_large_pages = 0; |
| 1342 | 864 | ||
| 1343 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | 865 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); |
| 866 | INIT_LIST_HEAD(&wc->w_unwritten_list); | ||
| 1344 | 867 | ||
| 1345 | *wcp = wc; | 868 | *wcp = wc; |
| 1346 | 869 | ||
| @@ -1401,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode, | |||
| 1401 | to = user_pos + user_len; | 924 | to = user_pos + user_len; |
| 1402 | struct page *tmppage; | 925 | struct page *tmppage; |
| 1403 | 926 | ||
| 1404 | ocfs2_zero_new_buffers(wc->w_target_page, from, to); | 927 | if (wc->w_target_page) |
| 928 | ocfs2_zero_new_buffers(wc->w_target_page, from, to); | ||
| 1405 | 929 | ||
| 1406 | for(i = 0; i < wc->w_num_pages; i++) { | 930 | for(i = 0; i < wc->w_num_pages; i++) { |
| 1407 | tmppage = wc->w_pages[i]; | 931 | tmppage = wc->w_pages[i]; |
| 1408 | 932 | ||
| 1409 | if (page_has_buffers(tmppage)) { | 933 | if (tmppage && page_has_buffers(tmppage)) { |
| 1410 | if (ocfs2_should_order_data(inode)) | 934 | if (ocfs2_should_order_data(inode)) |
| 1411 | ocfs2_jbd2_file_inode(wc->w_handle, inode); | 935 | ocfs2_jbd2_file_inode(wc->w_handle, inode); |
| 1412 | 936 | ||
| @@ -1536,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
| 1536 | wc->w_num_pages = 1; | 1060 | wc->w_num_pages = 1; |
| 1537 | start = target_index; | 1061 | start = target_index; |
| 1538 | } | 1062 | } |
| 1063 | end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT; | ||
| 1539 | 1064 | ||
| 1540 | for(i = 0; i < wc->w_num_pages; i++) { | 1065 | for(i = 0; i < wc->w_num_pages; i++) { |
| 1541 | index = start + i; | 1066 | index = start + i; |
| 1542 | 1067 | ||
| 1543 | if (index == target_index && mmap_page) { | 1068 | if (index >= target_index && index <= end_index && |
| 1069 | wc->w_type == OCFS2_WRITE_MMAP) { | ||
| 1544 | /* | 1070 | /* |
| 1545 | * ocfs2_pagemkwrite() is a little different | 1071 | * ocfs2_pagemkwrite() is a little different |
| 1546 | * and wants us to directly use the page | 1072 | * and wants us to directly use the page |
| @@ -1559,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
| 1559 | page_cache_get(mmap_page); | 1085 | page_cache_get(mmap_page); |
| 1560 | wc->w_pages[i] = mmap_page; | 1086 | wc->w_pages[i] = mmap_page; |
| 1561 | wc->w_target_locked = true; | 1087 | wc->w_target_locked = true; |
| 1088 | } else if (index >= target_index && index <= end_index && | ||
| 1089 | wc->w_type == OCFS2_WRITE_DIRECT) { | ||
| 1090 | /* Direct write has no mapping page. */ | ||
| 1091 | wc->w_pages[i] = NULL; | ||
| 1092 | continue; | ||
| 1562 | } else { | 1093 | } else { |
| 1563 | wc->w_pages[i] = find_or_create_page(mapping, index, | 1094 | wc->w_pages[i] = find_or_create_page(mapping, index, |
| 1564 | GFP_NOFS); | 1095 | GFP_NOFS); |
| @@ -1583,19 +1114,20 @@ out: | |||
| 1583 | * Prepare a single cluster for write one cluster into the file. | 1114 | * Prepare a single cluster for write one cluster into the file. |
| 1584 | */ | 1115 | */ |
| 1585 | static int ocfs2_write_cluster(struct address_space *mapping, | 1116 | static int ocfs2_write_cluster(struct address_space *mapping, |
| 1586 | u32 phys, unsigned int unwritten, | 1117 | u32 *phys, unsigned int new, |
| 1118 | unsigned int clear_unwritten, | ||
| 1587 | unsigned int should_zero, | 1119 | unsigned int should_zero, |
| 1588 | struct ocfs2_alloc_context *data_ac, | 1120 | struct ocfs2_alloc_context *data_ac, |
| 1589 | struct ocfs2_alloc_context *meta_ac, | 1121 | struct ocfs2_alloc_context *meta_ac, |
| 1590 | struct ocfs2_write_ctxt *wc, u32 cpos, | 1122 | struct ocfs2_write_ctxt *wc, u32 cpos, |
| 1591 | loff_t user_pos, unsigned user_len) | 1123 | loff_t user_pos, unsigned user_len) |
| 1592 | { | 1124 | { |
| 1593 | int ret, i, new; | 1125 | int ret, i; |
| 1594 | u64 v_blkno, p_blkno; | 1126 | u64 p_blkno; |
| 1595 | struct inode *inode = mapping->host; | 1127 | struct inode *inode = mapping->host; |
| 1596 | struct ocfs2_extent_tree et; | 1128 | struct ocfs2_extent_tree et; |
| 1129 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | ||
| 1597 | 1130 | ||
| 1598 | new = phys == 0 ? 1 : 0; | ||
| 1599 | if (new) { | 1131 | if (new) { |
| 1600 | u32 tmp_pos; | 1132 | u32 tmp_pos; |
| 1601 | 1133 | ||
| @@ -1605,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
| 1605 | */ | 1137 | */ |
| 1606 | tmp_pos = cpos; | 1138 | tmp_pos = cpos; |
| 1607 | ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, | 1139 | ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, |
| 1608 | &tmp_pos, 1, 0, wc->w_di_bh, | 1140 | &tmp_pos, 1, !clear_unwritten, |
| 1609 | wc->w_handle, data_ac, | 1141 | wc->w_di_bh, wc->w_handle, |
| 1610 | meta_ac, NULL); | 1142 | data_ac, meta_ac, NULL); |
| 1611 | /* | 1143 | /* |
| 1612 | * This shouldn't happen because we must have already | 1144 | * This shouldn't happen because we must have already |
| 1613 | * calculated the correct meta data allocation required. The | 1145 | * calculated the correct meta data allocation required. The |
| @@ -1624,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
| 1624 | mlog_errno(ret); | 1156 | mlog_errno(ret); |
| 1625 | goto out; | 1157 | goto out; |
| 1626 | } | 1158 | } |
| 1627 | } else if (unwritten) { | 1159 | } else if (clear_unwritten) { |
| 1628 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), | 1160 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), |
| 1629 | wc->w_di_bh); | 1161 | wc->w_di_bh); |
| 1630 | ret = ocfs2_mark_extent_written(inode, &et, | 1162 | ret = ocfs2_mark_extent_written(inode, &et, |
| 1631 | wc->w_handle, cpos, 1, phys, | 1163 | wc->w_handle, cpos, 1, *phys, |
| 1632 | meta_ac, &wc->w_dealloc); | 1164 | meta_ac, &wc->w_dealloc); |
| 1633 | if (ret < 0) { | 1165 | if (ret < 0) { |
| 1634 | mlog_errno(ret); | 1166 | mlog_errno(ret); |
| @@ -1636,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping, | |||
| 1636 | } | 1168 | } |
| 1637 | } | 1169 | } |
| 1638 | 1170 | ||
| 1639 | if (should_zero) | ||
| 1640 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
| 1641 | else | ||
| 1642 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
| 1643 | |||
| 1644 | /* | 1171 | /* |
| 1645 | * The only reason this should fail is due to an inability to | 1172 | * The only reason this should fail is due to an inability to |
| 1646 | * find the extent added. | 1173 | * find the extent added. |
| 1647 | */ | 1174 | */ |
| 1648 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1175 | ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); |
| 1649 | NULL); | ||
| 1650 | if (ret < 0) { | 1176 | if (ret < 0) { |
| 1651 | mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " | 1177 | mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " |
| 1652 | "at logical block %llu", | 1178 | "at logical cluster %u", |
| 1653 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1179 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); |
| 1654 | (unsigned long long)v_blkno); | ||
| 1655 | goto out; | 1180 | goto out; |
| 1656 | } | 1181 | } |
| 1657 | 1182 | ||
| 1658 | BUG_ON(p_blkno == 0); | 1183 | BUG_ON(*phys == 0); |
| 1184 | |||
| 1185 | p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); | ||
| 1186 | if (!should_zero) | ||
| 1187 | p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); | ||
| 1659 | 1188 | ||
| 1660 | for(i = 0; i < wc->w_num_pages; i++) { | 1189 | for(i = 0; i < wc->w_num_pages; i++) { |
| 1661 | int tmpret; | 1190 | int tmpret; |
| 1662 | 1191 | ||
| 1192 | /* This is the direct io target page. */ | ||
| 1193 | if (wc->w_pages[i] == NULL) { | ||
| 1194 | p_blkno++; | ||
| 1195 | continue; | ||
| 1196 | } | ||
| 1197 | |||
| 1663 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, | 1198 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
| 1664 | wc->w_pages[i], cpos, | 1199 | wc->w_pages[i], cpos, |
| 1665 | user_pos, user_len, | 1200 | user_pos, user_len, |
| @@ -1706,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, | |||
| 1706 | if ((cluster_off + local_len) > osb->s_clustersize) | 1241 | if ((cluster_off + local_len) > osb->s_clustersize) |
| 1707 | local_len = osb->s_clustersize - cluster_off; | 1242 | local_len = osb->s_clustersize - cluster_off; |
| 1708 | 1243 | ||
| 1709 | ret = ocfs2_write_cluster(mapping, desc->c_phys, | 1244 | ret = ocfs2_write_cluster(mapping, &desc->c_phys, |
| 1710 | desc->c_unwritten, | 1245 | desc->c_new, |
| 1246 | desc->c_clear_unwritten, | ||
| 1711 | desc->c_needs_zero, | 1247 | desc->c_needs_zero, |
| 1712 | data_ac, meta_ac, | 1248 | data_ac, meta_ac, |
| 1713 | wc, desc->c_cpos, pos, local_len); | 1249 | wc, desc->c_cpos, pos, local_len); |
| @@ -1778,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | |||
| 1778 | } | 1314 | } |
| 1779 | 1315 | ||
| 1780 | /* | 1316 | /* |
| 1317 | * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to | ||
| 1318 | * do the zero work. And should not to clear UNWRITTEN since it will be cleared | ||
| 1319 | * by the direct io procedure. | ||
| 1320 | * If this is a new extent that allocated by direct io, we should mark it in | ||
| 1321 | * the ip_unwritten_list. | ||
| 1322 | */ | ||
| 1323 | static int ocfs2_unwritten_check(struct inode *inode, | ||
| 1324 | struct ocfs2_write_ctxt *wc, | ||
| 1325 | struct ocfs2_write_cluster_desc *desc) | ||
| 1326 | { | ||
| 1327 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 1328 | struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; | ||
| 1329 | int ret = 0; | ||
| 1330 | |||
| 1331 | if (!desc->c_needs_zero) | ||
| 1332 | return 0; | ||
| 1333 | |||
| 1334 | retry: | ||
| 1335 | spin_lock(&oi->ip_lock); | ||
| 1336 | /* Needs not to zero no metter buffer or direct. The one who is zero | ||
| 1337 | * the cluster is doing zero. And he will clear unwritten after all | ||
| 1338 | * cluster io finished. */ | ||
| 1339 | list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { | ||
| 1340 | if (desc->c_cpos == ue->ue_cpos) { | ||
| 1341 | BUG_ON(desc->c_new); | ||
| 1342 | desc->c_needs_zero = 0; | ||
| 1343 | desc->c_clear_unwritten = 0; | ||
| 1344 | goto unlock; | ||
| 1345 | } | ||
| 1346 | } | ||
| 1347 | |||
| 1348 | if (wc->w_type != OCFS2_WRITE_DIRECT) | ||
| 1349 | goto unlock; | ||
| 1350 | |||
| 1351 | if (new == NULL) { | ||
| 1352 | spin_unlock(&oi->ip_lock); | ||
| 1353 | new = kmalloc(sizeof(struct ocfs2_unwritten_extent), | ||
| 1354 | GFP_NOFS); | ||
| 1355 | if (new == NULL) { | ||
| 1356 | ret = -ENOMEM; | ||
| 1357 | goto out; | ||
| 1358 | } | ||
| 1359 | goto retry; | ||
| 1360 | } | ||
| 1361 | /* This direct write will doing zero. */ | ||
| 1362 | new->ue_cpos = desc->c_cpos; | ||
| 1363 | new->ue_phys = desc->c_phys; | ||
| 1364 | desc->c_clear_unwritten = 0; | ||
| 1365 | list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); | ||
| 1366 | list_add_tail(&new->ue_node, &wc->w_unwritten_list); | ||
| 1367 | new = NULL; | ||
| 1368 | unlock: | ||
| 1369 | spin_unlock(&oi->ip_lock); | ||
| 1370 | out: | ||
| 1371 | if (new) | ||
| 1372 | kfree(new); | ||
| 1373 | return ret; | ||
| 1374 | } | ||
| 1375 | |||
| 1376 | /* | ||
| 1781 | * Populate each single-cluster write descriptor in the write context | 1377 | * Populate each single-cluster write descriptor in the write context |
| 1782 | * with information about the i/o to be done. | 1378 | * with information about the i/o to be done. |
| 1783 | * | 1379 | * |
| @@ -1852,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode, | |||
| 1852 | if (phys == 0) { | 1448 | if (phys == 0) { |
| 1853 | desc->c_new = 1; | 1449 | desc->c_new = 1; |
| 1854 | desc->c_needs_zero = 1; | 1450 | desc->c_needs_zero = 1; |
| 1451 | desc->c_clear_unwritten = 1; | ||
| 1855 | *clusters_to_alloc = *clusters_to_alloc + 1; | 1452 | *clusters_to_alloc = *clusters_to_alloc + 1; |
| 1856 | } | 1453 | } |
| 1857 | 1454 | ||
| 1858 | if (ext_flags & OCFS2_EXT_UNWRITTEN) { | 1455 | if (ext_flags & OCFS2_EXT_UNWRITTEN) { |
| 1859 | desc->c_unwritten = 1; | 1456 | desc->c_clear_unwritten = 1; |
| 1860 | desc->c_needs_zero = 1; | 1457 | desc->c_needs_zero = 1; |
| 1861 | } | 1458 | } |
| 1862 | 1459 | ||
| 1460 | ret = ocfs2_unwritten_check(inode, wc, desc); | ||
| 1461 | if (ret) { | ||
| 1462 | mlog_errno(ret); | ||
| 1463 | goto out; | ||
| 1464 | } | ||
| 1465 | |||
| 1863 | num_clusters--; | 1466 | num_clusters--; |
| 1864 | } | 1467 | } |
| 1865 | 1468 | ||
| @@ -2022,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, | |||
| 2022 | if (ret) | 1625 | if (ret) |
| 2023 | mlog_errno(ret); | 1626 | mlog_errno(ret); |
| 2024 | 1627 | ||
| 2025 | wc->w_first_new_cpos = | 1628 | /* There is no wc if this is call from direct. */ |
| 2026 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); | 1629 | if (wc) |
| 1630 | wc->w_first_new_cpos = | ||
| 1631 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); | ||
| 2027 | 1632 | ||
| 2028 | return ret; | 1633 | return ret; |
| 2029 | } | 1634 | } |
| @@ -2077,9 +1682,8 @@ out: | |||
| 2077 | return ret; | 1682 | return ret; |
| 2078 | } | 1683 | } |
| 2079 | 1684 | ||
| 2080 | int ocfs2_write_begin_nolock(struct file *filp, | 1685 | int ocfs2_write_begin_nolock(struct address_space *mapping, |
| 2081 | struct address_space *mapping, | 1686 | loff_t pos, unsigned len, ocfs2_write_type_t type, |
| 2082 | loff_t pos, unsigned len, unsigned flags, | ||
| 2083 | struct page **pagep, void **fsdata, | 1687 | struct page **pagep, void **fsdata, |
| 2084 | struct buffer_head *di_bh, struct page *mmap_page) | 1688 | struct buffer_head *di_bh, struct page *mmap_page) |
| 2085 | { | 1689 | { |
| @@ -2096,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp, | |||
| 2096 | int try_free = 1, ret1; | 1700 | int try_free = 1, ret1; |
| 2097 | 1701 | ||
| 2098 | try_again: | 1702 | try_again: |
| 2099 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); | 1703 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); |
| 2100 | if (ret) { | 1704 | if (ret) { |
| 2101 | mlog_errno(ret); | 1705 | mlog_errno(ret); |
| 2102 | return ret; | 1706 | return ret; |
| @@ -2115,14 +1719,17 @@ try_again: | |||
| 2115 | } | 1719 | } |
| 2116 | } | 1720 | } |
| 2117 | 1721 | ||
| 2118 | if (ocfs2_sparse_alloc(osb)) | 1722 | /* Direct io change i_size late, should not zero tail here. */ |
| 2119 | ret = ocfs2_zero_tail(inode, di_bh, pos); | 1723 | if (type != OCFS2_WRITE_DIRECT) { |
| 2120 | else | 1724 | if (ocfs2_sparse_alloc(osb)) |
| 2121 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, | 1725 | ret = ocfs2_zero_tail(inode, di_bh, pos); |
| 2122 | wc); | 1726 | else |
| 2123 | if (ret) { | 1727 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, |
| 2124 | mlog_errno(ret); | 1728 | len, wc); |
| 2125 | goto out; | 1729 | if (ret) { |
| 1730 | mlog_errno(ret); | ||
| 1731 | goto out; | ||
| 1732 | } | ||
| 2126 | } | 1733 | } |
| 2127 | 1734 | ||
| 2128 | ret = ocfs2_check_range_for_refcount(inode, pos, len); | 1735 | ret = ocfs2_check_range_for_refcount(inode, pos, len); |
| @@ -2153,7 +1760,7 @@ try_again: | |||
| 2153 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1760 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
| 2154 | (long long)i_size_read(inode), | 1761 | (long long)i_size_read(inode), |
| 2155 | le32_to_cpu(di->i_clusters), | 1762 | le32_to_cpu(di->i_clusters), |
| 2156 | pos, len, flags, mmap_page, | 1763 | pos, len, type, mmap_page, |
| 2157 | clusters_to_alloc, extents_to_split); | 1764 | clusters_to_alloc, extents_to_split); |
| 2158 | 1765 | ||
| 2159 | /* | 1766 | /* |
| @@ -2183,17 +1790,17 @@ try_again: | |||
| 2183 | 1790 | ||
| 2184 | credits = ocfs2_calc_extend_credits(inode->i_sb, | 1791 | credits = ocfs2_calc_extend_credits(inode->i_sb, |
| 2185 | &di->id2.i_list); | 1792 | &di->id2.i_list); |
| 2186 | 1793 | } else if (type == OCFS2_WRITE_DIRECT) | |
| 2187 | } | 1794 | /* direct write needs not to start trans if no extents alloc. */ |
| 1795 | goto success; | ||
| 2188 | 1796 | ||
| 2189 | /* | 1797 | /* |
| 2190 | * We have to zero sparse allocated clusters, unwritten extent clusters, | 1798 | * We have to zero sparse allocated clusters, unwritten extent clusters, |
| 2191 | * and non-sparse clusters we just extended. For non-sparse writes, | 1799 | * and non-sparse clusters we just extended. For non-sparse writes, |
| 2192 | * we know zeros will only be needed in the first and/or last cluster. | 1800 | * we know zeros will only be needed in the first and/or last cluster. |
| 2193 | */ | 1801 | */ |
| 2194 | if (clusters_to_alloc || extents_to_split || | 1802 | if (wc->w_clen && (wc->w_desc[0].c_needs_zero || |
| 2195 | (wc->w_clen && (wc->w_desc[0].c_needs_zero || | 1803 | wc->w_desc[wc->w_clen - 1].c_needs_zero)) |
| 2196 | wc->w_desc[wc->w_clen - 1].c_needs_zero))) | ||
| 2197 | cluster_of_pages = 1; | 1804 | cluster_of_pages = 1; |
| 2198 | else | 1805 | else |
| 2199 | cluster_of_pages = 0; | 1806 | cluster_of_pages = 0; |
| @@ -2260,7 +1867,8 @@ try_again: | |||
| 2260 | ocfs2_free_alloc_context(meta_ac); | 1867 | ocfs2_free_alloc_context(meta_ac); |
| 2261 | 1868 | ||
| 2262 | success: | 1869 | success: |
| 2263 | *pagep = wc->w_target_page; | 1870 | if (pagep) |
| 1871 | *pagep = wc->w_target_page; | ||
| 2264 | *fsdata = wc; | 1872 | *fsdata = wc; |
| 2265 | return 0; | 1873 | return 0; |
| 2266 | out_quota: | 1874 | out_quota: |
| @@ -2271,7 +1879,7 @@ out_commit: | |||
| 2271 | ocfs2_commit_trans(osb, handle); | 1879 | ocfs2_commit_trans(osb, handle); |
| 2272 | 1880 | ||
| 2273 | out: | 1881 | out: |
| 2274 | ocfs2_free_write_ctxt(wc); | 1882 | ocfs2_free_write_ctxt(inode, wc); |
| 2275 | 1883 | ||
| 2276 | if (data_ac) { | 1884 | if (data_ac) { |
| 2277 | ocfs2_free_alloc_context(data_ac); | 1885 | ocfs2_free_alloc_context(data_ac); |
| @@ -2323,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |||
| 2323 | */ | 1931 | */ |
| 2324 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 1932 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 2325 | 1933 | ||
| 2326 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, | 1934 | ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, |
| 2327 | fsdata, di_bh, NULL); | 1935 | pagep, fsdata, di_bh, NULL); |
| 2328 | if (ret) { | 1936 | if (ret) { |
| 2329 | mlog_errno(ret); | 1937 | mlog_errno(ret); |
| 2330 | goto out_fail; | 1938 | goto out_fail; |
| @@ -2381,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
| 2381 | handle_t *handle = wc->w_handle; | 1989 | handle_t *handle = wc->w_handle; |
| 2382 | struct page *tmppage; | 1990 | struct page *tmppage; |
| 2383 | 1991 | ||
| 2384 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, | 1992 | BUG_ON(!list_empty(&wc->w_unwritten_list)); |
| 2385 | OCFS2_JOURNAL_ACCESS_WRITE); | 1993 | |
| 2386 | if (ret) { | 1994 | if (handle) { |
| 2387 | copied = ret; | 1995 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), |
| 2388 | mlog_errno(ret); | 1996 | wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); |
| 2389 | goto out; | 1997 | if (ret) { |
| 1998 | copied = ret; | ||
| 1999 | mlog_errno(ret); | ||
| 2000 | goto out; | ||
| 2001 | } | ||
| 2390 | } | 2002 | } |
| 2391 | 2003 | ||
| 2392 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 2004 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
| @@ -2394,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
| 2394 | goto out_write_size; | 2006 | goto out_write_size; |
| 2395 | } | 2007 | } |
| 2396 | 2008 | ||
| 2397 | if (unlikely(copied < len)) { | 2009 | if (unlikely(copied < len) && wc->w_target_page) { |
| 2398 | if (!PageUptodate(wc->w_target_page)) | 2010 | if (!PageUptodate(wc->w_target_page)) |
| 2399 | copied = 0; | 2011 | copied = 0; |
| 2400 | 2012 | ||
| 2401 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | 2013 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, |
| 2402 | start+len); | 2014 | start+len); |
| 2403 | } | 2015 | } |
| 2404 | flush_dcache_page(wc->w_target_page); | 2016 | if (wc->w_target_page) |
| 2017 | flush_dcache_page(wc->w_target_page); | ||
| 2405 | 2018 | ||
| 2406 | for(i = 0; i < wc->w_num_pages; i++) { | 2019 | for(i = 0; i < wc->w_num_pages; i++) { |
| 2407 | tmppage = wc->w_pages[i]; | 2020 | tmppage = wc->w_pages[i]; |
| 2408 | 2021 | ||
| 2022 | /* This is the direct io target page. */ | ||
| 2023 | if (tmppage == NULL) | ||
| 2024 | continue; | ||
| 2025 | |||
| 2409 | if (tmppage == wc->w_target_page) { | 2026 | if (tmppage == wc->w_target_page) { |
| 2410 | from = wc->w_target_from; | 2027 | from = wc->w_target_from; |
| 2411 | to = wc->w_target_to; | 2028 | to = wc->w_target_to; |
| @@ -2424,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
| 2424 | } | 2041 | } |
| 2425 | 2042 | ||
| 2426 | if (page_has_buffers(tmppage)) { | 2043 | if (page_has_buffers(tmppage)) { |
| 2427 | if (ocfs2_should_order_data(inode)) | 2044 | if (handle && ocfs2_should_order_data(inode)) |
| 2428 | ocfs2_jbd2_file_inode(wc->w_handle, inode); | 2045 | ocfs2_jbd2_file_inode(handle, inode); |
| 2429 | block_commit_write(tmppage, from, to); | 2046 | block_commit_write(tmppage, from, to); |
| 2430 | } | 2047 | } |
| 2431 | } | 2048 | } |
| 2432 | 2049 | ||
| 2433 | out_write_size: | 2050 | out_write_size: |
| 2434 | pos += copied; | 2051 | /* Direct io do not update i_size here. */ |
| 2435 | if (pos > i_size_read(inode)) { | 2052 | if (wc->w_type != OCFS2_WRITE_DIRECT) { |
| 2436 | i_size_write(inode, pos); | 2053 | pos += copied; |
| 2437 | mark_inode_dirty(inode); | 2054 | if (pos > i_size_read(inode)) { |
| 2438 | } | 2055 | i_size_write(inode, pos); |
| 2439 | inode->i_blocks = ocfs2_inode_sector_count(inode); | 2056 | mark_inode_dirty(inode); |
| 2440 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | 2057 | } |
| 2441 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 2058 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 2442 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | 2059 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); |
| 2443 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 2060 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 2444 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | 2061 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); |
| 2445 | ocfs2_journal_dirty(handle, wc->w_di_bh); | 2062 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
| 2063 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | ||
| 2064 | } | ||
| 2065 | if (handle) | ||
| 2066 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
| 2446 | 2067 | ||
| 2447 | out: | 2068 | out: |
| 2448 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier | 2069 | /* unlock pages before dealloc since it needs acquiring j_trans_barrier |
| @@ -2452,7 +2073,8 @@ out: | |||
| 2452 | */ | 2073 | */ |
| 2453 | ocfs2_unlock_pages(wc); | 2074 | ocfs2_unlock_pages(wc); |
| 2454 | 2075 | ||
| 2455 | ocfs2_commit_trans(osb, handle); | 2076 | if (handle) |
| 2077 | ocfs2_commit_trans(osb, handle); | ||
| 2456 | 2078 | ||
| 2457 | ocfs2_run_deallocs(osb, &wc->w_dealloc); | 2079 | ocfs2_run_deallocs(osb, &wc->w_dealloc); |
| 2458 | 2080 | ||
| @@ -2477,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, | |||
| 2477 | return ret; | 2099 | return ret; |
| 2478 | } | 2100 | } |
| 2479 | 2101 | ||
| 2102 | struct ocfs2_dio_write_ctxt { | ||
| 2103 | struct list_head dw_zero_list; | ||
| 2104 | unsigned dw_zero_count; | ||
| 2105 | int dw_orphaned; | ||
| 2106 | pid_t dw_writer_pid; | ||
| 2107 | }; | ||
| 2108 | |||
| 2109 | static struct ocfs2_dio_write_ctxt * | ||
| 2110 | ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) | ||
| 2111 | { | ||
| 2112 | struct ocfs2_dio_write_ctxt *dwc = NULL; | ||
| 2113 | |||
| 2114 | if (bh->b_private) | ||
| 2115 | return bh->b_private; | ||
| 2116 | |||
| 2117 | dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); | ||
| 2118 | if (dwc == NULL) | ||
| 2119 | return NULL; | ||
| 2120 | INIT_LIST_HEAD(&dwc->dw_zero_list); | ||
| 2121 | dwc->dw_zero_count = 0; | ||
| 2122 | dwc->dw_orphaned = 0; | ||
| 2123 | dwc->dw_writer_pid = task_pid_nr(current); | ||
| 2124 | bh->b_private = dwc; | ||
| 2125 | *alloc = 1; | ||
| 2126 | |||
| 2127 | return dwc; | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | static void ocfs2_dio_free_write_ctx(struct inode *inode, | ||
| 2131 | struct ocfs2_dio_write_ctxt *dwc) | ||
| 2132 | { | ||
| 2133 | ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); | ||
| 2134 | kfree(dwc); | ||
| 2135 | } | ||
| 2136 | |||
| 2137 | /* | ||
| 2138 | * TODO: Make this into a generic get_blocks function. | ||
| 2139 | * | ||
| 2140 | * From do_direct_io in direct-io.c: | ||
| 2141 | * "So what we do is to permit the ->get_blocks function to populate | ||
| 2142 | * bh.b_size with the size of IO which is permitted at this offset and | ||
| 2143 | * this i_blkbits." | ||
| 2144 | * | ||
| 2145 | * This function is called directly from get_more_blocks in direct-io.c. | ||
| 2146 | * | ||
| 2147 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
| 2148 | * fs_count, map_bh, dio->rw == WRITE); | ||
| 2149 | */ | ||
| 2150 | static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, | ||
| 2151 | struct buffer_head *bh_result, int create) | ||
| 2152 | { | ||
| 2153 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 2154 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 2155 | struct ocfs2_write_ctxt *wc; | ||
| 2156 | struct ocfs2_write_cluster_desc *desc = NULL; | ||
| 2157 | struct ocfs2_dio_write_ctxt *dwc = NULL; | ||
| 2158 | struct buffer_head *di_bh = NULL; | ||
| 2159 | u64 p_blkno; | ||
| 2160 | loff_t pos = iblock << inode->i_sb->s_blocksize_bits; | ||
| 2161 | unsigned len, total_len = bh_result->b_size; | ||
| 2162 | int ret = 0, first_get_block = 0; | ||
| 2163 | |||
| 2164 | len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); | ||
| 2165 | len = min(total_len, len); | ||
| 2166 | |||
| 2167 | mlog(0, "get block of %lu at %llu:%u req %u\n", | ||
| 2168 | inode->i_ino, pos, len, total_len); | ||
| 2169 | |||
| 2170 | /* | ||
| 2171 | * Because we need to change file size in ocfs2_dio_end_io_write(), or | ||
| 2172 | * we may need to add it to orphan dir. So can not fall to fast path | ||
| 2173 | * while file size will be changed. | ||
| 2174 | */ | ||
| 2175 | if (pos + total_len <= i_size_read(inode)) { | ||
| 2176 | down_read(&oi->ip_alloc_sem); | ||
| 2177 | /* This is the fast path for re-write. */ | ||
| 2178 | ret = ocfs2_get_block(inode, iblock, bh_result, create); | ||
| 2179 | |||
| 2180 | up_read(&oi->ip_alloc_sem); | ||
| 2181 | |||
| 2182 | if (buffer_mapped(bh_result) && | ||
| 2183 | !buffer_new(bh_result) && | ||
| 2184 | ret == 0) | ||
| 2185 | goto out; | ||
| 2186 | |||
| 2187 | /* Clear state set by ocfs2_get_block. */ | ||
| 2188 | bh_result->b_state = 0; | ||
| 2189 | } | ||
| 2190 | |||
| 2191 | dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); | ||
| 2192 | if (unlikely(dwc == NULL)) { | ||
| 2193 | ret = -ENOMEM; | ||
| 2194 | mlog_errno(ret); | ||
| 2195 | goto out; | ||
| 2196 | } | ||
| 2197 | |||
| 2198 | if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > | ||
| 2199 | ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && | ||
| 2200 | !dwc->dw_orphaned) { | ||
| 2201 | /* | ||
| 2202 | * when we are going to alloc extents beyond file size, add the | ||
| 2203 | * inode to orphan dir, so we can recall those spaces when | ||
| 2204 | * system crashed during write. | ||
| 2205 | */ | ||
| 2206 | ret = ocfs2_add_inode_to_orphan(osb, inode); | ||
| 2207 | if (ret < 0) { | ||
| 2208 | mlog_errno(ret); | ||
| 2209 | goto out; | ||
| 2210 | } | ||
| 2211 | dwc->dw_orphaned = 1; | ||
| 2212 | } | ||
| 2213 | |||
| 2214 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
| 2215 | if (ret) { | ||
| 2216 | mlog_errno(ret); | ||
| 2217 | goto out; | ||
| 2218 | } | ||
| 2219 | |||
| 2220 | down_write(&oi->ip_alloc_sem); | ||
| 2221 | |||
| 2222 | if (first_get_block) { | ||
| 2223 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
| 2224 | ret = ocfs2_zero_tail(inode, di_bh, pos); | ||
| 2225 | else | ||
| 2226 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, | ||
| 2227 | total_len, NULL); | ||
| 2228 | if (ret < 0) { | ||
| 2229 | mlog_errno(ret); | ||
| 2230 | goto unlock; | ||
| 2231 | } | ||
| 2232 | } | ||
| 2233 | |||
| 2234 | ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, | ||
| 2235 | OCFS2_WRITE_DIRECT, NULL, | ||
| 2236 | (void **)&wc, di_bh, NULL); | ||
| 2237 | if (ret) { | ||
| 2238 | mlog_errno(ret); | ||
| 2239 | goto unlock; | ||
| 2240 | } | ||
| 2241 | |||
| 2242 | desc = &wc->w_desc[0]; | ||
| 2243 | |||
| 2244 | p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); | ||
| 2245 | BUG_ON(p_blkno == 0); | ||
| 2246 | p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); | ||
| 2247 | |||
| 2248 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 2249 | bh_result->b_size = len; | ||
| 2250 | if (desc->c_needs_zero) | ||
| 2251 | set_buffer_new(bh_result); | ||
| 2252 | |||
| 2253 | /* May sleep in end_io. It should not happen in a irq context. So defer | ||
| 2254 | * it to dio work queue. */ | ||
| 2255 | set_buffer_defer_completion(bh_result); | ||
| 2256 | |||
| 2257 | if (!list_empty(&wc->w_unwritten_list)) { | ||
| 2258 | struct ocfs2_unwritten_extent *ue = NULL; | ||
| 2259 | |||
| 2260 | ue = list_first_entry(&wc->w_unwritten_list, | ||
| 2261 | struct ocfs2_unwritten_extent, | ||
| 2262 | ue_node); | ||
| 2263 | BUG_ON(ue->ue_cpos != desc->c_cpos); | ||
| 2264 | /* The physical address may be 0, fill it. */ | ||
| 2265 | ue->ue_phys = desc->c_phys; | ||
| 2266 | |||
| 2267 | list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); | ||
| 2268 | dwc->dw_zero_count++; | ||
| 2269 | } | ||
| 2270 | |||
| 2271 | ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); | ||
| 2272 | BUG_ON(ret != len); | ||
| 2273 | ret = 0; | ||
| 2274 | unlock: | ||
| 2275 | up_write(&oi->ip_alloc_sem); | ||
| 2276 | ocfs2_inode_unlock(inode, 1); | ||
| 2277 | brelse(di_bh); | ||
| 2278 | out: | ||
| 2279 | if (ret < 0) | ||
| 2280 | ret = -EIO; | ||
| 2281 | return ret; | ||
| 2282 | } | ||
| 2283 | |||
| 2284 | static void ocfs2_dio_end_io_write(struct inode *inode, | ||
| 2285 | struct ocfs2_dio_write_ctxt *dwc, | ||
| 2286 | loff_t offset, | ||
| 2287 | ssize_t bytes) | ||
| 2288 | { | ||
| 2289 | struct ocfs2_cached_dealloc_ctxt dealloc; | ||
| 2290 | struct ocfs2_extent_tree et; | ||
| 2291 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 2292 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
| 2293 | struct ocfs2_unwritten_extent *ue = NULL; | ||
| 2294 | struct buffer_head *di_bh = NULL; | ||
| 2295 | struct ocfs2_dinode *di; | ||
| 2296 | struct ocfs2_alloc_context *data_ac = NULL; | ||
| 2297 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
| 2298 | handle_t *handle = NULL; | ||
| 2299 | loff_t end = offset + bytes; | ||
| 2300 | int ret = 0, credits = 0, locked = 0; | ||
| 2301 | |||
| 2302 | ocfs2_init_dealloc_ctxt(&dealloc); | ||
| 2303 | |||
| 2304 | /* We do clear unwritten, delete orphan, change i_size here. If neither | ||
| 2305 | * of these happen, we can skip all this. */ | ||
| 2306 | if (list_empty(&dwc->dw_zero_list) && | ||
| 2307 | end <= i_size_read(inode) && | ||
| 2308 | !dwc->dw_orphaned) | ||
| 2309 | goto out; | ||
| 2310 | |||
| 2311 | /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we | ||
| 2312 | * are in that context. */ | ||
| 2313 | if (dwc->dw_writer_pid != task_pid_nr(current)) { | ||
| 2314 | mutex_lock(&inode->i_mutex); | ||
| 2315 | locked = 1; | ||
| 2316 | } | ||
| 2317 | |||
| 2318 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | ||
| 2319 | if (ret < 0) { | ||
| 2320 | mlog_errno(ret); | ||
| 2321 | goto out; | ||
| 2322 | } | ||
| 2323 | |||
| 2324 | down_write(&oi->ip_alloc_sem); | ||
| 2325 | |||
| 2326 | /* Delete orphan before acquire i_mutex. */ | ||
| 2327 | if (dwc->dw_orphaned) { | ||
| 2328 | BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); | ||
| 2329 | |||
| 2330 | end = end > i_size_read(inode) ? end : 0; | ||
| 2331 | |||
| 2332 | ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, | ||
| 2333 | !!end, end); | ||
| 2334 | if (ret < 0) | ||
| 2335 | mlog_errno(ret); | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | di = (struct ocfs2_dinode *)di_bh; | ||
| 2339 | |||
| 2340 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); | ||
| 2341 | |||
| 2342 | ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, | ||
| 2343 | &data_ac, &meta_ac); | ||
| 2344 | if (ret) { | ||
| 2345 | mlog_errno(ret); | ||
| 2346 | goto unlock; | ||
| 2347 | } | ||
| 2348 | |||
| 2349 | credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); | ||
| 2350 | |||
| 2351 | handle = ocfs2_start_trans(osb, credits); | ||
| 2352 | if (IS_ERR(handle)) { | ||
| 2353 | ret = PTR_ERR(handle); | ||
| 2354 | mlog_errno(ret); | ||
| 2355 | goto unlock; | ||
| 2356 | } | ||
| 2357 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | ||
| 2358 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 2359 | if (ret) { | ||
| 2360 | mlog_errno(ret); | ||
| 2361 | goto commit; | ||
| 2362 | } | ||
| 2363 | |||
| 2364 | list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { | ||
| 2365 | ret = ocfs2_mark_extent_written(inode, &et, handle, | ||
| 2366 | ue->ue_cpos, 1, | ||
| 2367 | ue->ue_phys, | ||
| 2368 | meta_ac, &dealloc); | ||
| 2369 | if (ret < 0) { | ||
| 2370 | mlog_errno(ret); | ||
| 2371 | break; | ||
| 2372 | } | ||
| 2373 | } | ||
| 2374 | |||
| 2375 | if (end > i_size_read(inode)) { | ||
| 2376 | ret = ocfs2_set_inode_size(handle, inode, di_bh, end); | ||
| 2377 | if (ret < 0) | ||
| 2378 | mlog_errno(ret); | ||
| 2379 | } | ||
| 2380 | commit: | ||
| 2381 | ocfs2_commit_trans(osb, handle); | ||
| 2382 | unlock: | ||
| 2383 | up_write(&oi->ip_alloc_sem); | ||
| 2384 | ocfs2_inode_unlock(inode, 1); | ||
| 2385 | brelse(di_bh); | ||
| 2386 | out: | ||
| 2387 | if (data_ac) | ||
| 2388 | ocfs2_free_alloc_context(data_ac); | ||
| 2389 | if (meta_ac) | ||
| 2390 | ocfs2_free_alloc_context(meta_ac); | ||
| 2391 | ocfs2_run_deallocs(osb, &dealloc); | ||
| 2392 | if (locked) | ||
| 2393 | mutex_unlock(&inode->i_mutex); | ||
| 2394 | ocfs2_dio_free_write_ctx(inode, dwc); | ||
| 2395 | } | ||
| 2396 | |||
| 2397 | /* | ||
| 2398 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
| 2399 | * particularly interested in the aio/dio case. We use the rw_lock DLM lock | ||
| 2400 | * to protect io on one node from truncation on another. | ||
| 2401 | */ | ||
| 2402 | static int ocfs2_dio_end_io(struct kiocb *iocb, | ||
| 2403 | loff_t offset, | ||
| 2404 | ssize_t bytes, | ||
| 2405 | void *private) | ||
| 2406 | { | ||
| 2407 | struct inode *inode = file_inode(iocb->ki_filp); | ||
| 2408 | int level; | ||
| 2409 | |||
| 2410 | if (bytes <= 0) | ||
| 2411 | return 0; | ||
| 2412 | |||
| 2413 | /* this io's submitter should not have unlocked this before we could */ | ||
| 2414 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
| 2415 | |||
| 2416 | if (private) | ||
| 2417 | ocfs2_dio_end_io_write(inode, private, offset, bytes); | ||
| 2418 | |||
| 2419 | ocfs2_iocb_clear_rw_locked(iocb); | ||
| 2420 | |||
| 2421 | level = ocfs2_iocb_rw_locked_level(iocb); | ||
| 2422 | ocfs2_rw_unlock(inode, level); | ||
| 2423 | return 0; | ||
| 2424 | } | ||
| 2425 | |||
| 2426 | static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | ||
| 2427 | loff_t offset) | ||
| 2428 | { | ||
| 2429 | struct file *file = iocb->ki_filp; | ||
| 2430 | struct inode *inode = file_inode(file)->i_mapping->host; | ||
| 2431 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 2432 | loff_t end = offset + iter->count; | ||
| 2433 | get_block_t *get_block; | ||
| 2434 | |||
| 2435 | /* | ||
| 2436 | * Fallback to buffered I/O if we see an inode without | ||
| 2437 | * extents. | ||
| 2438 | */ | ||
| 2439 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
| 2440 | return 0; | ||
| 2441 | |||
| 2442 | /* Fallback to buffered I/O if we do not support append dio. */ | ||
| 2443 | if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) | ||
| 2444 | return 0; | ||
| 2445 | |||
| 2446 | if (iov_iter_rw(iter) == READ) | ||
| 2447 | get_block = ocfs2_get_block; | ||
| 2448 | else | ||
| 2449 | get_block = ocfs2_dio_get_block; | ||
| 2450 | |||
| 2451 | return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, | ||
| 2452 | iter, offset, get_block, | ||
| 2453 | ocfs2_dio_end_io, NULL, 0); | ||
| 2454 | } | ||
| 2455 | |||
| 2480 | const struct address_space_operations ocfs2_aops = { | 2456 | const struct address_space_operations ocfs2_aops = { |
| 2481 | .readpage = ocfs2_readpage, | 2457 | .readpage = ocfs2_readpage, |
| 2482 | .readpages = ocfs2_readpages, | 2458 | .readpages = ocfs2_readpages, |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 24e496d6bdcd..b1c9f28a57b1 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
| @@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
| 47 | loff_t pos, unsigned len, unsigned copied, | 47 | loff_t pos, unsigned len, unsigned copied, |
| 48 | struct page *page, void *fsdata); | 48 | struct page *page, void *fsdata); |
| 49 | 49 | ||
| 50 | int ocfs2_write_begin_nolock(struct file *filp, | 50 | typedef enum { |
| 51 | struct address_space *mapping, | 51 | OCFS2_WRITE_BUFFER = 0, |
| 52 | loff_t pos, unsigned len, unsigned flags, | 52 | OCFS2_WRITE_DIRECT, |
| 53 | OCFS2_WRITE_MMAP, | ||
| 54 | } ocfs2_write_type_t; | ||
| 55 | |||
| 56 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
| 57 | loff_t pos, unsigned len, ocfs2_write_type_t type, | ||
| 53 | struct page **pagep, void **fsdata, | 58 | struct page **pagep, void **fsdata, |
| 54 | struct buffer_head *di_bh, struct page *mmap_page); | 59 | struct buffer_head *di_bh, struct page *mmap_page); |
| 55 | 60 | ||
| @@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) | |||
| 79 | enum ocfs2_iocb_lock_bits { | 84 | enum ocfs2_iocb_lock_bits { |
| 80 | OCFS2_IOCB_RW_LOCK = 0, | 85 | OCFS2_IOCB_RW_LOCK = 0, |
| 81 | OCFS2_IOCB_RW_LOCK_LEVEL, | 86 | OCFS2_IOCB_RW_LOCK_LEVEL, |
| 82 | OCFS2_IOCB_UNALIGNED_IO, | ||
| 83 | OCFS2_IOCB_NUM_LOCKS | 87 | OCFS2_IOCB_NUM_LOCKS |
| 84 | }; | 88 | }; |
| 85 | 89 | ||
| @@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits { | |||
| 88 | #define ocfs2_iocb_rw_locked_level(iocb) \ | 92 | #define ocfs2_iocb_rw_locked_level(iocb) \ |
| 89 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) | 93 | test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) |
| 90 | 94 | ||
| 91 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ | ||
| 92 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
| 93 | #define ocfs2_iocb_clear_unaligned_aio(iocb) \ | ||
| 94 | clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
| 95 | #define ocfs2_iocb_is_unaligned_aio(iocb) \ | ||
| 96 | test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
| 97 | |||
| 98 | #endif /* OCFS2_FILE_H */ | 95 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ef6a2ec494de..bd15929b5f92 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
| @@ -1444,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item) | |||
| 1444 | debugfs_remove(reg->hr_debug_dir); | 1444 | debugfs_remove(reg->hr_debug_dir); |
| 1445 | kfree(reg->hr_db_livenodes); | 1445 | kfree(reg->hr_db_livenodes); |
| 1446 | kfree(reg->hr_db_regnum); | 1446 | kfree(reg->hr_db_regnum); |
| 1447 | kfree(reg->hr_debug_elapsed_time); | 1447 | kfree(reg->hr_db_elapsed_time); |
| 1448 | kfree(reg->hr_debug_pinned); | 1448 | kfree(reg->hr_db_pinned); |
| 1449 | 1449 | ||
| 1450 | spin_lock(&o2hb_live_lock); | 1450 | spin_lock(&o2hb_live_lock); |
| 1451 | list_del(®->hr_all_item); | 1451 | list_del(®->hr_all_item); |
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index e36d63ff1783..cdeafb4e7ed6 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c | |||
| @@ -212,6 +212,12 @@ grant: | |||
| 212 | if (lock->lksb->flags & DLM_LKSB_PUT_LVB) | 212 | if (lock->lksb->flags & DLM_LKSB_PUT_LVB) |
| 213 | memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); | 213 | memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); |
| 214 | 214 | ||
| 215 | /* | ||
| 216 | * Move the lock to the tail because it may be the only lock which has | ||
| 217 | * an invalid lvb. | ||
| 218 | */ | ||
| 219 | list_move_tail(&lock->list, &res->granted); | ||
| 220 | |||
| 215 | status = DLM_NORMAL; | 221 | status = DLM_NORMAL; |
| 216 | *call_ast = 1; | 222 | *call_ast = 1; |
| 217 | goto unlock_exit; | 223 | goto unlock_exit; |
| @@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
| 262 | struct dlm_lock *lock, int flags, int type) | 268 | struct dlm_lock *lock, int flags, int type) |
| 263 | { | 269 | { |
| 264 | enum dlm_status status; | 270 | enum dlm_status status; |
| 271 | u8 old_owner = res->owner; | ||
| 265 | 272 | ||
| 266 | mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, | 273 | mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, |
| 267 | lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); | 274 | lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); |
| @@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
| 287 | status = DLM_DENIED; | 294 | status = DLM_DENIED; |
| 288 | goto bail; | 295 | goto bail; |
| 289 | } | 296 | } |
| 297 | |||
| 298 | if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { | ||
| 299 | mlog(0, "last convert request returned DLM_RECOVERING, but " | ||
| 300 | "owner has already queued and sent ast to me. res %.*s, " | ||
| 301 | "(cookie=%u:%llu, type=%d, conv=%d)\n", | ||
| 302 | res->lockname.len, res->lockname.name, | ||
| 303 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
| 304 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
| 305 | lock->ml.type, lock->ml.convert_type); | ||
| 306 | status = DLM_NORMAL; | ||
| 307 | goto bail; | ||
| 308 | } | ||
| 309 | |||
| 290 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | 310 | res->state |= DLM_LOCK_RES_IN_PROGRESS; |
| 291 | /* move lock to local convert queue */ | 311 | /* move lock to local convert queue */ |
| 292 | /* do not alter lock refcount. switching lists. */ | 312 | /* do not alter lock refcount. switching lists. */ |
| @@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | |||
| 316 | spin_lock(&res->spinlock); | 336 | spin_lock(&res->spinlock); |
| 317 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 337 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
| 318 | lock->convert_pending = 0; | 338 | lock->convert_pending = 0; |
| 319 | /* if it failed, move it back to granted queue */ | 339 | /* if it failed, move it back to granted queue. |
| 340 | * if master returns DLM_NORMAL and then down before sending ast, | ||
| 341 | * it may have already been moved to granted queue, reset to | ||
| 342 | * DLM_RECOVERING and retry convert */ | ||
| 320 | if (status != DLM_NORMAL) { | 343 | if (status != DLM_NORMAL) { |
| 321 | if (status != DLM_NOTQUEUED) | 344 | if (status != DLM_NOTQUEUED) |
| 322 | dlm_error(status); | 345 | dlm_error(status); |
| 323 | dlm_revert_pending_convert(res, lock); | 346 | dlm_revert_pending_convert(res, lock); |
| 347 | } else if ((res->state & DLM_LOCK_RES_RECOVERING) || | ||
| 348 | (old_owner != res->owner)) { | ||
| 349 | mlog(0, "res %.*s is in recovering or has been recovered.\n", | ||
| 350 | res->lockname.len, res->lockname.name); | ||
| 351 | status = DLM_RECOVERING; | ||
| 324 | } | 352 | } |
| 325 | bail: | 353 | bail: |
| 326 | spin_unlock(&res->spinlock); | 354 | spin_unlock(&res->spinlock); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index cd38488a10fc..f6b313898763 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
| @@ -2083,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
| 2083 | dlm_lock_get(lock); | 2083 | dlm_lock_get(lock); |
| 2084 | if (lock->convert_pending) { | 2084 | if (lock->convert_pending) { |
| 2085 | /* move converting lock back to granted */ | 2085 | /* move converting lock back to granted */ |
| 2086 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
| 2087 | mlog(0, "node died with convert pending " | 2086 | mlog(0, "node died with convert pending " |
| 2088 | "on %.*s. move back to granted list.\n", | 2087 | "on %.*s. move back to granted list.\n", |
| 2089 | res->lockname.len, res->lockname.name); | 2088 | res->lockname.len, res->lockname.name); |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7cb38fdca229..c18ab45f8d21 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
| @@ -1381,44 +1381,6 @@ out: | |||
| 1381 | return ret; | 1381 | return ret; |
| 1382 | } | 1382 | } |
| 1383 | 1383 | ||
| 1384 | /* | ||
| 1385 | * Will look for holes and unwritten extents in the range starting at | ||
| 1386 | * pos for count bytes (inclusive). | ||
| 1387 | */ | ||
| 1388 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
| 1389 | size_t count) | ||
| 1390 | { | ||
| 1391 | int ret = 0; | ||
| 1392 | unsigned int extent_flags; | ||
| 1393 | u32 cpos, clusters, extent_len, phys_cpos; | ||
| 1394 | struct super_block *sb = inode->i_sb; | ||
| 1395 | |||
| 1396 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
| 1397 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
| 1398 | |||
| 1399 | while (clusters) { | ||
| 1400 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
| 1401 | &extent_flags); | ||
| 1402 | if (ret < 0) { | ||
| 1403 | mlog_errno(ret); | ||
| 1404 | goto out; | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
| 1408 | ret = 1; | ||
| 1409 | break; | ||
| 1410 | } | ||
| 1411 | |||
| 1412 | if (extent_len > clusters) | ||
| 1413 | extent_len = clusters; | ||
| 1414 | |||
| 1415 | clusters -= extent_len; | ||
| 1416 | cpos += extent_len; | ||
| 1417 | } | ||
| 1418 | out: | ||
| 1419 | return ret; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | static int ocfs2_write_remove_suid(struct inode *inode) | 1384 | static int ocfs2_write_remove_suid(struct inode *inode) |
| 1423 | { | 1385 | { |
| 1424 | int ret; | 1386 | int ret; |
| @@ -2129,18 +2091,12 @@ out: | |||
| 2129 | 2091 | ||
| 2130 | static int ocfs2_prepare_inode_for_write(struct file *file, | 2092 | static int ocfs2_prepare_inode_for_write(struct file *file, |
| 2131 | loff_t pos, | 2093 | loff_t pos, |
| 2132 | size_t count, | 2094 | size_t count) |
| 2133 | int appending, | ||
| 2134 | int *direct_io, | ||
| 2135 | int *has_refcount) | ||
| 2136 | { | 2095 | { |
| 2137 | int ret = 0, meta_level = 0; | 2096 | int ret = 0, meta_level = 0; |
| 2138 | struct dentry *dentry = file->f_path.dentry; | 2097 | struct dentry *dentry = file->f_path.dentry; |
| 2139 | struct inode *inode = d_inode(dentry); | 2098 | struct inode *inode = d_inode(dentry); |
| 2140 | loff_t end; | 2099 | loff_t end; |
| 2141 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 2142 | int full_coherency = !(osb->s_mount_opt & | ||
| 2143 | OCFS2_MOUNT_COHERENCY_BUFFERED); | ||
| 2144 | 2100 | ||
| 2145 | /* | 2101 | /* |
| 2146 | * We start with a read level meta lock and only jump to an ex | 2102 | * We start with a read level meta lock and only jump to an ex |
| @@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
| 2189 | pos, | 2145 | pos, |
| 2190 | count, | 2146 | count, |
| 2191 | &meta_level); | 2147 | &meta_level); |
| 2192 | if (has_refcount) | ||
| 2193 | *has_refcount = 1; | ||
| 2194 | if (direct_io) | ||
| 2195 | *direct_io = 0; | ||
| 2196 | } | 2148 | } |
| 2197 | 2149 | ||
| 2198 | if (ret < 0) { | 2150 | if (ret < 0) { |
| @@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
| 2200 | goto out_unlock; | 2152 | goto out_unlock; |
| 2201 | } | 2153 | } |
| 2202 | 2154 | ||
| 2203 | /* | ||
| 2204 | * Skip the O_DIRECT checks if we don't need | ||
| 2205 | * them. | ||
| 2206 | */ | ||
| 2207 | if (!direct_io || !(*direct_io)) | ||
| 2208 | break; | ||
| 2209 | |||
| 2210 | /* | ||
| 2211 | * There's no sane way to do direct writes to an inode | ||
| 2212 | * with inline data. | ||
| 2213 | */ | ||
| 2214 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | ||
| 2215 | *direct_io = 0; | ||
| 2216 | break; | ||
| 2217 | } | ||
| 2218 | |||
| 2219 | /* | ||
| 2220 | * Allowing concurrent direct writes means | ||
| 2221 | * i_size changes wouldn't be synchronized, so | ||
| 2222 | * one node could wind up truncating another | ||
| 2223 | * nodes writes. | ||
| 2224 | */ | ||
| 2225 | if (end > i_size_read(inode) && !full_coherency) { | ||
| 2226 | *direct_io = 0; | ||
| 2227 | break; | ||
| 2228 | } | ||
| 2229 | |||
| 2230 | /* | ||
| 2231 | * Fallback to old way if the feature bit is not set. | ||
| 2232 | */ | ||
| 2233 | if (end > i_size_read(inode) && | ||
| 2234 | !ocfs2_supports_append_dio(osb)) { | ||
| 2235 | *direct_io = 0; | ||
| 2236 | break; | ||
| 2237 | } | ||
| 2238 | |||
| 2239 | /* | ||
| 2240 | * We don't fill holes during direct io, so | ||
| 2241 | * check for them here. If any are found, the | ||
| 2242 | * caller will have to retake some cluster | ||
| 2243 | * locks and initiate the io as buffered. | ||
| 2244 | */ | ||
| 2245 | ret = ocfs2_check_range_for_holes(inode, pos, count); | ||
| 2246 | if (ret == 1) { | ||
| 2247 | /* | ||
| 2248 | * Fallback to old way if the feature bit is not set. | ||
| 2249 | * Otherwise try dio first and then complete the rest | ||
| 2250 | * request through buffer io. | ||
| 2251 | */ | ||
| 2252 | if (!ocfs2_supports_append_dio(osb)) | ||
| 2253 | *direct_io = 0; | ||
| 2254 | ret = 0; | ||
| 2255 | } else if (ret < 0) | ||
| 2256 | mlog_errno(ret); | ||
| 2257 | break; | 2155 | break; |
| 2258 | } | 2156 | } |
| 2259 | 2157 | ||
| 2260 | out_unlock: | 2158 | out_unlock: |
| 2261 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, | 2159 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, |
| 2262 | pos, appending, count, | 2160 | pos, count); |
| 2263 | direct_io, has_refcount); | ||
| 2264 | 2161 | ||
| 2265 | if (meta_level >= 0) | 2162 | if (meta_level >= 0) |
| 2266 | ocfs2_inode_unlock(inode, meta_level); | 2163 | ocfs2_inode_unlock(inode, meta_level); |
| @@ -2272,18 +2169,16 @@ out: | |||
| 2272 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | 2169 | static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, |
| 2273 | struct iov_iter *from) | 2170 | struct iov_iter *from) |
| 2274 | { | 2171 | { |
| 2275 | int direct_io, appending, rw_level; | 2172 | int direct_io, rw_level; |
| 2276 | int can_do_direct, has_refcount = 0; | ||
| 2277 | ssize_t written = 0; | 2173 | ssize_t written = 0; |
| 2278 | ssize_t ret; | 2174 | ssize_t ret; |
| 2279 | size_t count = iov_iter_count(from), orig_count; | 2175 | size_t count = iov_iter_count(from); |
| 2280 | struct file *file = iocb->ki_filp; | 2176 | struct file *file = iocb->ki_filp; |
| 2281 | struct inode *inode = file_inode(file); | 2177 | struct inode *inode = file_inode(file); |
| 2282 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2178 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 2283 | int full_coherency = !(osb->s_mount_opt & | 2179 | int full_coherency = !(osb->s_mount_opt & |
| 2284 | OCFS2_MOUNT_COHERENCY_BUFFERED); | 2180 | OCFS2_MOUNT_COHERENCY_BUFFERED); |
| 2285 | int unaligned_dio = 0; | 2181 | void *saved_ki_complete = NULL; |
| 2286 | int dropped_dio = 0; | ||
| 2287 | int append_write = ((iocb->ki_pos + count) >= | 2182 | int append_write = ((iocb->ki_pos + count) >= |
| 2288 | i_size_read(inode) ? 1 : 0); | 2183 | i_size_read(inode) ? 1 : 0); |
| 2289 | 2184 | ||
| @@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, | |||
| 2296 | if (count == 0) | 2191 | if (count == 0) |
| 2297 | return 0; | 2192 | return 0; |
| 2298 | 2193 | ||
| 2299 | appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; | ||
| 2300 | direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; | 2194 | direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; |
| 2301 | 2195 | ||
| 2302 | inode_lock(inode); | 2196 | inode_lock(inode); |
| 2303 | 2197 | ||
| 2304 | relock: | ||
| 2305 | /* | 2198 | /* |
| 2306 | * Concurrent O_DIRECT writes are allowed with | 2199 | * Concurrent O_DIRECT writes are allowed with |
| 2307 | * mount_option "coherency=buffered". | 2200 | * mount_option "coherency=buffered". |
| @@ -2334,7 +2227,6 @@ relock: | |||
| 2334 | ocfs2_inode_unlock(inode, 1); | 2227 | ocfs2_inode_unlock(inode, 1); |
| 2335 | } | 2228 | } |
| 2336 | 2229 | ||
| 2337 | orig_count = iov_iter_count(from); | ||
| 2338 | ret = generic_write_checks(iocb, from); | 2230 | ret = generic_write_checks(iocb, from); |
| 2339 | if (ret <= 0) { | 2231 | if (ret <= 0) { |
| 2340 | if (ret) | 2232 | if (ret) |
| @@ -2343,41 +2235,18 @@ relock: | |||
| 2343 | } | 2235 | } |
| 2344 | count = ret; | 2236 | count = ret; |
| 2345 | 2237 | ||
| 2346 | can_do_direct = direct_io; | 2238 | ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); |
| 2347 | ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, | ||
| 2348 | &can_do_direct, &has_refcount); | ||
| 2349 | if (ret < 0) { | 2239 | if (ret < 0) { |
| 2350 | mlog_errno(ret); | 2240 | mlog_errno(ret); |
| 2351 | goto out; | 2241 | goto out; |
| 2352 | } | 2242 | } |
| 2353 | 2243 | ||
| 2354 | if (direct_io && !is_sync_kiocb(iocb)) | 2244 | if (direct_io && !is_sync_kiocb(iocb) && |
| 2355 | unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); | 2245 | ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { |
| 2356 | |||
| 2357 | /* | ||
| 2358 | * We can't complete the direct I/O as requested, fall back to | ||
| 2359 | * buffered I/O. | ||
| 2360 | */ | ||
| 2361 | if (direct_io && !can_do_direct) { | ||
| 2362 | ocfs2_rw_unlock(inode, rw_level); | ||
| 2363 | |||
| 2364 | rw_level = -1; | ||
| 2365 | |||
| 2366 | direct_io = 0; | ||
| 2367 | iocb->ki_flags &= ~IOCB_DIRECT; | ||
| 2368 | iov_iter_reexpand(from, orig_count); | ||
| 2369 | dropped_dio = 1; | ||
| 2370 | goto relock; | ||
| 2371 | } | ||
| 2372 | |||
| 2373 | if (unaligned_dio) { | ||
| 2374 | /* | 2246 | /* |
| 2375 | * Wait on previous unaligned aio to complete before | 2247 | * Make it a sync io if it's an unaligned aio. |
| 2376 | * proceeding. | ||
| 2377 | */ | 2248 | */ |
| 2378 | mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); | 2249 | saved_ki_complete = xchg(&iocb->ki_complete, NULL); |
| 2379 | /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ | ||
| 2380 | ocfs2_iocb_set_unaligned_aio(iocb); | ||
| 2381 | } | 2250 | } |
| 2382 | 2251 | ||
| 2383 | /* communicate with ocfs2_dio_end_io */ | 2252 | /* communicate with ocfs2_dio_end_io */ |
| @@ -2398,14 +2267,13 @@ relock: | |||
| 2398 | */ | 2267 | */ |
| 2399 | if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | 2268 | if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { |
| 2400 | rw_level = -1; | 2269 | rw_level = -1; |
| 2401 | unaligned_dio = 0; | ||
| 2402 | } | 2270 | } |
| 2403 | 2271 | ||
| 2404 | if (unlikely(written <= 0)) | 2272 | if (unlikely(written <= 0)) |
| 2405 | goto no_sync; | 2273 | goto out; |
| 2406 | 2274 | ||
| 2407 | if (((file->f_flags & O_DSYNC) && !direct_io) || | 2275 | if (((file->f_flags & O_DSYNC) && !direct_io) || |
| 2408 | IS_SYNC(inode) || dropped_dio) { | 2276 | IS_SYNC(inode)) { |
| 2409 | ret = filemap_fdatawrite_range(file->f_mapping, | 2277 | ret = filemap_fdatawrite_range(file->f_mapping, |
| 2410 | iocb->ki_pos - written, | 2278 | iocb->ki_pos - written, |
| 2411 | iocb->ki_pos - 1); | 2279 | iocb->ki_pos - 1); |
| @@ -2424,13 +2292,10 @@ relock: | |||
| 2424 | iocb->ki_pos - 1); | 2292 | iocb->ki_pos - 1); |
| 2425 | } | 2293 | } |
| 2426 | 2294 | ||
| 2427 | no_sync: | ||
| 2428 | if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
| 2429 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
| 2430 | mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); | ||
| 2431 | } | ||
| 2432 | |||
| 2433 | out: | 2295 | out: |
| 2296 | if (saved_ki_complete) | ||
| 2297 | xchg(&iocb->ki_complete, saved_ki_complete); | ||
| 2298 | |||
| 2434 | if (rw_level != -1) | 2299 | if (rw_level != -1) |
| 2435 | ocfs2_rw_unlock(inode, rw_level); | 2300 | ocfs2_rw_unlock(inode, rw_level); |
| 2436 | 2301 | ||
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ba495beff1c2..12f4a9e9800f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
| @@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode) | |||
| 1170 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), | 1170 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), |
| 1171 | "Clear inode of %llu, inode has io markers\n", | 1171 | "Clear inode of %llu, inode has io markers\n", |
| 1172 | (unsigned long long)oi->ip_blkno); | 1172 | (unsigned long long)oi->ip_blkno); |
| 1173 | mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), | ||
| 1174 | "Clear inode of %llu, inode has unwritten extents\n", | ||
| 1175 | (unsigned long long)oi->ip_blkno); | ||
| 1173 | 1176 | ||
| 1174 | ocfs2_extent_map_trunc(inode, 0); | 1177 | ocfs2_extent_map_trunc(inode, 0); |
| 1175 | 1178 | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 01635e016b3e..d8f3fc8d2551 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
| @@ -43,9 +43,6 @@ struct ocfs2_inode_info | |||
| 43 | /* protects extended attribute changes on this inode */ | 43 | /* protects extended attribute changes on this inode */ |
| 44 | struct rw_semaphore ip_xattr_sem; | 44 | struct rw_semaphore ip_xattr_sem; |
| 45 | 45 | ||
| 46 | /* Number of outstanding AIO's which are not page aligned */ | ||
| 47 | struct mutex ip_unaligned_aio; | ||
| 48 | |||
| 49 | /* These fields are protected by ip_lock */ | 46 | /* These fields are protected by ip_lock */ |
| 50 | spinlock_t ip_lock; | 47 | spinlock_t ip_lock; |
| 51 | u32 ip_open_count; | 48 | u32 ip_open_count; |
| @@ -57,6 +54,9 @@ struct ocfs2_inode_info | |||
| 57 | u32 ip_flags; /* see below */ | 54 | u32 ip_flags; /* see below */ |
| 58 | u32 ip_attr; /* inode attributes */ | 55 | u32 ip_attr; /* inode attributes */ |
| 59 | 56 | ||
| 57 | /* Record unwritten extents during direct io. */ | ||
| 58 | struct list_head ip_unwritten_list; | ||
| 59 | |||
| 60 | /* protected by recovery_lock. */ | 60 | /* protected by recovery_lock. */ |
| 61 | struct inode *ip_next_orphan; | 61 | struct inode *ip_next_orphan; |
| 62 | 62 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 61b833b721d8..e607419cdfa4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
| @@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) | |||
| 231 | /* At this point, we know that no more recovery threads can be | 231 | /* At this point, we know that no more recovery threads can be |
| 232 | * launched, so wait for any recovery completion work to | 232 | * launched, so wait for any recovery completion work to |
| 233 | * complete. */ | 233 | * complete. */ |
| 234 | flush_workqueue(ocfs2_wq); | 234 | flush_workqueue(osb->ocfs2_wq); |
| 235 | 235 | ||
| 236 | /* | 236 | /* |
| 237 | * Now that recovery is shut down, and the osb is about to be | 237 | * Now that recovery is shut down, and the osb is about to be |
| @@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, | |||
| 1326 | 1326 | ||
| 1327 | spin_lock(&journal->j_lock); | 1327 | spin_lock(&journal->j_lock); |
| 1328 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); | 1328 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); |
| 1329 | queue_work(ocfs2_wq, &journal->j_recovery_work); | 1329 | queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); |
| 1330 | spin_unlock(&journal->j_lock); | 1330 | spin_unlock(&journal->j_lock); |
| 1331 | } | 1331 | } |
| 1332 | 1332 | ||
| @@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work) | |||
| 1968 | mutex_lock(&os->os_lock); | 1968 | mutex_lock(&os->os_lock); |
| 1969 | ocfs2_queue_orphan_scan(osb); | 1969 | ocfs2_queue_orphan_scan(osb); |
| 1970 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) | 1970 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) |
| 1971 | queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, | 1971 | queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, |
| 1972 | ocfs2_orphan_scan_timeout()); | 1972 | ocfs2_orphan_scan_timeout()); |
| 1973 | mutex_unlock(&os->os_lock); | 1973 | mutex_unlock(&os->os_lock); |
| 1974 | } | 1974 | } |
| @@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) | |||
| 2008 | atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); | 2008 | atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); |
| 2009 | else { | 2009 | else { |
| 2010 | atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); | 2010 | atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); |
| 2011 | queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, | 2011 | queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, |
| 2012 | ocfs2_orphan_scan_timeout()); | 2012 | ocfs2_orphan_scan_timeout()); |
| 2013 | } | 2013 | } |
| 2014 | } | 2014 | } |
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7d62c43a2c3e..fe0d1f9571bb 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c | |||
| @@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) | |||
| 386 | struct ocfs2_dinode *alloc = NULL; | 386 | struct ocfs2_dinode *alloc = NULL; |
| 387 | 387 | ||
| 388 | cancel_delayed_work(&osb->la_enable_wq); | 388 | cancel_delayed_work(&osb->la_enable_wq); |
| 389 | flush_workqueue(ocfs2_wq); | 389 | flush_workqueue(osb->ocfs2_wq); |
| 390 | 390 | ||
| 391 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) | 391 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) |
| 392 | goto out; | 392 | goto out; |
| @@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, | |||
| 1085 | } else { | 1085 | } else { |
| 1086 | osb->local_alloc_state = OCFS2_LA_DISABLED; | 1086 | osb->local_alloc_state = OCFS2_LA_DISABLED; |
| 1087 | } | 1087 | } |
| 1088 | queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, | 1088 | queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, |
| 1089 | OCFS2_LA_ENABLE_INTERVAL); | 1089 | OCFS2_LA_ENABLE_INTERVAL); |
| 1090 | goto out_unlock; | 1090 | goto out_unlock; |
| 1091 | } | 1091 | } |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 77ebc2bc1cca..9ea081f4e6e4 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
| @@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
| 104 | if (page->index == last_index) | 104 | if (page->index == last_index) |
| 105 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; | 105 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; |
| 106 | 106 | ||
| 107 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, | 107 | ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, |
| 108 | &fsdata, di_bh, page); | 108 | &locked_page, &fsdata, di_bh, page); |
| 109 | if (ret) { | 109 | if (ret) { |
| 110 | if (ret != -ENOSPC) | 110 | if (ret != -ENOSPC) |
| 111 | mlog_errno(ret); | 111 | mlog_errno(ret); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7a0126267847..6cf6538a0651 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
| @@ -464,6 +464,14 @@ struct ocfs2_super | |||
| 464 | struct ocfs2_refcount_tree *osb_ref_tree_lru; | 464 | struct ocfs2_refcount_tree *osb_ref_tree_lru; |
| 465 | 465 | ||
| 466 | struct mutex system_file_mutex; | 466 | struct mutex system_file_mutex; |
| 467 | |||
| 468 | /* | ||
| 469 | * OCFS2 needs to schedule several different types of work which | ||
| 470 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
| 471 | * types of work tend to be heavy we avoid using the kernel events | ||
| 472 | * workqueue and schedule on our own. | ||
| 473 | */ | ||
| 474 | struct workqueue_struct *ocfs2_wq; | ||
| 467 | }; | 475 | }; |
| 468 | 476 | ||
| 469 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) | 477 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) |
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 24b7e7f591dc..f8f5fc5e6c05 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h | |||
| @@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); | |||
| 1450 | 1450 | ||
| 1451 | TRACE_EVENT(ocfs2_prepare_inode_for_write, | 1451 | TRACE_EVENT(ocfs2_prepare_inode_for_write, |
| 1452 | TP_PROTO(unsigned long long ino, unsigned long long saved_pos, | 1452 | TP_PROTO(unsigned long long ino, unsigned long long saved_pos, |
| 1453 | int appending, unsigned long count, | 1453 | unsigned long count), |
| 1454 | int *direct_io, int *has_refcount), | 1454 | TP_ARGS(ino, saved_pos, count), |
| 1455 | TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), | ||
| 1456 | TP_STRUCT__entry( | 1455 | TP_STRUCT__entry( |
| 1457 | __field(unsigned long long, ino) | 1456 | __field(unsigned long long, ino) |
| 1458 | __field(unsigned long long, saved_pos) | 1457 | __field(unsigned long long, saved_pos) |
| 1459 | __field(int, appending) | ||
| 1460 | __field(unsigned long, count) | 1458 | __field(unsigned long, count) |
| 1461 | __field(int, direct_io) | ||
| 1462 | __field(int, has_refcount) | ||
| 1463 | ), | 1459 | ), |
| 1464 | TP_fast_assign( | 1460 | TP_fast_assign( |
| 1465 | __entry->ino = ino; | 1461 | __entry->ino = ino; |
| 1466 | __entry->saved_pos = saved_pos; | 1462 | __entry->saved_pos = saved_pos; |
| 1467 | __entry->appending = appending; | ||
| 1468 | __entry->count = count; | 1463 | __entry->count = count; |
| 1469 | __entry->direct_io = direct_io ? *direct_io : -1; | ||
| 1470 | __entry->has_refcount = has_refcount ? *has_refcount : -1; | ||
| 1471 | ), | 1464 | ), |
| 1472 | TP_printk("%llu %llu %d %lu %d %d", __entry->ino, | 1465 | TP_printk("%llu %llu %lu", __entry->ino, |
| 1473 | __entry->saved_pos, __entry->appending, __entry->count, | 1466 | __entry->saved_pos, __entry->count) |
| 1474 | __entry->direct_io, __entry->has_refcount) | ||
| 1475 | ); | 1467 | ); |
| 1476 | 1468 | ||
| 1477 | DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); | 1469 | DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); |
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 91bc674203ed..3892f3c079ca 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c | |||
| @@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) | |||
| 726 | dqgrab(dquot); | 726 | dqgrab(dquot); |
| 727 | /* First entry on list -> queue work */ | 727 | /* First entry on list -> queue work */ |
| 728 | if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) | 728 | if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) |
| 729 | queue_work(ocfs2_wq, &osb->dquot_drop_work); | 729 | queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); |
| 730 | goto out; | 730 | goto out; |
| 731 | } | 731 | } |
| 732 | status = ocfs2_lock_global_qf(oinfo, 1); | 732 | status = ocfs2_lock_global_qf(oinfo, 1); |
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 576b9a04873f..18451e0fab81 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c | |||
| @@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) | |||
| 196 | for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { | 196 | for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { |
| 197 | blkno = ocfs2_backup_super_blkno(inode->i_sb, i); | 197 | blkno = ocfs2_backup_super_blkno(inode->i_sb, i); |
| 198 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); | 198 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); |
| 199 | if (cluster > clusters) | 199 | if (cluster >= clusters) |
| 200 | break; | 200 | break; |
| 201 | 201 | ||
| 202 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); | 202 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ccc9386c42c5..7db631e1c8b0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
| @@ -80,12 +80,6 @@ static struct kmem_cache *ocfs2_inode_cachep; | |||
| 80 | struct kmem_cache *ocfs2_dquot_cachep; | 80 | struct kmem_cache *ocfs2_dquot_cachep; |
| 81 | struct kmem_cache *ocfs2_qf_chunk_cachep; | 81 | struct kmem_cache *ocfs2_qf_chunk_cachep; |
| 82 | 82 | ||
| 83 | /* OCFS2 needs to schedule several different types of work which | ||
| 84 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
| 85 | * types of work tend to be heavy we avoid using the kernel events | ||
| 86 | * workqueue and schedule on our own. */ | ||
| 87 | struct workqueue_struct *ocfs2_wq = NULL; | ||
| 88 | |||
| 89 | static struct dentry *ocfs2_debugfs_root; | 83 | static struct dentry *ocfs2_debugfs_root; |
| 90 | 84 | ||
| 91 | MODULE_AUTHOR("Oracle"); | 85 | MODULE_AUTHOR("Oracle"); |
| @@ -1613,33 +1607,25 @@ static int __init ocfs2_init(void) | |||
| 1613 | if (status < 0) | 1607 | if (status < 0) |
| 1614 | goto out2; | 1608 | goto out2; |
| 1615 | 1609 | ||
| 1616 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
| 1617 | if (!ocfs2_wq) { | ||
| 1618 | status = -ENOMEM; | ||
| 1619 | goto out3; | ||
| 1620 | } | ||
| 1621 | |||
| 1622 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | 1610 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); |
| 1623 | if (!ocfs2_debugfs_root) { | 1611 | if (!ocfs2_debugfs_root) { |
| 1624 | status = -ENOMEM; | 1612 | status = -ENOMEM; |
| 1625 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | 1613 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); |
| 1626 | goto out4; | 1614 | goto out3; |
| 1627 | } | 1615 | } |
| 1628 | 1616 | ||
| 1629 | ocfs2_set_locking_protocol(); | 1617 | ocfs2_set_locking_protocol(); |
| 1630 | 1618 | ||
| 1631 | status = register_quota_format(&ocfs2_quota_format); | 1619 | status = register_quota_format(&ocfs2_quota_format); |
| 1632 | if (status < 0) | 1620 | if (status < 0) |
| 1633 | goto out4; | 1621 | goto out3; |
| 1634 | status = register_filesystem(&ocfs2_fs_type); | 1622 | status = register_filesystem(&ocfs2_fs_type); |
| 1635 | if (!status) | 1623 | if (!status) |
| 1636 | return 0; | 1624 | return 0; |
| 1637 | 1625 | ||
| 1638 | unregister_quota_format(&ocfs2_quota_format); | 1626 | unregister_quota_format(&ocfs2_quota_format); |
| 1639 | out4: | ||
| 1640 | destroy_workqueue(ocfs2_wq); | ||
| 1641 | debugfs_remove(ocfs2_debugfs_root); | ||
| 1642 | out3: | 1627 | out3: |
| 1628 | debugfs_remove(ocfs2_debugfs_root); | ||
| 1643 | ocfs2_free_mem_caches(); | 1629 | ocfs2_free_mem_caches(); |
| 1644 | out2: | 1630 | out2: |
| 1645 | exit_ocfs2_uptodate_cache(); | 1631 | exit_ocfs2_uptodate_cache(); |
| @@ -1650,11 +1636,6 @@ out1: | |||
| 1650 | 1636 | ||
| 1651 | static void __exit ocfs2_exit(void) | 1637 | static void __exit ocfs2_exit(void) |
| 1652 | { | 1638 | { |
| 1653 | if (ocfs2_wq) { | ||
| 1654 | flush_workqueue(ocfs2_wq); | ||
| 1655 | destroy_workqueue(ocfs2_wq); | ||
| 1656 | } | ||
| 1657 | |||
| 1658 | unregister_quota_format(&ocfs2_quota_format); | 1639 | unregister_quota_format(&ocfs2_quota_format); |
| 1659 | 1640 | ||
| 1660 | debugfs_remove(ocfs2_debugfs_root); | 1641 | debugfs_remove(ocfs2_debugfs_root); |
| @@ -1745,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data) | |||
| 1745 | spin_lock_init(&oi->ip_lock); | 1726 | spin_lock_init(&oi->ip_lock); |
| 1746 | ocfs2_extent_map_init(&oi->vfs_inode); | 1727 | ocfs2_extent_map_init(&oi->vfs_inode); |
| 1747 | INIT_LIST_HEAD(&oi->ip_io_markers); | 1728 | INIT_LIST_HEAD(&oi->ip_io_markers); |
| 1729 | INIT_LIST_HEAD(&oi->ip_unwritten_list); | ||
| 1748 | oi->ip_dir_start_lookup = 0; | 1730 | oi->ip_dir_start_lookup = 0; |
| 1749 | mutex_init(&oi->ip_unaligned_aio); | ||
| 1750 | init_rwsem(&oi->ip_alloc_sem); | 1731 | init_rwsem(&oi->ip_alloc_sem); |
| 1751 | init_rwsem(&oi->ip_xattr_sem); | 1732 | init_rwsem(&oi->ip_xattr_sem); |
| 1752 | mutex_init(&oi->ip_io_mutex); | 1733 | mutex_init(&oi->ip_io_mutex); |
| @@ -2349,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
| 2349 | } | 2330 | } |
| 2350 | cleancache_init_shared_fs(sb); | 2331 | cleancache_init_shared_fs(sb); |
| 2351 | 2332 | ||
| 2333 | osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
| 2334 | if (!osb->ocfs2_wq) { | ||
| 2335 | status = -ENOMEM; | ||
| 2336 | mlog_errno(status); | ||
| 2337 | } | ||
| 2338 | |||
| 2352 | bail: | 2339 | bail: |
| 2353 | return status; | 2340 | return status; |
| 2354 | } | 2341 | } |
| @@ -2536,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) | |||
| 2536 | { | 2523 | { |
| 2537 | /* This function assumes that the caller has the main osb resource */ | 2524 | /* This function assumes that the caller has the main osb resource */ |
| 2538 | 2525 | ||
| 2526 | /* ocfs2_initializer_super have already created this workqueue */ | ||
| 2527 | if (osb->ocfs2_wq) { | ||
| 2528 | flush_workqueue(osb->ocfs2_wq); | ||
| 2529 | destroy_workqueue(osb->ocfs2_wq); | ||
| 2530 | } | ||
| 2531 | |||
| 2539 | ocfs2_free_slot_info(osb); | 2532 | ocfs2_free_slot_info(osb); |
| 2540 | 2533 | ||
| 2541 | kfree(osb->osb_orphan_wipes); | 2534 | kfree(osb->osb_orphan_wipes); |
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b477d0b1c7b6..b023e4f3d740 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h | |||
| @@ -26,8 +26,6 @@ | |||
| 26 | #ifndef OCFS2_SUPER_H | 26 | #ifndef OCFS2_SUPER_H |
| 27 | #define OCFS2_SUPER_H | 27 | #define OCFS2_SUPER_H |
| 28 | 28 | ||
| 29 | extern struct workqueue_struct *ocfs2_wq; | ||
| 30 | |||
| 31 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, | 29 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, |
| 32 | int node_num); | 30 | int node_num); |
| 33 | 31 | ||
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8f5a12ab2f2b..339125bb4d2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h | |||
| @@ -456,7 +456,7 @@ | |||
| 456 | *(.entry.text) \ | 456 | *(.entry.text) \ |
| 457 | VMLINUX_SYMBOL(__entry_text_end) = .; | 457 | VMLINUX_SYMBOL(__entry_text_end) = .; |
| 458 | 458 | ||
| 459 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 459 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
| 460 | #define IRQENTRY_TEXT \ | 460 | #define IRQENTRY_TEXT \ |
| 461 | ALIGN_FUNCTION(); \ | 461 | ALIGN_FUNCTION(); \ |
| 462 | VMLINUX_SYMBOL(__irqentry_text_start) = .; \ | 462 | VMLINUX_SYMBOL(__irqentry_text_start) = .; \ |
| @@ -466,6 +466,16 @@ | |||
| 466 | #define IRQENTRY_TEXT | 466 | #define IRQENTRY_TEXT |
| 467 | #endif | 467 | #endif |
| 468 | 468 | ||
| 469 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) | ||
| 470 | #define SOFTIRQENTRY_TEXT \ | ||
| 471 | ALIGN_FUNCTION(); \ | ||
| 472 | VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ | ||
| 473 | *(.softirqentry.text) \ | ||
| 474 | VMLINUX_SYMBOL(__softirqentry_text_end) = .; | ||
| 475 | #else | ||
| 476 | #define SOFTIRQENTRY_TEXT | ||
| 477 | #endif | ||
| 478 | |||
| 469 | /* Section used for early init (in .S files) */ | 479 | /* Section used for early init (in .S files) */ |
| 470 | #define HEAD_TEXT *(.head.text) | 480 | #define HEAD_TEXT *(.head.text) |
| 471 | 481 | ||
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d9df3f7e334..dea12a6e413b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
| @@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
| 811 | */ | 811 | */ |
| 812 | #define __notrace_funcgraph notrace | 812 | #define __notrace_funcgraph notrace |
| 813 | 813 | ||
| 814 | /* | ||
| 815 | * We want to which function is an entrypoint of a hardirq. | ||
| 816 | * That will help us to put a signal on output. | ||
| 817 | */ | ||
| 818 | #define __irq_entry __attribute__((__section__(".irqentry.text"))) | ||
| 819 | |||
| 820 | /* Limits of hardirq entrypoints */ | ||
| 821 | extern char __irqentry_text_start[]; | ||
| 822 | extern char __irqentry_text_end[]; | ||
| 823 | |||
| 824 | #define FTRACE_NOTRACE_DEPTH 65536 | 814 | #define FTRACE_NOTRACE_DEPTH 65536 |
| 825 | #define FTRACE_RETFUNC_DEPTH 50 | 815 | #define FTRACE_RETFUNC_DEPTH 50 |
| 826 | #define FTRACE_RETSTACK_ALLOC_SIZE 32 | 816 | #define FTRACE_RETSTACK_ALLOC_SIZE 32 |
| @@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void) | |||
| 857 | #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ | 847 | #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ |
| 858 | 848 | ||
| 859 | #define __notrace_funcgraph | 849 | #define __notrace_funcgraph |
| 860 | #define __irq_entry | ||
| 861 | #define INIT_FTRACE_GRAPH | 850 | #define INIT_FTRACE_GRAPH |
| 862 | 851 | ||
| 863 | static inline void ftrace_graph_init_task(struct task_struct *t) { } | 852 | static inline void ftrace_graph_init_task(struct task_struct *t) { } |
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 358076eda364..9fcabeb07787 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h | |||
| @@ -683,4 +683,24 @@ extern int early_irq_init(void); | |||
| 683 | extern int arch_probe_nr_irqs(void); | 683 | extern int arch_probe_nr_irqs(void); |
| 684 | extern int arch_early_irq_init(void); | 684 | extern int arch_early_irq_init(void); |
| 685 | 685 | ||
| 686 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) | ||
| 687 | /* | ||
| 688 | * We want to know which function is an entrypoint of a hardirq or a softirq. | ||
| 689 | */ | ||
| 690 | #define __irq_entry __attribute__((__section__(".irqentry.text"))) | ||
| 691 | #define __softirq_entry \ | ||
| 692 | __attribute__((__section__(".softirqentry.text"))) | ||
| 693 | |||
| 694 | /* Limits of hardirq entrypoints */ | ||
| 695 | extern char __irqentry_text_start[]; | ||
| 696 | extern char __irqentry_text_end[]; | ||
| 697 | /* Limits of softirq entrypoints */ | ||
| 698 | extern char __softirqentry_text_start[]; | ||
| 699 | extern char __softirqentry_text_end[]; | ||
| 700 | |||
| 701 | #else | ||
| 702 | #define __irq_entry | ||
| 703 | #define __softirq_entry | ||
| 704 | #endif | ||
| 705 | |||
| 686 | #endif | 706 | #endif |
diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0fdc798e3ff7..737371b56044 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h | |||
| @@ -48,19 +48,28 @@ void kasan_unpoison_task_stack(struct task_struct *task); | |||
| 48 | void kasan_alloc_pages(struct page *page, unsigned int order); | 48 | void kasan_alloc_pages(struct page *page, unsigned int order); |
| 49 | void kasan_free_pages(struct page *page, unsigned int order); | 49 | void kasan_free_pages(struct page *page, unsigned int order); |
| 50 | 50 | ||
| 51 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, | ||
| 52 | unsigned long *flags); | ||
| 53 | |||
| 51 | void kasan_poison_slab(struct page *page); | 54 | void kasan_poison_slab(struct page *page); |
| 52 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); | 55 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); |
| 53 | void kasan_poison_object_data(struct kmem_cache *cache, void *object); | 56 | void kasan_poison_object_data(struct kmem_cache *cache, void *object); |
| 54 | 57 | ||
| 55 | void kasan_kmalloc_large(const void *ptr, size_t size); | 58 | void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); |
| 56 | void kasan_kfree_large(const void *ptr); | 59 | void kasan_kfree_large(const void *ptr); |
| 57 | void kasan_kfree(void *ptr); | 60 | void kasan_kfree(void *ptr); |
| 58 | void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); | 61 | void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, |
| 59 | void kasan_krealloc(const void *object, size_t new_size); | 62 | gfp_t flags); |
| 63 | void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); | ||
| 60 | 64 | ||
| 61 | void kasan_slab_alloc(struct kmem_cache *s, void *object); | 65 | void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); |
| 62 | void kasan_slab_free(struct kmem_cache *s, void *object); | 66 | void kasan_slab_free(struct kmem_cache *s, void *object); |
| 63 | 67 | ||
| 68 | struct kasan_cache { | ||
| 69 | int alloc_meta_offset; | ||
| 70 | int free_meta_offset; | ||
| 71 | }; | ||
| 72 | |||
| 64 | int kasan_module_alloc(void *addr, size_t size); | 73 | int kasan_module_alloc(void *addr, size_t size); |
| 65 | void kasan_free_shadow(const struct vm_struct *vm); | 74 | void kasan_free_shadow(const struct vm_struct *vm); |
| 66 | 75 | ||
| @@ -76,20 +85,26 @@ static inline void kasan_disable_current(void) {} | |||
| 76 | static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} | 85 | static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} |
| 77 | static inline void kasan_free_pages(struct page *page, unsigned int order) {} | 86 | static inline void kasan_free_pages(struct page *page, unsigned int order) {} |
| 78 | 87 | ||
| 88 | static inline void kasan_cache_create(struct kmem_cache *cache, | ||
| 89 | size_t *size, | ||
| 90 | unsigned long *flags) {} | ||
| 91 | |||
| 79 | static inline void kasan_poison_slab(struct page *page) {} | 92 | static inline void kasan_poison_slab(struct page *page) {} |
| 80 | static inline void kasan_unpoison_object_data(struct kmem_cache *cache, | 93 | static inline void kasan_unpoison_object_data(struct kmem_cache *cache, |
| 81 | void *object) {} | 94 | void *object) {} |
| 82 | static inline void kasan_poison_object_data(struct kmem_cache *cache, | 95 | static inline void kasan_poison_object_data(struct kmem_cache *cache, |
| 83 | void *object) {} | 96 | void *object) {} |
| 84 | 97 | ||
| 85 | static inline void kasan_kmalloc_large(void *ptr, size_t size) {} | 98 | static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {} |
| 86 | static inline void kasan_kfree_large(const void *ptr) {} | 99 | static inline void kasan_kfree_large(const void *ptr) {} |
| 87 | static inline void kasan_kfree(void *ptr) {} | 100 | static inline void kasan_kfree(void *ptr) {} |
| 88 | static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, | 101 | static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, |
| 89 | size_t size) {} | 102 | size_t size, gfp_t flags) {} |
| 90 | static inline void kasan_krealloc(const void *object, size_t new_size) {} | 103 | static inline void kasan_krealloc(const void *object, size_t new_size, |
| 104 | gfp_t flags) {} | ||
| 91 | 105 | ||
| 92 | static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} | 106 | static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, |
| 107 | gfp_t flags) {} | ||
| 93 | static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} | 108 | static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} |
| 94 | 109 | ||
| 95 | static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } | 110 | static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 450fc977ed02..ed6407d1b7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -1132,6 +1132,8 @@ struct zap_details { | |||
| 1132 | struct address_space *check_mapping; /* Check page->mapping if set */ | 1132 | struct address_space *check_mapping; /* Check page->mapping if set */ |
| 1133 | pgoff_t first_index; /* Lowest page->index to unmap */ | 1133 | pgoff_t first_index; /* Lowest page->index to unmap */ |
| 1134 | pgoff_t last_index; /* Highest page->index to unmap */ | 1134 | pgoff_t last_index; /* Highest page->index to unmap */ |
| 1135 | bool ignore_dirty; /* Ignore dirty pages */ | ||
| 1136 | bool check_swap_entries; /* Check also swap entries */ | ||
| 1135 | }; | 1137 | }; |
| 1136 | 1138 | ||
| 1137 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 1139 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 03e6257321f0..628a43242a34 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
| @@ -76,8 +76,6 @@ extern unsigned long oom_badness(struct task_struct *p, | |||
| 76 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 76 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
| 77 | unsigned long totalpages); | 77 | unsigned long totalpages); |
| 78 | 78 | ||
| 79 | extern int oom_kills_count(void); | ||
| 80 | extern void note_oom_kill(void); | ||
| 81 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, | 79 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, |
| 82 | unsigned int points, unsigned long totalpages, | 80 | unsigned int points, unsigned long totalpages, |
| 83 | struct mem_cgroup *memcg, const char *message); | 81 | struct mem_cgroup *memcg, const char *message); |
| @@ -91,7 +89,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | |||
| 91 | 89 | ||
| 92 | extern bool out_of_memory(struct oom_control *oc); | 90 | extern bool out_of_memory(struct oom_control *oc); |
| 93 | 91 | ||
| 94 | extern void exit_oom_victim(void); | 92 | extern void exit_oom_victim(struct task_struct *tsk); |
| 95 | 93 | ||
| 96 | extern int register_oom_notifier(struct notifier_block *nb); | 94 | extern int register_oom_notifier(struct notifier_block *nb); |
| 97 | extern int unregister_oom_notifier(struct notifier_block *nb); | 95 | extern int unregister_oom_notifier(struct notifier_block *nb); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 589c4780b077..60bba7e032dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout); | |||
| 426 | extern signed long schedule_timeout_interruptible(signed long timeout); | 426 | extern signed long schedule_timeout_interruptible(signed long timeout); |
| 427 | extern signed long schedule_timeout_killable(signed long timeout); | 427 | extern signed long schedule_timeout_killable(signed long timeout); |
| 428 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 428 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
| 429 | extern signed long schedule_timeout_idle(signed long timeout); | ||
| 429 | asmlinkage void schedule(void); | 430 | asmlinkage void schedule(void); |
| 430 | extern void schedule_preempt_disabled(void); | 431 | extern void schedule_preempt_disabled(void); |
| 431 | 432 | ||
| @@ -1848,6 +1849,9 @@ struct task_struct { | |||
| 1848 | unsigned long task_state_change; | 1849 | unsigned long task_state_change; |
| 1849 | #endif | 1850 | #endif |
| 1850 | int pagefault_disabled; | 1851 | int pagefault_disabled; |
| 1852 | #ifdef CONFIG_MMU | ||
| 1853 | struct task_struct *oom_reaper_list; | ||
| 1854 | #endif | ||
| 1851 | /* CPU-specific state of this task */ | 1855 | /* CPU-specific state of this task */ |
| 1852 | struct thread_struct thread; | 1856 | struct thread_struct thread; |
| 1853 | /* | 1857 | /* |
diff --git a/include/linux/slab.h b/include/linux/slab.h index e4b568738ca3..508bd827e6dc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
| @@ -92,6 +92,12 @@ | |||
| 92 | # define SLAB_ACCOUNT 0x00000000UL | 92 | # define SLAB_ACCOUNT 0x00000000UL |
| 93 | #endif | 93 | #endif |
| 94 | 94 | ||
| 95 | #ifdef CONFIG_KASAN | ||
| 96 | #define SLAB_KASAN 0x08000000UL | ||
| 97 | #else | ||
| 98 | #define SLAB_KASAN 0x00000000UL | ||
| 99 | #endif | ||
| 100 | |||
| 95 | /* The following flags affect the page allocator grouping pages by mobility */ | 101 | /* The following flags affect the page allocator grouping pages by mobility */ |
| 96 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ | 102 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ |
| 97 | #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ | 103 | #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ |
| @@ -370,7 +376,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, | |||
| 370 | { | 376 | { |
| 371 | void *ret = kmem_cache_alloc(s, flags); | 377 | void *ret = kmem_cache_alloc(s, flags); |
| 372 | 378 | ||
| 373 | kasan_kmalloc(s, ret, size); | 379 | kasan_kmalloc(s, ret, size, flags); |
| 374 | return ret; | 380 | return ret; |
| 375 | } | 381 | } |
| 376 | 382 | ||
| @@ -381,7 +387,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
| 381 | { | 387 | { |
| 382 | void *ret = kmem_cache_alloc_node(s, gfpflags, node); | 388 | void *ret = kmem_cache_alloc_node(s, gfpflags, node); |
| 383 | 389 | ||
| 384 | kasan_kmalloc(s, ret, size); | 390 | kasan_kmalloc(s, ret, size, gfpflags); |
| 385 | return ret; | 391 | return ret; |
| 386 | } | 392 | } |
| 387 | #endif /* CONFIG_TRACING */ | 393 | #endif /* CONFIG_TRACING */ |
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e878ba35ae91..9edbbf352340 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h | |||
| @@ -76,8 +76,22 @@ struct kmem_cache { | |||
| 76 | #ifdef CONFIG_MEMCG | 76 | #ifdef CONFIG_MEMCG |
| 77 | struct memcg_cache_params memcg_params; | 77 | struct memcg_cache_params memcg_params; |
| 78 | #endif | 78 | #endif |
| 79 | #ifdef CONFIG_KASAN | ||
| 80 | struct kasan_cache kasan_info; | ||
| 81 | #endif | ||
| 79 | 82 | ||
| 80 | struct kmem_cache_node *node[MAX_NUMNODES]; | 83 | struct kmem_cache_node *node[MAX_NUMNODES]; |
| 81 | }; | 84 | }; |
| 82 | 85 | ||
| 86 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | ||
| 87 | void *x) { | ||
| 88 | void *object = x - (x - page->s_mem) % cache->size; | ||
| 89 | void *last_object = page->s_mem + (cache->num - 1) * cache->size; | ||
| 90 | |||
| 91 | if (unlikely(object > last_object)) | ||
| 92 | return last_object; | ||
| 93 | else | ||
| 94 | return object; | ||
| 95 | } | ||
| 96 | |||
| 83 | #endif /* _LINUX_SLAB_DEF_H */ | 97 | #endif /* _LINUX_SLAB_DEF_H */ |
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index ac5143f95ee6..665cd0cd18b8 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
| @@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s, | |||
| 130 | void object_err(struct kmem_cache *s, struct page *page, | 130 | void object_err(struct kmem_cache *s, struct page *page, |
| 131 | u8 *object, char *reason); | 131 | u8 *object, char *reason); |
| 132 | 132 | ||
| 133 | static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, | ||
| 134 | void *x) { | ||
| 135 | void *object = x - (x - page_address(page)) % cache->size; | ||
| 136 | void *last_object = page_address(page) + | ||
| 137 | (page->objects - 1) * cache->size; | ||
| 138 | if (unlikely(object > last_object)) | ||
| 139 | return last_object; | ||
| 140 | else | ||
| 141 | return object; | ||
| 142 | } | ||
| 143 | |||
| 133 | #endif /* _LINUX_SLUB_DEF_H */ | 144 | #endif /* _LINUX_SLUB_DEF_H */ |
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h new file mode 100644 index 000000000000..7978b3e2c1e1 --- /dev/null +++ b/include/linux/stackdepot.h | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | /* | ||
| 2 | * A generic stack depot implementation | ||
| 3 | * | ||
| 4 | * Author: Alexander Potapenko <glider@google.com> | ||
| 5 | * Copyright (C) 2016 Google, Inc. | ||
| 6 | * | ||
| 7 | * Based on code by Dmitry Chernenkov. | ||
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of the GNU General Public License as published by | ||
| 11 | * the Free Software Foundation; either version 2 of the License, or | ||
| 12 | * (at your option) any later version. | ||
| 13 | * | ||
| 14 | * This program is distributed in the hope that it will be useful, | ||
| 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 17 | * GNU General Public License for more details. | ||
| 18 | * | ||
| 19 | */ | ||
| 20 | |||
| 21 | #ifndef _LINUX_STACKDEPOT_H | ||
| 22 | #define _LINUX_STACKDEPOT_H | ||
| 23 | |||
| 24 | typedef u32 depot_stack_handle_t; | ||
| 25 | |||
| 26 | struct stack_trace; | ||
| 27 | |||
| 28 | depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); | ||
| 29 | |||
| 30 | void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); | ||
| 31 | |||
| 32 | #endif | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 953d1a1c0387..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) | |||
| 435 | mm_update_next_owner(mm); | 435 | mm_update_next_owner(mm); |
| 436 | mmput(mm); | 436 | mmput(mm); |
| 437 | if (test_thread_flag(TIF_MEMDIE)) | 437 | if (test_thread_flag(TIF_MEMDIE)) |
| 438 | exit_oom_victim(); | 438 | exit_oom_victim(tsk); |
| 439 | } | 439 | } |
| 440 | 440 | ||
| 441 | static struct task_struct *find_alive_thread(struct task_struct *p) | 441 | static struct task_struct *find_alive_thread(struct task_struct *p) |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 8aae49dd7da8..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } | |||
| 227 | static inline void lockdep_softirq_end(bool in_hardirq) { } | 227 | static inline void lockdep_softirq_end(bool in_hardirq) { } |
| 228 | #endif | 228 | #endif |
| 229 | 229 | ||
| 230 | asmlinkage __visible void __do_softirq(void) | 230 | asmlinkage __visible void __softirq_entry __do_softirq(void) |
| 231 | { | 231 | { |
| 232 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 232 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
| 233 | unsigned long old_flags = current->flags; | 233 | unsigned long old_flags = current->flags; |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d1798fa0c743..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
| 1566 | } | 1566 | } |
| 1567 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1567 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
| 1568 | 1568 | ||
| 1569 | /* | ||
| 1570 | * Like schedule_timeout_uninterruptible(), except this task will not contribute | ||
| 1571 | * to load average. | ||
| 1572 | */ | ||
| 1573 | signed long __sched schedule_timeout_idle(signed long timeout) | ||
| 1574 | { | ||
| 1575 | __set_current_state(TASK_IDLE); | ||
| 1576 | return schedule_timeout(timeout); | ||
| 1577 | } | ||
| 1578 | EXPORT_SYMBOL(schedule_timeout_idle); | ||
| 1579 | |||
| 1569 | #ifdef CONFIG_HOTPLUG_CPU | 1580 | #ifdef CONFIG_HOTPLUG_CPU |
| 1570 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) | 1581 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) |
| 1571 | { | 1582 | { |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 91d6a63a2ea7..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | */ | 8 | */ |
| 9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
| 10 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> |
| 11 | #include <linux/interrupt.h> | ||
| 11 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| 12 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
| 13 | 14 | ||
diff --git a/lib/Kconfig b/lib/Kconfig index 133ebc0c1773..3cca1222578e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig | |||
| @@ -536,4 +536,8 @@ config ARCH_HAS_PMEM_API | |||
| 536 | config ARCH_HAS_MMIO_FLUSH | 536 | config ARCH_HAS_MMIO_FLUSH |
| 537 | bool | 537 | bool |
| 538 | 538 | ||
| 539 | config STACKDEPOT | ||
| 540 | bool | ||
| 541 | select STACKTRACE | ||
| 542 | |||
| 539 | endmenu | 543 | endmenu |
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 0fee5acd5aa0..67d8c6838ba9 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
| @@ -5,8 +5,9 @@ if HAVE_ARCH_KASAN | |||
| 5 | 5 | ||
| 6 | config KASAN | 6 | config KASAN |
| 7 | bool "KASan: runtime memory debugger" | 7 | bool "KASan: runtime memory debugger" |
| 8 | depends on SLUB_DEBUG | 8 | depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) |
| 9 | select CONSTRUCTORS | 9 | select CONSTRUCTORS |
| 10 | select STACKDEPOT if SLAB | ||
| 10 | help | 11 | help |
| 11 | Enables kernel address sanitizer - runtime memory debugger, | 12 | Enables kernel address sanitizer - runtime memory debugger, |
| 12 | designed to find out-of-bounds accesses and use-after-free bugs. | 13 | designed to find out-of-bounds accesses and use-after-free bugs. |
| @@ -16,6 +17,8 @@ config KASAN | |||
| 16 | This feature consumes about 1/8 of available memory and brings about | 17 | This feature consumes about 1/8 of available memory and brings about |
| 17 | ~x3 performance slowdown. | 18 | ~x3 performance slowdown. |
| 18 | For better error detection enable CONFIG_STACKTRACE. | 19 | For better error detection enable CONFIG_STACKTRACE. |
| 20 | Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB | ||
| 21 | (the resulting kernel does not boot). | ||
| 19 | 22 | ||
| 20 | choice | 23 | choice |
| 21 | prompt "Instrumentation type" | 24 | prompt "Instrumentation type" |
diff --git a/lib/Makefile b/lib/Makefile index a1de5b61ff40..7bd6fd436c97 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
| @@ -181,6 +181,9 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o | |||
| 181 | obj-$(CONFIG_STMP_DEVICE) += stmp_device.o | 181 | obj-$(CONFIG_STMP_DEVICE) += stmp_device.o |
| 182 | obj-$(CONFIG_IRQ_POLL) += irq_poll.o | 182 | obj-$(CONFIG_IRQ_POLL) += irq_poll.o |
| 183 | 183 | ||
| 184 | obj-$(CONFIG_STACKDEPOT) += stackdepot.o | ||
| 185 | KASAN_SANITIZE_stackdepot.o := n | ||
| 186 | |||
| 184 | libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ | 187 | libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ |
| 185 | fdt_empty_tree.o | 188 | fdt_empty_tree.o |
| 186 | $(foreach file, $(libfdt_files), \ | 189 | $(foreach file, $(libfdt_files), \ |
diff --git a/lib/stackdepot.c b/lib/stackdepot.c new file mode 100644 index 000000000000..654c9d87e83a --- /dev/null +++ b/lib/stackdepot.c | |||
| @@ -0,0 +1,284 @@ | |||
| 1 | /* | ||
| 2 | * Generic stack depot for storing stack traces. | ||
| 3 | * | ||
| 4 | * Some debugging tools need to save stack traces of certain events which can | ||
| 5 | * be later presented to the user. For example, KASAN needs to safe alloc and | ||
| 6 | * free stacks for each object, but storing two stack traces per object | ||
| 7 | * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for | ||
| 8 | * that). | ||
| 9 | * | ||
| 10 | * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc | ||
| 11 | * and free stacks repeat a lot, we save about 100x space. | ||
| 12 | * Stacks are never removed from depot, so we store them contiguously one after | ||
| 13 | * another in a contiguos memory allocation. | ||
| 14 | * | ||
| 15 | * Author: Alexander Potapenko <glider@google.com> | ||
| 16 | * Copyright (C) 2016 Google, Inc. | ||
| 17 | * | ||
| 18 | * Based on code by Dmitry Chernenkov. | ||
| 19 | * | ||
| 20 | * This program is free software; you can redistribute it and/or | ||
| 21 | * modify it under the terms of the GNU General Public License | ||
| 22 | * version 2 as published by the Free Software Foundation. | ||
| 23 | * | ||
| 24 | * This program is distributed in the hope that it will be useful, but | ||
| 25 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 27 | * General Public License for more details. | ||
| 28 | * | ||
| 29 | */ | ||
| 30 | |||
| 31 | #include <linux/gfp.h> | ||
| 32 | #include <linux/jhash.h> | ||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/mm.h> | ||
| 35 | #include <linux/percpu.h> | ||
| 36 | #include <linux/printk.h> | ||
| 37 | #include <linux/slab.h> | ||
| 38 | #include <linux/stacktrace.h> | ||
| 39 | #include <linux/stackdepot.h> | ||
| 40 | #include <linux/string.h> | ||
| 41 | #include <linux/types.h> | ||
| 42 | |||
| 43 | #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) | ||
| 44 | |||
| 45 | #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ | ||
| 46 | #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) | ||
| 47 | #define STACK_ALLOC_ALIGN 4 | ||
| 48 | #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ | ||
| 49 | STACK_ALLOC_ALIGN) | ||
| 50 | #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS) | ||
| 51 | #define STACK_ALLOC_SLABS_CAP 1024 | ||
| 52 | #define STACK_ALLOC_MAX_SLABS \ | ||
| 53 | (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ | ||
| 54 | (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) | ||
| 55 | |||
| 56 | /* The compact structure to store the reference to stacks. */ | ||
| 57 | union handle_parts { | ||
| 58 | depot_stack_handle_t handle; | ||
| 59 | struct { | ||
| 60 | u32 slabindex : STACK_ALLOC_INDEX_BITS; | ||
| 61 | u32 offset : STACK_ALLOC_OFFSET_BITS; | ||
| 62 | }; | ||
| 63 | }; | ||
| 64 | |||
| 65 | struct stack_record { | ||
| 66 | struct stack_record *next; /* Link in the hashtable */ | ||
| 67 | u32 hash; /* Hash in the hastable */ | ||
| 68 | u32 size; /* Number of frames in the stack */ | ||
| 69 | union handle_parts handle; | ||
| 70 | unsigned long entries[1]; /* Variable-sized array of entries. */ | ||
| 71 | }; | ||
| 72 | |||
| 73 | static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; | ||
| 74 | |||
| 75 | static int depot_index; | ||
| 76 | static int next_slab_inited; | ||
| 77 | static size_t depot_offset; | ||
| 78 | static DEFINE_SPINLOCK(depot_lock); | ||
| 79 | |||
| 80 | static bool init_stack_slab(void **prealloc) | ||
| 81 | { | ||
| 82 | if (!*prealloc) | ||
| 83 | return false; | ||
| 84 | /* | ||
| 85 | * This smp_load_acquire() pairs with smp_store_release() to | ||
| 86 | * |next_slab_inited| below and in depot_alloc_stack(). | ||
| 87 | */ | ||
| 88 | if (smp_load_acquire(&next_slab_inited)) | ||
| 89 | return true; | ||
| 90 | if (stack_slabs[depot_index] == NULL) { | ||
| 91 | stack_slabs[depot_index] = *prealloc; | ||
| 92 | } else { | ||
| 93 | stack_slabs[depot_index + 1] = *prealloc; | ||
| 94 | /* | ||
| 95 | * This smp_store_release pairs with smp_load_acquire() from | ||
| 96 | * |next_slab_inited| above and in depot_save_stack(). | ||
| 97 | */ | ||
| 98 | smp_store_release(&next_slab_inited, 1); | ||
| 99 | } | ||
| 100 | *prealloc = NULL; | ||
| 101 | return true; | ||
| 102 | } | ||
| 103 | |||
| 104 | /* Allocation of a new stack in raw storage */ | ||
| 105 | static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, | ||
| 106 | u32 hash, void **prealloc, gfp_t alloc_flags) | ||
| 107 | { | ||
| 108 | int required_size = offsetof(struct stack_record, entries) + | ||
| 109 | sizeof(unsigned long) * size; | ||
| 110 | struct stack_record *stack; | ||
| 111 | |||
| 112 | required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); | ||
| 113 | |||
| 114 | if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { | ||
| 115 | if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { | ||
| 116 | WARN_ONCE(1, "Stack depot reached limit capacity"); | ||
| 117 | return NULL; | ||
| 118 | } | ||
| 119 | depot_index++; | ||
| 120 | depot_offset = 0; | ||
| 121 | /* | ||
| 122 | * smp_store_release() here pairs with smp_load_acquire() from | ||
| 123 | * |next_slab_inited| in depot_save_stack() and | ||
| 124 | * init_stack_slab(). | ||
| 125 | */ | ||
| 126 | if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) | ||
| 127 | smp_store_release(&next_slab_inited, 0); | ||
| 128 | } | ||
| 129 | init_stack_slab(prealloc); | ||
| 130 | if (stack_slabs[depot_index] == NULL) | ||
| 131 | return NULL; | ||
| 132 | |||
| 133 | stack = stack_slabs[depot_index] + depot_offset; | ||
| 134 | |||
| 135 | stack->hash = hash; | ||
| 136 | stack->size = size; | ||
| 137 | stack->handle.slabindex = depot_index; | ||
| 138 | stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; | ||
| 139 | memcpy(stack->entries, entries, size * sizeof(unsigned long)); | ||
| 140 | depot_offset += required_size; | ||
| 141 | |||
| 142 | return stack; | ||
| 143 | } | ||
| 144 | |||
| 145 | #define STACK_HASH_ORDER 20 | ||
| 146 | #define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) | ||
| 147 | #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) | ||
| 148 | #define STACK_HASH_SEED 0x9747b28c | ||
| 149 | |||
| 150 | static struct stack_record *stack_table[STACK_HASH_SIZE] = { | ||
| 151 | [0 ... STACK_HASH_SIZE - 1] = NULL | ||
| 152 | }; | ||
| 153 | |||
| 154 | /* Calculate hash for a stack */ | ||
| 155 | static inline u32 hash_stack(unsigned long *entries, unsigned int size) | ||
| 156 | { | ||
| 157 | return jhash2((u32 *)entries, | ||
| 158 | size * sizeof(unsigned long) / sizeof(u32), | ||
| 159 | STACK_HASH_SEED); | ||
| 160 | } | ||
| 161 | |||
| 162 | /* Find a stack that is equal to the one stored in entries in the hash */ | ||
| 163 | static inline struct stack_record *find_stack(struct stack_record *bucket, | ||
| 164 | unsigned long *entries, int size, | ||
| 165 | u32 hash) | ||
| 166 | { | ||
| 167 | struct stack_record *found; | ||
| 168 | |||
| 169 | for (found = bucket; found; found = found->next) { | ||
| 170 | if (found->hash == hash && | ||
| 171 | found->size == size && | ||
| 172 | !memcmp(entries, found->entries, | ||
| 173 | size * sizeof(unsigned long))) { | ||
| 174 | return found; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | return NULL; | ||
| 178 | } | ||
| 179 | |||
| 180 | void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) | ||
| 181 | { | ||
| 182 | union handle_parts parts = { .handle = handle }; | ||
| 183 | void *slab = stack_slabs[parts.slabindex]; | ||
| 184 | size_t offset = parts.offset << STACK_ALLOC_ALIGN; | ||
| 185 | struct stack_record *stack = slab + offset; | ||
| 186 | |||
| 187 | trace->nr_entries = trace->max_entries = stack->size; | ||
| 188 | trace->entries = stack->entries; | ||
| 189 | trace->skip = 0; | ||
| 190 | } | ||
| 191 | |||
| 192 | /** | ||
| 193 | * depot_save_stack - save stack in a stack depot. | ||
| 194 | * @trace - the stacktrace to save. | ||
| 195 | * @alloc_flags - flags for allocating additional memory if required. | ||
| 196 | * | ||
| 197 | * Returns the handle of the stack struct stored in depot. | ||
| 198 | */ | ||
| 199 | depot_stack_handle_t depot_save_stack(struct stack_trace *trace, | ||
| 200 | gfp_t alloc_flags) | ||
| 201 | { | ||
| 202 | u32 hash; | ||
| 203 | depot_stack_handle_t retval = 0; | ||
| 204 | struct stack_record *found = NULL, **bucket; | ||
| 205 | unsigned long flags; | ||
| 206 | struct page *page = NULL; | ||
| 207 | void *prealloc = NULL; | ||
| 208 | |||
| 209 | if (unlikely(trace->nr_entries == 0)) | ||
| 210 | goto fast_exit; | ||
| 211 | |||
| 212 | hash = hash_stack(trace->entries, trace->nr_entries); | ||
| 213 | /* Bad luck, we won't store this stack. */ | ||
| 214 | if (hash == 0) | ||
| 215 | goto exit; | ||
| 216 | |||
| 217 | bucket = &stack_table[hash & STACK_HASH_MASK]; | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Fast path: look the stack trace up without locking. | ||
| 221 | * The smp_load_acquire() here pairs with smp_store_release() to | ||
| 222 | * |bucket| below. | ||
| 223 | */ | ||
| 224 | found = find_stack(smp_load_acquire(bucket), trace->entries, | ||
| 225 | trace->nr_entries, hash); | ||
| 226 | if (found) | ||
| 227 | goto exit; | ||
| 228 | |||
| 229 | /* | ||
| 230 | * Check if the current or the next stack slab need to be initialized. | ||
| 231 | * If so, allocate the memory - we won't be able to do that under the | ||
| 232 | * lock. | ||
| 233 | * | ||
| 234 | * The smp_load_acquire() here pairs with smp_store_release() to | ||
| 235 | * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). | ||
| 236 | */ | ||
| 237 | if (unlikely(!smp_load_acquire(&next_slab_inited))) { | ||
| 238 | /* | ||
| 239 | * Zero out zone modifiers, as we don't have specific zone | ||
| 240 | * requirements. Keep the flags related to allocation in atomic | ||
| 241 | * contexts and I/O. | ||
| 242 | */ | ||
| 243 | alloc_flags &= ~GFP_ZONEMASK; | ||
| 244 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); | ||
| 245 | page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); | ||
| 246 | if (page) | ||
| 247 | prealloc = page_address(page); | ||
| 248 | } | ||
| 249 | |||
| 250 | spin_lock_irqsave(&depot_lock, flags); | ||
| 251 | |||
| 252 | found = find_stack(*bucket, trace->entries, trace->nr_entries, hash); | ||
| 253 | if (!found) { | ||
| 254 | struct stack_record *new = | ||
| 255 | depot_alloc_stack(trace->entries, trace->nr_entries, | ||
| 256 | hash, &prealloc, alloc_flags); | ||
| 257 | if (new) { | ||
| 258 | new->next = *bucket; | ||
| 259 | /* | ||
| 260 | * This smp_store_release() pairs with | ||
| 261 | * smp_load_acquire() from |bucket| above. | ||
| 262 | */ | ||
| 263 | smp_store_release(bucket, new); | ||
| 264 | found = new; | ||
| 265 | } | ||
| 266 | } else if (prealloc) { | ||
| 267 | /* | ||
| 268 | * We didn't need to store this stack trace, but let's keep | ||
| 269 | * the preallocated memory for the future. | ||
| 270 | */ | ||
| 271 | WARN_ON(!init_stack_slab(&prealloc)); | ||
| 272 | } | ||
| 273 | |||
| 274 | spin_unlock_irqrestore(&depot_lock, flags); | ||
| 275 | exit: | ||
| 276 | if (prealloc) { | ||
| 277 | /* Nobody used this memory, ok to free it. */ | ||
| 278 | free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); | ||
| 279 | } | ||
| 280 | if (found) | ||
| 281 | retval = found->handle.handle; | ||
| 282 | fast_exit: | ||
| 283 | return retval; | ||
| 284 | } | ||
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c32f3b0048dc..82169fbf2453 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
| @@ -65,11 +65,34 @@ static noinline void __init kmalloc_node_oob_right(void) | |||
| 65 | kfree(ptr); | 65 | kfree(ptr); |
| 66 | } | 66 | } |
| 67 | 67 | ||
| 68 | static noinline void __init kmalloc_large_oob_right(void) | 68 | #ifdef CONFIG_SLUB |
| 69 | static noinline void __init kmalloc_pagealloc_oob_right(void) | ||
| 69 | { | 70 | { |
| 70 | char *ptr; | 71 | char *ptr; |
| 71 | size_t size = KMALLOC_MAX_CACHE_SIZE + 10; | 72 | size_t size = KMALLOC_MAX_CACHE_SIZE + 10; |
| 72 | 73 | ||
| 74 | /* Allocate a chunk that does not fit into a SLUB cache to trigger | ||
| 75 | * the page allocator fallback. | ||
| 76 | */ | ||
| 77 | pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n"); | ||
| 78 | ptr = kmalloc(size, GFP_KERNEL); | ||
| 79 | if (!ptr) { | ||
| 80 | pr_err("Allocation failed\n"); | ||
| 81 | return; | ||
| 82 | } | ||
| 83 | |||
| 84 | ptr[size] = 0; | ||
| 85 | kfree(ptr); | ||
| 86 | } | ||
| 87 | #endif | ||
| 88 | |||
| 89 | static noinline void __init kmalloc_large_oob_right(void) | ||
| 90 | { | ||
| 91 | char *ptr; | ||
| 92 | size_t size = KMALLOC_MAX_CACHE_SIZE - 256; | ||
| 93 | /* Allocate a chunk that is large enough, but still fits into a slab | ||
| 94 | * and does not trigger the page allocator fallback in SLUB. | ||
| 95 | */ | ||
| 73 | pr_info("kmalloc large allocation: out-of-bounds to right\n"); | 96 | pr_info("kmalloc large allocation: out-of-bounds to right\n"); |
| 74 | ptr = kmalloc(size, GFP_KERNEL); | 97 | ptr = kmalloc(size, GFP_KERNEL); |
| 75 | if (!ptr) { | 98 | if (!ptr) { |
| @@ -271,6 +294,8 @@ static noinline void __init kmalloc_uaf2(void) | |||
| 271 | } | 294 | } |
| 272 | 295 | ||
| 273 | ptr1[40] = 'x'; | 296 | ptr1[40] = 'x'; |
| 297 | if (ptr1 == ptr2) | ||
| 298 | pr_err("Could not detect use-after-free: ptr1 == ptr2\n"); | ||
| 274 | kfree(ptr2); | 299 | kfree(ptr2); |
| 275 | } | 300 | } |
| 276 | 301 | ||
| @@ -324,6 +349,9 @@ static int __init kmalloc_tests_init(void) | |||
| 324 | kmalloc_oob_right(); | 349 | kmalloc_oob_right(); |
| 325 | kmalloc_oob_left(); | 350 | kmalloc_oob_left(); |
| 326 | kmalloc_node_oob_right(); | 351 | kmalloc_node_oob_right(); |
| 352 | #ifdef CONFIG_SLUB | ||
| 353 | kmalloc_pagealloc_oob_right(); | ||
| 354 | #endif | ||
| 327 | kmalloc_large_oob_right(); | 355 | kmalloc_large_oob_right(); |
| 328 | kmalloc_oob_krealloc_more(); | 356 | kmalloc_oob_krealloc_more(); |
| 329 | kmalloc_oob_krealloc_less(); | 357 | kmalloc_oob_krealloc_less(); |
diff --git a/mm/Makefile b/mm/Makefile index f5e797cbd128..deb467edca2d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | KASAN_SANITIZE_slab_common.o := n | 5 | KASAN_SANITIZE_slab_common.o := n |
| 6 | KASAN_SANITIZE_slab.o := n | ||
| 6 | KASAN_SANITIZE_slub.o := n | 7 | KASAN_SANITIZE_slub.o := n |
| 7 | 8 | ||
| 8 | # These files are disabled because they produce non-interesting and/or | 9 | # These files are disabled because they produce non-interesting and/or |
diff --git a/mm/filemap.c b/mm/filemap.c index 7c00f105845e..a8c69c8c0a90 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -1840,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
| 1840 | ssize_t retval = 0; | 1840 | ssize_t retval = 0; |
| 1841 | loff_t *ppos = &iocb->ki_pos; | 1841 | loff_t *ppos = &iocb->ki_pos; |
| 1842 | loff_t pos = *ppos; | 1842 | loff_t pos = *ppos; |
| 1843 | size_t count = iov_iter_count(iter); | ||
| 1844 | |||
| 1845 | if (!count) | ||
| 1846 | goto out; /* skip atime */ | ||
| 1843 | 1847 | ||
| 1844 | if (iocb->ki_flags & IOCB_DIRECT) { | 1848 | if (iocb->ki_flags & IOCB_DIRECT) { |
| 1845 | struct address_space *mapping = file->f_mapping; | 1849 | struct address_space *mapping = file->f_mapping; |
| 1846 | struct inode *inode = mapping->host; | 1850 | struct inode *inode = mapping->host; |
| 1847 | size_t count = iov_iter_count(iter); | ||
| 1848 | loff_t size; | 1851 | loff_t size; |
| 1849 | 1852 | ||
| 1850 | if (!count) | ||
| 1851 | goto out; /* skip atime */ | ||
| 1852 | size = i_size_read(inode); | 1853 | size = i_size_read(inode); |
| 1853 | retval = filemap_write_and_wait_range(mapping, pos, | 1854 | retval = filemap_write_and_wait_range(mapping, pos, |
| 1854 | pos + count - 1); | 1855 | pos + count - 1); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fbfb1b8d6726..86f9f8b82f8e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -2578,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2578 | } | 2578 | } |
| 2579 | khugepaged_node_load[node]++; | 2579 | khugepaged_node_load[node]++; |
| 2580 | if (!PageLRU(page)) { | 2580 | if (!PageLRU(page)) { |
| 2581 | result = SCAN_SCAN_ABORT; | 2581 | result = SCAN_PAGE_LRU; |
| 2582 | goto out_unmap; | 2582 | goto out_unmap; |
| 2583 | } | 2583 | } |
| 2584 | if (PageLocked(page)) { | 2584 | if (PageLocked(page)) { |
diff --git a/mm/internal.h b/mm/internal.h index 7449392c6faa..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -38,6 +38,11 @@ | |||
| 38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
| 39 | unsigned long floor, unsigned long ceiling); | 39 | unsigned long floor, unsigned long ceiling); |
| 40 | 40 | ||
| 41 | void unmap_page_range(struct mmu_gather *tlb, | ||
| 42 | struct vm_area_struct *vma, | ||
| 43 | unsigned long addr, unsigned long end, | ||
| 44 | struct zap_details *details); | ||
| 45 | |||
| 41 | extern int __do_page_cache_readahead(struct address_space *mapping, | 46 | extern int __do_page_cache_readahead(struct address_space *mapping, |
| 42 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, | 47 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, |
| 43 | unsigned long lookahead_size); | 48 | unsigned long lookahead_size); |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ade8c91..acb3b6c4dd89 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
| @@ -17,7 +17,9 @@ | |||
| 17 | #define DISABLE_BRANCH_PROFILING | 17 | #define DISABLE_BRANCH_PROFILING |
| 18 | 18 | ||
| 19 | #include <linux/export.h> | 19 | #include <linux/export.h> |
| 20 | #include <linux/interrupt.h> | ||
| 20 | #include <linux/init.h> | 21 | #include <linux/init.h> |
| 22 | #include <linux/kasan.h> | ||
| 21 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
| 22 | #include <linux/kmemleak.h> | 24 | #include <linux/kmemleak.h> |
| 23 | #include <linux/linkage.h> | 25 | #include <linux/linkage.h> |
| @@ -32,7 +34,6 @@ | |||
| 32 | #include <linux/string.h> | 34 | #include <linux/string.h> |
| 33 | #include <linux/types.h> | 35 | #include <linux/types.h> |
| 34 | #include <linux/vmalloc.h> | 36 | #include <linux/vmalloc.h> |
| 35 | #include <linux/kasan.h> | ||
| 36 | 37 | ||
| 37 | #include "kasan.h" | 38 | #include "kasan.h" |
| 38 | #include "../slab.h" | 39 | #include "../slab.h" |
| @@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order) | |||
| 334 | KASAN_FREE_PAGE); | 335 | KASAN_FREE_PAGE); |
| 335 | } | 336 | } |
| 336 | 337 | ||
| 338 | #ifdef CONFIG_SLAB | ||
| 339 | /* | ||
| 340 | * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. | ||
| 341 | * For larger allocations larger redzones are used. | ||
| 342 | */ | ||
| 343 | static size_t optimal_redzone(size_t object_size) | ||
| 344 | { | ||
| 345 | int rz = | ||
| 346 | object_size <= 64 - 16 ? 16 : | ||
| 347 | object_size <= 128 - 32 ? 32 : | ||
| 348 | object_size <= 512 - 64 ? 64 : | ||
| 349 | object_size <= 4096 - 128 ? 128 : | ||
| 350 | object_size <= (1 << 14) - 256 ? 256 : | ||
| 351 | object_size <= (1 << 15) - 512 ? 512 : | ||
| 352 | object_size <= (1 << 16) - 1024 ? 1024 : 2048; | ||
| 353 | return rz; | ||
| 354 | } | ||
| 355 | |||
| 356 | void kasan_cache_create(struct kmem_cache *cache, size_t *size, | ||
| 357 | unsigned long *flags) | ||
| 358 | { | ||
| 359 | int redzone_adjust; | ||
| 360 | /* Make sure the adjusted size is still less than | ||
| 361 | * KMALLOC_MAX_CACHE_SIZE. | ||
| 362 | * TODO: this check is only useful for SLAB, but not SLUB. We'll need | ||
| 363 | * to skip it for SLUB when it starts using kasan_cache_create(). | ||
| 364 | */ | ||
| 365 | if (*size > KMALLOC_MAX_CACHE_SIZE - | ||
| 366 | sizeof(struct kasan_alloc_meta) - | ||
| 367 | sizeof(struct kasan_free_meta)) | ||
| 368 | return; | ||
| 369 | *flags |= SLAB_KASAN; | ||
| 370 | /* Add alloc meta. */ | ||
| 371 | cache->kasan_info.alloc_meta_offset = *size; | ||
| 372 | *size += sizeof(struct kasan_alloc_meta); | ||
| 373 | |||
| 374 | /* Add free meta. */ | ||
| 375 | if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || | ||
| 376 | cache->object_size < sizeof(struct kasan_free_meta)) { | ||
| 377 | cache->kasan_info.free_meta_offset = *size; | ||
| 378 | *size += sizeof(struct kasan_free_meta); | ||
| 379 | } | ||
| 380 | redzone_adjust = optimal_redzone(cache->object_size) - | ||
| 381 | (*size - cache->object_size); | ||
| 382 | if (redzone_adjust > 0) | ||
| 383 | *size += redzone_adjust; | ||
| 384 | *size = min(KMALLOC_MAX_CACHE_SIZE, | ||
| 385 | max(*size, | ||
| 386 | cache->object_size + | ||
| 387 | optimal_redzone(cache->object_size))); | ||
| 388 | } | ||
| 389 | #endif | ||
| 390 | |||
| 337 | void kasan_poison_slab(struct page *page) | 391 | void kasan_poison_slab(struct page *page) |
| 338 | { | 392 | { |
| 339 | kasan_poison_shadow(page_address(page), | 393 | kasan_poison_shadow(page_address(page), |
| @@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) | |||
| 351 | kasan_poison_shadow(object, | 405 | kasan_poison_shadow(object, |
| 352 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), | 406 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), |
| 353 | KASAN_KMALLOC_REDZONE); | 407 | KASAN_KMALLOC_REDZONE); |
| 408 | #ifdef CONFIG_SLAB | ||
| 409 | if (cache->flags & SLAB_KASAN) { | ||
| 410 | struct kasan_alloc_meta *alloc_info = | ||
| 411 | get_alloc_info(cache, object); | ||
| 412 | alloc_info->state = KASAN_STATE_INIT; | ||
| 413 | } | ||
| 414 | #endif | ||
| 354 | } | 415 | } |
| 355 | 416 | ||
| 356 | void kasan_slab_alloc(struct kmem_cache *cache, void *object) | 417 | #ifdef CONFIG_SLAB |
| 418 | static inline int in_irqentry_text(unsigned long ptr) | ||
| 357 | { | 419 | { |
| 358 | kasan_kmalloc(cache, object, cache->object_size); | 420 | return (ptr >= (unsigned long)&__irqentry_text_start && |
| 421 | ptr < (unsigned long)&__irqentry_text_end) || | ||
| 422 | (ptr >= (unsigned long)&__softirqentry_text_start && | ||
| 423 | ptr < (unsigned long)&__softirqentry_text_end); | ||
| 424 | } | ||
| 425 | |||
| 426 | static inline void filter_irq_stacks(struct stack_trace *trace) | ||
| 427 | { | ||
| 428 | int i; | ||
| 429 | |||
| 430 | if (!trace->nr_entries) | ||
| 431 | return; | ||
| 432 | for (i = 0; i < trace->nr_entries; i++) | ||
| 433 | if (in_irqentry_text(trace->entries[i])) { | ||
| 434 | /* Include the irqentry function into the stack. */ | ||
| 435 | trace->nr_entries = i + 1; | ||
| 436 | break; | ||
| 437 | } | ||
| 438 | } | ||
| 439 | |||
| 440 | static inline depot_stack_handle_t save_stack(gfp_t flags) | ||
| 441 | { | ||
| 442 | unsigned long entries[KASAN_STACK_DEPTH]; | ||
| 443 | struct stack_trace trace = { | ||
| 444 | .nr_entries = 0, | ||
| 445 | .entries = entries, | ||
| 446 | .max_entries = KASAN_STACK_DEPTH, | ||
| 447 | .skip = 0 | ||
| 448 | }; | ||
| 449 | |||
| 450 | save_stack_trace(&trace); | ||
| 451 | filter_irq_stacks(&trace); | ||
| 452 | if (trace.nr_entries != 0 && | ||
| 453 | trace.entries[trace.nr_entries-1] == ULONG_MAX) | ||
| 454 | trace.nr_entries--; | ||
| 455 | |||
| 456 | return depot_save_stack(&trace, flags); | ||
| 457 | } | ||
| 458 | |||
| 459 | static inline void set_track(struct kasan_track *track, gfp_t flags) | ||
| 460 | { | ||
| 461 | track->pid = current->pid; | ||
| 462 | track->stack = save_stack(flags); | ||
| 463 | } | ||
| 464 | |||
| 465 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | ||
| 466 | const void *object) | ||
| 467 | { | ||
| 468 | BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); | ||
| 469 | return (void *)object + cache->kasan_info.alloc_meta_offset; | ||
| 470 | } | ||
| 471 | |||
| 472 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | ||
| 473 | const void *object) | ||
| 474 | { | ||
| 475 | BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); | ||
| 476 | return (void *)object + cache->kasan_info.free_meta_offset; | ||
| 477 | } | ||
| 478 | #endif | ||
| 479 | |||
| 480 | void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) | ||
| 481 | { | ||
| 482 | kasan_kmalloc(cache, object, cache->object_size, flags); | ||
| 359 | } | 483 | } |
| 360 | 484 | ||
| 361 | void kasan_slab_free(struct kmem_cache *cache, void *object) | 485 | void kasan_slab_free(struct kmem_cache *cache, void *object) |
| @@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) | |||
| 367 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 491 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) |
| 368 | return; | 492 | return; |
| 369 | 493 | ||
| 494 | #ifdef CONFIG_SLAB | ||
| 495 | if (cache->flags & SLAB_KASAN) { | ||
| 496 | struct kasan_free_meta *free_info = | ||
| 497 | get_free_info(cache, object); | ||
| 498 | struct kasan_alloc_meta *alloc_info = | ||
| 499 | get_alloc_info(cache, object); | ||
| 500 | alloc_info->state = KASAN_STATE_FREE; | ||
| 501 | set_track(&free_info->track); | ||
| 502 | } | ||
| 503 | #endif | ||
| 504 | |||
| 370 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | 505 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); |
| 371 | } | 506 | } |
| 372 | 507 | ||
| 373 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | 508 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, |
| 509 | gfp_t flags) | ||
| 374 | { | 510 | { |
| 375 | unsigned long redzone_start; | 511 | unsigned long redzone_start; |
| 376 | unsigned long redzone_end; | 512 | unsigned long redzone_end; |
| @@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | |||
| 386 | kasan_unpoison_shadow(object, size); | 522 | kasan_unpoison_shadow(object, size); |
| 387 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 523 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
| 388 | KASAN_KMALLOC_REDZONE); | 524 | KASAN_KMALLOC_REDZONE); |
| 525 | #ifdef CONFIG_SLAB | ||
| 526 | if (cache->flags & SLAB_KASAN) { | ||
| 527 | struct kasan_alloc_meta *alloc_info = | ||
| 528 | get_alloc_info(cache, object); | ||
| 529 | |||
| 530 | alloc_info->state = KASAN_STATE_ALLOC; | ||
| 531 | alloc_info->alloc_size = size; | ||
| 532 | set_track(&alloc_info->track, flags); | ||
| 533 | } | ||
| 534 | #endif | ||
| 389 | } | 535 | } |
| 390 | EXPORT_SYMBOL(kasan_kmalloc); | 536 | EXPORT_SYMBOL(kasan_kmalloc); |
| 391 | 537 | ||
| 392 | void kasan_kmalloc_large(const void *ptr, size_t size) | 538 | void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) |
| 393 | { | 539 | { |
| 394 | struct page *page; | 540 | struct page *page; |
| 395 | unsigned long redzone_start; | 541 | unsigned long redzone_start; |
| @@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) | |||
| 408 | KASAN_PAGE_REDZONE); | 554 | KASAN_PAGE_REDZONE); |
| 409 | } | 555 | } |
| 410 | 556 | ||
| 411 | void kasan_krealloc(const void *object, size_t size) | 557 | void kasan_krealloc(const void *object, size_t size, gfp_t flags) |
| 412 | { | 558 | { |
| 413 | struct page *page; | 559 | struct page *page; |
| 414 | 560 | ||
| @@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size) | |||
| 418 | page = virt_to_head_page(object); | 564 | page = virt_to_head_page(object); |
| 419 | 565 | ||
| 420 | if (unlikely(!PageSlab(page))) | 566 | if (unlikely(!PageSlab(page))) |
| 421 | kasan_kmalloc_large(object, size); | 567 | kasan_kmalloc_large(object, size, flags); |
| 422 | else | 568 | else |
| 423 | kasan_kmalloc(page->slab_cache, object, size); | 569 | kasan_kmalloc(page->slab_cache, object, size, flags); |
| 424 | } | 570 | } |
| 425 | 571 | ||
| 426 | void kasan_kfree(void *ptr) | 572 | void kasan_kfree(void *ptr) |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e5c21e..30a2f0ba0e09 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #define __MM_KASAN_KASAN_H | 2 | #define __MM_KASAN_KASAN_H |
| 3 | 3 | ||
| 4 | #include <linux/kasan.h> | 4 | #include <linux/kasan.h> |
| 5 | #include <linux/stackdepot.h> | ||
| 5 | 6 | ||
| 6 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) | 7 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) |
| 7 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) | 8 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) |
| @@ -54,6 +55,42 @@ struct kasan_global { | |||
| 54 | #endif | 55 | #endif |
| 55 | }; | 56 | }; |
| 56 | 57 | ||
| 58 | /** | ||
| 59 | * Structures to keep alloc and free tracks * | ||
| 60 | */ | ||
| 61 | |||
| 62 | enum kasan_state { | ||
| 63 | KASAN_STATE_INIT, | ||
| 64 | KASAN_STATE_ALLOC, | ||
| 65 | KASAN_STATE_FREE | ||
| 66 | }; | ||
| 67 | |||
| 68 | #define KASAN_STACK_DEPTH 64 | ||
| 69 | |||
| 70 | struct kasan_track { | ||
| 71 | u32 pid; | ||
| 72 | depot_stack_handle_t stack; | ||
| 73 | }; | ||
| 74 | |||
| 75 | struct kasan_alloc_meta { | ||
| 76 | struct kasan_track track; | ||
| 77 | u32 state : 2; /* enum kasan_state */ | ||
| 78 | u32 alloc_size : 30; | ||
| 79 | u32 reserved; | ||
| 80 | }; | ||
| 81 | |||
| 82 | struct kasan_free_meta { | ||
| 83 | /* Allocator freelist pointer, unused by KASAN. */ | ||
| 84 | void **freelist; | ||
| 85 | struct kasan_track track; | ||
| 86 | }; | ||
| 87 | |||
| 88 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | ||
| 89 | const void *object); | ||
| 90 | struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | ||
| 91 | const void *object); | ||
| 92 | |||
| 93 | |||
| 57 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | 94 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) |
| 58 | { | 95 | { |
| 59 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | 96 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 745aa8f36028..60869a5a0124 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/printk.h> | 18 | #include <linux/printk.h> |
| 19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
| 20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
| 21 | #include <linux/stackdepot.h> | ||
| 21 | #include <linux/stacktrace.h> | 22 | #include <linux/stacktrace.h> |
| 22 | #include <linux/string.h> | 23 | #include <linux/string.h> |
| 23 | #include <linux/types.h> | 24 | #include <linux/types.h> |
| @@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr) | |||
| 115 | sizeof(init_thread_union.stack)); | 116 | sizeof(init_thread_union.stack)); |
| 116 | } | 117 | } |
| 117 | 118 | ||
| 119 | #ifdef CONFIG_SLAB | ||
| 120 | static void print_track(struct kasan_track *track) | ||
| 121 | { | ||
| 122 | pr_err("PID = %u\n", track->pid); | ||
| 123 | if (track->stack) { | ||
| 124 | struct stack_trace trace; | ||
| 125 | |||
| 126 | depot_fetch_stack(track->stack, &trace); | ||
| 127 | print_stack_trace(&trace, 0); | ||
| 128 | } else { | ||
| 129 | pr_err("(stack is not available)\n"); | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | static void object_err(struct kmem_cache *cache, struct page *page, | ||
| 134 | void *object, char *unused_reason) | ||
| 135 | { | ||
| 136 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); | ||
| 137 | struct kasan_free_meta *free_info; | ||
| 138 | |||
| 139 | dump_stack(); | ||
| 140 | pr_err("Object at %p, in cache %s\n", object, cache->name); | ||
| 141 | if (!(cache->flags & SLAB_KASAN)) | ||
| 142 | return; | ||
| 143 | switch (alloc_info->state) { | ||
| 144 | case KASAN_STATE_INIT: | ||
| 145 | pr_err("Object not allocated yet\n"); | ||
| 146 | break; | ||
| 147 | case KASAN_STATE_ALLOC: | ||
| 148 | pr_err("Object allocated with size %u bytes.\n", | ||
| 149 | alloc_info->alloc_size); | ||
| 150 | pr_err("Allocation:\n"); | ||
| 151 | print_track(&alloc_info->track); | ||
| 152 | break; | ||
| 153 | case KASAN_STATE_FREE: | ||
| 154 | pr_err("Object freed, allocated with size %u bytes\n", | ||
| 155 | alloc_info->alloc_size); | ||
| 156 | free_info = get_free_info(cache, object); | ||
| 157 | pr_err("Allocation:\n"); | ||
| 158 | print_track(&alloc_info->track); | ||
| 159 | pr_err("Deallocation:\n"); | ||
| 160 | print_track(&free_info->track); | ||
| 161 | break; | ||
| 162 | } | ||
| 163 | } | ||
| 164 | #endif | ||
| 165 | |||
| 118 | static void print_address_description(struct kasan_access_info *info) | 166 | static void print_address_description(struct kasan_access_info *info) |
| 119 | { | 167 | { |
| 120 | const void *addr = info->access_addr; | 168 | const void *addr = info->access_addr; |
| @@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info) | |||
| 126 | if (PageSlab(page)) { | 174 | if (PageSlab(page)) { |
| 127 | void *object; | 175 | void *object; |
| 128 | struct kmem_cache *cache = page->slab_cache; | 176 | struct kmem_cache *cache = page->slab_cache; |
| 129 | void *last_object; | 177 | object = nearest_obj(cache, page, |
| 130 | 178 | (void *)info->access_addr); | |
| 131 | object = virt_to_obj(cache, page_address(page), addr); | ||
| 132 | last_object = page_address(page) + | ||
| 133 | page->objects * cache->size; | ||
| 134 | |||
| 135 | if (unlikely(object > last_object)) | ||
| 136 | object = last_object; /* we hit into padding */ | ||
| 137 | |||
| 138 | object_err(cache, page, object, | 179 | object_err(cache, page, object, |
| 139 | "kasan: bad access detected"); | 180 | "kasan: bad access detected"); |
| 140 | return; | 181 | return; |
| 141 | } | 182 | } |
| 142 | dump_page(page, "kasan: bad access detected"); | 183 | dump_page(page, "kasan: bad access detected"); |
| @@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info) | |||
| 146 | if (!init_task_stack_addr(addr)) | 187 | if (!init_task_stack_addr(addr)) |
| 147 | pr_err("Address belongs to variable %pS\n", addr); | 188 | pr_err("Address belongs to variable %pS\n", addr); |
| 148 | } | 189 | } |
| 149 | |||
| 150 | dump_stack(); | 190 | dump_stack(); |
| 151 | } | 191 | } |
| 152 | 192 | ||
diff --git a/mm/memory.c b/mm/memory.c index 81dca0083fcd..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1102,6 +1102,12 @@ again: | |||
| 1102 | 1102 | ||
| 1103 | if (!PageAnon(page)) { | 1103 | if (!PageAnon(page)) { |
| 1104 | if (pte_dirty(ptent)) { | 1104 | if (pte_dirty(ptent)) { |
| 1105 | /* | ||
| 1106 | * oom_reaper cannot tear down dirty | ||
| 1107 | * pages | ||
| 1108 | */ | ||
| 1109 | if (unlikely(details && details->ignore_dirty)) | ||
| 1110 | continue; | ||
| 1105 | force_flush = 1; | 1111 | force_flush = 1; |
| 1106 | set_page_dirty(page); | 1112 | set_page_dirty(page); |
| 1107 | } | 1113 | } |
| @@ -1120,8 +1126,8 @@ again: | |||
| 1120 | } | 1126 | } |
| 1121 | continue; | 1127 | continue; |
| 1122 | } | 1128 | } |
| 1123 | /* If details->check_mapping, we leave swap entries. */ | 1129 | /* only check swap_entries if explicitly asked for in details */ |
| 1124 | if (unlikely(details)) | 1130 | if (unlikely(details && !details->check_swap_entries)) |
| 1125 | continue; | 1131 | continue; |
| 1126 | 1132 | ||
| 1127 | entry = pte_to_swp_entry(ptent); | 1133 | entry = pte_to_swp_entry(ptent); |
| @@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
| 1226 | return addr; | 1232 | return addr; |
| 1227 | } | 1233 | } |
| 1228 | 1234 | ||
| 1229 | static void unmap_page_range(struct mmu_gather *tlb, | 1235 | void unmap_page_range(struct mmu_gather *tlb, |
| 1230 | struct vm_area_struct *vma, | 1236 | struct vm_area_struct *vma, |
| 1231 | unsigned long addr, unsigned long end, | 1237 | unsigned long addr, unsigned long end, |
| 1232 | struct zap_details *details) | 1238 | struct zap_details *details) |
| @@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
| 1234 | pgd_t *pgd; | 1240 | pgd_t *pgd; |
| 1235 | unsigned long next; | 1241 | unsigned long next; |
| 1236 | 1242 | ||
| 1237 | if (details && !details->check_mapping) | ||
| 1238 | details = NULL; | ||
| 1239 | |||
| 1240 | BUG_ON(addr >= end); | 1243 | BUG_ON(addr >= end); |
| 1241 | tlb_start_vma(tlb, vma); | 1244 | tlb_start_vma(tlb, vma); |
| 1242 | pgd = pgd_offset(vma->vm_mm, addr); | 1245 | pgd = pgd_offset(vma->vm_mm, addr); |
| @@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
| 2432 | void unmap_mapping_range(struct address_space *mapping, | 2435 | void unmap_mapping_range(struct address_space *mapping, |
| 2433 | loff_t const holebegin, loff_t const holelen, int even_cows) | 2436 | loff_t const holebegin, loff_t const holelen, int even_cows) |
| 2434 | { | 2437 | { |
| 2435 | struct zap_details details; | 2438 | struct zap_details details = { }; |
| 2436 | pgoff_t hba = holebegin >> PAGE_SHIFT; | 2439 | pgoff_t hba = holebegin >> PAGE_SHIFT; |
| 2437 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2440 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 2438 | 2441 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index 07c383ddbbab..9b7a14a791cc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) | |||
| 112 | kasan_free_pages(element, (unsigned long)pool->pool_data); | 112 | kasan_free_pages(element, (unsigned long)pool->pool_data); |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | static void kasan_unpoison_element(mempool_t *pool, void *element) | 115 | static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) |
| 116 | { | 116 | { |
| 117 | if (pool->alloc == mempool_alloc_slab) | 117 | if (pool->alloc == mempool_alloc_slab) |
| 118 | kasan_slab_alloc(pool->pool_data, element); | 118 | kasan_slab_alloc(pool->pool_data, element, flags); |
| 119 | if (pool->alloc == mempool_kmalloc) | 119 | if (pool->alloc == mempool_kmalloc) |
| 120 | kasan_krealloc(element, (size_t)pool->pool_data); | 120 | kasan_krealloc(element, (size_t)pool->pool_data, flags); |
| 121 | if (pool->alloc == mempool_alloc_pages) | 121 | if (pool->alloc == mempool_alloc_pages) |
| 122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); | 122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); |
| 123 | } | 123 | } |
| @@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) | |||
| 130 | pool->elements[pool->curr_nr++] = element; | 130 | pool->elements[pool->curr_nr++] = element; |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | static void *remove_element(mempool_t *pool) | 133 | static void *remove_element(mempool_t *pool, gfp_t flags) |
| 134 | { | 134 | { |
| 135 | void *element = pool->elements[--pool->curr_nr]; | 135 | void *element = pool->elements[--pool->curr_nr]; |
| 136 | 136 | ||
| 137 | BUG_ON(pool->curr_nr < 0); | 137 | BUG_ON(pool->curr_nr < 0); |
| 138 | kasan_unpoison_element(pool, element); | 138 | kasan_unpoison_element(pool, element, flags); |
| 139 | check_element(pool, element); | 139 | check_element(pool, element); |
| 140 | return element; | 140 | return element; |
| 141 | } | 141 | } |
| @@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) | |||
| 154 | return; | 154 | return; |
| 155 | 155 | ||
| 156 | while (pool->curr_nr) { | 156 | while (pool->curr_nr) { |
| 157 | void *element = remove_element(pool); | 157 | void *element = remove_element(pool, GFP_KERNEL); |
| 158 | pool->free(element, pool->pool_data); | 158 | pool->free(element, pool->pool_data); |
| 159 | } | 159 | } |
| 160 | kfree(pool->elements); | 160 | kfree(pool->elements); |
| @@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) | |||
| 250 | spin_lock_irqsave(&pool->lock, flags); | 250 | spin_lock_irqsave(&pool->lock, flags); |
| 251 | if (new_min_nr <= pool->min_nr) { | 251 | if (new_min_nr <= pool->min_nr) { |
| 252 | while (new_min_nr < pool->curr_nr) { | 252 | while (new_min_nr < pool->curr_nr) { |
| 253 | element = remove_element(pool); | 253 | element = remove_element(pool, GFP_KERNEL); |
| 254 | spin_unlock_irqrestore(&pool->lock, flags); | 254 | spin_unlock_irqrestore(&pool->lock, flags); |
| 255 | pool->free(element, pool->pool_data); | 255 | pool->free(element, pool->pool_data); |
| 256 | spin_lock_irqsave(&pool->lock, flags); | 256 | spin_lock_irqsave(&pool->lock, flags); |
| @@ -347,7 +347,7 @@ repeat_alloc: | |||
| 347 | 347 | ||
| 348 | spin_lock_irqsave(&pool->lock, flags); | 348 | spin_lock_irqsave(&pool->lock, flags); |
| 349 | if (likely(pool->curr_nr)) { | 349 | if (likely(pool->curr_nr)) { |
| 350 | element = remove_element(pool); | 350 | element = remove_element(pool, gfp_temp); |
| 351 | spin_unlock_irqrestore(&pool->lock, flags); | 351 | spin_unlock_irqrestore(&pool->lock, flags); |
| 352 | /* paired with rmb in mempool_free(), read comment there */ | 352 | /* paired with rmb in mempool_free(), read comment there */ |
| 353 | smp_wmb(); | 353 | smp_wmb(); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e1707847..b34d279a7ee6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -35,6 +35,11 @@ | |||
| 35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
| 36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
| 37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
| 38 | #include <linux/kthread.h> | ||
| 39 | #include <linux/init.h> | ||
| 40 | |||
| 41 | #include <asm/tlb.h> | ||
| 42 | #include "internal.h" | ||
| 38 | 43 | ||
| 39 | #define CREATE_TRACE_POINTS | 44 | #define CREATE_TRACE_POINTS |
| 40 | #include <trace/events/oom.h> | 45 | #include <trace/events/oom.h> |
| @@ -405,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | |||
| 405 | 410 | ||
| 406 | bool oom_killer_disabled __read_mostly; | 411 | bool oom_killer_disabled __read_mostly; |
| 407 | 412 | ||
| 413 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 414 | |||
| 415 | #ifdef CONFIG_MMU | ||
| 416 | /* | ||
| 417 | * OOM Reaper kernel thread which tries to reap the memory used by the OOM | ||
| 418 | * victim (if that is possible) to help the OOM killer to move on. | ||
| 419 | */ | ||
| 420 | static struct task_struct *oom_reaper_th; | ||
| 421 | static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | ||
| 422 | static struct task_struct *oom_reaper_list; | ||
| 423 | static DEFINE_SPINLOCK(oom_reaper_lock); | ||
| 424 | |||
| 425 | |||
| 426 | static bool __oom_reap_task(struct task_struct *tsk) | ||
| 427 | { | ||
| 428 | struct mmu_gather tlb; | ||
| 429 | struct vm_area_struct *vma; | ||
| 430 | struct mm_struct *mm; | ||
| 431 | struct task_struct *p; | ||
| 432 | struct zap_details details = {.check_swap_entries = true, | ||
| 433 | .ignore_dirty = true}; | ||
| 434 | bool ret = true; | ||
| 435 | |||
| 436 | /* | ||
| 437 | * Make sure we find the associated mm_struct even when the particular | ||
| 438 | * thread has already terminated and cleared its mm. | ||
| 439 | * We might have race with exit path so consider our work done if there | ||
| 440 | * is no mm. | ||
| 441 | */ | ||
| 442 | p = find_lock_task_mm(tsk); | ||
| 443 | if (!p) | ||
| 444 | return true; | ||
| 445 | |||
| 446 | mm = p->mm; | ||
| 447 | if (!atomic_inc_not_zero(&mm->mm_users)) { | ||
| 448 | task_unlock(p); | ||
| 449 | return true; | ||
| 450 | } | ||
| 451 | |||
| 452 | task_unlock(p); | ||
| 453 | |||
| 454 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
| 455 | ret = false; | ||
| 456 | goto out; | ||
| 457 | } | ||
| 458 | |||
| 459 | tlb_gather_mmu(&tlb, mm, 0, -1); | ||
| 460 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | ||
| 461 | if (is_vm_hugetlb_page(vma)) | ||
| 462 | continue; | ||
| 463 | |||
| 464 | /* | ||
| 465 | * mlocked VMAs require explicit munlocking before unmap. | ||
| 466 | * Let's keep it simple here and skip such VMAs. | ||
| 467 | */ | ||
| 468 | if (vma->vm_flags & VM_LOCKED) | ||
| 469 | continue; | ||
| 470 | |||
| 471 | /* | ||
| 472 | * Only anonymous pages have a good chance to be dropped | ||
| 473 | * without additional steps which we cannot afford as we | ||
| 474 | * are OOM already. | ||
| 475 | * | ||
| 476 | * We do not even care about fs backed pages because all | ||
| 477 | * which are reclaimable have already been reclaimed and | ||
| 478 | * we do not want to block exit_mmap by keeping mm ref | ||
| 479 | * count elevated without a good reason. | ||
| 480 | */ | ||
| 481 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) | ||
| 482 | unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, | ||
| 483 | &details); | ||
| 484 | } | ||
| 485 | tlb_finish_mmu(&tlb, 0, -1); | ||
| 486 | pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | ||
| 487 | task_pid_nr(tsk), tsk->comm, | ||
| 488 | K(get_mm_counter(mm, MM_ANONPAGES)), | ||
| 489 | K(get_mm_counter(mm, MM_FILEPAGES)), | ||
| 490 | K(get_mm_counter(mm, MM_SHMEMPAGES))); | ||
| 491 | up_read(&mm->mmap_sem); | ||
| 492 | |||
| 493 | /* | ||
| 494 | * Clear TIF_MEMDIE because the task shouldn't be sitting on a | ||
| 495 | * reasonably reclaimable memory anymore. OOM killer can continue | ||
| 496 | * by selecting other victim if unmapping hasn't led to any | ||
| 497 | * improvements. This also means that selecting this task doesn't | ||
| 498 | * make any sense. | ||
| 499 | */ | ||
| 500 | tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; | ||
| 501 | exit_oom_victim(tsk); | ||
| 502 | out: | ||
| 503 | mmput(mm); | ||
| 504 | return ret; | ||
| 505 | } | ||
| 506 | |||
| 507 | #define MAX_OOM_REAP_RETRIES 10 | ||
| 508 | static void oom_reap_task(struct task_struct *tsk) | ||
| 509 | { | ||
| 510 | int attempts = 0; | ||
| 511 | |||
| 512 | /* Retry the down_read_trylock(mmap_sem) a few times */ | ||
| 513 | while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) | ||
| 514 | schedule_timeout_idle(HZ/10); | ||
| 515 | |||
| 516 | if (attempts > MAX_OOM_REAP_RETRIES) { | ||
| 517 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | ||
| 518 | task_pid_nr(tsk), tsk->comm); | ||
| 519 | debug_show_all_locks(); | ||
| 520 | } | ||
| 521 | |||
| 522 | /* Drop a reference taken by wake_oom_reaper */ | ||
| 523 | put_task_struct(tsk); | ||
| 524 | } | ||
| 525 | |||
| 526 | static int oom_reaper(void *unused) | ||
| 527 | { | ||
| 528 | set_freezable(); | ||
| 529 | |||
| 530 | while (true) { | ||
| 531 | struct task_struct *tsk = NULL; | ||
| 532 | |||
| 533 | wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); | ||
| 534 | spin_lock(&oom_reaper_lock); | ||
| 535 | if (oom_reaper_list != NULL) { | ||
| 536 | tsk = oom_reaper_list; | ||
| 537 | oom_reaper_list = tsk->oom_reaper_list; | ||
| 538 | } | ||
| 539 | spin_unlock(&oom_reaper_lock); | ||
| 540 | |||
| 541 | if (tsk) | ||
| 542 | oom_reap_task(tsk); | ||
| 543 | } | ||
| 544 | |||
| 545 | return 0; | ||
| 546 | } | ||
| 547 | |||
| 548 | static void wake_oom_reaper(struct task_struct *tsk) | ||
| 549 | { | ||
| 550 | if (!oom_reaper_th || tsk->oom_reaper_list) | ||
| 551 | return; | ||
| 552 | |||
| 553 | get_task_struct(tsk); | ||
| 554 | |||
| 555 | spin_lock(&oom_reaper_lock); | ||
| 556 | tsk->oom_reaper_list = oom_reaper_list; | ||
| 557 | oom_reaper_list = tsk; | ||
| 558 | spin_unlock(&oom_reaper_lock); | ||
| 559 | wake_up(&oom_reaper_wait); | ||
| 560 | } | ||
| 561 | |||
| 562 | static int __init oom_init(void) | ||
| 563 | { | ||
| 564 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | ||
| 565 | if (IS_ERR(oom_reaper_th)) { | ||
| 566 | pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", | ||
| 567 | PTR_ERR(oom_reaper_th)); | ||
| 568 | oom_reaper_th = NULL; | ||
| 569 | } | ||
| 570 | return 0; | ||
| 571 | } | ||
| 572 | subsys_initcall(oom_init) | ||
| 573 | #else | ||
| 574 | static void wake_oom_reaper(struct task_struct *tsk) | ||
| 575 | { | ||
| 576 | } | ||
| 577 | #endif | ||
| 578 | |||
| 408 | /** | 579 | /** |
| 409 | * mark_oom_victim - mark the given task as OOM victim | 580 | * mark_oom_victim - mark the given task as OOM victim |
| 410 | * @tsk: task to mark | 581 | * @tsk: task to mark |
| @@ -431,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk) | |||
| 431 | /** | 602 | /** |
| 432 | * exit_oom_victim - note the exit of an OOM victim | 603 | * exit_oom_victim - note the exit of an OOM victim |
| 433 | */ | 604 | */ |
| 434 | void exit_oom_victim(void) | 605 | void exit_oom_victim(struct task_struct *tsk) |
| 435 | { | 606 | { |
| 436 | clear_thread_flag(TIF_MEMDIE); | 607 | if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) |
| 608 | return; | ||
| 437 | 609 | ||
| 438 | if (!atomic_dec_return(&oom_victims)) | 610 | if (!atomic_dec_return(&oom_victims)) |
| 439 | wake_up_all(&oom_victims_wait); | 611 | wake_up_all(&oom_victims_wait); |
| @@ -494,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | |||
| 494 | return false; | 666 | return false; |
| 495 | } | 667 | } |
| 496 | 668 | ||
| 497 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 498 | /* | 669 | /* |
| 499 | * Must be called while holding a reference to p, which will be released upon | 670 | * Must be called while holding a reference to p, which will be released upon |
| 500 | * returning. | 671 | * returning. |
| @@ -510,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
| 510 | unsigned int victim_points = 0; | 681 | unsigned int victim_points = 0; |
| 511 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 682 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
| 512 | DEFAULT_RATELIMIT_BURST); | 683 | DEFAULT_RATELIMIT_BURST); |
| 684 | bool can_oom_reap = true; | ||
| 513 | 685 | ||
| 514 | /* | 686 | /* |
| 515 | * If the task is already exiting, don't alarm the sysadmin or kill | 687 | * If the task is already exiting, don't alarm the sysadmin or kill |
| @@ -600,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
| 600 | continue; | 772 | continue; |
| 601 | if (same_thread_group(p, victim)) | 773 | if (same_thread_group(p, victim)) |
| 602 | continue; | 774 | continue; |
| 603 | if (unlikely(p->flags & PF_KTHREAD)) | 775 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || |
| 604 | continue; | 776 | p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
| 605 | if (is_global_init(p)) | 777 | /* |
| 606 | continue; | 778 | * We cannot use oom_reaper for the mm shared by this |
| 607 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | 779 | * process because it wouldn't get killed and so the |
| 780 | * memory might be still used. | ||
| 781 | */ | ||
| 782 | can_oom_reap = false; | ||
| 608 | continue; | 783 | continue; |
| 609 | 784 | } | |
| 610 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 785 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
| 611 | } | 786 | } |
| 612 | rcu_read_unlock(); | 787 | rcu_read_unlock(); |
| 613 | 788 | ||
| 789 | if (can_oom_reap) | ||
| 790 | wake_oom_reaper(victim); | ||
| 791 | |||
| 614 | mmdrop(mm); | 792 | mmdrop(mm); |
| 615 | put_task_struct(victim); | 793 | put_task_struct(victim); |
| 616 | } | 794 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a762be57e46e..59de90d5d3a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -692,34 +692,28 @@ static inline void __free_one_page(struct page *page, | |||
| 692 | unsigned long combined_idx; | 692 | unsigned long combined_idx; |
| 693 | unsigned long uninitialized_var(buddy_idx); | 693 | unsigned long uninitialized_var(buddy_idx); |
| 694 | struct page *buddy; | 694 | struct page *buddy; |
| 695 | unsigned int max_order = MAX_ORDER; | 695 | unsigned int max_order; |
| 696 | |||
| 697 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
| 696 | 698 | ||
| 697 | VM_BUG_ON(!zone_is_initialized(zone)); | 699 | VM_BUG_ON(!zone_is_initialized(zone)); |
| 698 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | 700 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
| 699 | 701 | ||
| 700 | VM_BUG_ON(migratetype == -1); | 702 | VM_BUG_ON(migratetype == -1); |
| 701 | if (is_migrate_isolate(migratetype)) { | 703 | if (likely(!is_migrate_isolate(migratetype))) |
| 702 | /* | ||
| 703 | * We restrict max order of merging to prevent merge | ||
| 704 | * between freepages on isolate pageblock and normal | ||
| 705 | * pageblock. Without this, pageblock isolation | ||
| 706 | * could cause incorrect freepage accounting. | ||
| 707 | */ | ||
| 708 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
| 709 | } else { | ||
| 710 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 704 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
| 711 | } | ||
| 712 | 705 | ||
| 713 | page_idx = pfn & ((1 << max_order) - 1); | 706 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
| 714 | 707 | ||
| 715 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); | 708 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
| 716 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 709 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| 717 | 710 | ||
| 711 | continue_merging: | ||
| 718 | while (order < max_order - 1) { | 712 | while (order < max_order - 1) { |
| 719 | buddy_idx = __find_buddy_index(page_idx, order); | 713 | buddy_idx = __find_buddy_index(page_idx, order); |
| 720 | buddy = page + (buddy_idx - page_idx); | 714 | buddy = page + (buddy_idx - page_idx); |
| 721 | if (!page_is_buddy(page, buddy, order)) | 715 | if (!page_is_buddy(page, buddy, order)) |
| 722 | break; | 716 | goto done_merging; |
| 723 | /* | 717 | /* |
| 724 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, | 718 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
| 725 | * merge with it and move up one order. | 719 | * merge with it and move up one order. |
| @@ -736,6 +730,32 @@ static inline void __free_one_page(struct page *page, | |||
| 736 | page_idx = combined_idx; | 730 | page_idx = combined_idx; |
| 737 | order++; | 731 | order++; |
| 738 | } | 732 | } |
| 733 | if (max_order < MAX_ORDER) { | ||
| 734 | /* If we are here, it means order is >= pageblock_order. | ||
| 735 | * We want to prevent merge between freepages on isolate | ||
| 736 | * pageblock and normal pageblock. Without this, pageblock | ||
| 737 | * isolation could cause incorrect freepage or CMA accounting. | ||
| 738 | * | ||
| 739 | * We don't want to hit this code for the more frequent | ||
| 740 | * low-order merging. | ||
| 741 | */ | ||
| 742 | if (unlikely(has_isolate_pageblock(zone))) { | ||
| 743 | int buddy_mt; | ||
| 744 | |||
| 745 | buddy_idx = __find_buddy_index(page_idx, order); | ||
| 746 | buddy = page + (buddy_idx - page_idx); | ||
| 747 | buddy_mt = get_pageblock_migratetype(buddy); | ||
| 748 | |||
| 749 | if (migratetype != buddy_mt | ||
| 750 | && (is_migrate_isolate(migratetype) || | ||
| 751 | is_migrate_isolate(buddy_mt))) | ||
| 752 | goto done_merging; | ||
| 753 | } | ||
| 754 | max_order++; | ||
| 755 | goto continue_merging; | ||
| 756 | } | ||
| 757 | |||
| 758 | done_merging: | ||
| 739 | set_page_order(page, order); | 759 | set_page_order(page, order); |
| 740 | 760 | ||
| 741 | /* | 761 | /* |
| @@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
| 2086 | } | 2086 | } |
| 2087 | #endif | 2087 | #endif |
| 2088 | 2088 | ||
| 2089 | kasan_cache_create(cachep, &size, &flags); | ||
| 2090 | |||
| 2089 | size = ALIGN(size, cachep->align); | 2091 | size = ALIGN(size, cachep->align); |
| 2090 | /* | 2092 | /* |
| 2091 | * We should restrict the number of objects in a slab to implement | 2093 | * We should restrict the number of objects in a slab to implement |
| @@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) | |||
| 2387 | * cache which they are a constructor for. Otherwise, deadlock. | 2389 | * cache which they are a constructor for. Otherwise, deadlock. |
| 2388 | * They must also be threaded. | 2390 | * They must also be threaded. |
| 2389 | */ | 2391 | */ |
| 2390 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2392 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { |
| 2393 | kasan_unpoison_object_data(cachep, | ||
| 2394 | objp + obj_offset(cachep)); | ||
| 2391 | cachep->ctor(objp + obj_offset(cachep)); | 2395 | cachep->ctor(objp + obj_offset(cachep)); |
| 2396 | kasan_poison_object_data( | ||
| 2397 | cachep, objp + obj_offset(cachep)); | ||
| 2398 | } | ||
| 2392 | 2399 | ||
| 2393 | if (cachep->flags & SLAB_RED_ZONE) { | 2400 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2394 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2401 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
| @@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2409 | struct page *page) | 2416 | struct page *page) |
| 2410 | { | 2417 | { |
| 2411 | int i; | 2418 | int i; |
| 2419 | void *objp; | ||
| 2412 | 2420 | ||
| 2413 | cache_init_objs_debug(cachep, page); | 2421 | cache_init_objs_debug(cachep, page); |
| 2414 | 2422 | ||
| @@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2419 | 2427 | ||
| 2420 | for (i = 0; i < cachep->num; i++) { | 2428 | for (i = 0; i < cachep->num; i++) { |
| 2421 | /* constructor could break poison info */ | 2429 | /* constructor could break poison info */ |
| 2422 | if (DEBUG == 0 && cachep->ctor) | 2430 | if (DEBUG == 0 && cachep->ctor) { |
| 2423 | cachep->ctor(index_to_obj(cachep, page, i)); | 2431 | objp = index_to_obj(cachep, page, i); |
| 2432 | kasan_unpoison_object_data(cachep, objp); | ||
| 2433 | cachep->ctor(objp); | ||
| 2434 | kasan_poison_object_data(cachep, objp); | ||
| 2435 | } | ||
| 2424 | 2436 | ||
| 2425 | set_free_obj(page, i, i); | 2437 | set_free_obj(page, i, i); |
| 2426 | } | 2438 | } |
| @@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
| 2550 | 2562 | ||
| 2551 | slab_map_pages(cachep, page, freelist); | 2563 | slab_map_pages(cachep, page, freelist); |
| 2552 | 2564 | ||
| 2565 | kasan_poison_slab(page); | ||
| 2553 | cache_init_objs(cachep, page); | 2566 | cache_init_objs(cachep, page); |
| 2554 | 2567 | ||
| 2555 | if (gfpflags_allow_blocking(local_flags)) | 2568 | if (gfpflags_allow_blocking(local_flags)) |
| @@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
| 3316 | { | 3329 | { |
| 3317 | struct array_cache *ac = cpu_cache_get(cachep); | 3330 | struct array_cache *ac = cpu_cache_get(cachep); |
| 3318 | 3331 | ||
| 3332 | kasan_slab_free(cachep, objp); | ||
| 3333 | |||
| 3319 | check_irq_off(); | 3334 | check_irq_off(); |
| 3320 | kmemleak_free_recursive(objp, cachep->flags); | 3335 | kmemleak_free_recursive(objp, cachep->flags); |
| 3321 | objp = cache_free_debugcheck(cachep, objp, caller); | 3336 | objp = cache_free_debugcheck(cachep, objp, caller); |
| @@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3363 | { | 3378 | { |
| 3364 | void *ret = slab_alloc(cachep, flags, _RET_IP_); | 3379 | void *ret = slab_alloc(cachep, flags, _RET_IP_); |
| 3365 | 3380 | ||
| 3381 | kasan_slab_alloc(cachep, ret, flags); | ||
| 3366 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3382 | trace_kmem_cache_alloc(_RET_IP_, ret, |
| 3367 | cachep->object_size, cachep->size, flags); | 3383 | cachep->object_size, cachep->size, flags); |
| 3368 | 3384 | ||
| @@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) | |||
| 3428 | 3444 | ||
| 3429 | ret = slab_alloc(cachep, flags, _RET_IP_); | 3445 | ret = slab_alloc(cachep, flags, _RET_IP_); |
| 3430 | 3446 | ||
| 3447 | kasan_kmalloc(cachep, ret, size, flags); | ||
| 3431 | trace_kmalloc(_RET_IP_, ret, | 3448 | trace_kmalloc(_RET_IP_, ret, |
| 3432 | size, cachep->size, flags); | 3449 | size, cachep->size, flags); |
| 3433 | return ret; | 3450 | return ret; |
| @@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 3451 | { | 3468 | { |
| 3452 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3469 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
| 3453 | 3470 | ||
| 3471 | kasan_slab_alloc(cachep, ret, flags); | ||
| 3454 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3472 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
| 3455 | cachep->object_size, cachep->size, | 3473 | cachep->object_size, cachep->size, |
| 3456 | flags, nodeid); | 3474 | flags, nodeid); |
| @@ -3469,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, | |||
| 3469 | 3487 | ||
| 3470 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3488 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
| 3471 | 3489 | ||
| 3490 | kasan_kmalloc(cachep, ret, size, flags); | ||
| 3472 | trace_kmalloc_node(_RET_IP_, ret, | 3491 | trace_kmalloc_node(_RET_IP_, ret, |
| 3473 | size, cachep->size, | 3492 | size, cachep->size, |
| 3474 | flags, nodeid); | 3493 | flags, nodeid); |
| @@ -3481,11 +3500,15 @@ static __always_inline void * | |||
| 3481 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | 3500 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) |
| 3482 | { | 3501 | { |
| 3483 | struct kmem_cache *cachep; | 3502 | struct kmem_cache *cachep; |
| 3503 | void *ret; | ||
| 3484 | 3504 | ||
| 3485 | cachep = kmalloc_slab(size, flags); | 3505 | cachep = kmalloc_slab(size, flags); |
| 3486 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3506 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
| 3487 | return cachep; | 3507 | return cachep; |
| 3488 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3508 | ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); |
| 3509 | kasan_kmalloc(cachep, ret, size, flags); | ||
| 3510 | |||
| 3511 | return ret; | ||
| 3489 | } | 3512 | } |
| 3490 | 3513 | ||
| 3491 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3514 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
| @@ -3519,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
| 3519 | return cachep; | 3542 | return cachep; |
| 3520 | ret = slab_alloc(cachep, flags, caller); | 3543 | ret = slab_alloc(cachep, flags, caller); |
| 3521 | 3544 | ||
| 3545 | kasan_kmalloc(cachep, ret, size, flags); | ||
| 3522 | trace_kmalloc(caller, ret, | 3546 | trace_kmalloc(caller, ret, |
| 3523 | size, cachep->size, flags); | 3547 | size, cachep->size, flags); |
| 3524 | 3548 | ||
| @@ -4290,10 +4314,18 @@ module_init(slab_proc_init); | |||
| 4290 | */ | 4314 | */ |
| 4291 | size_t ksize(const void *objp) | 4315 | size_t ksize(const void *objp) |
| 4292 | { | 4316 | { |
| 4317 | size_t size; | ||
| 4318 | |||
| 4293 | BUG_ON(!objp); | 4319 | BUG_ON(!objp); |
| 4294 | if (unlikely(objp == ZERO_SIZE_PTR)) | 4320 | if (unlikely(objp == ZERO_SIZE_PTR)) |
| 4295 | return 0; | 4321 | return 0; |
| 4296 | 4322 | ||
| 4297 | return virt_to_cache(objp)->object_size; | 4323 | size = virt_to_cache(objp)->object_size; |
| 4324 | /* We assume that ksize callers could use the whole allocated area, | ||
| 4325 | * so we need to unpoison this area. | ||
| 4326 | */ | ||
| 4327 | kasan_krealloc(objp, size, GFP_NOWAIT); | ||
| 4328 | |||
| 4329 | return size; | ||
| 4298 | } | 4330 | } |
| 4299 | EXPORT_SYMBOL(ksize); | 4331 | EXPORT_SYMBOL(ksize); |
| @@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
| 405 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 405 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
| 406 | kmemleak_alloc_recursive(object, s->object_size, 1, | 406 | kmemleak_alloc_recursive(object, s->object_size, 1, |
| 407 | s->flags, flags); | 407 | s->flags, flags); |
| 408 | kasan_slab_alloc(s, object); | 408 | kasan_slab_alloc(s, object, flags); |
| 409 | } | 409 | } |
| 410 | memcg_kmem_put_cache(s); | 410 | memcg_kmem_put_cache(s); |
| 411 | } | 411 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index b2e379639a5b..3239bfd758e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
| @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; | |||
| 35 | */ | 35 | */ |
| 36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
| 37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | 37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
| 38 | SLAB_FAILSLAB) | 38 | SLAB_FAILSLAB | SLAB_KASAN) |
| 39 | 39 | ||
| 40 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ | 40 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ |
| 41 | SLAB_NOTRACK | SLAB_ACCOUNT) | 41 | SLAB_NOTRACK | SLAB_ACCOUNT) |
| @@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
| 1013 | page = alloc_kmem_pages(flags, order); | 1013 | page = alloc_kmem_pages(flags, order); |
| 1014 | ret = page ? page_address(page) : NULL; | 1014 | ret = page ? page_address(page) : NULL; |
| 1015 | kmemleak_alloc(ret, size, 1, flags); | 1015 | kmemleak_alloc(ret, size, 1, flags); |
| 1016 | kasan_kmalloc_large(ret, size); | 1016 | kasan_kmalloc_large(ret, size, flags); |
| 1017 | return ret; | 1017 | return ret; |
| 1018 | } | 1018 | } |
| 1019 | EXPORT_SYMBOL(kmalloc_order); | 1019 | EXPORT_SYMBOL(kmalloc_order); |
| @@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
| 1192 | ks = ksize(p); | 1192 | ks = ksize(p); |
| 1193 | 1193 | ||
| 1194 | if (ks >= new_size) { | 1194 | if (ks >= new_size) { |
| 1195 | kasan_krealloc((void *)p, new_size); | 1195 | kasan_krealloc((void *)p, new_size, flags); |
| 1196 | return (void *)p; | 1196 | return (void *)p; |
| 1197 | } | 1197 | } |
| 1198 | 1198 | ||
| @@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, | |||
| 1313 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1313 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
| 1314 | { | 1314 | { |
| 1315 | kmemleak_alloc(ptr, size, 1, flags); | 1315 | kmemleak_alloc(ptr, size, 1, flags); |
| 1316 | kasan_kmalloc_large(ptr, size); | 1316 | kasan_kmalloc_large(ptr, size, flags); |
| 1317 | } | 1317 | } |
| 1318 | 1318 | ||
| 1319 | static inline void kfree_hook(const void *x) | 1319 | static inline void kfree_hook(const void *x) |
| @@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | |||
| 2596 | { | 2596 | { |
| 2597 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); | 2597 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
| 2598 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2598 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
| 2599 | kasan_kmalloc(s, ret, size); | 2599 | kasan_kmalloc(s, ret, size, gfpflags); |
| 2600 | return ret; | 2600 | return ret; |
| 2601 | } | 2601 | } |
| 2602 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 2602 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
| @@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
| 2624 | trace_kmalloc_node(_RET_IP_, ret, | 2624 | trace_kmalloc_node(_RET_IP_, ret, |
| 2625 | size, s->size, gfpflags, node); | 2625 | size, s->size, gfpflags, node); |
| 2626 | 2626 | ||
| 2627 | kasan_kmalloc(s, ret, size); | 2627 | kasan_kmalloc(s, ret, size, gfpflags); |
| 2628 | return ret; | 2628 | return ret; |
| 2629 | } | 2629 | } |
| 2630 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 2630 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
| @@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) | |||
| 3182 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 3182 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
| 3183 | init_tracking(kmem_cache_node, n); | 3183 | init_tracking(kmem_cache_node, n); |
| 3184 | #endif | 3184 | #endif |
| 3185 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); | 3185 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), |
| 3186 | GFP_KERNEL); | ||
| 3186 | init_kmem_cache_node(n); | 3187 | init_kmem_cache_node(n); |
| 3187 | inc_slabs_node(kmem_cache_node, node, page->objects); | 3188 | inc_slabs_node(kmem_cache_node, node, page->objects); |
| 3188 | 3189 | ||
| @@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
| 3561 | 3562 | ||
| 3562 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3563 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
| 3563 | 3564 | ||
| 3564 | kasan_kmalloc(s, ret, size); | 3565 | kasan_kmalloc(s, ret, size, flags); |
| 3565 | 3566 | ||
| 3566 | return ret; | 3567 | return ret; |
| 3567 | } | 3568 | } |
| @@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
| 3606 | 3607 | ||
| 3607 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3608 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
| 3608 | 3609 | ||
| 3609 | kasan_kmalloc(s, ret, size); | 3610 | kasan_kmalloc(s, ret, size, flags); |
| 3610 | 3611 | ||
| 3611 | return ret; | 3612 | return ret; |
| 3612 | } | 3613 | } |
| @@ -3635,7 +3636,7 @@ size_t ksize(const void *object) | |||
| 3635 | size_t size = __ksize(object); | 3636 | size_t size = __ksize(object); |
| 3636 | /* We assume that ksize callers could use whole allocated area, | 3637 | /* We assume that ksize callers could use whole allocated area, |
| 3637 | so we need unpoison this area. */ | 3638 | so we need unpoison this area. */ |
| 3638 | kasan_krealloc(object, size); | 3639 | kasan_krealloc(object, size, GFP_NOWAIT); |
| 3639 | return size; | 3640 | return size; |
| 3640 | } | 3641 | } |
| 3641 | EXPORT_SYMBOL(ksize); | 3642 | EXPORT_SYMBOL(ksize); |
