aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-25 19:59:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-25 19:59:11 -0400
commit606c61a0579669c292dc5f5e1cf898edecfc0d53 (patch)
tree569aa7e9b99571890bfccd7278bbc303cfa0a919
parent15dbc136dff62ebefb03353cfb7d308d49b275f3 (diff)
parent0fda2788b03c1868e2f20b3b7995b8cc2adf4715 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge fourth patch-bomb from Andrew Morton: "A lot more stuff than expected, sorry. A bunch of ocfs2 reviewing was finished off. - mhocko's oom-reaper out-of-memory-handler changes - ocfs2 fixes and features - KASAN feature work - various fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (42 commits) thp: fix typo in khugepaged_scan_pmd() MAINTAINERS: fill entries for KASAN mm/filemap: generic_file_read_iter(): check for zero reads unconditionally kasan: test fix: warn if the UAF could not be detected in kmalloc_uaf2 mm, kasan: stackdepot implementation. Enable stackdepot for SLAB arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections mm, kasan: add GFP flags to KASAN API mm, kasan: SLAB support kasan: modify kmalloc_large_oob_right(), add kmalloc_pagealloc_oob_right() include/linux/oom.h: remove undefined oom_kills_count()/note_oom_kill() mm/page_alloc: prevent merging between isolated and other pageblocks drivers/memstick/host/r592.c: avoid gcc-6 warning ocfs2: extend enough credits for freeing one truncate record while replaying truncate records ocfs2: extend transaction for ocfs2_remove_rightmost_path() and ocfs2_update_edge_lengths() before to avoid inconsistency between inode and et ocfs2/dlm: move lock to the tail of grant queue while doing in-place convert ocfs2: solve a problem of crossing the boundary in updating backups ocfs2: fix occurring deadlock by changing ocfs2_wq from global to local ocfs2/dlm: fix BUG in dlm_move_lockres_to_recovery_list ocfs2/dlm: fix race between convert and recovery ocfs2: fix a deadlock issue in ocfs2_dio_end_io_write() ...
-rw-r--r--Documentation/kasan.txt5
-rw-r--r--MAINTAINERS14
-rw-r--r--arch/arm/include/asm/exception.h2
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/arm64/include/asm/exception.h2
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S1
-rw-r--r--arch/blackfin/kernel/vmlinux.lds.S1
-rw-r--r--arch/c6x/kernel/vmlinux.lds.S1
-rw-r--r--arch/metag/kernel/vmlinux.lds.S1
-rw-r--r--arch/microblaze/kernel/vmlinux.lds.S1
-rw-r--r--arch/mips/kernel/vmlinux.lds.S1
-rw-r--r--arch/nios2/kernel/vmlinux.lds.S1
-rw-r--r--arch/openrisc/kernel/vmlinux.lds.S1
-rw-r--r--arch/parisc/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S1
-rw-r--r--arch/s390/kernel/vmlinux.lds.S1
-rw-r--r--arch/sh/kernel/vmlinux.lds.S1
-rw-r--r--arch/sparc/kernel/vmlinux.lds.S1
-rw-r--r--arch/tile/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--drivers/input/input-compat.c6
-rw-r--r--drivers/input/input-compat.h4
-rw-r--r--drivers/input/input.c2
-rw-r--r--drivers/input/misc/uinput.c4
-rw-r--r--drivers/memstick/host/r592.c3
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1146
-rw-r--r--fs/ocfs2/aops.h19
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/file.c165
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h16
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/super.c39
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--include/asm-generic/vmlinux.lds.h12
-rw-r--r--include/linux/ftrace.h11
-rw-r--r--include/linux/interrupt.h20
-rw-r--r--include/linux/kasan.h31
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/oom.h4
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/slab.h10
-rw-r--r--include/linux/slab_def.h14
-rw-r--r--include/linux/slub_def.h11
-rw-r--r--include/linux/stackdepot.h32
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/time/timer.c11
-rw-r--r--kernel/trace/trace_functions_graph.c1
-rw-r--r--lib/Kconfig4
-rw-r--r--lib/Kconfig.kasan5
-rw-r--r--lib/Makefile3
-rw-r--r--lib/stackdepot.c284
-rw-r--r--lib/test_kasan.c30
-rw-r--r--mm/Makefile1
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/kasan.c162
-rw-r--r--mm/kasan/kasan.h37
-rw-r--r--mm/kasan/report.c62
-rw-r--r--mm/memory.c17
-rw-r--r--mm/mempool.c16
-rw-r--r--mm/oom_kill.c196
-rw-r--r--mm/page_alloc.c46
-rw-r--r--mm/slab.c42
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c6
-rw-r--r--mm/slub.c15
79 files changed, 1770 insertions, 962 deletions
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index aa1e0c91e368..7dd95b35cd7c 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access,
12therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is 12therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
13required for detection of out-of-bounds accesses to stack or global variables. 13required for detection of out-of-bounds accesses to stack or global variables.
14 14
15Currently KASAN is supported only for x86_64 architecture and requires the 15Currently KASAN is supported only for x86_64 architecture.
16kernel to be built with the SLUB allocator.
17 16
181. Usage 171. Usage
19======== 18========
@@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary
27the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC 26the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
28version 5.0 or later. 27version 5.0 or later.
29 28
30Currently KASAN works only with the SLUB memory allocator. 29KASAN works with both SLUB and SLAB memory allocators.
31For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. 30For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
32 31
33To disable instrumentation for specific files or directories, add a line 32To disable instrumentation for specific files or directories, add a line
diff --git a/MAINTAINERS b/MAINTAINERS
index f07a174bbc81..df8cf6b924c6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6165,6 +6165,20 @@ S: Maintained
6165F: Documentation/hwmon/k8temp 6165F: Documentation/hwmon/k8temp
6166F: drivers/hwmon/k8temp.c 6166F: drivers/hwmon/k8temp.c
6167 6167
6168KASAN
6169M: Andrey Ryabinin <aryabinin@virtuozzo.com>
6170R: Alexander Potapenko <glider@google.com>
6171R: Dmitry Vyukov <dvyukov@google.com>
6172L: kasan-dev@googlegroups.com
6173S: Maintained
6174F: arch/*/include/asm/kasan.h
6175F: arch/*/mm/kasan_init*
6176F: Documentation/kasan.txt
6177F: include/linux/kasan.h
6178F: lib/test_kasan.c
6179F: mm/kasan/
6180F: scripts/Makefile.kasan
6181
6168KCONFIG 6182KCONFIG
6169M: "Yann E. MORIN" <yann.morin.1998@free.fr> 6183M: "Yann E. MORIN" <yann.morin.1998@free.fr>
6170L: linux-kbuild@vger.kernel.org 6184L: linux-kbuild@vger.kernel.org
diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h
index 5abaf5bbd985..bf1991263d2d 100644
--- a/arch/arm/include/asm/exception.h
+++ b/arch/arm/include/asm/exception.h
@@ -7,7 +7,7 @@
7#ifndef __ASM_ARM_EXCEPTION_H 7#ifndef __ASM_ARM_EXCEPTION_H
8#define __ASM_ARM_EXCEPTION_H 8#define __ASM_ARM_EXCEPTION_H
9 9
10#include <linux/ftrace.h> 10#include <linux/interrupt.h>
11 11
12#define __exception __attribute__((section(".exception.text"))) 12#define __exception __attribute__((section(".exception.text")))
13#ifdef CONFIG_FUNCTION_GRAPH_TRACER 13#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 1fab979daeaf..e2c6da096cef 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -108,6 +108,7 @@ SECTIONS
108 *(.exception.text) 108 *(.exception.text)
109 __exception_text_end = .; 109 __exception_text_end = .;
110 IRQENTRY_TEXT 110 IRQENTRY_TEXT
111 SOFTIRQENTRY_TEXT
111 TEXT_TEXT 112 TEXT_TEXT
112 SCHED_TEXT 113 SCHED_TEXT
113 LOCK_TEXT 114 LOCK_TEXT
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 6cb7e1a6bc02..0c2eec490abf 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -18,7 +18,7 @@
18#ifndef __ASM_EXCEPTION_H 18#ifndef __ASM_EXCEPTION_H
19#define __ASM_EXCEPTION_H 19#define __ASM_EXCEPTION_H
20 20
21#include <linux/ftrace.h> 21#include <linux/interrupt.h>
22 22
23#define __exception __attribute__((section(".exception.text"))) 23#define __exception __attribute__((section(".exception.text")))
24#ifdef CONFIG_FUNCTION_GRAPH_TRACER 24#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 37f624df68fa..5a1939a74ff3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -103,6 +103,7 @@ SECTIONS
103 *(.exception.text) 103 *(.exception.text)
104 __exception_text_end = .; 104 __exception_text_end = .;
105 IRQENTRY_TEXT 105 IRQENTRY_TEXT
106 SOFTIRQENTRY_TEXT
106 TEXT_TEXT 107 TEXT_TEXT
107 SCHED_TEXT 108 SCHED_TEXT
108 LOCK_TEXT 109 LOCK_TEXT
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index c9eec84aa258..d920b959ff3a 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35#endif 35#endif
36 LOCK_TEXT 36 LOCK_TEXT
37 IRQENTRY_TEXT 37 IRQENTRY_TEXT
38 SOFTIRQENTRY_TEXT
38 KPROBES_TEXT 39 KPROBES_TEXT
39#ifdef CONFIG_ROMKERNEL 40#ifdef CONFIG_ROMKERNEL
40 __sinittext = .; 41 __sinittext = .;
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S
index 5a6e141d1641..50bc10f97bcb 100644
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
72 SCHED_TEXT 72 SCHED_TEXT
73 LOCK_TEXT 73 LOCK_TEXT
74 IRQENTRY_TEXT 74 IRQENTRY_TEXT
75 SOFTIRQENTRY_TEXT
75 KPROBES_TEXT 76 KPROBES_TEXT
76 *(.fixup) 77 *(.fixup)
77 *(.gnu.warning) 78 *(.gnu.warning)
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S
index e12055e88bfe..150ace92c7ad 100644
--- a/arch/metag/kernel/vmlinux.lds.S
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@ SECTIONS
24 LOCK_TEXT 24 LOCK_TEXT
25 KPROBES_TEXT 25 KPROBES_TEXT
26 IRQENTRY_TEXT 26 IRQENTRY_TEXT
27 SOFTIRQENTRY_TEXT
27 *(.text.*) 28 *(.text.*)
28 *(.gnu.warning) 29 *(.gnu.warning)
29 } 30 }
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index be9488d69734..0a47f0410554 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS {
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT 38 IRQENTRY_TEXT
39 SOFTIRQENTRY_TEXT
39 . = ALIGN (4) ; 40 . = ALIGN (4) ;
40 _etext = . ; 41 _etext = . ;
41 } 42 }
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 0a93e83cd014..54d653ee17e1 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -58,6 +58,7 @@ SECTIONS
58 LOCK_TEXT 58 LOCK_TEXT
59 KPROBES_TEXT 59 KPROBES_TEXT
60 IRQENTRY_TEXT 60 IRQENTRY_TEXT
61 SOFTIRQENTRY_TEXT
61 *(.text.*) 62 *(.text.*)
62 *(.fixup) 63 *(.fixup)
63 *(.gnu.warning) 64 *(.gnu.warning)
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S
index 326fab40a9de..e23e89539967 100644
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
39 SCHED_TEXT 39 SCHED_TEXT
40 LOCK_TEXT 40 LOCK_TEXT
41 IRQENTRY_TEXT 41 IRQENTRY_TEXT
42 SOFTIRQENTRY_TEXT
42 KPROBES_TEXT 43 KPROBES_TEXT
43 } =0 44 } =0
44 _etext = .; 45 _etext = .;
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S
index 2d69a853b742..d936de4c07ca 100644
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
50 LOCK_TEXT 50 LOCK_TEXT
51 KPROBES_TEXT 51 KPROBES_TEXT
52 IRQENTRY_TEXT 52 IRQENTRY_TEXT
53 SOFTIRQENTRY_TEXT
53 *(.fixup) 54 *(.fixup)
54 *(.text.__*) 55 *(.text.__*)
55 _etext = .; 56 _etext = .;
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 308f29081d46..f3ead0b6ce46 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
72 LOCK_TEXT 72 LOCK_TEXT
73 KPROBES_TEXT 73 KPROBES_TEXT
74 IRQENTRY_TEXT 74 IRQENTRY_TEXT
75 SOFTIRQENTRY_TEXT
75 *(.text.do_softirq) 76 *(.text.do_softirq)
76 *(.text.sys_exit) 77 *(.text.sys_exit)
77 *(.text.do_sigaltstack) 78 *(.text.do_sigaltstack)
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index d41fd0af8980..2dd91f79de05 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -55,6 +55,7 @@ SECTIONS
55 LOCK_TEXT 55 LOCK_TEXT
56 KPROBES_TEXT 56 KPROBES_TEXT
57 IRQENTRY_TEXT 57 IRQENTRY_TEXT
58 SOFTIRQENTRY_TEXT
58 59
59#ifdef CONFIG_PPC32 60#ifdef CONFIG_PPC32
60 *(.got1) 61 *(.got1)
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 445657fe658c..0f41a8286378 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
28 LOCK_TEXT 28 LOCK_TEXT
29 KPROBES_TEXT 29 KPROBES_TEXT
30 IRQENTRY_TEXT 30 IRQENTRY_TEXT
31 SOFTIRQENTRY_TEXT
31 *(.fixup) 32 *(.fixup)
32 *(.gnu.warning) 33 *(.gnu.warning)
33 } :text = 0x0700 34 } :text = 0x0700
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index db88cbf9eafd..235a4101999f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
39 LOCK_TEXT 39 LOCK_TEXT
40 KPROBES_TEXT 40 KPROBES_TEXT
41 IRQENTRY_TEXT 41 IRQENTRY_TEXT
42 SOFTIRQENTRY_TEXT
42 *(.fixup) 43 *(.fixup)
43 *(.gnu.warning) 44 *(.gnu.warning)
44 _etext = .; /* End of text section */ 45 _etext = .; /* End of text section */
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index f1a2f688b28a..aadd321aa05d 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -48,6 +48,7 @@ SECTIONS
48 LOCK_TEXT 48 LOCK_TEXT
49 KPROBES_TEXT 49 KPROBES_TEXT
50 IRQENTRY_TEXT 50 IRQENTRY_TEXT
51 SOFTIRQENTRY_TEXT
51 *(.gnu.warning) 52 *(.gnu.warning)
52 } = 0 53 } = 0
53 _etext = .; 54 _etext = .;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index 0e059a0101ea..378f5d8d1ec8 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -45,6 +45,7 @@ SECTIONS
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT 47 IRQENTRY_TEXT
48 SOFTIRQENTRY_TEXT
48 __fix_text_end = .; /* tile-cpack won't rearrange before this */ 49 __fix_text_end = .; /* tile-cpack won't rearrange before this */
49 ALIGN_FUNCTION(); 50 ALIGN_FUNCTION();
50 *(.hottext*) 51 *(.hottext*)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index adaae2c781c1..616ebd22ef9a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,7 @@ endif
19KASAN_SANITIZE_head$(BITS).o := n 19KASAN_SANITIZE_head$(BITS).o := n
20KASAN_SANITIZE_dumpstack.o := n 20KASAN_SANITIZE_dumpstack.o := n
21KASAN_SANITIZE_dumpstack_$(BITS).o := n 21KASAN_SANITIZE_dumpstack_$(BITS).o := n
22KASAN_SANITIZE_stacktrace.o := n
22 23
23OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y 24OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
24OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y 25OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d239639e0c1d..4c941f88d405 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
101 KPROBES_TEXT 101 KPROBES_TEXT
102 ENTRY_TEXT 102 ENTRY_TEXT
103 IRQENTRY_TEXT 103 IRQENTRY_TEXT
104 SOFTIRQENTRY_TEXT
104 *(.fixup) 105 *(.fixup)
105 *(.gnu.warning) 106 *(.gnu.warning)
106 /* End of text section */ 107 /* End of text section */
diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c
index 64ca7113ff28..d84d20b9cec0 100644
--- a/drivers/input/input-compat.c
+++ b/drivers/input/input-compat.c
@@ -17,7 +17,7 @@
17int input_event_from_user(const char __user *buffer, 17int input_event_from_user(const char __user *buffer,
18 struct input_event *event) 18 struct input_event *event)
19{ 19{
20 if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { 20 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
21 struct input_event_compat compat_event; 21 struct input_event_compat compat_event;
22 22
23 if (copy_from_user(&compat_event, buffer, 23 if (copy_from_user(&compat_event, buffer,
@@ -41,7 +41,7 @@ int input_event_from_user(const char __user *buffer,
41int input_event_to_user(char __user *buffer, 41int input_event_to_user(char __user *buffer,
42 const struct input_event *event) 42 const struct input_event *event)
43{ 43{
44 if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { 44 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
45 struct input_event_compat compat_event; 45 struct input_event_compat compat_event;
46 46
47 compat_event.time.tv_sec = event->time.tv_sec; 47 compat_event.time.tv_sec = event->time.tv_sec;
@@ -65,7 +65,7 @@ int input_event_to_user(char __user *buffer,
65int input_ff_effect_from_user(const char __user *buffer, size_t size, 65int input_ff_effect_from_user(const char __user *buffer, size_t size,
66 struct ff_effect *effect) 66 struct ff_effect *effect)
67{ 67{
68 if (INPUT_COMPAT_TEST) { 68 if (in_compat_syscall()) {
69 struct ff_effect_compat *compat_effect; 69 struct ff_effect_compat *compat_effect;
70 70
71 if (size != sizeof(struct ff_effect_compat)) 71 if (size != sizeof(struct ff_effect_compat))
diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h
index 0f25878d5fa2..1563160a7af3 100644
--- a/drivers/input/input-compat.h
+++ b/drivers/input/input-compat.h
@@ -17,8 +17,6 @@
17 17
18#ifdef CONFIG_COMPAT 18#ifdef CONFIG_COMPAT
19 19
20#define INPUT_COMPAT_TEST in_compat_syscall()
21
22struct input_event_compat { 20struct input_event_compat {
23 struct compat_timeval time; 21 struct compat_timeval time;
24 __u16 type; 22 __u16 type;
@@ -57,7 +55,7 @@ struct ff_effect_compat {
57 55
58static inline size_t input_event_size(void) 56static inline size_t input_event_size(void)
59{ 57{
60 return (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) ? 58 return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ?
61 sizeof(struct input_event_compat) : sizeof(struct input_event); 59 sizeof(struct input_event_compat) : sizeof(struct input_event);
62} 60}
63 61
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 880605959aa6..b87ffbd4547d 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1015,7 +1015,7 @@ static int input_bits_to_string(char *buf, int buf_size,
1015{ 1015{
1016 int len = 0; 1016 int len = 0;
1017 1017
1018 if (INPUT_COMPAT_TEST) { 1018 if (in_compat_syscall()) {
1019 u32 dword = bits >> 32; 1019 u32 dword = bits >> 32;
1020 if (dword || !skip_empty) 1020 if (dword || !skip_empty)
1021 len += snprintf(buf, buf_size, "%x ", dword); 1021 len += snprintf(buf, buf_size, "%x ", dword);
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 4eb9e4d94f46..abe1a927b332 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -664,7 +664,7 @@ struct uinput_ff_upload_compat {
664static int uinput_ff_upload_to_user(char __user *buffer, 664static int uinput_ff_upload_to_user(char __user *buffer,
665 const struct uinput_ff_upload *ff_up) 665 const struct uinput_ff_upload *ff_up)
666{ 666{
667 if (INPUT_COMPAT_TEST) { 667 if (in_compat_syscall()) {
668 struct uinput_ff_upload_compat ff_up_compat; 668 struct uinput_ff_upload_compat ff_up_compat;
669 669
670 ff_up_compat.request_id = ff_up->request_id; 670 ff_up_compat.request_id = ff_up->request_id;
@@ -695,7 +695,7 @@ static int uinput_ff_upload_to_user(char __user *buffer,
695static int uinput_ff_upload_from_user(const char __user *buffer, 695static int uinput_ff_upload_from_user(const char __user *buffer,
696 struct uinput_ff_upload *ff_up) 696 struct uinput_ff_upload *ff_up)
697{ 697{
698 if (INPUT_COMPAT_TEST) { 698 if (in_compat_syscall()) {
699 struct uinput_ff_upload_compat ff_up_compat; 699 struct uinput_ff_upload_compat ff_up_compat;
700 700
701 if (copy_from_user(&ff_up_compat, buffer, 701 if (copy_from_user(&ff_up_compat, buffer,
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index ef09ba0289d7..d5cfb503b9d6 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -298,8 +298,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev)
298 sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? 298 sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ?
299 PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); 299 PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
300 300
301 if (sg_count != 1 || 301 if (sg_count != 1 || sg_dma_len(&dev->req->sg) < R592_LFIFO_SIZE) {
302 (sg_dma_len(&dev->req->sg) < dev->req->sg.length)) {
303 message("problem in dma_map_sg"); 302 message("problem in dma_map_sg");
304 return -EIO; 303 return -EIO;
305 } 304 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2516 struct ocfs2_extent_block *eb; 2516 struct ocfs2_extent_block *eb;
2517 u32 range; 2517 u32 range;
2518 2518
2519 /*
2520 * In normal tree rotation process, we will never touch the
2521 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2522 * doesn't reserve the credits for them either.
2523 *
2524 * But we do have a special case here which will update the rightmost
2525 * records for all the bh in the path.
2526 * So we have to allocate extra credits and access them.
2527 */
2528 ret = ocfs2_extend_trans(handle, subtree_index);
2529 if (ret) {
2530 mlog_errno(ret);
2531 goto out;
2532 }
2533
2534 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 2519 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2535 if (ret) { 2520 if (ret) {
2536 mlog_errno(ret); 2521 mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
2956 right_path->p_node[subtree_root].bh->b_blocknr, 2941 right_path->p_node[subtree_root].bh->b_blocknr,
2957 right_path->p_tree_depth); 2942 right_path->p_tree_depth);
2958 2943
2959 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2944 ret = ocfs2_extend_rotate_transaction(handle, 0,
2960 orig_credits, left_path); 2945 orig_credits, left_path);
2961 if (ret) { 2946 if (ret) {
2962 mlog_errno(ret); 2947 mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
3029 struct ocfs2_extent_block *eb; 3014 struct ocfs2_extent_block *eb;
3030 struct ocfs2_extent_list *el; 3015 struct ocfs2_extent_list *el;
3031 3016
3032
3033 ret = ocfs2_et_sanity_check(et); 3017 ret = ocfs2_et_sanity_check(et);
3034 if (ret) 3018 if (ret)
3035 goto out; 3019 goto out;
3036 /*
3037 * There's two ways we handle this depending on
3038 * whether path is the only existing one.
3039 */
3040 ret = ocfs2_extend_rotate_transaction(handle, 0,
3041 handle->h_buffer_credits,
3042 path);
3043 if (ret) {
3044 mlog_errno(ret);
3045 goto out;
3046 }
3047 3020
3048 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 3021 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3049 if (ret) { 3022 if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3641 */ 3614 */
3642 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3615 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3643 le16_to_cpu(el->l_next_free_rec) == 1) { 3616 le16_to_cpu(el->l_next_free_rec) == 1) {
3617 /* extend credit for ocfs2_remove_rightmost_path */
3618 ret = ocfs2_extend_rotate_transaction(handle, 0,
3619 handle->h_buffer_credits,
3620 right_path);
3621 if (ret) {
3622 mlog_errno(ret);
3623 goto out;
3624 }
3644 3625
3645 ret = ocfs2_remove_rightmost_path(handle, et, 3626 ret = ocfs2_remove_rightmost_path(handle, et,
3646 right_path, 3627 right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3679 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3660 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3680 3661
3681 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3662 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3663 /* extend credit for ocfs2_remove_rightmost_path */
3664 ret = ocfs2_extend_rotate_transaction(handle, 0,
3665 handle->h_buffer_credits,
3666 path);
3667 if (ret) {
3668 mlog_errno(ret);
3669 goto out;
3670 }
3682 /* 3671 /*
3683 * The merge code will need to create an empty 3672 * The merge code will need to create an empty
3684 * extent to take the place of the newly 3673 * extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3727 */ 3716 */
3728 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3717 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3729 3718
3719 /* extend credit for ocfs2_remove_rightmost_path */
3720 ret = ocfs2_extend_rotate_transaction(handle, 0,
3721 handle->h_buffer_credits,
3722 path);
3723 if (ret) {
3724 mlog_errno(ret);
3725 goto out;
3726 }
3727
3730 /* The merge left us with an empty extent, remove it. */ 3728 /* The merge left us with an empty extent, remove it. */
3731 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3729 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3732 if (ret) { 3730 if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3748 goto out; 3746 goto out;
3749 } 3747 }
3750 3748
3749 /* extend credit for ocfs2_remove_rightmost_path */
3750 ret = ocfs2_extend_rotate_transaction(handle, 0,
3751 handle->h_buffer_credits,
3752 path);
3753 if (ret) {
3754 mlog_errno(ret);
3755 goto out;
3756 }
3757
3751 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3758 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3752 /* 3759 /*
3753 * Error from this last rotate is not critical, so 3760 * Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
3783 } 3790 }
3784 3791
3785 if (ctxt->c_split_covers_rec) { 3792 if (ctxt->c_split_covers_rec) {
3793 /* extend credit for ocfs2_remove_rightmost_path */
3794 ret = ocfs2_extend_rotate_transaction(handle, 0,
3795 handle->h_buffer_credits,
3796 path);
3797 if (ret) {
3798 mlog_errno(ret);
3799 ret = 0;
3800 goto out;
3801 }
3802
3786 /* 3803 /*
3787 * The merge may have left an empty extent in 3804 * The merge may have left an empty extent in
3788 * our leaf. Try to rotate it away. 3805 * our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
5342 struct ocfs2_extent_block *eb; 5359 struct ocfs2_extent_block *eb;
5343 5360
5344 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5361 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5362 /* extend credit for ocfs2_remove_rightmost_path */
5363 ret = ocfs2_extend_rotate_transaction(handle, 0,
5364 handle->h_buffer_credits,
5365 path);
5366 if (ret) {
5367 mlog_errno(ret);
5368 goto out;
5369 }
5370
5345 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5371 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5346 if (ret) { 5372 if (ret) {
5347 mlog_errno(ret); 5373 mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5928 5954
5929 ocfs2_journal_dirty(handle, tl_bh); 5955 ocfs2_journal_dirty(handle, tl_bh);
5930 5956
5931 /* TODO: Perhaps we can calculate the bulk of the
5932 * credits up front rather than extending like
5933 * this. */
5934 status = ocfs2_extend_trans(handle,
5935 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5936 if (status < 0) {
5937 mlog_errno(status);
5938 goto bail;
5939 }
5940
5941 rec = tl->tl_recs[i]; 5957 rec = tl->tl_recs[i];
5942 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5958 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5943 le32_to_cpu(rec.t_start)); 5959 le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5958 goto bail; 5974 goto bail;
5959 } 5975 }
5960 } 5976 }
5977
5978 status = ocfs2_extend_trans(handle,
5979 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5980 if (status < 0) {
5981 mlog_errno(status);
5982 goto bail;
5983 }
5961 i--; 5984 i--;
5962 } 5985 }
5963 5986
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6016 goto out_mutex; 6039 goto out_mutex;
6017 } 6040 }
6018 6041
6019 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6042 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
6020 if (IS_ERR(handle)) { 6043 if (IS_ERR(handle)) {
6021 status = PTR_ERR(handle); 6044 status = PTR_ERR(handle);
6022 mlog_errno(status); 6045 mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6079 if (cancel) 6102 if (cancel)
6080 cancel_delayed_work(&osb->osb_truncate_log_wq); 6103 cancel_delayed_work(&osb->osb_truncate_log_wq);
6081 6104
6082 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 6105 queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
6083 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 6106 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6084 } 6107 }
6085} 6108}
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6253 6276
6254 if (tl_inode) { 6277 if (tl_inode) {
6255 cancel_delayed_work(&osb->osb_truncate_log_wq); 6278 cancel_delayed_work(&osb->osb_truncate_log_wq);
6256 flush_workqueue(ocfs2_wq); 6279 flush_workqueue(osb->ocfs2_wq);
6257 6280
6258 status = ocfs2_flush_truncate_log(osb); 6281 status = ocfs2_flush_truncate_log(osb);
6259 if (status < 0) 6282 if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 043110e5212d..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,158 +499,6 @@ bail:
499 return status; 499 return status;
500} 500}
501 501
502/*
503 * TODO: Make this into a generic get_blocks function.
504 *
505 * From do_direct_io in direct-io.c:
506 * "So what we do is to permit the ->get_blocks function to populate
507 * bh.b_size with the size of IO which is permitted at this offset and
508 * this i_blkbits."
509 *
510 * This function is called directly from get_more_blocks in direct-io.c.
511 *
512 * called like this: dio->get_blocks(dio->inode, fs_startblk,
513 * fs_count, map_bh, dio->rw == WRITE);
514 */
515static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
516 struct buffer_head *bh_result, int create)
517{
518 int ret;
519 u32 cpos = 0;
520 int alloc_locked = 0;
521 u64 p_blkno, inode_blocks, contig_blocks;
522 unsigned int ext_flags;
523 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
524 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
525 unsigned long len = bh_result->b_size;
526 unsigned int clusters_to_alloc = 0, contig_clusters = 0;
527
528 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
529
530 /* This function won't even be called if the request isn't all
531 * nicely aligned and of the right size, so there's no need
532 * for us to check any of that. */
533
534 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
535
536 down_read(&OCFS2_I(inode)->ip_alloc_sem);
537
538 /* This figures out the size of the next contiguous block, and
539 * our logical offset */
540 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
541 &contig_blocks, &ext_flags);
542 up_read(&OCFS2_I(inode)->ip_alloc_sem);
543
544 if (ret) {
545 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
546 (unsigned long long)iblock);
547 ret = -EIO;
548 goto bail;
549 }
550
551 /* We should already CoW the refcounted extent in case of create. */
552 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
553
554 /* allocate blocks if no p_blkno is found, and create == 1 */
555 if (!p_blkno && create) {
556 ret = ocfs2_inode_lock(inode, NULL, 1);
557 if (ret < 0) {
558 mlog_errno(ret);
559 goto bail;
560 }
561
562 alloc_locked = 1;
563
564 down_write(&OCFS2_I(inode)->ip_alloc_sem);
565
566 /* fill hole, allocate blocks can't be larger than the size
567 * of the hole */
568 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
569 contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
570 contig_blocks);
571 if (clusters_to_alloc > contig_clusters)
572 clusters_to_alloc = contig_clusters;
573
574 /* allocate extent and insert them into the extent tree */
575 ret = ocfs2_extend_allocation(inode, cpos,
576 clusters_to_alloc, 0);
577 if (ret < 0) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 mlog_errno(ret);
580 goto bail;
581 }
582
583 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
584 &contig_blocks, &ext_flags);
585 if (ret < 0) {
586 up_write(&OCFS2_I(inode)->ip_alloc_sem);
587 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
588 (unsigned long long)iblock);
589 ret = -EIO;
590 goto bail;
591 }
592 set_buffer_new(bh_result);
593 up_write(&OCFS2_I(inode)->ip_alloc_sem);
594 }
595
596 /*
597 * get_more_blocks() expects us to describe a hole by clearing
598 * the mapped bit on bh_result().
599 *
600 * Consider an unwritten extent as a hole.
601 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno);
604 else
605 clear_buffer_mapped(bh_result);
606
607 /* make sure we don't map more than max_blocks blocks here as
608 that's all the kernel will handle at this point. */
609 if (max_blocks < contig_blocks)
610 contig_blocks = max_blocks;
611 bh_result->b_size = contig_blocks << blocksize_bits;
612bail:
613 if (alloc_locked)
614 ocfs2_inode_unlock(inode, 1);
615 return ret;
616}
617
618/*
619 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
620 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
621 * to protect io on one node from truncation on another.
622 */
623static int ocfs2_dio_end_io(struct kiocb *iocb,
624 loff_t offset,
625 ssize_t bytes,
626 void *private)
627{
628 struct inode *inode = file_inode(iocb->ki_filp);
629 int level;
630
631 if (bytes <= 0)
632 return 0;
633
634 /* this io's submitter should not have unlocked this before we could */
635 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
636
637 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
638 ocfs2_iocb_clear_unaligned_aio(iocb);
639
640 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
641 }
642
643 /* Let rw unlock to be done later to protect append direct io write */
644 if (offset + bytes <= i_size_read(inode)) {
645 ocfs2_iocb_clear_rw_locked(iocb);
646
647 level = ocfs2_iocb_rw_locked_level(iocb);
648 ocfs2_rw_unlock(inode, level);
649 }
650
651 return 0;
652}
653
654static int ocfs2_releasepage(struct page *page, gfp_t wait) 502static int ocfs2_releasepage(struct page *page, gfp_t wait)
655{ 503{
656 if (!page_has_buffers(page)) 504 if (!page_has_buffers(page))
@@ -658,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
658 return try_to_free_buffers(page); 506 return try_to_free_buffers(page);
659} 507}
660 508
661static int ocfs2_is_overwrite(struct ocfs2_super *osb,
662 struct inode *inode, loff_t offset)
663{
664 int ret = 0;
665 u32 v_cpos = 0;
666 u32 p_cpos = 0;
667 unsigned int num_clusters = 0;
668 unsigned int ext_flags = 0;
669
670 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
671 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
672 &num_clusters, &ext_flags);
673 if (ret < 0) {
674 mlog_errno(ret);
675 return ret;
676 }
677
678 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
679 return 1;
680
681 return 0;
682}
683
684static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
685 struct inode *inode, loff_t offset,
686 u64 zero_len, int cluster_align)
687{
688 u32 p_cpos = 0;
689 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
690 unsigned int num_clusters = 0;
691 unsigned int ext_flags = 0;
692 int ret = 0;
693
694 if (offset <= i_size_read(inode) || cluster_align)
695 return 0;
696
697 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
698 &ext_flags);
699 if (ret < 0) {
700 mlog_errno(ret);
701 return ret;
702 }
703
704 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
705 u64 s = i_size_read(inode);
706 sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
707 (do_div(s, osb->s_clustersize) >> 9);
708
709 ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
710 zero_len >> 9, GFP_NOFS, false);
711 if (ret < 0)
712 mlog_errno(ret);
713 }
714
715 return ret;
716}
717
718static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
719 struct inode *inode, loff_t offset)
720{
721 u64 zero_start, zero_len, total_zero_len;
722 u32 p_cpos = 0, clusters_to_add;
723 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
724 unsigned int num_clusters = 0;
725 unsigned int ext_flags = 0;
726 u32 size_div, offset_div;
727 int ret = 0;
728
729 {
730 u64 o = offset;
731 u64 s = i_size_read(inode);
732
733 offset_div = do_div(o, osb->s_clustersize);
734 size_div = do_div(s, osb->s_clustersize);
735 }
736
737 if (offset <= i_size_read(inode))
738 return 0;
739
740 clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
741 ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
742 total_zero_len = offset - i_size_read(inode);
743 if (clusters_to_add)
744 total_zero_len -= offset_div;
745
746 /* Allocate clusters to fill out holes, and this is only needed
747 * when we add more than one clusters. Otherwise the cluster will
748 * be allocated during direct IO */
749 if (clusters_to_add > 1) {
750 ret = ocfs2_extend_allocation(inode,
751 OCFS2_I(inode)->ip_clusters,
752 clusters_to_add - 1, 0);
753 if (ret) {
754 mlog_errno(ret);
755 goto out;
756 }
757 }
758
759 while (total_zero_len) {
760 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
761 &ext_flags);
762 if (ret < 0) {
763 mlog_errno(ret);
764 goto out;
765 }
766
767 zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
768 size_div;
769 zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
770 size_div;
771 zero_len = min(total_zero_len, zero_len);
772
773 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
774 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
775 zero_start >> 9, zero_len >> 9,
776 GFP_NOFS, false);
777 if (ret < 0) {
778 mlog_errno(ret);
779 goto out;
780 }
781 }
782
783 total_zero_len -= zero_len;
784 v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
785
786 /* Only at first iteration can be cluster not aligned.
787 * So set size_div to 0 for the rest */
788 size_div = 0;
789 }
790
791out:
792 return ret;
793}
794
795static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
796 struct iov_iter *iter,
797 loff_t offset)
798{
799 ssize_t ret = 0;
800 ssize_t written = 0;
801 bool orphaned = false;
802 int is_overwrite = 0;
803 struct file *file = iocb->ki_filp;
804 struct inode *inode = file_inode(file)->i_mapping->host;
805 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
806 struct buffer_head *di_bh = NULL;
807 size_t count = iter->count;
808 journal_t *journal = osb->journal->j_journal;
809 u64 zero_len_head, zero_len_tail;
810 int cluster_align_head, cluster_align_tail;
811 loff_t final_size = offset + count;
812 int append_write = offset >= i_size_read(inode) ? 1 : 0;
813 unsigned int num_clusters = 0;
814 unsigned int ext_flags = 0;
815
816 {
817 u64 o = offset;
818 u64 s = i_size_read(inode);
819
820 zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
821 cluster_align_head = !zero_len_head;
822
823 zero_len_tail = osb->s_clustersize -
824 do_div(s, osb->s_clustersize);
825 if ((offset - i_size_read(inode)) < zero_len_tail)
826 zero_len_tail = offset - i_size_read(inode);
827 cluster_align_tail = !zero_len_tail;
828 }
829
830 /*
831 * when final_size > inode->i_size, inode->i_size will be
832 * updated after direct write, so add the inode to orphan
833 * dir first.
834 */
835 if (final_size > i_size_read(inode)) {
836 ret = ocfs2_add_inode_to_orphan(osb, inode);
837 if (ret < 0) {
838 mlog_errno(ret);
839 goto out;
840 }
841 orphaned = true;
842 }
843
844 if (append_write) {
845 ret = ocfs2_inode_lock(inode, NULL, 1);
846 if (ret < 0) {
847 mlog_errno(ret);
848 goto clean_orphan;
849 }
850
851 /* zeroing out the previously allocated cluster tail
852 * that but not zeroed */
853 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
854 down_read(&OCFS2_I(inode)->ip_alloc_sem);
855 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
856 zero_len_tail, cluster_align_tail);
857 up_read(&OCFS2_I(inode)->ip_alloc_sem);
858 } else {
859 down_write(&OCFS2_I(inode)->ip_alloc_sem);
860 ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
861 offset);
862 up_write(&OCFS2_I(inode)->ip_alloc_sem);
863 }
864 if (ret < 0) {
865 mlog_errno(ret);
866 ocfs2_inode_unlock(inode, 1);
867 goto clean_orphan;
868 }
869
870 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
871 if (is_overwrite < 0) {
872 mlog_errno(is_overwrite);
873 ret = is_overwrite;
874 ocfs2_inode_unlock(inode, 1);
875 goto clean_orphan;
876 }
877
878 ocfs2_inode_unlock(inode, 1);
879 }
880
881 written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
882 offset, ocfs2_direct_IO_get_blocks,
883 ocfs2_dio_end_io, NULL, 0);
884 /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
885 if ((written < 0) && (written != -EIOCBQUEUED)) {
886 loff_t i_size = i_size_read(inode);
887
888 if (offset + count > i_size) {
889 ret = ocfs2_inode_lock(inode, &di_bh, 1);
890 if (ret < 0) {
891 mlog_errno(ret);
892 goto clean_orphan;
893 }
894
895 if (i_size == i_size_read(inode)) {
896 ret = ocfs2_truncate_file(inode, di_bh,
897 i_size);
898 if (ret < 0) {
899 if (ret != -ENOSPC)
900 mlog_errno(ret);
901
902 ocfs2_inode_unlock(inode, 1);
903 brelse(di_bh);
904 di_bh = NULL;
905 goto clean_orphan;
906 }
907 }
908
909 ocfs2_inode_unlock(inode, 1);
910 brelse(di_bh);
911 di_bh = NULL;
912
913 ret = jbd2_journal_force_commit(journal);
914 if (ret < 0)
915 mlog_errno(ret);
916 }
917 } else if (written > 0 && append_write && !is_overwrite &&
918 !cluster_align_head) {
919 /* zeroing out the allocated cluster head */
920 u32 p_cpos = 0;
921 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
922
923 ret = ocfs2_inode_lock(inode, NULL, 0);
924 if (ret < 0) {
925 mlog_errno(ret);
926 goto clean_orphan;
927 }
928
929 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
930 &num_clusters, &ext_flags);
931 if (ret < 0) {
932 mlog_errno(ret);
933 ocfs2_inode_unlock(inode, 0);
934 goto clean_orphan;
935 }
936
937 BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
938
939 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
940 (u64)p_cpos << (osb->s_clustersize_bits - 9),
941 zero_len_head >> 9, GFP_NOFS, false);
942 if (ret < 0)
943 mlog_errno(ret);
944
945 ocfs2_inode_unlock(inode, 0);
946 }
947
948clean_orphan:
949 if (orphaned) {
950 int tmp_ret;
951 int update_isize = written > 0 ? 1 : 0;
952 loff_t end = update_isize ? offset + written : 0;
953
954 tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
955 if (tmp_ret < 0) {
956 ret = tmp_ret;
957 mlog_errno(ret);
958 goto out;
959 }
960
961 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
962 update_isize, end);
963 if (tmp_ret < 0) {
964 ocfs2_inode_unlock(inode, 1);
965 ret = tmp_ret;
966 mlog_errno(ret);
967 brelse(di_bh);
968 goto out;
969 }
970
971 ocfs2_inode_unlock(inode, 1);
972 brelse(di_bh);
973
974 tmp_ret = jbd2_journal_force_commit(journal);
975 if (tmp_ret < 0) {
976 ret = tmp_ret;
977 mlog_errno(tmp_ret);
978 }
979 }
980
981out:
982 if (ret >= 0)
983 ret = written;
984 return ret;
985}
986
987static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
988 loff_t offset)
989{
990 struct file *file = iocb->ki_filp;
991 struct inode *inode = file_inode(file)->i_mapping->host;
992 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
993 int full_coherency = !(osb->s_mount_opt &
994 OCFS2_MOUNT_COHERENCY_BUFFERED);
995
996 /*
997 * Fallback to buffered I/O if we see an inode without
998 * extents.
999 */
1000 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1001 return 0;
1002
1003 /* Fallback to buffered I/O if we are appending and
1004 * concurrent O_DIRECT writes are allowed.
1005 */
1006 if (i_size_read(inode) <= offset && !full_coherency)
1007 return 0;
1008
1009 if (iov_iter_rw(iter) == READ)
1010 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
1011 iter, offset,
1012 ocfs2_direct_IO_get_blocks,
1013 ocfs2_dio_end_io, NULL, 0);
1014 else
1015 return ocfs2_direct_IO_write(iocb, iter, offset);
1016}
1017
1018static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 509static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
1019 u32 cpos, 510 u32 cpos,
1020 unsigned int *start, 511 unsigned int *start,
@@ -1201,6 +692,13 @@ next_bh:
1201 692
1202#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 693#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
1203 694
695struct ocfs2_unwritten_extent {
696 struct list_head ue_node;
697 struct list_head ue_ip_node;
698 u32 ue_cpos;
699 u32 ue_phys;
700};
701
1204/* 702/*
1205 * Describe the state of a single cluster to be written to. 703 * Describe the state of a single cluster to be written to.
1206 */ 704 */
@@ -1212,7 +710,7 @@ struct ocfs2_write_cluster_desc {
1212 * filled. 710 * filled.
1213 */ 711 */
1214 unsigned c_new; 712 unsigned c_new;
1215 unsigned c_unwritten; 713 unsigned c_clear_unwritten;
1216 unsigned c_needs_zero; 714 unsigned c_needs_zero;
1217}; 715};
1218 716
@@ -1224,6 +722,9 @@ struct ocfs2_write_ctxt {
1224 /* First cluster allocated in a nonsparse extend */ 722 /* First cluster allocated in a nonsparse extend */
1225 u32 w_first_new_cpos; 723 u32 w_first_new_cpos;
1226 724
725 /* Type of caller. Must be one of buffer, mmap, direct. */
726 ocfs2_write_type_t w_type;
727
1227 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 728 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
1228 729
1229 /* 730 /*
@@ -1272,6 +773,8 @@ struct ocfs2_write_ctxt {
1272 struct buffer_head *w_di_bh; 773 struct buffer_head *w_di_bh;
1273 774
1274 struct ocfs2_cached_dealloc_ctxt w_dealloc; 775 struct ocfs2_cached_dealloc_ctxt w_dealloc;
776
777 struct list_head w_unwritten_list;
1275}; 778};
1276 779
1277void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) 780void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1310,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
1310 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 813 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
1311} 814}
1312 815
1313static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 816static void ocfs2_free_unwritten_list(struct inode *inode,
817 struct list_head *head)
1314{ 818{
819 struct ocfs2_inode_info *oi = OCFS2_I(inode);
820 struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
821
822 list_for_each_entry_safe(ue, tmp, head, ue_node) {
823 list_del(&ue->ue_node);
824 spin_lock(&oi->ip_lock);
825 list_del(&ue->ue_ip_node);
826 spin_unlock(&oi->ip_lock);
827 kfree(ue);
828 }
829}
830
831static void ocfs2_free_write_ctxt(struct inode *inode,
832 struct ocfs2_write_ctxt *wc)
833{
834 ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
1315 ocfs2_unlock_pages(wc); 835 ocfs2_unlock_pages(wc);
1316 brelse(wc->w_di_bh); 836 brelse(wc->w_di_bh);
1317 kfree(wc); 837 kfree(wc);
@@ -1319,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
1319 839
1320static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 840static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1321 struct ocfs2_super *osb, loff_t pos, 841 struct ocfs2_super *osb, loff_t pos,
1322 unsigned len, struct buffer_head *di_bh) 842 unsigned len, ocfs2_write_type_t type,
843 struct buffer_head *di_bh)
1323{ 844{
1324 u32 cend; 845 u32 cend;
1325 struct ocfs2_write_ctxt *wc; 846 struct ocfs2_write_ctxt *wc;
@@ -1334,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1334 wc->w_clen = cend - wc->w_cpos + 1; 855 wc->w_clen = cend - wc->w_cpos + 1;
1335 get_bh(di_bh); 856 get_bh(di_bh);
1336 wc->w_di_bh = di_bh; 857 wc->w_di_bh = di_bh;
858 wc->w_type = type;
1337 859
1338 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 860 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1339 wc->w_large_pages = 1; 861 wc->w_large_pages = 1;
@@ -1341,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
1341 wc->w_large_pages = 0; 863 wc->w_large_pages = 0;
1342 864
1343 ocfs2_init_dealloc_ctxt(&wc->w_dealloc); 865 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
866 INIT_LIST_HEAD(&wc->w_unwritten_list);
1344 867
1345 *wcp = wc; 868 *wcp = wc;
1346 869
@@ -1401,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
1401 to = user_pos + user_len; 924 to = user_pos + user_len;
1402 struct page *tmppage; 925 struct page *tmppage;
1403 926
1404 ocfs2_zero_new_buffers(wc->w_target_page, from, to); 927 if (wc->w_target_page)
928 ocfs2_zero_new_buffers(wc->w_target_page, from, to);
1405 929
1406 for(i = 0; i < wc->w_num_pages; i++) { 930 for(i = 0; i < wc->w_num_pages; i++) {
1407 tmppage = wc->w_pages[i]; 931 tmppage = wc->w_pages[i];
1408 932
1409 if (page_has_buffers(tmppage)) { 933 if (tmppage && page_has_buffers(tmppage)) {
1410 if (ocfs2_should_order_data(inode)) 934 if (ocfs2_should_order_data(inode))
1411 ocfs2_jbd2_file_inode(wc->w_handle, inode); 935 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1412 936
@@ -1536,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1536 wc->w_num_pages = 1; 1060 wc->w_num_pages = 1;
1537 start = target_index; 1061 start = target_index;
1538 } 1062 }
1063 end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
1539 1064
1540 for(i = 0; i < wc->w_num_pages; i++) { 1065 for(i = 0; i < wc->w_num_pages; i++) {
1541 index = start + i; 1066 index = start + i;
1542 1067
1543 if (index == target_index && mmap_page) { 1068 if (index >= target_index && index <= end_index &&
1069 wc->w_type == OCFS2_WRITE_MMAP) {
1544 /* 1070 /*
1545 * ocfs2_pagemkwrite() is a little different 1071 * ocfs2_pagemkwrite() is a little different
1546 * and wants us to directly use the page 1072 * and wants us to directly use the page
@@ -1559,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1559 page_cache_get(mmap_page); 1085 page_cache_get(mmap_page);
1560 wc->w_pages[i] = mmap_page; 1086 wc->w_pages[i] = mmap_page;
1561 wc->w_target_locked = true; 1087 wc->w_target_locked = true;
1088 } else if (index >= target_index && index <= end_index &&
1089 wc->w_type == OCFS2_WRITE_DIRECT) {
1090 /* Direct write has no mapping page. */
1091 wc->w_pages[i] = NULL;
1092 continue;
1562 } else { 1093 } else {
1563 wc->w_pages[i] = find_or_create_page(mapping, index, 1094 wc->w_pages[i] = find_or_create_page(mapping, index,
1564 GFP_NOFS); 1095 GFP_NOFS);
@@ -1583,19 +1114,20 @@ out:
1583 * Prepare a single cluster for write one cluster into the file. 1114 * Prepare a single cluster for write one cluster into the file.
1584 */ 1115 */
1585static int ocfs2_write_cluster(struct address_space *mapping, 1116static int ocfs2_write_cluster(struct address_space *mapping,
1586 u32 phys, unsigned int unwritten, 1117 u32 *phys, unsigned int new,
1118 unsigned int clear_unwritten,
1587 unsigned int should_zero, 1119 unsigned int should_zero,
1588 struct ocfs2_alloc_context *data_ac, 1120 struct ocfs2_alloc_context *data_ac,
1589 struct ocfs2_alloc_context *meta_ac, 1121 struct ocfs2_alloc_context *meta_ac,
1590 struct ocfs2_write_ctxt *wc, u32 cpos, 1122 struct ocfs2_write_ctxt *wc, u32 cpos,
1591 loff_t user_pos, unsigned user_len) 1123 loff_t user_pos, unsigned user_len)
1592{ 1124{
1593 int ret, i, new; 1125 int ret, i;
1594 u64 v_blkno, p_blkno; 1126 u64 p_blkno;
1595 struct inode *inode = mapping->host; 1127 struct inode *inode = mapping->host;
1596 struct ocfs2_extent_tree et; 1128 struct ocfs2_extent_tree et;
1129 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
1597 1130
1598 new = phys == 0 ? 1 : 0;
1599 if (new) { 1131 if (new) {
1600 u32 tmp_pos; 1132 u32 tmp_pos;
1601 1133
@@ -1605,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1605 */ 1137 */
1606 tmp_pos = cpos; 1138 tmp_pos = cpos;
1607 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, 1139 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1608 &tmp_pos, 1, 0, wc->w_di_bh, 1140 &tmp_pos, 1, !clear_unwritten,
1609 wc->w_handle, data_ac, 1141 wc->w_di_bh, wc->w_handle,
1610 meta_ac, NULL); 1142 data_ac, meta_ac, NULL);
1611 /* 1143 /*
1612 * This shouldn't happen because we must have already 1144 * This shouldn't happen because we must have already
1613 * calculated the correct meta data allocation required. The 1145 * calculated the correct meta data allocation required. The
@@ -1624,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1624 mlog_errno(ret); 1156 mlog_errno(ret);
1625 goto out; 1157 goto out;
1626 } 1158 }
1627 } else if (unwritten) { 1159 } else if (clear_unwritten) {
1628 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), 1160 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1629 wc->w_di_bh); 1161 wc->w_di_bh);
1630 ret = ocfs2_mark_extent_written(inode, &et, 1162 ret = ocfs2_mark_extent_written(inode, &et,
1631 wc->w_handle, cpos, 1, phys, 1163 wc->w_handle, cpos, 1, *phys,
1632 meta_ac, &wc->w_dealloc); 1164 meta_ac, &wc->w_dealloc);
1633 if (ret < 0) { 1165 if (ret < 0) {
1634 mlog_errno(ret); 1166 mlog_errno(ret);
@@ -1636,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1636 } 1168 }
1637 } 1169 }
1638 1170
1639 if (should_zero)
1640 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1641 else
1642 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1643
1644 /* 1171 /*
1645 * The only reason this should fail is due to an inability to 1172 * The only reason this should fail is due to an inability to
1646 * find the extent added. 1173 * find the extent added.
1647 */ 1174 */
1648 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1175 ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
1649 NULL);
1650 if (ret < 0) { 1176 if (ret < 0) {
1651 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " 1177 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
1652 "at logical block %llu", 1178 "at logical cluster %u",
1653 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1179 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
1654 (unsigned long long)v_blkno);
1655 goto out; 1180 goto out;
1656 } 1181 }
1657 1182
1658 BUG_ON(p_blkno == 0); 1183 BUG_ON(*phys == 0);
1184
1185 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
1186 if (!should_zero)
1187 p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
1659 1188
1660 for(i = 0; i < wc->w_num_pages; i++) { 1189 for(i = 0; i < wc->w_num_pages; i++) {
1661 int tmpret; 1190 int tmpret;
1662 1191
1192 /* This is the direct io target page. */
1193 if (wc->w_pages[i] == NULL) {
1194 p_blkno++;
1195 continue;
1196 }
1197
1663 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1198 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1664 wc->w_pages[i], cpos, 1199 wc->w_pages[i], cpos,
1665 user_pos, user_len, 1200 user_pos, user_len,
@@ -1706,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1706 if ((cluster_off + local_len) > osb->s_clustersize) 1241 if ((cluster_off + local_len) > osb->s_clustersize)
1707 local_len = osb->s_clustersize - cluster_off; 1242 local_len = osb->s_clustersize - cluster_off;
1708 1243
1709 ret = ocfs2_write_cluster(mapping, desc->c_phys, 1244 ret = ocfs2_write_cluster(mapping, &desc->c_phys,
1710 desc->c_unwritten, 1245 desc->c_new,
1246 desc->c_clear_unwritten,
1711 desc->c_needs_zero, 1247 desc->c_needs_zero,
1712 data_ac, meta_ac, 1248 data_ac, meta_ac,
1713 wc, desc->c_cpos, pos, local_len); 1249 wc, desc->c_cpos, pos, local_len);
@@ -1778,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1778} 1314}
1779 1315
1780/* 1316/*
1317 * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
1318 * do the zero work. And should not to clear UNWRITTEN since it will be cleared
1319 * by the direct io procedure.
1320 * If this is a new extent that allocated by direct io, we should mark it in
1321 * the ip_unwritten_list.
1322 */
1323static int ocfs2_unwritten_check(struct inode *inode,
1324 struct ocfs2_write_ctxt *wc,
1325 struct ocfs2_write_cluster_desc *desc)
1326{
1327 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1328 struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
1329 int ret = 0;
1330
1331 if (!desc->c_needs_zero)
1332 return 0;
1333
1334retry:
1335 spin_lock(&oi->ip_lock);
1336 /* Needs not to zero no metter buffer or direct. The one who is zero
1337 * the cluster is doing zero. And he will clear unwritten after all
1338 * cluster io finished. */
1339 list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
1340 if (desc->c_cpos == ue->ue_cpos) {
1341 BUG_ON(desc->c_new);
1342 desc->c_needs_zero = 0;
1343 desc->c_clear_unwritten = 0;
1344 goto unlock;
1345 }
1346 }
1347
1348 if (wc->w_type != OCFS2_WRITE_DIRECT)
1349 goto unlock;
1350
1351 if (new == NULL) {
1352 spin_unlock(&oi->ip_lock);
1353 new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
1354 GFP_NOFS);
1355 if (new == NULL) {
1356 ret = -ENOMEM;
1357 goto out;
1358 }
1359 goto retry;
1360 }
1361 /* This direct write will doing zero. */
1362 new->ue_cpos = desc->c_cpos;
1363 new->ue_phys = desc->c_phys;
1364 desc->c_clear_unwritten = 0;
1365 list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
1366 list_add_tail(&new->ue_node, &wc->w_unwritten_list);
1367 new = NULL;
1368unlock:
1369 spin_unlock(&oi->ip_lock);
1370out:
1371 if (new)
1372 kfree(new);
1373 return ret;
1374}
1375
1376/*
1781 * Populate each single-cluster write descriptor in the write context 1377 * Populate each single-cluster write descriptor in the write context
1782 * with information about the i/o to be done. 1378 * with information about the i/o to be done.
1783 * 1379 *
@@ -1852,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1852 if (phys == 0) { 1448 if (phys == 0) {
1853 desc->c_new = 1; 1449 desc->c_new = 1;
1854 desc->c_needs_zero = 1; 1450 desc->c_needs_zero = 1;
1451 desc->c_clear_unwritten = 1;
1855 *clusters_to_alloc = *clusters_to_alloc + 1; 1452 *clusters_to_alloc = *clusters_to_alloc + 1;
1856 } 1453 }
1857 1454
1858 if (ext_flags & OCFS2_EXT_UNWRITTEN) { 1455 if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1859 desc->c_unwritten = 1; 1456 desc->c_clear_unwritten = 1;
1860 desc->c_needs_zero = 1; 1457 desc->c_needs_zero = 1;
1861 } 1458 }
1862 1459
1460 ret = ocfs2_unwritten_check(inode, wc, desc);
1461 if (ret) {
1462 mlog_errno(ret);
1463 goto out;
1464 }
1465
1863 num_clusters--; 1466 num_clusters--;
1864 } 1467 }
1865 1468
@@ -2022,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
2022 if (ret) 1625 if (ret)
2023 mlog_errno(ret); 1626 mlog_errno(ret);
2024 1627
2025 wc->w_first_new_cpos = 1628 /* There is no wc if this is call from direct. */
2026 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); 1629 if (wc)
1630 wc->w_first_new_cpos =
1631 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
2027 1632
2028 return ret; 1633 return ret;
2029} 1634}
@@ -2077,9 +1682,8 @@ out:
2077 return ret; 1682 return ret;
2078} 1683}
2079 1684
2080int ocfs2_write_begin_nolock(struct file *filp, 1685int ocfs2_write_begin_nolock(struct address_space *mapping,
2081 struct address_space *mapping, 1686 loff_t pos, unsigned len, ocfs2_write_type_t type,
2082 loff_t pos, unsigned len, unsigned flags,
2083 struct page **pagep, void **fsdata, 1687 struct page **pagep, void **fsdata,
2084 struct buffer_head *di_bh, struct page *mmap_page) 1688 struct buffer_head *di_bh, struct page *mmap_page)
2085{ 1689{
@@ -2096,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
2096 int try_free = 1, ret1; 1700 int try_free = 1, ret1;
2097 1701
2098try_again: 1702try_again:
2099 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1703 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
2100 if (ret) { 1704 if (ret) {
2101 mlog_errno(ret); 1705 mlog_errno(ret);
2102 return ret; 1706 return ret;
@@ -2115,14 +1719,17 @@ try_again:
2115 } 1719 }
2116 } 1720 }
2117 1721
2118 if (ocfs2_sparse_alloc(osb)) 1722 /* Direct io change i_size late, should not zero tail here. */
2119 ret = ocfs2_zero_tail(inode, di_bh, pos); 1723 if (type != OCFS2_WRITE_DIRECT) {
2120 else 1724 if (ocfs2_sparse_alloc(osb))
2121 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, 1725 ret = ocfs2_zero_tail(inode, di_bh, pos);
2122 wc); 1726 else
2123 if (ret) { 1727 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2124 mlog_errno(ret); 1728 len, wc);
2125 goto out; 1729 if (ret) {
1730 mlog_errno(ret);
1731 goto out;
1732 }
2126 } 1733 }
2127 1734
2128 ret = ocfs2_check_range_for_refcount(inode, pos, len); 1735 ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2153,7 +1760,7 @@ try_again:
2153 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1760 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2154 (long long)i_size_read(inode), 1761 (long long)i_size_read(inode),
2155 le32_to_cpu(di->i_clusters), 1762 le32_to_cpu(di->i_clusters),
2156 pos, len, flags, mmap_page, 1763 pos, len, type, mmap_page,
2157 clusters_to_alloc, extents_to_split); 1764 clusters_to_alloc, extents_to_split);
2158 1765
2159 /* 1766 /*
@@ -2183,17 +1790,17 @@ try_again:
2183 1790
2184 credits = ocfs2_calc_extend_credits(inode->i_sb, 1791 credits = ocfs2_calc_extend_credits(inode->i_sb,
2185 &di->id2.i_list); 1792 &di->id2.i_list);
2186 1793 } else if (type == OCFS2_WRITE_DIRECT)
2187 } 1794 /* direct write needs not to start trans if no extents alloc. */
1795 goto success;
2188 1796
2189 /* 1797 /*
2190 * We have to zero sparse allocated clusters, unwritten extent clusters, 1798 * We have to zero sparse allocated clusters, unwritten extent clusters,
2191 * and non-sparse clusters we just extended. For non-sparse writes, 1799 * and non-sparse clusters we just extended. For non-sparse writes,
2192 * we know zeros will only be needed in the first and/or last cluster. 1800 * we know zeros will only be needed in the first and/or last cluster.
2193 */ 1801 */
2194 if (clusters_to_alloc || extents_to_split || 1802 if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
2195 (wc->w_clen && (wc->w_desc[0].c_needs_zero || 1803 wc->w_desc[wc->w_clen - 1].c_needs_zero))
2196 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
2197 cluster_of_pages = 1; 1804 cluster_of_pages = 1;
2198 else 1805 else
2199 cluster_of_pages = 0; 1806 cluster_of_pages = 0;
@@ -2260,7 +1867,8 @@ try_again:
2260 ocfs2_free_alloc_context(meta_ac); 1867 ocfs2_free_alloc_context(meta_ac);
2261 1868
2262success: 1869success:
2263 *pagep = wc->w_target_page; 1870 if (pagep)
1871 *pagep = wc->w_target_page;
2264 *fsdata = wc; 1872 *fsdata = wc;
2265 return 0; 1873 return 0;
2266out_quota: 1874out_quota:
@@ -2271,7 +1879,7 @@ out_commit:
2271 ocfs2_commit_trans(osb, handle); 1879 ocfs2_commit_trans(osb, handle);
2272 1880
2273out: 1881out:
2274 ocfs2_free_write_ctxt(wc); 1882 ocfs2_free_write_ctxt(inode, wc);
2275 1883
2276 if (data_ac) { 1884 if (data_ac) {
2277 ocfs2_free_alloc_context(data_ac); 1885 ocfs2_free_alloc_context(data_ac);
@@ -2323,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
2323 */ 1931 */
2324 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1932 down_write(&OCFS2_I(inode)->ip_alloc_sem);
2325 1933
2326 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, 1934 ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
2327 fsdata, di_bh, NULL); 1935 pagep, fsdata, di_bh, NULL);
2328 if (ret) { 1936 if (ret) {
2329 mlog_errno(ret); 1937 mlog_errno(ret);
2330 goto out_fail; 1938 goto out_fail;
@@ -2381,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2381 handle_t *handle = wc->w_handle; 1989 handle_t *handle = wc->w_handle;
2382 struct page *tmppage; 1990 struct page *tmppage;
2383 1991
2384 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1992 BUG_ON(!list_empty(&wc->w_unwritten_list));
2385 OCFS2_JOURNAL_ACCESS_WRITE); 1993
2386 if (ret) { 1994 if (handle) {
2387 copied = ret; 1995 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2388 mlog_errno(ret); 1996 wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2389 goto out; 1997 if (ret) {
1998 copied = ret;
1999 mlog_errno(ret);
2000 goto out;
2001 }
2390 } 2002 }
2391 2003
2392 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2004 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2394,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2394 goto out_write_size; 2006 goto out_write_size;
2395 } 2007 }
2396 2008
2397 if (unlikely(copied < len)) { 2009 if (unlikely(copied < len) && wc->w_target_page) {
2398 if (!PageUptodate(wc->w_target_page)) 2010 if (!PageUptodate(wc->w_target_page))
2399 copied = 0; 2011 copied = 0;
2400 2012
2401 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 2013 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
2402 start+len); 2014 start+len);
2403 } 2015 }
2404 flush_dcache_page(wc->w_target_page); 2016 if (wc->w_target_page)
2017 flush_dcache_page(wc->w_target_page);
2405 2018
2406 for(i = 0; i < wc->w_num_pages; i++) { 2019 for(i = 0; i < wc->w_num_pages; i++) {
2407 tmppage = wc->w_pages[i]; 2020 tmppage = wc->w_pages[i];
2408 2021
2022 /* This is the direct io target page. */
2023 if (tmppage == NULL)
2024 continue;
2025
2409 if (tmppage == wc->w_target_page) { 2026 if (tmppage == wc->w_target_page) {
2410 from = wc->w_target_from; 2027 from = wc->w_target_from;
2411 to = wc->w_target_to; 2028 to = wc->w_target_to;
@@ -2424,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2424 } 2041 }
2425 2042
2426 if (page_has_buffers(tmppage)) { 2043 if (page_has_buffers(tmppage)) {
2427 if (ocfs2_should_order_data(inode)) 2044 if (handle && ocfs2_should_order_data(inode))
2428 ocfs2_jbd2_file_inode(wc->w_handle, inode); 2045 ocfs2_jbd2_file_inode(handle, inode);
2429 block_commit_write(tmppage, from, to); 2046 block_commit_write(tmppage, from, to);
2430 } 2047 }
2431 } 2048 }
2432 2049
2433out_write_size: 2050out_write_size:
2434 pos += copied; 2051 /* Direct io do not update i_size here. */
2435 if (pos > i_size_read(inode)) { 2052 if (wc->w_type != OCFS2_WRITE_DIRECT) {
2436 i_size_write(inode, pos); 2053 pos += copied;
2437 mark_inode_dirty(inode); 2054 if (pos > i_size_read(inode)) {
2438 } 2055 i_size_write(inode, pos);
2439 inode->i_blocks = ocfs2_inode_sector_count(inode); 2056 mark_inode_dirty(inode);
2440 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 2057 }
2441 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2058 inode->i_blocks = ocfs2_inode_sector_count(inode);
2442 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2059 di->i_size = cpu_to_le64((u64)i_size_read(inode));
2443 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2060 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2444 ocfs2_update_inode_fsync_trans(handle, inode, 1); 2061 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2445 ocfs2_journal_dirty(handle, wc->w_di_bh); 2062 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2063 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2064 }
2065 if (handle)
2066 ocfs2_journal_dirty(handle, wc->w_di_bh);
2446 2067
2447out: 2068out:
2448 /* unlock pages before dealloc since it needs acquiring j_trans_barrier 2069 /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2452,7 +2073,8 @@ out:
2452 */ 2073 */
2453 ocfs2_unlock_pages(wc); 2074 ocfs2_unlock_pages(wc);
2454 2075
2455 ocfs2_commit_trans(osb, handle); 2076 if (handle)
2077 ocfs2_commit_trans(osb, handle);
2456 2078
2457 ocfs2_run_deallocs(osb, &wc->w_dealloc); 2079 ocfs2_run_deallocs(osb, &wc->w_dealloc);
2458 2080
@@ -2477,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
2477 return ret; 2099 return ret;
2478} 2100}
2479 2101
2102struct ocfs2_dio_write_ctxt {
2103 struct list_head dw_zero_list;
2104 unsigned dw_zero_count;
2105 int dw_orphaned;
2106 pid_t dw_writer_pid;
2107};
2108
2109static struct ocfs2_dio_write_ctxt *
2110ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
2111{
2112 struct ocfs2_dio_write_ctxt *dwc = NULL;
2113
2114 if (bh->b_private)
2115 return bh->b_private;
2116
2117 dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
2118 if (dwc == NULL)
2119 return NULL;
2120 INIT_LIST_HEAD(&dwc->dw_zero_list);
2121 dwc->dw_zero_count = 0;
2122 dwc->dw_orphaned = 0;
2123 dwc->dw_writer_pid = task_pid_nr(current);
2124 bh->b_private = dwc;
2125 *alloc = 1;
2126
2127 return dwc;
2128}
2129
2130static void ocfs2_dio_free_write_ctx(struct inode *inode,
2131 struct ocfs2_dio_write_ctxt *dwc)
2132{
2133 ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
2134 kfree(dwc);
2135}
2136
2137/*
2138 * TODO: Make this into a generic get_blocks function.
2139 *
2140 * From do_direct_io in direct-io.c:
2141 * "So what we do is to permit the ->get_blocks function to populate
2142 * bh.b_size with the size of IO which is permitted at this offset and
2143 * this i_blkbits."
2144 *
2145 * This function is called directly from get_more_blocks in direct-io.c.
2146 *
2147 * called like this: dio->get_blocks(dio->inode, fs_startblk,
2148 * fs_count, map_bh, dio->rw == WRITE);
2149 */
2150static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
2151 struct buffer_head *bh_result, int create)
2152{
2153 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2154 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2155 struct ocfs2_write_ctxt *wc;
2156 struct ocfs2_write_cluster_desc *desc = NULL;
2157 struct ocfs2_dio_write_ctxt *dwc = NULL;
2158 struct buffer_head *di_bh = NULL;
2159 u64 p_blkno;
2160 loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
2161 unsigned len, total_len = bh_result->b_size;
2162 int ret = 0, first_get_block = 0;
2163
2164 len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
2165 len = min(total_len, len);
2166
2167 mlog(0, "get block of %lu at %llu:%u req %u\n",
2168 inode->i_ino, pos, len, total_len);
2169
2170 /*
2171 * Because we need to change file size in ocfs2_dio_end_io_write(), or
2172 * we may need to add it to orphan dir. So can not fall to fast path
2173 * while file size will be changed.
2174 */
2175 if (pos + total_len <= i_size_read(inode)) {
2176 down_read(&oi->ip_alloc_sem);
2177 /* This is the fast path for re-write. */
2178 ret = ocfs2_get_block(inode, iblock, bh_result, create);
2179
2180 up_read(&oi->ip_alloc_sem);
2181
2182 if (buffer_mapped(bh_result) &&
2183 !buffer_new(bh_result) &&
2184 ret == 0)
2185 goto out;
2186
2187 /* Clear state set by ocfs2_get_block. */
2188 bh_result->b_state = 0;
2189 }
2190
2191 dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
2192 if (unlikely(dwc == NULL)) {
2193 ret = -ENOMEM;
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197
2198 if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
2199 ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
2200 !dwc->dw_orphaned) {
2201 /*
2202 * when we are going to alloc extents beyond file size, add the
2203 * inode to orphan dir, so we can recall those spaces when
2204 * system crashed during write.
2205 */
2206 ret = ocfs2_add_inode_to_orphan(osb, inode);
2207 if (ret < 0) {
2208 mlog_errno(ret);
2209 goto out;
2210 }
2211 dwc->dw_orphaned = 1;
2212 }
2213
2214 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2215 if (ret) {
2216 mlog_errno(ret);
2217 goto out;
2218 }
2219
2220 down_write(&oi->ip_alloc_sem);
2221
2222 if (first_get_block) {
2223 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
2224 ret = ocfs2_zero_tail(inode, di_bh, pos);
2225 else
2226 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2227 total_len, NULL);
2228 if (ret < 0) {
2229 mlog_errno(ret);
2230 goto unlock;
2231 }
2232 }
2233
2234 ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
2235 OCFS2_WRITE_DIRECT, NULL,
2236 (void **)&wc, di_bh, NULL);
2237 if (ret) {
2238 mlog_errno(ret);
2239 goto unlock;
2240 }
2241
2242 desc = &wc->w_desc[0];
2243
2244 p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
2245 BUG_ON(p_blkno == 0);
2246 p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
2247
2248 map_bh(bh_result, inode->i_sb, p_blkno);
2249 bh_result->b_size = len;
2250 if (desc->c_needs_zero)
2251 set_buffer_new(bh_result);
2252
2253 /* May sleep in end_io. It should not happen in a irq context. So defer
2254 * it to dio work queue. */
2255 set_buffer_defer_completion(bh_result);
2256
2257 if (!list_empty(&wc->w_unwritten_list)) {
2258 struct ocfs2_unwritten_extent *ue = NULL;
2259
2260 ue = list_first_entry(&wc->w_unwritten_list,
2261 struct ocfs2_unwritten_extent,
2262 ue_node);
2263 BUG_ON(ue->ue_cpos != desc->c_cpos);
2264 /* The physical address may be 0, fill it. */
2265 ue->ue_phys = desc->c_phys;
2266
2267 list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
2268 dwc->dw_zero_count++;
2269 }
2270
2271 ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
2272 BUG_ON(ret != len);
2273 ret = 0;
2274unlock:
2275 up_write(&oi->ip_alloc_sem);
2276 ocfs2_inode_unlock(inode, 1);
2277 brelse(di_bh);
2278out:
2279 if (ret < 0)
2280 ret = -EIO;
2281 return ret;
2282}
2283
2284static void ocfs2_dio_end_io_write(struct inode *inode,
2285 struct ocfs2_dio_write_ctxt *dwc,
2286 loff_t offset,
2287 ssize_t bytes)
2288{
2289 struct ocfs2_cached_dealloc_ctxt dealloc;
2290 struct ocfs2_extent_tree et;
2291 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2292 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2293 struct ocfs2_unwritten_extent *ue = NULL;
2294 struct buffer_head *di_bh = NULL;
2295 struct ocfs2_dinode *di;
2296 struct ocfs2_alloc_context *data_ac = NULL;
2297 struct ocfs2_alloc_context *meta_ac = NULL;
2298 handle_t *handle = NULL;
2299 loff_t end = offset + bytes;
2300 int ret = 0, credits = 0, locked = 0;
2301
2302 ocfs2_init_dealloc_ctxt(&dealloc);
2303
2304 /* We do clear unwritten, delete orphan, change i_size here. If neither
2305 * of these happen, we can skip all this. */
2306 if (list_empty(&dwc->dw_zero_list) &&
2307 end <= i_size_read(inode) &&
2308 !dwc->dw_orphaned)
2309 goto out;
2310
2311 /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
2312 * are in that context. */
2313 if (dwc->dw_writer_pid != task_pid_nr(current)) {
2314 mutex_lock(&inode->i_mutex);
2315 locked = 1;
2316 }
2317
2318 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2319 if (ret < 0) {
2320 mlog_errno(ret);
2321 goto out;
2322 }
2323
2324 down_write(&oi->ip_alloc_sem);
2325
2326 /* Delete orphan before acquire i_mutex. */
2327 if (dwc->dw_orphaned) {
2328 BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
2329
2330 end = end > i_size_read(inode) ? end : 0;
2331
2332 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
2333 !!end, end);
2334 if (ret < 0)
2335 mlog_errno(ret);
2336 }
2337
2338 di = (struct ocfs2_dinode *)di_bh;
2339
2340 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
2341
2342 ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
2343 &data_ac, &meta_ac);
2344 if (ret) {
2345 mlog_errno(ret);
2346 goto unlock;
2347 }
2348
2349 credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
2350
2351 handle = ocfs2_start_trans(osb, credits);
2352 if (IS_ERR(handle)) {
2353 ret = PTR_ERR(handle);
2354 mlog_errno(ret);
2355 goto unlock;
2356 }
2357 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2358 OCFS2_JOURNAL_ACCESS_WRITE);
2359 if (ret) {
2360 mlog_errno(ret);
2361 goto commit;
2362 }
2363
2364 list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
2365 ret = ocfs2_mark_extent_written(inode, &et, handle,
2366 ue->ue_cpos, 1,
2367 ue->ue_phys,
2368 meta_ac, &dealloc);
2369 if (ret < 0) {
2370 mlog_errno(ret);
2371 break;
2372 }
2373 }
2374
2375 if (end > i_size_read(inode)) {
2376 ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
2377 if (ret < 0)
2378 mlog_errno(ret);
2379 }
2380commit:
2381 ocfs2_commit_trans(osb, handle);
2382unlock:
2383 up_write(&oi->ip_alloc_sem);
2384 ocfs2_inode_unlock(inode, 1);
2385 brelse(di_bh);
2386out:
2387 if (data_ac)
2388 ocfs2_free_alloc_context(data_ac);
2389 if (meta_ac)
2390 ocfs2_free_alloc_context(meta_ac);
2391 ocfs2_run_deallocs(osb, &dealloc);
2392 if (locked)
2393 mutex_unlock(&inode->i_mutex);
2394 ocfs2_dio_free_write_ctx(inode, dwc);
2395}
2396
2397/*
2398 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
2399 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
2400 * to protect io on one node from truncation on another.
2401 */
2402static int ocfs2_dio_end_io(struct kiocb *iocb,
2403 loff_t offset,
2404 ssize_t bytes,
2405 void *private)
2406{
2407 struct inode *inode = file_inode(iocb->ki_filp);
2408 int level;
2409
2410 if (bytes <= 0)
2411 return 0;
2412
2413 /* this io's submitter should not have unlocked this before we could */
2414 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
2415
2416 if (private)
2417 ocfs2_dio_end_io_write(inode, private, offset, bytes);
2418
2419 ocfs2_iocb_clear_rw_locked(iocb);
2420
2421 level = ocfs2_iocb_rw_locked_level(iocb);
2422 ocfs2_rw_unlock(inode, level);
2423 return 0;
2424}
2425
2426static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
2427 loff_t offset)
2428{
2429 struct file *file = iocb->ki_filp;
2430 struct inode *inode = file_inode(file)->i_mapping->host;
2431 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2432 loff_t end = offset + iter->count;
2433 get_block_t *get_block;
2434
2435 /*
2436 * Fallback to buffered I/O if we see an inode without
2437 * extents.
2438 */
2439 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2440 return 0;
2441
2442 /* Fallback to buffered I/O if we do not support append dio. */
2443 if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
2444 return 0;
2445
2446 if (iov_iter_rw(iter) == READ)
2447 get_block = ocfs2_get_block;
2448 else
2449 get_block = ocfs2_dio_get_block;
2450
2451 return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
2452 iter, offset, get_block,
2453 ocfs2_dio_end_io, NULL, 0);
2454}
2455
2480const struct address_space_operations ocfs2_aops = { 2456const struct address_space_operations ocfs2_aops = {
2481 .readpage = ocfs2_readpage, 2457 .readpage = ocfs2_readpage,
2482 .readpages = ocfs2_readpages, 2458 .readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
47 loff_t pos, unsigned len, unsigned copied, 47 loff_t pos, unsigned len, unsigned copied,
48 struct page *page, void *fsdata); 48 struct page *page, void *fsdata);
49 49
50int ocfs2_write_begin_nolock(struct file *filp, 50typedef enum {
51 struct address_space *mapping, 51 OCFS2_WRITE_BUFFER = 0,
52 loff_t pos, unsigned len, unsigned flags, 52 OCFS2_WRITE_DIRECT,
53 OCFS2_WRITE_MMAP,
54} ocfs2_write_type_t;
55
56int ocfs2_write_begin_nolock(struct address_space *mapping,
57 loff_t pos, unsigned len, ocfs2_write_type_t type,
53 struct page **pagep, void **fsdata, 58 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 59 struct buffer_head *di_bh, struct page *mmap_page);
55 60
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
79enum ocfs2_iocb_lock_bits { 84enum ocfs2_iocb_lock_bits {
80 OCFS2_IOCB_RW_LOCK = 0, 85 OCFS2_IOCB_RW_LOCK = 0,
81 OCFS2_IOCB_RW_LOCK_LEVEL, 86 OCFS2_IOCB_RW_LOCK_LEVEL,
82 OCFS2_IOCB_UNALIGNED_IO,
83 OCFS2_IOCB_NUM_LOCKS 87 OCFS2_IOCB_NUM_LOCKS
84}; 88};
85 89
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
88#define ocfs2_iocb_rw_locked_level(iocb) \ 92#define ocfs2_iocb_rw_locked_level(iocb) \
89 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) 93 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
90 94
91#define ocfs2_iocb_set_unaligned_aio(iocb) \
92 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
93#define ocfs2_iocb_clear_unaligned_aio(iocb) \
94 clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
95#define ocfs2_iocb_is_unaligned_aio(iocb) \
96 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
97
98#endif /* OCFS2_FILE_H */ 95#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ef6a2ec494de..bd15929b5f92 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1444,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item)
1444 debugfs_remove(reg->hr_debug_dir); 1444 debugfs_remove(reg->hr_debug_dir);
1445 kfree(reg->hr_db_livenodes); 1445 kfree(reg->hr_db_livenodes);
1446 kfree(reg->hr_db_regnum); 1446 kfree(reg->hr_db_regnum);
1447 kfree(reg->hr_debug_elapsed_time); 1447 kfree(reg->hr_db_elapsed_time);
1448 kfree(reg->hr_debug_pinned); 1448 kfree(reg->hr_db_pinned);
1449 1449
1450 spin_lock(&o2hb_live_lock); 1450 spin_lock(&o2hb_live_lock);
1451 list_del(&reg->hr_all_item); 1451 list_del(&reg->hr_all_item);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
212 if (lock->lksb->flags & DLM_LKSB_PUT_LVB) 212 if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
213 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); 213 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
214 214
215 /*
216 * Move the lock to the tail because it may be the only lock which has
217 * an invalid lvb.
218 */
219 list_move_tail(&lock->list, &res->granted);
220
215 status = DLM_NORMAL; 221 status = DLM_NORMAL;
216 *call_ast = 1; 222 *call_ast = 1;
217 goto unlock_exit; 223 goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
262 struct dlm_lock *lock, int flags, int type) 268 struct dlm_lock *lock, int flags, int type)
263{ 269{
264 enum dlm_status status; 270 enum dlm_status status;
271 u8 old_owner = res->owner;
265 272
266 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, 273 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
267 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); 274 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
287 status = DLM_DENIED; 294 status = DLM_DENIED;
288 goto bail; 295 goto bail;
289 } 296 }
297
298 if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
299 mlog(0, "last convert request returned DLM_RECOVERING, but "
300 "owner has already queued and sent ast to me. res %.*s, "
301 "(cookie=%u:%llu, type=%d, conv=%d)\n",
302 res->lockname.len, res->lockname.name,
303 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
304 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
305 lock->ml.type, lock->ml.convert_type);
306 status = DLM_NORMAL;
307 goto bail;
308 }
309
290 res->state |= DLM_LOCK_RES_IN_PROGRESS; 310 res->state |= DLM_LOCK_RES_IN_PROGRESS;
291 /* move lock to local convert queue */ 311 /* move lock to local convert queue */
292 /* do not alter lock refcount. switching lists. */ 312 /* do not alter lock refcount. switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
316 spin_lock(&res->spinlock); 336 spin_lock(&res->spinlock);
317 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 337 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
318 lock->convert_pending = 0; 338 lock->convert_pending = 0;
319 /* if it failed, move it back to granted queue */ 339 /* if it failed, move it back to granted queue.
340 * if master returns DLM_NORMAL and then down before sending ast,
341 * it may have already been moved to granted queue, reset to
342 * DLM_RECOVERING and retry convert */
320 if (status != DLM_NORMAL) { 343 if (status != DLM_NORMAL) {
321 if (status != DLM_NOTQUEUED) 344 if (status != DLM_NOTQUEUED)
322 dlm_error(status); 345 dlm_error(status);
323 dlm_revert_pending_convert(res, lock); 346 dlm_revert_pending_convert(res, lock);
347 } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
348 (old_owner != res->owner)) {
349 mlog(0, "res %.*s is in recovering or has been recovered.\n",
350 res->lockname.len, res->lockname.name);
351 status = DLM_RECOVERING;
324 } 352 }
325bail: 353bail:
326 spin_unlock(&res->spinlock); 354 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index cd38488a10fc..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2083,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
2083 dlm_lock_get(lock); 2083 dlm_lock_get(lock);
2084 if (lock->convert_pending) { 2084 if (lock->convert_pending) {
2085 /* move converting lock back to granted */ 2085 /* move converting lock back to granted */
2086 BUG_ON(i != DLM_CONVERTING_LIST);
2087 mlog(0, "node died with convert pending " 2086 mlog(0, "node died with convert pending "
2088 "on %.*s. move back to granted list.\n", 2087 "on %.*s. move back to granted list.\n",
2089 res->lockname.len, res->lockname.name); 2088 res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
1381 return ret; 1381 return ret;
1382} 1382}
1383 1383
1384/*
1385 * Will look for holes and unwritten extents in the range starting at
1386 * pos for count bytes (inclusive).
1387 */
1388static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1389 size_t count)
1390{
1391 int ret = 0;
1392 unsigned int extent_flags;
1393 u32 cpos, clusters, extent_len, phys_cpos;
1394 struct super_block *sb = inode->i_sb;
1395
1396 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1397 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1398
1399 while (clusters) {
1400 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1401 &extent_flags);
1402 if (ret < 0) {
1403 mlog_errno(ret);
1404 goto out;
1405 }
1406
1407 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1408 ret = 1;
1409 break;
1410 }
1411
1412 if (extent_len > clusters)
1413 extent_len = clusters;
1414
1415 clusters -= extent_len;
1416 cpos += extent_len;
1417 }
1418out:
1419 return ret;
1420}
1421
1422static int ocfs2_write_remove_suid(struct inode *inode) 1384static int ocfs2_write_remove_suid(struct inode *inode)
1423{ 1385{
1424 int ret; 1386 int ret;
@@ -2129,18 +2091,12 @@ out:
2129 2091
2130static int ocfs2_prepare_inode_for_write(struct file *file, 2092static int ocfs2_prepare_inode_for_write(struct file *file,
2131 loff_t pos, 2093 loff_t pos,
2132 size_t count, 2094 size_t count)
2133 int appending,
2134 int *direct_io,
2135 int *has_refcount)
2136{ 2095{
2137 int ret = 0, meta_level = 0; 2096 int ret = 0, meta_level = 0;
2138 struct dentry *dentry = file->f_path.dentry; 2097 struct dentry *dentry = file->f_path.dentry;
2139 struct inode *inode = d_inode(dentry); 2098 struct inode *inode = d_inode(dentry);
2140 loff_t end; 2099 loff_t end;
2141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2142 int full_coherency = !(osb->s_mount_opt &
2143 OCFS2_MOUNT_COHERENCY_BUFFERED);
2144 2100
2145 /* 2101 /*
2146 * We start with a read level meta lock and only jump to an ex 2102 * We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2189 pos, 2145 pos,
2190 count, 2146 count,
2191 &meta_level); 2147 &meta_level);
2192 if (has_refcount)
2193 *has_refcount = 1;
2194 if (direct_io)
2195 *direct_io = 0;
2196 } 2148 }
2197 2149
2198 if (ret < 0) { 2150 if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2200 goto out_unlock; 2152 goto out_unlock;
2201 } 2153 }
2202 2154
2203 /*
2204 * Skip the O_DIRECT checks if we don't need
2205 * them.
2206 */
2207 if (!direct_io || !(*direct_io))
2208 break;
2209
2210 /*
2211 * There's no sane way to do direct writes to an inode
2212 * with inline data.
2213 */
2214 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2215 *direct_io = 0;
2216 break;
2217 }
2218
2219 /*
2220 * Allowing concurrent direct writes means
2221 * i_size changes wouldn't be synchronized, so
2222 * one node could wind up truncating another
2223 * nodes writes.
2224 */
2225 if (end > i_size_read(inode) && !full_coherency) {
2226 *direct_io = 0;
2227 break;
2228 }
2229
2230 /*
2231 * Fallback to old way if the feature bit is not set.
2232 */
2233 if (end > i_size_read(inode) &&
2234 !ocfs2_supports_append_dio(osb)) {
2235 *direct_io = 0;
2236 break;
2237 }
2238
2239 /*
2240 * We don't fill holes during direct io, so
2241 * check for them here. If any are found, the
2242 * caller will have to retake some cluster
2243 * locks and initiate the io as buffered.
2244 */
2245 ret = ocfs2_check_range_for_holes(inode, pos, count);
2246 if (ret == 1) {
2247 /*
2248 * Fallback to old way if the feature bit is not set.
2249 * Otherwise try dio first and then complete the rest
2250 * request through buffer io.
2251 */
2252 if (!ocfs2_supports_append_dio(osb))
2253 *direct_io = 0;
2254 ret = 0;
2255 } else if (ret < 0)
2256 mlog_errno(ret);
2257 break; 2155 break;
2258 } 2156 }
2259 2157
2260out_unlock: 2158out_unlock:
2261 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2159 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2262 pos, appending, count, 2160 pos, count);
2263 direct_io, has_refcount);
2264 2161
2265 if (meta_level >= 0) 2162 if (meta_level >= 0)
2266 ocfs2_inode_unlock(inode, meta_level); 2163 ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
2272static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, 2169static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2273 struct iov_iter *from) 2170 struct iov_iter *from)
2274{ 2171{
2275 int direct_io, appending, rw_level; 2172 int direct_io, rw_level;
2276 int can_do_direct, has_refcount = 0;
2277 ssize_t written = 0; 2173 ssize_t written = 0;
2278 ssize_t ret; 2174 ssize_t ret;
2279 size_t count = iov_iter_count(from), orig_count; 2175 size_t count = iov_iter_count(from);
2280 struct file *file = iocb->ki_filp; 2176 struct file *file = iocb->ki_filp;
2281 struct inode *inode = file_inode(file); 2177 struct inode *inode = file_inode(file);
2282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2283 int full_coherency = !(osb->s_mount_opt & 2179 int full_coherency = !(osb->s_mount_opt &
2284 OCFS2_MOUNT_COHERENCY_BUFFERED); 2180 OCFS2_MOUNT_COHERENCY_BUFFERED);
2285 int unaligned_dio = 0; 2181 void *saved_ki_complete = NULL;
2286 int dropped_dio = 0;
2287 int append_write = ((iocb->ki_pos + count) >= 2182 int append_write = ((iocb->ki_pos + count) >=
2288 i_size_read(inode) ? 1 : 0); 2183 i_size_read(inode) ? 1 : 0);
2289 2184
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2296 if (count == 0) 2191 if (count == 0)
2297 return 0; 2192 return 0;
2298 2193
2299 appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
2300 direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; 2194 direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2301 2195
2302 inode_lock(inode); 2196 inode_lock(inode);
2303 2197
2304relock:
2305 /* 2198 /*
2306 * Concurrent O_DIRECT writes are allowed with 2199 * Concurrent O_DIRECT writes are allowed with
2307 * mount_option "coherency=buffered". 2200 * mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
2334 ocfs2_inode_unlock(inode, 1); 2227 ocfs2_inode_unlock(inode, 1);
2335 } 2228 }
2336 2229
2337 orig_count = iov_iter_count(from);
2338 ret = generic_write_checks(iocb, from); 2230 ret = generic_write_checks(iocb, from);
2339 if (ret <= 0) { 2231 if (ret <= 0) {
2340 if (ret) 2232 if (ret)
@@ -2343,41 +2235,18 @@ relock:
2343 } 2235 }
2344 count = ret; 2236 count = ret;
2345 2237
2346 can_do_direct = direct_io; 2238 ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
2347 ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
2348 &can_do_direct, &has_refcount);
2349 if (ret < 0) { 2239 if (ret < 0) {
2350 mlog_errno(ret); 2240 mlog_errno(ret);
2351 goto out; 2241 goto out;
2352 } 2242 }
2353 2243
2354 if (direct_io && !is_sync_kiocb(iocb)) 2244 if (direct_io && !is_sync_kiocb(iocb) &&
2355 unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); 2245 ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2356
2357 /*
2358 * We can't complete the direct I/O as requested, fall back to
2359 * buffered I/O.
2360 */
2361 if (direct_io && !can_do_direct) {
2362 ocfs2_rw_unlock(inode, rw_level);
2363
2364 rw_level = -1;
2365
2366 direct_io = 0;
2367 iocb->ki_flags &= ~IOCB_DIRECT;
2368 iov_iter_reexpand(from, orig_count);
2369 dropped_dio = 1;
2370 goto relock;
2371 }
2372
2373 if (unaligned_dio) {
2374 /* 2246 /*
2375 * Wait on previous unaligned aio to complete before 2247 * Make it a sync io if it's an unaligned aio.
2376 * proceeding.
2377 */ 2248 */
2378 mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); 2249 saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2379 /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
2380 ocfs2_iocb_set_unaligned_aio(iocb);
2381 } 2250 }
2382 2251
2383 /* communicate with ocfs2_dio_end_io */ 2252 /* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
2398 */ 2267 */
2399 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2268 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2400 rw_level = -1; 2269 rw_level = -1;
2401 unaligned_dio = 0;
2402 } 2270 }
2403 2271
2404 if (unlikely(written <= 0)) 2272 if (unlikely(written <= 0))
2405 goto no_sync; 2273 goto out;
2406 2274
2407 if (((file->f_flags & O_DSYNC) && !direct_io) || 2275 if (((file->f_flags & O_DSYNC) && !direct_io) ||
2408 IS_SYNC(inode) || dropped_dio) { 2276 IS_SYNC(inode)) {
2409 ret = filemap_fdatawrite_range(file->f_mapping, 2277 ret = filemap_fdatawrite_range(file->f_mapping,
2410 iocb->ki_pos - written, 2278 iocb->ki_pos - written,
2411 iocb->ki_pos - 1); 2279 iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
2424 iocb->ki_pos - 1); 2292 iocb->ki_pos - 1);
2425 } 2293 }
2426 2294
2427no_sync:
2428 if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
2429 ocfs2_iocb_clear_unaligned_aio(iocb);
2430 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2431 }
2432
2433out: 2295out:
2296 if (saved_ki_complete)
2297 xchg(&iocb->ki_complete, saved_ki_complete);
2298
2434 if (rw_level != -1) 2299 if (rw_level != -1)
2435 ocfs2_rw_unlock(inode, rw_level); 2300 ocfs2_rw_unlock(inode, rw_level);
2436 2301
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ba495beff1c2..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1170 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1170 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
1171 "Clear inode of %llu, inode has io markers\n", 1171 "Clear inode of %llu, inode has io markers\n",
1172 (unsigned long long)oi->ip_blkno); 1172 (unsigned long long)oi->ip_blkno);
1173 mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
1174 "Clear inode of %llu, inode has unwritten extents\n",
1175 (unsigned long long)oi->ip_blkno);
1173 1176
1174 ocfs2_extent_map_trunc(inode, 0); 1177 ocfs2_extent_map_trunc(inode, 0);
1175 1178
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 01635e016b3e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
43 /* protects extended attribute changes on this inode */ 43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */
47 struct mutex ip_unaligned_aio;
48
49 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
50 spinlock_t ip_lock; 47 spinlock_t ip_lock;
51 u32 ip_open_count; 48 u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
57 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
58 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
59 56
57 /* Record unwritten extents during direct io. */
58 struct list_head ip_unwritten_list;
59
60 /* protected by recovery_lock. */ 60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan; 61 struct inode *ip_next_orphan;
62 62
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
231 /* At this point, we know that no more recovery threads can be 231 /* At this point, we know that no more recovery threads can be
232 * launched, so wait for any recovery completion work to 232 * launched, so wait for any recovery completion work to
233 * complete. */ 233 * complete. */
234 flush_workqueue(ocfs2_wq); 234 flush_workqueue(osb->ocfs2_wq);
235 235
236 /* 236 /*
237 * Now that recovery is shut down, and the osb is about to be 237 * Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1326 1326
1327 spin_lock(&journal->j_lock); 1327 spin_lock(&journal->j_lock);
1328 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1328 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1329 queue_work(ocfs2_wq, &journal->j_recovery_work); 1329 queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
1330 spin_unlock(&journal->j_lock); 1330 spin_unlock(&journal->j_lock);
1331} 1331}
1332 1332
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
1968 mutex_lock(&os->os_lock); 1968 mutex_lock(&os->os_lock);
1969 ocfs2_queue_orphan_scan(osb); 1969 ocfs2_queue_orphan_scan(osb);
1970 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) 1970 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
1971 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 1971 queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
1972 ocfs2_orphan_scan_timeout()); 1972 ocfs2_orphan_scan_timeout());
1973 mutex_unlock(&os->os_lock); 1973 mutex_unlock(&os->os_lock);
1974} 1974}
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
2008 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 2008 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
2009 else { 2009 else {
2010 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); 2010 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
2011 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 2011 queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
2012 ocfs2_orphan_scan_timeout()); 2012 ocfs2_orphan_scan_timeout());
2013 } 2013 }
2014} 2014}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
386 struct ocfs2_dinode *alloc = NULL; 386 struct ocfs2_dinode *alloc = NULL;
387 387
388 cancel_delayed_work(&osb->la_enable_wq); 388 cancel_delayed_work(&osb->la_enable_wq);
389 flush_workqueue(ocfs2_wq); 389 flush_workqueue(osb->ocfs2_wq);
390 390
391 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 391 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
392 goto out; 392 goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
1085 } else { 1085 } else {
1086 osb->local_alloc_state = OCFS2_LA_DISABLED; 1086 osb->local_alloc_state = OCFS2_LA_DISABLED;
1087 } 1087 }
1088 queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, 1088 queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
1089 OCFS2_LA_ENABLE_INTERVAL); 1089 OCFS2_LA_ENABLE_INTERVAL);
1090 goto out_unlock; 1090 goto out_unlock;
1091 } 1091 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 77ebc2bc1cca..9ea081f4e6e4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
104 if (page->index == last_index) 104 if (page->index == last_index)
105 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 105 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
106 106
107 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, 107 ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
108 &fsdata, di_bh, page); 108 &locked_page, &fsdata, di_bh, page);
109 if (ret) { 109 if (ret) {
110 if (ret != -ENOSPC) 110 if (ret != -ENOSPC)
111 mlog_errno(ret); 111 mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
464 struct ocfs2_refcount_tree *osb_ref_tree_lru; 464 struct ocfs2_refcount_tree *osb_ref_tree_lru;
465 465
466 struct mutex system_file_mutex; 466 struct mutex system_file_mutex;
467
468 /*
469 * OCFS2 needs to schedule several different types of work which
470 * require cluster locking, disk I/O, recovery waits, etc. Since these
471 * types of work tend to be heavy we avoid using the kernel events
472 * workqueue and schedule on our own.
473 */
474 struct workqueue_struct *ocfs2_wq;
467}; 475};
468 476
469#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 477#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 24b7e7f591dc..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
1450 1450
1451TRACE_EVENT(ocfs2_prepare_inode_for_write, 1451TRACE_EVENT(ocfs2_prepare_inode_for_write,
1452 TP_PROTO(unsigned long long ino, unsigned long long saved_pos, 1452 TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
1453 int appending, unsigned long count, 1453 unsigned long count),
1454 int *direct_io, int *has_refcount), 1454 TP_ARGS(ino, saved_pos, count),
1455 TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
1456 TP_STRUCT__entry( 1455 TP_STRUCT__entry(
1457 __field(unsigned long long, ino) 1456 __field(unsigned long long, ino)
1458 __field(unsigned long long, saved_pos) 1457 __field(unsigned long long, saved_pos)
1459 __field(int, appending)
1460 __field(unsigned long, count) 1458 __field(unsigned long, count)
1461 __field(int, direct_io)
1462 __field(int, has_refcount)
1463 ), 1459 ),
1464 TP_fast_assign( 1460 TP_fast_assign(
1465 __entry->ino = ino; 1461 __entry->ino = ino;
1466 __entry->saved_pos = saved_pos; 1462 __entry->saved_pos = saved_pos;
1467 __entry->appending = appending;
1468 __entry->count = count; 1463 __entry->count = count;
1469 __entry->direct_io = direct_io ? *direct_io : -1;
1470 __entry->has_refcount = has_refcount ? *has_refcount : -1;
1471 ), 1464 ),
1472 TP_printk("%llu %llu %d %lu %d %d", __entry->ino, 1465 TP_printk("%llu %llu %lu", __entry->ino,
1473 __entry->saved_pos, __entry->appending, __entry->count, 1466 __entry->saved_pos, __entry->count)
1474 __entry->direct_io, __entry->has_refcount)
1475); 1467);
1476 1468
1477DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); 1469DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 91bc674203ed..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
726 dqgrab(dquot); 726 dqgrab(dquot);
727 /* First entry on list -> queue work */ 727 /* First entry on list -> queue work */
728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) 728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
729 queue_work(ocfs2_wq, &osb->dquot_drop_work); 729 queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
730 goto out; 730 goto out;
731 } 731 }
732 status = ocfs2_lock_global_qf(oinfo, 1); 732 status = ocfs2_lock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
196 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { 196 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
197 blkno = ocfs2_backup_super_blkno(inode->i_sb, i); 197 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
198 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 198 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
199 if (cluster > clusters) 199 if (cluster >= clusters)
200 break; 200 break;
201 201
202 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); 202 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccc9386c42c5..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -80,12 +80,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
80struct kmem_cache *ocfs2_dquot_cachep; 80struct kmem_cache *ocfs2_dquot_cachep;
81struct kmem_cache *ocfs2_qf_chunk_cachep; 81struct kmem_cache *ocfs2_qf_chunk_cachep;
82 82
83/* OCFS2 needs to schedule several different types of work which
84 * require cluster locking, disk I/O, recovery waits, etc. Since these
85 * types of work tend to be heavy we avoid using the kernel events
86 * workqueue and schedule on our own. */
87struct workqueue_struct *ocfs2_wq = NULL;
88
89static struct dentry *ocfs2_debugfs_root; 83static struct dentry *ocfs2_debugfs_root;
90 84
91MODULE_AUTHOR("Oracle"); 85MODULE_AUTHOR("Oracle");
@@ -1613,33 +1607,25 @@ static int __init ocfs2_init(void)
1613 if (status < 0) 1607 if (status < 0)
1614 goto out2; 1608 goto out2;
1615 1609
1616 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
1617 if (!ocfs2_wq) {
1618 status = -ENOMEM;
1619 goto out3;
1620 }
1621
1622 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1610 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1623 if (!ocfs2_debugfs_root) { 1611 if (!ocfs2_debugfs_root) {
1624 status = -ENOMEM; 1612 status = -ENOMEM;
1625 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1613 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1626 goto out4; 1614 goto out3;
1627 } 1615 }
1628 1616
1629 ocfs2_set_locking_protocol(); 1617 ocfs2_set_locking_protocol();
1630 1618
1631 status = register_quota_format(&ocfs2_quota_format); 1619 status = register_quota_format(&ocfs2_quota_format);
1632 if (status < 0) 1620 if (status < 0)
1633 goto out4; 1621 goto out3;
1634 status = register_filesystem(&ocfs2_fs_type); 1622 status = register_filesystem(&ocfs2_fs_type);
1635 if (!status) 1623 if (!status)
1636 return 0; 1624 return 0;
1637 1625
1638 unregister_quota_format(&ocfs2_quota_format); 1626 unregister_quota_format(&ocfs2_quota_format);
1639out4:
1640 destroy_workqueue(ocfs2_wq);
1641 debugfs_remove(ocfs2_debugfs_root);
1642out3: 1627out3:
1628 debugfs_remove(ocfs2_debugfs_root);
1643 ocfs2_free_mem_caches(); 1629 ocfs2_free_mem_caches();
1644out2: 1630out2:
1645 exit_ocfs2_uptodate_cache(); 1631 exit_ocfs2_uptodate_cache();
@@ -1650,11 +1636,6 @@ out1:
1650 1636
1651static void __exit ocfs2_exit(void) 1637static void __exit ocfs2_exit(void)
1652{ 1638{
1653 if (ocfs2_wq) {
1654 flush_workqueue(ocfs2_wq);
1655 destroy_workqueue(ocfs2_wq);
1656 }
1657
1658 unregister_quota_format(&ocfs2_quota_format); 1639 unregister_quota_format(&ocfs2_quota_format);
1659 1640
1660 debugfs_remove(ocfs2_debugfs_root); 1641 debugfs_remove(ocfs2_debugfs_root);
@@ -1745,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
1745 spin_lock_init(&oi->ip_lock); 1726 spin_lock_init(&oi->ip_lock);
1746 ocfs2_extent_map_init(&oi->vfs_inode); 1727 ocfs2_extent_map_init(&oi->vfs_inode);
1747 INIT_LIST_HEAD(&oi->ip_io_markers); 1728 INIT_LIST_HEAD(&oi->ip_io_markers);
1729 INIT_LIST_HEAD(&oi->ip_unwritten_list);
1748 oi->ip_dir_start_lookup = 0; 1730 oi->ip_dir_start_lookup = 0;
1749 mutex_init(&oi->ip_unaligned_aio);
1750 init_rwsem(&oi->ip_alloc_sem); 1731 init_rwsem(&oi->ip_alloc_sem);
1751 init_rwsem(&oi->ip_xattr_sem); 1732 init_rwsem(&oi->ip_xattr_sem);
1752 mutex_init(&oi->ip_io_mutex); 1733 mutex_init(&oi->ip_io_mutex);
@@ -2349,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2349 } 2330 }
2350 cleancache_init_shared_fs(sb); 2331 cleancache_init_shared_fs(sb);
2351 2332
2333 osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
2334 if (!osb->ocfs2_wq) {
2335 status = -ENOMEM;
2336 mlog_errno(status);
2337 }
2338
2352bail: 2339bail:
2353 return status; 2340 return status;
2354} 2341}
@@ -2536,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2536{ 2523{
2537 /* This function assumes that the caller has the main osb resource */ 2524 /* This function assumes that the caller has the main osb resource */
2538 2525
2526 /* ocfs2_initializer_super have already created this workqueue */
2527 if (osb->ocfs2_wq) {
2528 flush_workqueue(osb->ocfs2_wq);
2529 destroy_workqueue(osb->ocfs2_wq);
2530 }
2531
2539 ocfs2_free_slot_info(osb); 2532 ocfs2_free_slot_info(osb);
2540 2533
2541 kfree(osb->osb_orphan_wipes); 2534 kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
26#ifndef OCFS2_SUPER_H 26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H 27#define OCFS2_SUPER_H
28 28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, 29int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num); 30 int node_num);
33 31
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8f5a12ab2f2b..339125bb4d2c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -456,7 +456,7 @@
456 *(.entry.text) \ 456 *(.entry.text) \
457 VMLINUX_SYMBOL(__entry_text_end) = .; 457 VMLINUX_SYMBOL(__entry_text_end) = .;
458 458
459#ifdef CONFIG_FUNCTION_GRAPH_TRACER 459#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
460#define IRQENTRY_TEXT \ 460#define IRQENTRY_TEXT \
461 ALIGN_FUNCTION(); \ 461 ALIGN_FUNCTION(); \
462 VMLINUX_SYMBOL(__irqentry_text_start) = .; \ 462 VMLINUX_SYMBOL(__irqentry_text_start) = .; \
@@ -466,6 +466,16 @@
466#define IRQENTRY_TEXT 466#define IRQENTRY_TEXT
467#endif 467#endif
468 468
469#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
470#define SOFTIRQENTRY_TEXT \
471 ALIGN_FUNCTION(); \
472 VMLINUX_SYMBOL(__softirqentry_text_start) = .; \
473 *(.softirqentry.text) \
474 VMLINUX_SYMBOL(__softirqentry_text_end) = .;
475#else
476#define SOFTIRQENTRY_TEXT
477#endif
478
469/* Section used for early init (in .S files) */ 479/* Section used for early init (in .S files) */
470#define HEAD_TEXT *(.head.text) 480#define HEAD_TEXT *(.head.text)
471 481
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6d9df3f7e334..dea12a6e413b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
811 */ 811 */
812#define __notrace_funcgraph notrace 812#define __notrace_funcgraph notrace
813 813
814/*
815 * We want to which function is an entrypoint of a hardirq.
816 * That will help us to put a signal on output.
817 */
818#define __irq_entry __attribute__((__section__(".irqentry.text")))
819
820/* Limits of hardirq entrypoints */
821extern char __irqentry_text_start[];
822extern char __irqentry_text_end[];
823
824#define FTRACE_NOTRACE_DEPTH 65536 814#define FTRACE_NOTRACE_DEPTH 65536
825#define FTRACE_RETFUNC_DEPTH 50 815#define FTRACE_RETFUNC_DEPTH 50
826#define FTRACE_RETSTACK_ALLOC_SIZE 32 816#define FTRACE_RETSTACK_ALLOC_SIZE 32
@@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void)
857#else /* !CONFIG_FUNCTION_GRAPH_TRACER */ 847#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
858 848
859#define __notrace_funcgraph 849#define __notrace_funcgraph
860#define __irq_entry
861#define INIT_FTRACE_GRAPH 850#define INIT_FTRACE_GRAPH
862 851
863static inline void ftrace_graph_init_task(struct task_struct *t) { } 852static inline void ftrace_graph_init_task(struct task_struct *t) { }
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 358076eda364..9fcabeb07787 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -683,4 +683,24 @@ extern int early_irq_init(void);
683extern int arch_probe_nr_irqs(void); 683extern int arch_probe_nr_irqs(void);
684extern int arch_early_irq_init(void); 684extern int arch_early_irq_init(void);
685 685
686#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
687/*
688 * We want to know which function is an entrypoint of a hardirq or a softirq.
689 */
690#define __irq_entry __attribute__((__section__(".irqentry.text")))
691#define __softirq_entry \
692 __attribute__((__section__(".softirqentry.text")))
693
694/* Limits of hardirq entrypoints */
695extern char __irqentry_text_start[];
696extern char __irqentry_text_end[];
697/* Limits of softirq entrypoints */
698extern char __softirqentry_text_start[];
699extern char __softirqentry_text_end[];
700
701#else
702#define __irq_entry
703#define __softirq_entry
704#endif
705
686#endif 706#endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0fdc798e3ff7..737371b56044 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -48,19 +48,28 @@ void kasan_unpoison_task_stack(struct task_struct *task);
48void kasan_alloc_pages(struct page *page, unsigned int order); 48void kasan_alloc_pages(struct page *page, unsigned int order);
49void kasan_free_pages(struct page *page, unsigned int order); 49void kasan_free_pages(struct page *page, unsigned int order);
50 50
51void kasan_cache_create(struct kmem_cache *cache, size_t *size,
52 unsigned long *flags);
53
51void kasan_poison_slab(struct page *page); 54void kasan_poison_slab(struct page *page);
52void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); 55void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
53void kasan_poison_object_data(struct kmem_cache *cache, void *object); 56void kasan_poison_object_data(struct kmem_cache *cache, void *object);
54 57
55void kasan_kmalloc_large(const void *ptr, size_t size); 58void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
56void kasan_kfree_large(const void *ptr); 59void kasan_kfree_large(const void *ptr);
57void kasan_kfree(void *ptr); 60void kasan_kfree(void *ptr);
58void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); 61void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
59void kasan_krealloc(const void *object, size_t new_size); 62 gfp_t flags);
63void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
60 64
61void kasan_slab_alloc(struct kmem_cache *s, void *object); 65void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
62void kasan_slab_free(struct kmem_cache *s, void *object); 66void kasan_slab_free(struct kmem_cache *s, void *object);
63 67
68struct kasan_cache {
69 int alloc_meta_offset;
70 int free_meta_offset;
71};
72
64int kasan_module_alloc(void *addr, size_t size); 73int kasan_module_alloc(void *addr, size_t size);
65void kasan_free_shadow(const struct vm_struct *vm); 74void kasan_free_shadow(const struct vm_struct *vm);
66 75
@@ -76,20 +85,26 @@ static inline void kasan_disable_current(void) {}
76static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} 85static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
77static inline void kasan_free_pages(struct page *page, unsigned int order) {} 86static inline void kasan_free_pages(struct page *page, unsigned int order) {}
78 87
88static inline void kasan_cache_create(struct kmem_cache *cache,
89 size_t *size,
90 unsigned long *flags) {}
91
79static inline void kasan_poison_slab(struct page *page) {} 92static inline void kasan_poison_slab(struct page *page) {}
80static inline void kasan_unpoison_object_data(struct kmem_cache *cache, 93static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
81 void *object) {} 94 void *object) {}
82static inline void kasan_poison_object_data(struct kmem_cache *cache, 95static inline void kasan_poison_object_data(struct kmem_cache *cache,
83 void *object) {} 96 void *object) {}
84 97
85static inline void kasan_kmalloc_large(void *ptr, size_t size) {} 98static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
86static inline void kasan_kfree_large(const void *ptr) {} 99static inline void kasan_kfree_large(const void *ptr) {}
87static inline void kasan_kfree(void *ptr) {} 100static inline void kasan_kfree(void *ptr) {}
88static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, 101static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
89 size_t size) {} 102 size_t size, gfp_t flags) {}
90static inline void kasan_krealloc(const void *object, size_t new_size) {} 103static inline void kasan_krealloc(const void *object, size_t new_size,
104 gfp_t flags) {}
91 105
92static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} 106static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
107 gfp_t flags) {}
93static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} 108static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
94 109
95static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } 110static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 450fc977ed02..ed6407d1b7b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1132,6 +1132,8 @@ struct zap_details {
1132 struct address_space *check_mapping; /* Check page->mapping if set */ 1132 struct address_space *check_mapping; /* Check page->mapping if set */
1133 pgoff_t first_index; /* Lowest page->index to unmap */ 1133 pgoff_t first_index; /* Lowest page->index to unmap */
1134 pgoff_t last_index; /* Highest page->index to unmap */ 1134 pgoff_t last_index; /* Highest page->index to unmap */
1135 bool ignore_dirty; /* Ignore dirty pages */
1136 bool check_swap_entries; /* Check also swap entries */
1135}; 1137};
1136 1138
1137struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 1139struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 03e6257321f0..628a43242a34 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -76,8 +76,6 @@ extern unsigned long oom_badness(struct task_struct *p,
76 struct mem_cgroup *memcg, const nodemask_t *nodemask, 76 struct mem_cgroup *memcg, const nodemask_t *nodemask,
77 unsigned long totalpages); 77 unsigned long totalpages);
78 78
79extern int oom_kills_count(void);
80extern void note_oom_kill(void);
81extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, 79extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
82 unsigned int points, unsigned long totalpages, 80 unsigned int points, unsigned long totalpages,
83 struct mem_cgroup *memcg, const char *message); 81 struct mem_cgroup *memcg, const char *message);
@@ -91,7 +89,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
91 89
92extern bool out_of_memory(struct oom_control *oc); 90extern bool out_of_memory(struct oom_control *oc);
93 91
94extern void exit_oom_victim(void); 92extern void exit_oom_victim(struct task_struct *tsk);
95 93
96extern int register_oom_notifier(struct notifier_block *nb); 94extern int register_oom_notifier(struct notifier_block *nb);
97extern int unregister_oom_notifier(struct notifier_block *nb); 95extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 589c4780b077..60bba7e032dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout);
426extern signed long schedule_timeout_interruptible(signed long timeout); 426extern signed long schedule_timeout_interruptible(signed long timeout);
427extern signed long schedule_timeout_killable(signed long timeout); 427extern signed long schedule_timeout_killable(signed long timeout);
428extern signed long schedule_timeout_uninterruptible(signed long timeout); 428extern signed long schedule_timeout_uninterruptible(signed long timeout);
429extern signed long schedule_timeout_idle(signed long timeout);
429asmlinkage void schedule(void); 430asmlinkage void schedule(void);
430extern void schedule_preempt_disabled(void); 431extern void schedule_preempt_disabled(void);
431 432
@@ -1848,6 +1849,9 @@ struct task_struct {
1848 unsigned long task_state_change; 1849 unsigned long task_state_change;
1849#endif 1850#endif
1850 int pagefault_disabled; 1851 int pagefault_disabled;
1852#ifdef CONFIG_MMU
1853 struct task_struct *oom_reaper_list;
1854#endif
1851/* CPU-specific state of this task */ 1855/* CPU-specific state of this task */
1852 struct thread_struct thread; 1856 struct thread_struct thread;
1853/* 1857/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e4b568738ca3..508bd827e6dc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -92,6 +92,12 @@
92# define SLAB_ACCOUNT 0x00000000UL 92# define SLAB_ACCOUNT 0x00000000UL
93#endif 93#endif
94 94
95#ifdef CONFIG_KASAN
96#define SLAB_KASAN 0x08000000UL
97#else
98#define SLAB_KASAN 0x00000000UL
99#endif
100
95/* The following flags affect the page allocator grouping pages by mobility */ 101/* The following flags affect the page allocator grouping pages by mobility */
96#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 102#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
97#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 103#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
@@ -370,7 +376,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
370{ 376{
371 void *ret = kmem_cache_alloc(s, flags); 377 void *ret = kmem_cache_alloc(s, flags);
372 378
373 kasan_kmalloc(s, ret, size); 379 kasan_kmalloc(s, ret, size, flags);
374 return ret; 380 return ret;
375} 381}
376 382
@@ -381,7 +387,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
381{ 387{
382 void *ret = kmem_cache_alloc_node(s, gfpflags, node); 388 void *ret = kmem_cache_alloc_node(s, gfpflags, node);
383 389
384 kasan_kmalloc(s, ret, size); 390 kasan_kmalloc(s, ret, size, gfpflags);
385 return ret; 391 return ret;
386} 392}
387#endif /* CONFIG_TRACING */ 393#endif /* CONFIG_TRACING */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e878ba35ae91..9edbbf352340 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -76,8 +76,22 @@ struct kmem_cache {
76#ifdef CONFIG_MEMCG 76#ifdef CONFIG_MEMCG
77 struct memcg_cache_params memcg_params; 77 struct memcg_cache_params memcg_params;
78#endif 78#endif
79#ifdef CONFIG_KASAN
80 struct kasan_cache kasan_info;
81#endif
79 82
80 struct kmem_cache_node *node[MAX_NUMNODES]; 83 struct kmem_cache_node *node[MAX_NUMNODES];
81}; 84};
82 85
86static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
87 void *x) {
88 void *object = x - (x - page->s_mem) % cache->size;
89 void *last_object = page->s_mem + (cache->num - 1) * cache->size;
90
91 if (unlikely(object > last_object))
92 return last_object;
93 else
94 return object;
95}
96
83#endif /* _LINUX_SLAB_DEF_H */ 97#endif /* _LINUX_SLAB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index ac5143f95ee6..665cd0cd18b8 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s,
130void object_err(struct kmem_cache *s, struct page *page, 130void object_err(struct kmem_cache *s, struct page *page,
131 u8 *object, char *reason); 131 u8 *object, char *reason);
132 132
133static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
134 void *x) {
135 void *object = x - (x - page_address(page)) % cache->size;
136 void *last_object = page_address(page) +
137 (page->objects - 1) * cache->size;
138 if (unlikely(object > last_object))
139 return last_object;
140 else
141 return object;
142}
143
133#endif /* _LINUX_SLUB_DEF_H */ 144#endif /* _LINUX_SLUB_DEF_H */
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
new file mode 100644
index 000000000000..7978b3e2c1e1
--- /dev/null
+++ b/include/linux/stackdepot.h
@@ -0,0 +1,32 @@
1/*
2 * A generic stack depot implementation
3 *
4 * Author: Alexander Potapenko <glider@google.com>
5 * Copyright (C) 2016 Google, Inc.
6 *
7 * Based on code by Dmitry Chernenkov.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 */
20
21#ifndef _LINUX_STACKDEPOT_H
22#define _LINUX_STACKDEPOT_H
23
24typedef u32 depot_stack_handle_t;
25
26struct stack_trace;
27
28depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags);
29
30void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace);
31
32#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 953d1a1c0387..fd90195667e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
435 mm_update_next_owner(mm); 435 mm_update_next_owner(mm);
436 mmput(mm); 436 mmput(mm);
437 if (test_thread_flag(TIF_MEMDIE)) 437 if (test_thread_flag(TIF_MEMDIE))
438 exit_oom_victim(); 438 exit_oom_victim(tsk);
439} 439}
440 440
441static struct task_struct *find_alive_thread(struct task_struct *p) 441static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8aae49dd7da8..17caf4b63342 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
227static inline void lockdep_softirq_end(bool in_hardirq) { } 227static inline void lockdep_softirq_end(bool in_hardirq) { }
228#endif 228#endif
229 229
230asmlinkage __visible void __do_softirq(void) 230asmlinkage __visible void __softirq_entry __do_softirq(void)
231{ 231{
232 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 232 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
233 unsigned long old_flags = current->flags; 233 unsigned long old_flags = current->flags;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d1798fa0c743..73164c3aa56b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1566} 1566}
1567EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1567EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1568 1568
1569/*
1570 * Like schedule_timeout_uninterruptible(), except this task will not contribute
1571 * to load average.
1572 */
1573signed long __sched schedule_timeout_idle(signed long timeout)
1574{
1575 __set_current_state(TASK_IDLE);
1576 return schedule_timeout(timeout);
1577}
1578EXPORT_SYMBOL(schedule_timeout_idle);
1579
1569#ifdef CONFIG_HOTPLUG_CPU 1580#ifdef CONFIG_HOTPLUG_CPU
1570static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) 1581static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
1571{ 1582{
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 91d6a63a2ea7..3a0244ff7ea8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/interrupt.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
diff --git a/lib/Kconfig b/lib/Kconfig
index 133ebc0c1773..3cca1222578e 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -536,4 +536,8 @@ config ARCH_HAS_PMEM_API
536config ARCH_HAS_MMIO_FLUSH 536config ARCH_HAS_MMIO_FLUSH
537 bool 537 bool
538 538
539config STACKDEPOT
540 bool
541 select STACKTRACE
542
539endmenu 543endmenu
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0fee5acd5aa0..67d8c6838ba9 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,8 +5,9 @@ if HAVE_ARCH_KASAN
5 5
6config KASAN 6config KASAN
7 bool "KASan: runtime memory debugger" 7 bool "KASan: runtime memory debugger"
8 depends on SLUB_DEBUG 8 depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
9 select CONSTRUCTORS 9 select CONSTRUCTORS
10 select STACKDEPOT if SLAB
10 help 11 help
11 Enables kernel address sanitizer - runtime memory debugger, 12 Enables kernel address sanitizer - runtime memory debugger,
12 designed to find out-of-bounds accesses and use-after-free bugs. 13 designed to find out-of-bounds accesses and use-after-free bugs.
@@ -16,6 +17,8 @@ config KASAN
16 This feature consumes about 1/8 of available memory and brings about 17 This feature consumes about 1/8 of available memory and brings about
17 ~x3 performance slowdown. 18 ~x3 performance slowdown.
18 For better error detection enable CONFIG_STACKTRACE. 19 For better error detection enable CONFIG_STACKTRACE.
20 Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB
21 (the resulting kernel does not boot).
19 22
20choice 23choice
21 prompt "Instrumentation type" 24 prompt "Instrumentation type"
diff --git a/lib/Makefile b/lib/Makefile
index a1de5b61ff40..7bd6fd436c97 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -181,6 +181,9 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o
181obj-$(CONFIG_STMP_DEVICE) += stmp_device.o 181obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
182obj-$(CONFIG_IRQ_POLL) += irq_poll.o 182obj-$(CONFIG_IRQ_POLL) += irq_poll.o
183 183
184obj-$(CONFIG_STACKDEPOT) += stackdepot.o
185KASAN_SANITIZE_stackdepot.o := n
186
184libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ 187libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
185 fdt_empty_tree.o 188 fdt_empty_tree.o
186$(foreach file, $(libfdt_files), \ 189$(foreach file, $(libfdt_files), \
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
new file mode 100644
index 000000000000..654c9d87e83a
--- /dev/null
+++ b/lib/stackdepot.c
@@ -0,0 +1,284 @@
1/*
2 * Generic stack depot for storing stack traces.
3 *
4 * Some debugging tools need to save stack traces of certain events which can
5 * be later presented to the user. For example, KASAN needs to safe alloc and
6 * free stacks for each object, but storing two stack traces per object
7 * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for
8 * that).
9 *
10 * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc
11 * and free stacks repeat a lot, we save about 100x space.
12 * Stacks are never removed from depot, so we store them contiguously one after
13 * another in a contiguos memory allocation.
14 *
15 * Author: Alexander Potapenko <glider@google.com>
16 * Copyright (C) 2016 Google, Inc.
17 *
18 * Based on code by Dmitry Chernenkov.
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * version 2 as published by the Free Software Foundation.
23 *
24 * This program is distributed in the hope that it will be useful, but
25 * WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 * General Public License for more details.
28 *
29 */
30
31#include <linux/gfp.h>
32#include <linux/jhash.h>
33#include <linux/kernel.h>
34#include <linux/mm.h>
35#include <linux/percpu.h>
36#include <linux/printk.h>
37#include <linux/slab.h>
38#include <linux/stacktrace.h>
39#include <linux/stackdepot.h>
40#include <linux/string.h>
41#include <linux/types.h>
42
43#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
44
45#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
46#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
47#define STACK_ALLOC_ALIGN 4
48#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
49 STACK_ALLOC_ALIGN)
50#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
51#define STACK_ALLOC_SLABS_CAP 1024
52#define STACK_ALLOC_MAX_SLABS \
53 (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
54 (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
55
56/* The compact structure to store the reference to stacks. */
57union handle_parts {
58 depot_stack_handle_t handle;
59 struct {
60 u32 slabindex : STACK_ALLOC_INDEX_BITS;
61 u32 offset : STACK_ALLOC_OFFSET_BITS;
62 };
63};
64
65struct stack_record {
66 struct stack_record *next; /* Link in the hashtable */
67 u32 hash; /* Hash in the hastable */
68 u32 size; /* Number of frames in the stack */
69 union handle_parts handle;
70 unsigned long entries[1]; /* Variable-sized array of entries. */
71};
72
73static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
74
75static int depot_index;
76static int next_slab_inited;
77static size_t depot_offset;
78static DEFINE_SPINLOCK(depot_lock);
79
80static bool init_stack_slab(void **prealloc)
81{
82 if (!*prealloc)
83 return false;
84 /*
85 * This smp_load_acquire() pairs with smp_store_release() to
86 * |next_slab_inited| below and in depot_alloc_stack().
87 */
88 if (smp_load_acquire(&next_slab_inited))
89 return true;
90 if (stack_slabs[depot_index] == NULL) {
91 stack_slabs[depot_index] = *prealloc;
92 } else {
93 stack_slabs[depot_index + 1] = *prealloc;
94 /*
95 * This smp_store_release pairs with smp_load_acquire() from
96 * |next_slab_inited| above and in depot_save_stack().
97 */
98 smp_store_release(&next_slab_inited, 1);
99 }
100 *prealloc = NULL;
101 return true;
102}
103
104/* Allocation of a new stack in raw storage */
105static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
106 u32 hash, void **prealloc, gfp_t alloc_flags)
107{
108 int required_size = offsetof(struct stack_record, entries) +
109 sizeof(unsigned long) * size;
110 struct stack_record *stack;
111
112 required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN);
113
114 if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) {
115 if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) {
116 WARN_ONCE(1, "Stack depot reached limit capacity");
117 return NULL;
118 }
119 depot_index++;
120 depot_offset = 0;
121 /*
122 * smp_store_release() here pairs with smp_load_acquire() from
123 * |next_slab_inited| in depot_save_stack() and
124 * init_stack_slab().
125 */
126 if (depot_index + 1 < STACK_ALLOC_MAX_SLABS)
127 smp_store_release(&next_slab_inited, 0);
128 }
129 init_stack_slab(prealloc);
130 if (stack_slabs[depot_index] == NULL)
131 return NULL;
132
133 stack = stack_slabs[depot_index] + depot_offset;
134
135 stack->hash = hash;
136 stack->size = size;
137 stack->handle.slabindex = depot_index;
138 stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
139 memcpy(stack->entries, entries, size * sizeof(unsigned long));
140 depot_offset += required_size;
141
142 return stack;
143}
144
145#define STACK_HASH_ORDER 20
146#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER)
147#define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
148#define STACK_HASH_SEED 0x9747b28c
149
150static struct stack_record *stack_table[STACK_HASH_SIZE] = {
151 [0 ... STACK_HASH_SIZE - 1] = NULL
152};
153
154/* Calculate hash for a stack */
155static inline u32 hash_stack(unsigned long *entries, unsigned int size)
156{
157 return jhash2((u32 *)entries,
158 size * sizeof(unsigned long) / sizeof(u32),
159 STACK_HASH_SEED);
160}
161
162/* Find a stack that is equal to the one stored in entries in the hash */
163static inline struct stack_record *find_stack(struct stack_record *bucket,
164 unsigned long *entries, int size,
165 u32 hash)
166{
167 struct stack_record *found;
168
169 for (found = bucket; found; found = found->next) {
170 if (found->hash == hash &&
171 found->size == size &&
172 !memcmp(entries, found->entries,
173 size * sizeof(unsigned long))) {
174 return found;
175 }
176 }
177 return NULL;
178}
179
180void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
181{
182 union handle_parts parts = { .handle = handle };
183 void *slab = stack_slabs[parts.slabindex];
184 size_t offset = parts.offset << STACK_ALLOC_ALIGN;
185 struct stack_record *stack = slab + offset;
186
187 trace->nr_entries = trace->max_entries = stack->size;
188 trace->entries = stack->entries;
189 trace->skip = 0;
190}
191
192/**
193 * depot_save_stack - save stack in a stack depot.
194 * @trace - the stacktrace to save.
195 * @alloc_flags - flags for allocating additional memory if required.
196 *
197 * Returns the handle of the stack struct stored in depot.
198 */
199depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
200 gfp_t alloc_flags)
201{
202 u32 hash;
203 depot_stack_handle_t retval = 0;
204 struct stack_record *found = NULL, **bucket;
205 unsigned long flags;
206 struct page *page = NULL;
207 void *prealloc = NULL;
208
209 if (unlikely(trace->nr_entries == 0))
210 goto fast_exit;
211
212 hash = hash_stack(trace->entries, trace->nr_entries);
213 /* Bad luck, we won't store this stack. */
214 if (hash == 0)
215 goto exit;
216
217 bucket = &stack_table[hash & STACK_HASH_MASK];
218
219 /*
220 * Fast path: look the stack trace up without locking.
221 * The smp_load_acquire() here pairs with smp_store_release() to
222 * |bucket| below.
223 */
224 found = find_stack(smp_load_acquire(bucket), trace->entries,
225 trace->nr_entries, hash);
226 if (found)
227 goto exit;
228
229 /*
230 * Check if the current or the next stack slab need to be initialized.
231 * If so, allocate the memory - we won't be able to do that under the
232 * lock.
233 *
234 * The smp_load_acquire() here pairs with smp_store_release() to
235 * |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
236 */
237 if (unlikely(!smp_load_acquire(&next_slab_inited))) {
238 /*
239 * Zero out zone modifiers, as we don't have specific zone
240 * requirements. Keep the flags related to allocation in atomic
241 * contexts and I/O.
242 */
243 alloc_flags &= ~GFP_ZONEMASK;
244 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
245 page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
246 if (page)
247 prealloc = page_address(page);
248 }
249
250 spin_lock_irqsave(&depot_lock, flags);
251
252 found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
253 if (!found) {
254 struct stack_record *new =
255 depot_alloc_stack(trace->entries, trace->nr_entries,
256 hash, &prealloc, alloc_flags);
257 if (new) {
258 new->next = *bucket;
259 /*
260 * This smp_store_release() pairs with
261 * smp_load_acquire() from |bucket| above.
262 */
263 smp_store_release(bucket, new);
264 found = new;
265 }
266 } else if (prealloc) {
267 /*
268 * We didn't need to store this stack trace, but let's keep
269 * the preallocated memory for the future.
270 */
271 WARN_ON(!init_stack_slab(&prealloc));
272 }
273
274 spin_unlock_irqrestore(&depot_lock, flags);
275exit:
276 if (prealloc) {
277 /* Nobody used this memory, ok to free it. */
278 free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER);
279 }
280 if (found)
281 retval = found->handle.handle;
282fast_exit:
283 return retval;
284}
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index c32f3b0048dc..82169fbf2453 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -65,11 +65,34 @@ static noinline void __init kmalloc_node_oob_right(void)
65 kfree(ptr); 65 kfree(ptr);
66} 66}
67 67
68static noinline void __init kmalloc_large_oob_right(void) 68#ifdef CONFIG_SLUB
69static noinline void __init kmalloc_pagealloc_oob_right(void)
69{ 70{
70 char *ptr; 71 char *ptr;
71 size_t size = KMALLOC_MAX_CACHE_SIZE + 10; 72 size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
72 73
74 /* Allocate a chunk that does not fit into a SLUB cache to trigger
75 * the page allocator fallback.
76 */
77 pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n");
78 ptr = kmalloc(size, GFP_KERNEL);
79 if (!ptr) {
80 pr_err("Allocation failed\n");
81 return;
82 }
83
84 ptr[size] = 0;
85 kfree(ptr);
86}
87#endif
88
89static noinline void __init kmalloc_large_oob_right(void)
90{
91 char *ptr;
92 size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
93 /* Allocate a chunk that is large enough, but still fits into a slab
94 * and does not trigger the page allocator fallback in SLUB.
95 */
73 pr_info("kmalloc large allocation: out-of-bounds to right\n"); 96 pr_info("kmalloc large allocation: out-of-bounds to right\n");
74 ptr = kmalloc(size, GFP_KERNEL); 97 ptr = kmalloc(size, GFP_KERNEL);
75 if (!ptr) { 98 if (!ptr) {
@@ -271,6 +294,8 @@ static noinline void __init kmalloc_uaf2(void)
271 } 294 }
272 295
273 ptr1[40] = 'x'; 296 ptr1[40] = 'x';
297 if (ptr1 == ptr2)
298 pr_err("Could not detect use-after-free: ptr1 == ptr2\n");
274 kfree(ptr2); 299 kfree(ptr2);
275} 300}
276 301
@@ -324,6 +349,9 @@ static int __init kmalloc_tests_init(void)
324 kmalloc_oob_right(); 349 kmalloc_oob_right();
325 kmalloc_oob_left(); 350 kmalloc_oob_left();
326 kmalloc_node_oob_right(); 351 kmalloc_node_oob_right();
352#ifdef CONFIG_SLUB
353 kmalloc_pagealloc_oob_right();
354#endif
327 kmalloc_large_oob_right(); 355 kmalloc_large_oob_right();
328 kmalloc_oob_krealloc_more(); 356 kmalloc_oob_krealloc_more();
329 kmalloc_oob_krealloc_less(); 357 kmalloc_oob_krealloc_less();
diff --git a/mm/Makefile b/mm/Makefile
index f5e797cbd128..deb467edca2d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,6 +3,7 @@
3# 3#
4 4
5KASAN_SANITIZE_slab_common.o := n 5KASAN_SANITIZE_slab_common.o := n
6KASAN_SANITIZE_slab.o := n
6KASAN_SANITIZE_slub.o := n 7KASAN_SANITIZE_slub.o := n
7 8
8# These files are disabled because they produce non-interesting and/or 9# These files are disabled because they produce non-interesting and/or
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c00f105845e..a8c69c8c0a90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1840,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1840 ssize_t retval = 0; 1840 ssize_t retval = 0;
1841 loff_t *ppos = &iocb->ki_pos; 1841 loff_t *ppos = &iocb->ki_pos;
1842 loff_t pos = *ppos; 1842 loff_t pos = *ppos;
1843 size_t count = iov_iter_count(iter);
1844
1845 if (!count)
1846 goto out; /* skip atime */
1843 1847
1844 if (iocb->ki_flags & IOCB_DIRECT) { 1848 if (iocb->ki_flags & IOCB_DIRECT) {
1845 struct address_space *mapping = file->f_mapping; 1849 struct address_space *mapping = file->f_mapping;
1846 struct inode *inode = mapping->host; 1850 struct inode *inode = mapping->host;
1847 size_t count = iov_iter_count(iter);
1848 loff_t size; 1851 loff_t size;
1849 1852
1850 if (!count)
1851 goto out; /* skip atime */
1852 size = i_size_read(inode); 1853 size = i_size_read(inode);
1853 retval = filemap_write_and_wait_range(mapping, pos, 1854 retval = filemap_write_and_wait_range(mapping, pos,
1854 pos + count - 1); 1855 pos + count - 1);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fbfb1b8d6726..86f9f8b82f8e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2578,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2578 } 2578 }
2579 khugepaged_node_load[node]++; 2579 khugepaged_node_load[node]++;
2580 if (!PageLRU(page)) { 2580 if (!PageLRU(page)) {
2581 result = SCAN_SCAN_ABORT; 2581 result = SCAN_PAGE_LRU;
2582 goto out_unmap; 2582 goto out_unmap;
2583 } 2583 }
2584 if (PageLocked(page)) { 2584 if (PageLocked(page)) {
diff --git a/mm/internal.h b/mm/internal.h
index 7449392c6faa..b79abb6721cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,6 +38,11 @@
38void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 38void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
39 unsigned long floor, unsigned long ceiling); 39 unsigned long floor, unsigned long ceiling);
40 40
41void unmap_page_range(struct mmu_gather *tlb,
42 struct vm_area_struct *vma,
43 unsigned long addr, unsigned long end,
44 struct zap_details *details);
45
41extern int __do_page_cache_readahead(struct address_space *mapping, 46extern int __do_page_cache_readahead(struct address_space *mapping,
42 struct file *filp, pgoff_t offset, unsigned long nr_to_read, 47 struct file *filp, pgoff_t offset, unsigned long nr_to_read,
43 unsigned long lookahead_size); 48 unsigned long lookahead_size);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8c91..acb3b6c4dd89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
17#define DISABLE_BRANCH_PROFILING 17#define DISABLE_BRANCH_PROFILING
18 18
19#include <linux/export.h> 19#include <linux/export.h>
20#include <linux/interrupt.h>
20#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kasan.h>
21#include <linux/kernel.h> 23#include <linux/kernel.h>
22#include <linux/kmemleak.h> 24#include <linux/kmemleak.h>
23#include <linux/linkage.h> 25#include <linux/linkage.h>
@@ -32,7 +34,6 @@
32#include <linux/string.h> 34#include <linux/string.h>
33#include <linux/types.h> 35#include <linux/types.h>
34#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
35#include <linux/kasan.h>
36 37
37#include "kasan.h" 38#include "kasan.h"
38#include "../slab.h" 39#include "../slab.h"
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
334 KASAN_FREE_PAGE); 335 KASAN_FREE_PAGE);
335} 336}
336 337
338#ifdef CONFIG_SLAB
339/*
340 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
341 * For larger allocations larger redzones are used.
342 */
343static size_t optimal_redzone(size_t object_size)
344{
345 int rz =
346 object_size <= 64 - 16 ? 16 :
347 object_size <= 128 - 32 ? 32 :
348 object_size <= 512 - 64 ? 64 :
349 object_size <= 4096 - 128 ? 128 :
350 object_size <= (1 << 14) - 256 ? 256 :
351 object_size <= (1 << 15) - 512 ? 512 :
352 object_size <= (1 << 16) - 1024 ? 1024 : 2048;
353 return rz;
354}
355
356void kasan_cache_create(struct kmem_cache *cache, size_t *size,
357 unsigned long *flags)
358{
359 int redzone_adjust;
360 /* Make sure the adjusted size is still less than
361 * KMALLOC_MAX_CACHE_SIZE.
362 * TODO: this check is only useful for SLAB, but not SLUB. We'll need
363 * to skip it for SLUB when it starts using kasan_cache_create().
364 */
365 if (*size > KMALLOC_MAX_CACHE_SIZE -
366 sizeof(struct kasan_alloc_meta) -
367 sizeof(struct kasan_free_meta))
368 return;
369 *flags |= SLAB_KASAN;
370 /* Add alloc meta. */
371 cache->kasan_info.alloc_meta_offset = *size;
372 *size += sizeof(struct kasan_alloc_meta);
373
374 /* Add free meta. */
375 if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
376 cache->object_size < sizeof(struct kasan_free_meta)) {
377 cache->kasan_info.free_meta_offset = *size;
378 *size += sizeof(struct kasan_free_meta);
379 }
380 redzone_adjust = optimal_redzone(cache->object_size) -
381 (*size - cache->object_size);
382 if (redzone_adjust > 0)
383 *size += redzone_adjust;
384 *size = min(KMALLOC_MAX_CACHE_SIZE,
385 max(*size,
386 cache->object_size +
387 optimal_redzone(cache->object_size)));
388}
389#endif
390
337void kasan_poison_slab(struct page *page) 391void kasan_poison_slab(struct page *page)
338{ 392{
339 kasan_poison_shadow(page_address(page), 393 kasan_poison_shadow(page_address(page),
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
351 kasan_poison_shadow(object, 405 kasan_poison_shadow(object,
352 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), 406 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
353 KASAN_KMALLOC_REDZONE); 407 KASAN_KMALLOC_REDZONE);
408#ifdef CONFIG_SLAB
409 if (cache->flags & SLAB_KASAN) {
410 struct kasan_alloc_meta *alloc_info =
411 get_alloc_info(cache, object);
412 alloc_info->state = KASAN_STATE_INIT;
413 }
414#endif
354} 415}
355 416
356void kasan_slab_alloc(struct kmem_cache *cache, void *object) 417#ifdef CONFIG_SLAB
418static inline int in_irqentry_text(unsigned long ptr)
357{ 419{
358 kasan_kmalloc(cache, object, cache->object_size); 420 return (ptr >= (unsigned long)&__irqentry_text_start &&
421 ptr < (unsigned long)&__irqentry_text_end) ||
422 (ptr >= (unsigned long)&__softirqentry_text_start &&
423 ptr < (unsigned long)&__softirqentry_text_end);
424}
425
426static inline void filter_irq_stacks(struct stack_trace *trace)
427{
428 int i;
429
430 if (!trace->nr_entries)
431 return;
432 for (i = 0; i < trace->nr_entries; i++)
433 if (in_irqentry_text(trace->entries[i])) {
434 /* Include the irqentry function into the stack. */
435 trace->nr_entries = i + 1;
436 break;
437 }
438}
439
440static inline depot_stack_handle_t save_stack(gfp_t flags)
441{
442 unsigned long entries[KASAN_STACK_DEPTH];
443 struct stack_trace trace = {
444 .nr_entries = 0,
445 .entries = entries,
446 .max_entries = KASAN_STACK_DEPTH,
447 .skip = 0
448 };
449
450 save_stack_trace(&trace);
451 filter_irq_stacks(&trace);
452 if (trace.nr_entries != 0 &&
453 trace.entries[trace.nr_entries-1] == ULONG_MAX)
454 trace.nr_entries--;
455
456 return depot_save_stack(&trace, flags);
457}
458
459static inline void set_track(struct kasan_track *track, gfp_t flags)
460{
461 track->pid = current->pid;
462 track->stack = save_stack(flags);
463}
464
465struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
466 const void *object)
467{
468 BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
469 return (void *)object + cache->kasan_info.alloc_meta_offset;
470}
471
472struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
473 const void *object)
474{
475 BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
476 return (void *)object + cache->kasan_info.free_meta_offset;
477}
478#endif
479
480void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
481{
482 kasan_kmalloc(cache, object, cache->object_size, flags);
359} 483}
360 484
361void kasan_slab_free(struct kmem_cache *cache, void *object) 485void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
367 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 491 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
368 return; 492 return;
369 493
494#ifdef CONFIG_SLAB
495 if (cache->flags & SLAB_KASAN) {
496 struct kasan_free_meta *free_info =
497 get_free_info(cache, object);
498 struct kasan_alloc_meta *alloc_info =
499 get_alloc_info(cache, object);
500 alloc_info->state = KASAN_STATE_FREE;
501 set_track(&free_info->track);
502 }
503#endif
504
370 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); 505 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
371} 506}
372 507
373void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) 508void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
509 gfp_t flags)
374{ 510{
375 unsigned long redzone_start; 511 unsigned long redzone_start;
376 unsigned long redzone_end; 512 unsigned long redzone_end;
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
386 kasan_unpoison_shadow(object, size); 522 kasan_unpoison_shadow(object, size);
387 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 523 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
388 KASAN_KMALLOC_REDZONE); 524 KASAN_KMALLOC_REDZONE);
525#ifdef CONFIG_SLAB
526 if (cache->flags & SLAB_KASAN) {
527 struct kasan_alloc_meta *alloc_info =
528 get_alloc_info(cache, object);
529
530 alloc_info->state = KASAN_STATE_ALLOC;
531 alloc_info->alloc_size = size;
532 set_track(&alloc_info->track, flags);
533 }
534#endif
389} 535}
390EXPORT_SYMBOL(kasan_kmalloc); 536EXPORT_SYMBOL(kasan_kmalloc);
391 537
392void kasan_kmalloc_large(const void *ptr, size_t size) 538void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
393{ 539{
394 struct page *page; 540 struct page *page;
395 unsigned long redzone_start; 541 unsigned long redzone_start;
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
408 KASAN_PAGE_REDZONE); 554 KASAN_PAGE_REDZONE);
409} 555}
410 556
411void kasan_krealloc(const void *object, size_t size) 557void kasan_krealloc(const void *object, size_t size, gfp_t flags)
412{ 558{
413 struct page *page; 559 struct page *page;
414 560
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size)
418 page = virt_to_head_page(object); 564 page = virt_to_head_page(object);
419 565
420 if (unlikely(!PageSlab(page))) 566 if (unlikely(!PageSlab(page)))
421 kasan_kmalloc_large(object, size); 567 kasan_kmalloc_large(object, size, flags);
422 else 568 else
423 kasan_kmalloc(page->slab_cache, object, size); 569 kasan_kmalloc(page->slab_cache, object, size, flags);
424} 570}
425 571
426void kasan_kfree(void *ptr) 572void kasan_kfree(void *ptr)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4f6c62e5c21e..30a2f0ba0e09 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
2#define __MM_KASAN_KASAN_H 2#define __MM_KASAN_KASAN_H
3 3
4#include <linux/kasan.h> 4#include <linux/kasan.h>
5#include <linux/stackdepot.h>
5 6
6#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) 7#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
7#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) 8#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -54,6 +55,42 @@ struct kasan_global {
54#endif 55#endif
55}; 56};
56 57
58/**
59 * Structures to keep alloc and free tracks *
60 */
61
62enum kasan_state {
63 KASAN_STATE_INIT,
64 KASAN_STATE_ALLOC,
65 KASAN_STATE_FREE
66};
67
68#define KASAN_STACK_DEPTH 64
69
70struct kasan_track {
71 u32 pid;
72 depot_stack_handle_t stack;
73};
74
75struct kasan_alloc_meta {
76 struct kasan_track track;
77 u32 state : 2; /* enum kasan_state */
78 u32 alloc_size : 30;
79 u32 reserved;
80};
81
82struct kasan_free_meta {
83 /* Allocator freelist pointer, unused by KASAN. */
84 void **freelist;
85 struct kasan_track track;
86};
87
88struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
89 const void *object);
90struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
91 const void *object);
92
93
57static inline const void *kasan_shadow_to_mem(const void *shadow_addr) 94static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
58{ 95{
59 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) 96 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 745aa8f36028..60869a5a0124 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -18,6 +18,7 @@
18#include <linux/printk.h> 18#include <linux/printk.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/stackdepot.h>
21#include <linux/stacktrace.h> 22#include <linux/stacktrace.h>
22#include <linux/string.h> 23#include <linux/string.h>
23#include <linux/types.h> 24#include <linux/types.h>
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr)
115 sizeof(init_thread_union.stack)); 116 sizeof(init_thread_union.stack));
116} 117}
117 118
119#ifdef CONFIG_SLAB
120static void print_track(struct kasan_track *track)
121{
122 pr_err("PID = %u\n", track->pid);
123 if (track->stack) {
124 struct stack_trace trace;
125
126 depot_fetch_stack(track->stack, &trace);
127 print_stack_trace(&trace, 0);
128 } else {
129 pr_err("(stack is not available)\n");
130 }
131}
132
133static void object_err(struct kmem_cache *cache, struct page *page,
134 void *object, char *unused_reason)
135{
136 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
137 struct kasan_free_meta *free_info;
138
139 dump_stack();
140 pr_err("Object at %p, in cache %s\n", object, cache->name);
141 if (!(cache->flags & SLAB_KASAN))
142 return;
143 switch (alloc_info->state) {
144 case KASAN_STATE_INIT:
145 pr_err("Object not allocated yet\n");
146 break;
147 case KASAN_STATE_ALLOC:
148 pr_err("Object allocated with size %u bytes.\n",
149 alloc_info->alloc_size);
150 pr_err("Allocation:\n");
151 print_track(&alloc_info->track);
152 break;
153 case KASAN_STATE_FREE:
154 pr_err("Object freed, allocated with size %u bytes\n",
155 alloc_info->alloc_size);
156 free_info = get_free_info(cache, object);
157 pr_err("Allocation:\n");
158 print_track(&alloc_info->track);
159 pr_err("Deallocation:\n");
160 print_track(&free_info->track);
161 break;
162 }
163}
164#endif
165
118static void print_address_description(struct kasan_access_info *info) 166static void print_address_description(struct kasan_access_info *info)
119{ 167{
120 const void *addr = info->access_addr; 168 const void *addr = info->access_addr;
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info)
126 if (PageSlab(page)) { 174 if (PageSlab(page)) {
127 void *object; 175 void *object;
128 struct kmem_cache *cache = page->slab_cache; 176 struct kmem_cache *cache = page->slab_cache;
129 void *last_object; 177 object = nearest_obj(cache, page,
130 178 (void *)info->access_addr);
131 object = virt_to_obj(cache, page_address(page), addr);
132 last_object = page_address(page) +
133 page->objects * cache->size;
134
135 if (unlikely(object > last_object))
136 object = last_object; /* we hit into padding */
137
138 object_err(cache, page, object, 179 object_err(cache, page, object,
139 "kasan: bad access detected"); 180 "kasan: bad access detected");
140 return; 181 return;
141 } 182 }
142 dump_page(page, "kasan: bad access detected"); 183 dump_page(page, "kasan: bad access detected");
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info)
146 if (!init_task_stack_addr(addr)) 187 if (!init_task_stack_addr(addr))
147 pr_err("Address belongs to variable %pS\n", addr); 188 pr_err("Address belongs to variable %pS\n", addr);
148 } 189 }
149
150 dump_stack(); 190 dump_stack();
151} 191}
152 192
diff --git a/mm/memory.c b/mm/memory.c
index 81dca0083fcd..098f00d05461 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1102,6 +1102,12 @@ again:
1102 1102
1103 if (!PageAnon(page)) { 1103 if (!PageAnon(page)) {
1104 if (pte_dirty(ptent)) { 1104 if (pte_dirty(ptent)) {
1105 /*
1106 * oom_reaper cannot tear down dirty
1107 * pages
1108 */
1109 if (unlikely(details && details->ignore_dirty))
1110 continue;
1105 force_flush = 1; 1111 force_flush = 1;
1106 set_page_dirty(page); 1112 set_page_dirty(page);
1107 } 1113 }
@@ -1120,8 +1126,8 @@ again:
1120 } 1126 }
1121 continue; 1127 continue;
1122 } 1128 }
1123 /* If details->check_mapping, we leave swap entries. */ 1129 /* only check swap_entries if explicitly asked for in details */
1124 if (unlikely(details)) 1130 if (unlikely(details && !details->check_swap_entries))
1125 continue; 1131 continue;
1126 1132
1127 entry = pte_to_swp_entry(ptent); 1133 entry = pte_to_swp_entry(ptent);
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1226 return addr; 1232 return addr;
1227} 1233}
1228 1234
1229static void unmap_page_range(struct mmu_gather *tlb, 1235void unmap_page_range(struct mmu_gather *tlb,
1230 struct vm_area_struct *vma, 1236 struct vm_area_struct *vma,
1231 unsigned long addr, unsigned long end, 1237 unsigned long addr, unsigned long end,
1232 struct zap_details *details) 1238 struct zap_details *details)
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
1234 pgd_t *pgd; 1240 pgd_t *pgd;
1235 unsigned long next; 1241 unsigned long next;
1236 1242
1237 if (details && !details->check_mapping)
1238 details = NULL;
1239
1240 BUG_ON(addr >= end); 1243 BUG_ON(addr >= end);
1241 tlb_start_vma(tlb, vma); 1244 tlb_start_vma(tlb, vma);
1242 pgd = pgd_offset(vma->vm_mm, addr); 1245 pgd = pgd_offset(vma->vm_mm, addr);
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2432void unmap_mapping_range(struct address_space *mapping, 2435void unmap_mapping_range(struct address_space *mapping,
2433 loff_t const holebegin, loff_t const holelen, int even_cows) 2436 loff_t const holebegin, loff_t const holelen, int even_cows)
2434{ 2437{
2435 struct zap_details details; 2438 struct zap_details details = { };
2436 pgoff_t hba = holebegin >> PAGE_SHIFT; 2439 pgoff_t hba = holebegin >> PAGE_SHIFT;
2437 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2440 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2438 2441
diff --git a/mm/mempool.c b/mm/mempool.c
index 07c383ddbbab..9b7a14a791cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
112 kasan_free_pages(element, (unsigned long)pool->pool_data); 112 kasan_free_pages(element, (unsigned long)pool->pool_data);
113} 113}
114 114
115static void kasan_unpoison_element(mempool_t *pool, void *element) 115static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
116{ 116{
117 if (pool->alloc == mempool_alloc_slab) 117 if (pool->alloc == mempool_alloc_slab)
118 kasan_slab_alloc(pool->pool_data, element); 118 kasan_slab_alloc(pool->pool_data, element, flags);
119 if (pool->alloc == mempool_kmalloc) 119 if (pool->alloc == mempool_kmalloc)
120 kasan_krealloc(element, (size_t)pool->pool_data); 120 kasan_krealloc(element, (size_t)pool->pool_data, flags);
121 if (pool->alloc == mempool_alloc_pages) 121 if (pool->alloc == mempool_alloc_pages)
122 kasan_alloc_pages(element, (unsigned long)pool->pool_data); 122 kasan_alloc_pages(element, (unsigned long)pool->pool_data);
123} 123}
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
130 pool->elements[pool->curr_nr++] = element; 130 pool->elements[pool->curr_nr++] = element;
131} 131}
132 132
133static void *remove_element(mempool_t *pool) 133static void *remove_element(mempool_t *pool, gfp_t flags)
134{ 134{
135 void *element = pool->elements[--pool->curr_nr]; 135 void *element = pool->elements[--pool->curr_nr];
136 136
137 BUG_ON(pool->curr_nr < 0); 137 BUG_ON(pool->curr_nr < 0);
138 kasan_unpoison_element(pool, element); 138 kasan_unpoison_element(pool, element, flags);
139 check_element(pool, element); 139 check_element(pool, element);
140 return element; 140 return element;
141} 141}
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
154 return; 154 return;
155 155
156 while (pool->curr_nr) { 156 while (pool->curr_nr) {
157 void *element = remove_element(pool); 157 void *element = remove_element(pool, GFP_KERNEL);
158 pool->free(element, pool->pool_data); 158 pool->free(element, pool->pool_data);
159 } 159 }
160 kfree(pool->elements); 160 kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
250 spin_lock_irqsave(&pool->lock, flags); 250 spin_lock_irqsave(&pool->lock, flags);
251 if (new_min_nr <= pool->min_nr) { 251 if (new_min_nr <= pool->min_nr) {
252 while (new_min_nr < pool->curr_nr) { 252 while (new_min_nr < pool->curr_nr) {
253 element = remove_element(pool); 253 element = remove_element(pool, GFP_KERNEL);
254 spin_unlock_irqrestore(&pool->lock, flags); 254 spin_unlock_irqrestore(&pool->lock, flags);
255 pool->free(element, pool->pool_data); 255 pool->free(element, pool->pool_data);
256 spin_lock_irqsave(&pool->lock, flags); 256 spin_lock_irqsave(&pool->lock, flags);
@@ -347,7 +347,7 @@ repeat_alloc:
347 347
348 spin_lock_irqsave(&pool->lock, flags); 348 spin_lock_irqsave(&pool->lock, flags);
349 if (likely(pool->curr_nr)) { 349 if (likely(pool->curr_nr)) {
350 element = remove_element(pool); 350 element = remove_element(pool, gfp_temp);
351 spin_unlock_irqrestore(&pool->lock, flags); 351 spin_unlock_irqrestore(&pool->lock, flags);
352 /* paired with rmb in mempool_free(), read comment there */ 352 /* paired with rmb in mempool_free(), read comment there */
353 smp_wmb(); 353 smp_wmb();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 06f7e1707847..b34d279a7ee6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h> 36#include <linux/ftrace.h>
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <linux/kthread.h>
39#include <linux/init.h>
40
41#include <asm/tlb.h>
42#include "internal.h"
38 43
39#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
40#include <trace/events/oom.h> 45#include <trace/events/oom.h>
@@ -405,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
405 410
406bool oom_killer_disabled __read_mostly; 411bool oom_killer_disabled __read_mostly;
407 412
413#define K(x) ((x) << (PAGE_SHIFT-10))
414
415#ifdef CONFIG_MMU
416/*
417 * OOM Reaper kernel thread which tries to reap the memory used by the OOM
418 * victim (if that is possible) to help the OOM killer to move on.
419 */
420static struct task_struct *oom_reaper_th;
421static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
422static struct task_struct *oom_reaper_list;
423static DEFINE_SPINLOCK(oom_reaper_lock);
424
425
426static bool __oom_reap_task(struct task_struct *tsk)
427{
428 struct mmu_gather tlb;
429 struct vm_area_struct *vma;
430 struct mm_struct *mm;
431 struct task_struct *p;
432 struct zap_details details = {.check_swap_entries = true,
433 .ignore_dirty = true};
434 bool ret = true;
435
436 /*
437 * Make sure we find the associated mm_struct even when the particular
438 * thread has already terminated and cleared its mm.
439 * We might have race with exit path so consider our work done if there
440 * is no mm.
441 */
442 p = find_lock_task_mm(tsk);
443 if (!p)
444 return true;
445
446 mm = p->mm;
447 if (!atomic_inc_not_zero(&mm->mm_users)) {
448 task_unlock(p);
449 return true;
450 }
451
452 task_unlock(p);
453
454 if (!down_read_trylock(&mm->mmap_sem)) {
455 ret = false;
456 goto out;
457 }
458
459 tlb_gather_mmu(&tlb, mm, 0, -1);
460 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
461 if (is_vm_hugetlb_page(vma))
462 continue;
463
464 /*
465 * mlocked VMAs require explicit munlocking before unmap.
466 * Let's keep it simple here and skip such VMAs.
467 */
468 if (vma->vm_flags & VM_LOCKED)
469 continue;
470
471 /*
472 * Only anonymous pages have a good chance to be dropped
473 * without additional steps which we cannot afford as we
474 * are OOM already.
475 *
476 * We do not even care about fs backed pages because all
477 * which are reclaimable have already been reclaimed and
478 * we do not want to block exit_mmap by keeping mm ref
479 * count elevated without a good reason.
480 */
481 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
482 unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
483 &details);
484 }
485 tlb_finish_mmu(&tlb, 0, -1);
486 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
487 task_pid_nr(tsk), tsk->comm,
488 K(get_mm_counter(mm, MM_ANONPAGES)),
489 K(get_mm_counter(mm, MM_FILEPAGES)),
490 K(get_mm_counter(mm, MM_SHMEMPAGES)));
491 up_read(&mm->mmap_sem);
492
493 /*
494 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
495 * reasonably reclaimable memory anymore. OOM killer can continue
496 * by selecting other victim if unmapping hasn't led to any
497 * improvements. This also means that selecting this task doesn't
498 * make any sense.
499 */
500 tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
501 exit_oom_victim(tsk);
502out:
503 mmput(mm);
504 return ret;
505}
506
507#define MAX_OOM_REAP_RETRIES 10
508static void oom_reap_task(struct task_struct *tsk)
509{
510 int attempts = 0;
511
512 /* Retry the down_read_trylock(mmap_sem) a few times */
513 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
514 schedule_timeout_idle(HZ/10);
515
516 if (attempts > MAX_OOM_REAP_RETRIES) {
517 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
518 task_pid_nr(tsk), tsk->comm);
519 debug_show_all_locks();
520 }
521
522 /* Drop a reference taken by wake_oom_reaper */
523 put_task_struct(tsk);
524}
525
526static int oom_reaper(void *unused)
527{
528 set_freezable();
529
530 while (true) {
531 struct task_struct *tsk = NULL;
532
533 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
534 spin_lock(&oom_reaper_lock);
535 if (oom_reaper_list != NULL) {
536 tsk = oom_reaper_list;
537 oom_reaper_list = tsk->oom_reaper_list;
538 }
539 spin_unlock(&oom_reaper_lock);
540
541 if (tsk)
542 oom_reap_task(tsk);
543 }
544
545 return 0;
546}
547
548static void wake_oom_reaper(struct task_struct *tsk)
549{
550 if (!oom_reaper_th || tsk->oom_reaper_list)
551 return;
552
553 get_task_struct(tsk);
554
555 spin_lock(&oom_reaper_lock);
556 tsk->oom_reaper_list = oom_reaper_list;
557 oom_reaper_list = tsk;
558 spin_unlock(&oom_reaper_lock);
559 wake_up(&oom_reaper_wait);
560}
561
562static int __init oom_init(void)
563{
564 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
565 if (IS_ERR(oom_reaper_th)) {
566 pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
567 PTR_ERR(oom_reaper_th));
568 oom_reaper_th = NULL;
569 }
570 return 0;
571}
572subsys_initcall(oom_init)
573#else
574static void wake_oom_reaper(struct task_struct *tsk)
575{
576}
577#endif
578
408/** 579/**
409 * mark_oom_victim - mark the given task as OOM victim 580 * mark_oom_victim - mark the given task as OOM victim
410 * @tsk: task to mark 581 * @tsk: task to mark
@@ -431,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk)
431/** 602/**
432 * exit_oom_victim - note the exit of an OOM victim 603 * exit_oom_victim - note the exit of an OOM victim
433 */ 604 */
434void exit_oom_victim(void) 605void exit_oom_victim(struct task_struct *tsk)
435{ 606{
436 clear_thread_flag(TIF_MEMDIE); 607 if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
608 return;
437 609
438 if (!atomic_dec_return(&oom_victims)) 610 if (!atomic_dec_return(&oom_victims))
439 wake_up_all(&oom_victims_wait); 611 wake_up_all(&oom_victims_wait);
@@ -494,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
494 return false; 666 return false;
495} 667}
496 668
497#define K(x) ((x) << (PAGE_SHIFT-10))
498/* 669/*
499 * Must be called while holding a reference to p, which will be released upon 670 * Must be called while holding a reference to p, which will be released upon
500 * returning. 671 * returning.
@@ -510,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
510 unsigned int victim_points = 0; 681 unsigned int victim_points = 0;
511 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 682 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
512 DEFAULT_RATELIMIT_BURST); 683 DEFAULT_RATELIMIT_BURST);
684 bool can_oom_reap = true;
513 685
514 /* 686 /*
515 * If the task is already exiting, don't alarm the sysadmin or kill 687 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -600,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
600 continue; 772 continue;
601 if (same_thread_group(p, victim)) 773 if (same_thread_group(p, victim))
602 continue; 774 continue;
603 if (unlikely(p->flags & PF_KTHREAD)) 775 if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
604 continue; 776 p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
605 if (is_global_init(p)) 777 /*
606 continue; 778 * We cannot use oom_reaper for the mm shared by this
607 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) 779 * process because it wouldn't get killed and so the
780 * memory might be still used.
781 */
782 can_oom_reap = false;
608 continue; 783 continue;
609 784 }
610 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 785 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
611 } 786 }
612 rcu_read_unlock(); 787 rcu_read_unlock();
613 788
789 if (can_oom_reap)
790 wake_oom_reaper(victim);
791
614 mmdrop(mm); 792 mmdrop(mm);
615 put_task_struct(victim); 793 put_task_struct(victim);
616} 794}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a762be57e46e..59de90d5d3a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -692,34 +692,28 @@ static inline void __free_one_page(struct page *page,
692 unsigned long combined_idx; 692 unsigned long combined_idx;
693 unsigned long uninitialized_var(buddy_idx); 693 unsigned long uninitialized_var(buddy_idx);
694 struct page *buddy; 694 struct page *buddy;
695 unsigned int max_order = MAX_ORDER; 695 unsigned int max_order;
696
697 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
696 698
697 VM_BUG_ON(!zone_is_initialized(zone)); 699 VM_BUG_ON(!zone_is_initialized(zone));
698 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 700 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
699 701
700 VM_BUG_ON(migratetype == -1); 702 VM_BUG_ON(migratetype == -1);
701 if (is_migrate_isolate(migratetype)) { 703 if (likely(!is_migrate_isolate(migratetype)))
702 /*
703 * We restrict max order of merging to prevent merge
704 * between freepages on isolate pageblock and normal
705 * pageblock. Without this, pageblock isolation
706 * could cause incorrect freepage accounting.
707 */
708 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
709 } else {
710 __mod_zone_freepage_state(zone, 1 << order, migratetype); 704 __mod_zone_freepage_state(zone, 1 << order, migratetype);
711 }
712 705
713 page_idx = pfn & ((1 << max_order) - 1); 706 page_idx = pfn & ((1 << MAX_ORDER) - 1);
714 707
715 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 708 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
716 VM_BUG_ON_PAGE(bad_range(zone, page), page); 709 VM_BUG_ON_PAGE(bad_range(zone, page), page);
717 710
711continue_merging:
718 while (order < max_order - 1) { 712 while (order < max_order - 1) {
719 buddy_idx = __find_buddy_index(page_idx, order); 713 buddy_idx = __find_buddy_index(page_idx, order);
720 buddy = page + (buddy_idx - page_idx); 714 buddy = page + (buddy_idx - page_idx);
721 if (!page_is_buddy(page, buddy, order)) 715 if (!page_is_buddy(page, buddy, order))
722 break; 716 goto done_merging;
723 /* 717 /*
724 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 718 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
725 * merge with it and move up one order. 719 * merge with it and move up one order.
@@ -736,6 +730,32 @@ static inline void __free_one_page(struct page *page,
736 page_idx = combined_idx; 730 page_idx = combined_idx;
737 order++; 731 order++;
738 } 732 }
733 if (max_order < MAX_ORDER) {
734 /* If we are here, it means order is >= pageblock_order.
735 * We want to prevent merge between freepages on isolate
736 * pageblock and normal pageblock. Without this, pageblock
737 * isolation could cause incorrect freepage or CMA accounting.
738 *
739 * We don't want to hit this code for the more frequent
740 * low-order merging.
741 */
742 if (unlikely(has_isolate_pageblock(zone))) {
743 int buddy_mt;
744
745 buddy_idx = __find_buddy_index(page_idx, order);
746 buddy = page + (buddy_idx - page_idx);
747 buddy_mt = get_pageblock_migratetype(buddy);
748
749 if (migratetype != buddy_mt
750 && (is_migrate_isolate(migratetype) ||
751 is_migrate_isolate(buddy_mt)))
752 goto done_merging;
753 }
754 max_order++;
755 goto continue_merging;
756 }
757
758done_merging:
739 set_page_order(page, order); 759 set_page_order(page, order);
740 760
741 /* 761 /*
diff --git a/mm/slab.c b/mm/slab.c
index e719a5cb3396..17e2848979c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2086 } 2086 }
2087#endif 2087#endif
2088 2088
2089 kasan_cache_create(cachep, &size, &flags);
2090
2089 size = ALIGN(size, cachep->align); 2091 size = ALIGN(size, cachep->align);
2090 /* 2092 /*
2091 * We should restrict the number of objects in a slab to implement 2093 * We should restrict the number of objects in a slab to implement
@@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2387 * cache which they are a constructor for. Otherwise, deadlock. 2389 * cache which they are a constructor for. Otherwise, deadlock.
2388 * They must also be threaded. 2390 * They must also be threaded.
2389 */ 2391 */
2390 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2392 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
2393 kasan_unpoison_object_data(cachep,
2394 objp + obj_offset(cachep));
2391 cachep->ctor(objp + obj_offset(cachep)); 2395 cachep->ctor(objp + obj_offset(cachep));
2396 kasan_poison_object_data(
2397 cachep, objp + obj_offset(cachep));
2398 }
2392 2399
2393 if (cachep->flags & SLAB_RED_ZONE) { 2400 if (cachep->flags & SLAB_RED_ZONE) {
2394 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2401 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2409 struct page *page) 2416 struct page *page)
2410{ 2417{
2411 int i; 2418 int i;
2419 void *objp;
2412 2420
2413 cache_init_objs_debug(cachep, page); 2421 cache_init_objs_debug(cachep, page);
2414 2422
@@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep,
2419 2427
2420 for (i = 0; i < cachep->num; i++) { 2428 for (i = 0; i < cachep->num; i++) {
2421 /* constructor could break poison info */ 2429 /* constructor could break poison info */
2422 if (DEBUG == 0 && cachep->ctor) 2430 if (DEBUG == 0 && cachep->ctor) {
2423 cachep->ctor(index_to_obj(cachep, page, i)); 2431 objp = index_to_obj(cachep, page, i);
2432 kasan_unpoison_object_data(cachep, objp);
2433 cachep->ctor(objp);
2434 kasan_poison_object_data(cachep, objp);
2435 }
2424 2436
2425 set_free_obj(page, i, i); 2437 set_free_obj(page, i, i);
2426 } 2438 }
@@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep,
2550 2562
2551 slab_map_pages(cachep, page, freelist); 2563 slab_map_pages(cachep, page, freelist);
2552 2564
2565 kasan_poison_slab(page);
2553 cache_init_objs(cachep, page); 2566 cache_init_objs(cachep, page);
2554 2567
2555 if (gfpflags_allow_blocking(local_flags)) 2568 if (gfpflags_allow_blocking(local_flags))
@@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3316{ 3329{
3317 struct array_cache *ac = cpu_cache_get(cachep); 3330 struct array_cache *ac = cpu_cache_get(cachep);
3318 3331
3332 kasan_slab_free(cachep, objp);
3333
3319 check_irq_off(); 3334 check_irq_off();
3320 kmemleak_free_recursive(objp, cachep->flags); 3335 kmemleak_free_recursive(objp, cachep->flags);
3321 objp = cache_free_debugcheck(cachep, objp, caller); 3336 objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3363{ 3378{
3364 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3379 void *ret = slab_alloc(cachep, flags, _RET_IP_);
3365 3380
3381 kasan_slab_alloc(cachep, ret, flags);
3366 trace_kmem_cache_alloc(_RET_IP_, ret, 3382 trace_kmem_cache_alloc(_RET_IP_, ret,
3367 cachep->object_size, cachep->size, flags); 3383 cachep->object_size, cachep->size, flags);
3368 3384
@@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3428 3444
3429 ret = slab_alloc(cachep, flags, _RET_IP_); 3445 ret = slab_alloc(cachep, flags, _RET_IP_);
3430 3446
3447 kasan_kmalloc(cachep, ret, size, flags);
3431 trace_kmalloc(_RET_IP_, ret, 3448 trace_kmalloc(_RET_IP_, ret,
3432 size, cachep->size, flags); 3449 size, cachep->size, flags);
3433 return ret; 3450 return ret;
@@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3451{ 3468{
3452 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3469 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3453 3470
3471 kasan_slab_alloc(cachep, ret, flags);
3454 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3472 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3455 cachep->object_size, cachep->size, 3473 cachep->object_size, cachep->size,
3456 flags, nodeid); 3474 flags, nodeid);
@@ -3469,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3469 3487
3470 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3488 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3471 3489
3490 kasan_kmalloc(cachep, ret, size, flags);
3472 trace_kmalloc_node(_RET_IP_, ret, 3491 trace_kmalloc_node(_RET_IP_, ret,
3473 size, cachep->size, 3492 size, cachep->size,
3474 flags, nodeid); 3493 flags, nodeid);
@@ -3481,11 +3500,15 @@ static __always_inline void *
3481__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3500__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3482{ 3501{
3483 struct kmem_cache *cachep; 3502 struct kmem_cache *cachep;
3503 void *ret;
3484 3504
3485 cachep = kmalloc_slab(size, flags); 3505 cachep = kmalloc_slab(size, flags);
3486 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3506 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3487 return cachep; 3507 return cachep;
3488 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3508 ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
3509 kasan_kmalloc(cachep, ret, size, flags);
3510
3511 return ret;
3489} 3512}
3490 3513
3491void *__kmalloc_node(size_t size, gfp_t flags, int node) 3514void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3519,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3519 return cachep; 3542 return cachep;
3520 ret = slab_alloc(cachep, flags, caller); 3543 ret = slab_alloc(cachep, flags, caller);
3521 3544
3545 kasan_kmalloc(cachep, ret, size, flags);
3522 trace_kmalloc(caller, ret, 3546 trace_kmalloc(caller, ret,
3523 size, cachep->size, flags); 3547 size, cachep->size, flags);
3524 3548
@@ -4290,10 +4314,18 @@ module_init(slab_proc_init);
4290 */ 4314 */
4291size_t ksize(const void *objp) 4315size_t ksize(const void *objp)
4292{ 4316{
4317 size_t size;
4318
4293 BUG_ON(!objp); 4319 BUG_ON(!objp);
4294 if (unlikely(objp == ZERO_SIZE_PTR)) 4320 if (unlikely(objp == ZERO_SIZE_PTR))
4295 return 0; 4321 return 0;
4296 4322
4297 return virt_to_cache(objp)->object_size; 4323 size = virt_to_cache(objp)->object_size;
4324 /* We assume that ksize callers could use the whole allocated area,
4325 * so we need to unpoison this area.
4326 */
4327 kasan_krealloc(objp, size, GFP_NOWAIT);
4328
4329 return size;
4298} 4330}
4299EXPORT_SYMBOL(ksize); 4331EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
index ff39a8fc3b3f..5969769fbee6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
405 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 405 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
406 kmemleak_alloc_recursive(object, s->object_size, 1, 406 kmemleak_alloc_recursive(object, s->object_size, 1,
407 s->flags, flags); 407 s->flags, flags);
408 kasan_slab_alloc(s, object); 408 kasan_slab_alloc(s, object, flags);
409 } 409 }
410 memcg_kmem_put_cache(s); 410 memcg_kmem_put_cache(s);
411} 411}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b2e379639a5b..3239bfd758e6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
35 */ 35 */
36#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 36#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
38 SLAB_FAILSLAB) 38 SLAB_FAILSLAB | SLAB_KASAN)
39 39
40#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 40#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
41 SLAB_NOTRACK | SLAB_ACCOUNT) 41 SLAB_NOTRACK | SLAB_ACCOUNT)
@@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
1013 page = alloc_kmem_pages(flags, order); 1013 page = alloc_kmem_pages(flags, order);
1014 ret = page ? page_address(page) : NULL; 1014 ret = page ? page_address(page) : NULL;
1015 kmemleak_alloc(ret, size, 1, flags); 1015 kmemleak_alloc(ret, size, 1, flags);
1016 kasan_kmalloc_large(ret, size); 1016 kasan_kmalloc_large(ret, size, flags);
1017 return ret; 1017 return ret;
1018} 1018}
1019EXPORT_SYMBOL(kmalloc_order); 1019EXPORT_SYMBOL(kmalloc_order);
@@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
1192 ks = ksize(p); 1192 ks = ksize(p);
1193 1193
1194 if (ks >= new_size) { 1194 if (ks >= new_size) {
1195 kasan_krealloc((void *)p, new_size); 1195 kasan_krealloc((void *)p, new_size, flags);
1196 return (void *)p; 1196 return (void *)p;
1197 } 1197 }
1198 1198
diff --git a/mm/slub.c b/mm/slub.c
index 7277413ebc8b..4dbb109eb8cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
1313static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1313static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1314{ 1314{
1315 kmemleak_alloc(ptr, size, 1, flags); 1315 kmemleak_alloc(ptr, size, 1, flags);
1316 kasan_kmalloc_large(ptr, size); 1316 kasan_kmalloc_large(ptr, size, flags);
1317} 1317}
1318 1318
1319static inline void kfree_hook(const void *x) 1319static inline void kfree_hook(const void *x)
@@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2596{ 2596{
2597 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2597 void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2598 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2598 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2599 kasan_kmalloc(s, ret, size); 2599 kasan_kmalloc(s, ret, size, gfpflags);
2600 return ret; 2600 return ret;
2601} 2601}
2602EXPORT_SYMBOL(kmem_cache_alloc_trace); 2602EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2624 trace_kmalloc_node(_RET_IP_, ret, 2624 trace_kmalloc_node(_RET_IP_, ret,
2625 size, s->size, gfpflags, node); 2625 size, s->size, gfpflags, node);
2626 2626
2627 kasan_kmalloc(s, ret, size); 2627 kasan_kmalloc(s, ret, size, gfpflags);
2628 return ret; 2628 return ret;
2629} 2629}
2630EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2630EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node)
3182 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 3182 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3183 init_tracking(kmem_cache_node, n); 3183 init_tracking(kmem_cache_node, n);
3184#endif 3184#endif
3185 kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); 3185 kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
3186 GFP_KERNEL);
3186 init_kmem_cache_node(n); 3187 init_kmem_cache_node(n);
3187 inc_slabs_node(kmem_cache_node, node, page->objects); 3188 inc_slabs_node(kmem_cache_node, node, page->objects);
3188 3189
@@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags)
3561 3562
3562 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3563 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3563 3564
3564 kasan_kmalloc(s, ret, size); 3565 kasan_kmalloc(s, ret, size, flags);
3565 3566
3566 return ret; 3567 return ret;
3567} 3568}
@@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3606 3607
3607 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3608 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3608 3609
3609 kasan_kmalloc(s, ret, size); 3610 kasan_kmalloc(s, ret, size, flags);
3610 3611
3611 return ret; 3612 return ret;
3612} 3613}
@@ -3635,7 +3636,7 @@ size_t ksize(const void *object)
3635 size_t size = __ksize(object); 3636 size_t size = __ksize(object);
3636 /* We assume that ksize callers could use whole allocated area, 3637 /* We assume that ksize callers could use whole allocated area,
3637 so we need unpoison this area. */ 3638 so we need unpoison this area. */
3638 kasan_krealloc(object, size); 3639 kasan_krealloc(object, size, GFP_NOWAIT);
3639 return size; 3640 return size;
3640} 3641}
3641EXPORT_SYMBOL(ksize); 3642EXPORT_SYMBOL(ksize);