Merge branch 'akpm' (patches from Andrew)

Merge fourth patch-bomb from Andrew Morton: "A lot more stuff than expected, sorry. A bunch of ocfs2 reviewing was finished off. - mhocko's oom-reaper out-of-memory-handler changes - ocfs2 fixes and features - KASAN feature work - various fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (42 commits) thp: fix typo in khugepaged_scan_pmd() MAINTAINERS: fill entries for KASAN mm/filemap: generic_file_read_iter(): check for zero reads unconditionally kasan: test fix: warn if the UAF could not be detected in kmalloc_uaf2 mm, kasan: stackdepot implementation. Enable stackdepot for SLAB arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections mm, kasan: add GFP flags to KASAN API mm, kasan: SLAB support kasan: modify kmalloc_large_oob_right(), add kmalloc_pagealloc_oob_right() include/linux/oom.h: remove undefined oom_kills_count()/note_oom_kill() mm/page_alloc: prevent merging between isolated and other pageblocks drivers/memstick/host/r592.c: avoid gcc-6 warning ocfs2: extend enough credits for freeing one truncate record while replaying truncate records ocfs2: extend transaction for ocfs2_remove_rightmost_path() and ocfs2_update_edge_lengths() before to avoid inconsistency between inode and et ocfs2/dlm: move lock to the tail of grant queue while doing in-place convert ocfs2: solve a problem of crossing the boundary in updating backups ocfs2: fix occurring deadlock by changing ocfs2_wq from global to local ocfs2/dlm: fix BUG in dlm_move_lockres_to_recovery_list ocfs2/dlm: fix race between convert and recovery ocfs2: fix a deadlock issue in ocfs2_dio_end_io_write() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-25 19:59:11 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-25 19:59:11 -0400
commit: 606c61a0579669c292dc5f5e1cf898edecfc0d53 (patch)
tree: 569aa7e9b99571890bfccd7278bbc303cfa0a919
parent: 15dbc136dff62ebefb03353cfb7d308d49b275f3 (diff)
parent: 0fda2788b03c1868e2f20b3b7995b8cc2adf4715 (diff)
79 files changed, 1770 insertions, 962 deletions
diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt
index aa1e0c91e368..7dd95b35cd7c 100644
--- a/Documentation/kasan.txt
+++ b/Documentation/kasan.txt
@@ -12,8 +12,7 @@ KASAN uses compile-time instrumentation for checking every memory access,
 therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
 required for detection of out-of-bounds accesses to stack or global variables.
-Currently KASAN is supported only for x86_64 architecture and requires the
+Currently KASAN is supported only for x86_64 architecture.
-kernel to be built with the SLUB allocator.
 1. Usage
 ========
@@ -27,7 +26,7 @@ inline are compiler instrumentation types. The former produces smaller binary
 the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
 version 5.0 or later.
-Currently KASAN works only with the SLUB memory allocator.
+KASAN works with both SLUB and SLAB memory allocators.
 For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
 To disable instrumentation for specific files or directories, add a line
diff --git a/MAINTAINERS b/MAINTAINERS
index f07a174bbc81..df8cf6b924c6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6165,6 +6165,20 @@ S:	Maintained
 F:      Documentation/hwmon/k8temp
 F:      drivers/hwmon/k8temp.c
+KASAN
+M:      Andrey Ryabinin <aryabinin@virtuozzo.com>
+R:      Alexander Potapenko <glider@google.com>
+R:      Dmitry Vyukov <dvyukov@google.com>
+L:      kasan-dev@googlegroups.com
+S:      Maintained
+F:      arch/*/include/asm/kasan.h
+F:      arch/*/mm/kasan_init*
+F:      Documentation/kasan.txt
+F:      include/linux/kasan.h
+F:      lib/test_kasan.c
+F:      mm/kasan/
+F:      scripts/Makefile.kasan
 KCONFIG
 M:      "Yann E. MORIN" <yann.morin.1998@free.fr>
 L:      linux-kbuild@vger.kernel.org
diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h
index 5abaf5bbd985..bf1991263d2d 100644
--- a/arch/arm/include/asm/exception.h
+++ b/arch/arm/include/asm/exception.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_ARM_EXCEPTION_H
 #define __ASM_ARM_EXCEPTION_H
-#include <linux/ftrace.h>
+#include <linux/interrupt.h>
 #define __exception     __attribute__((section(".exception.text")))
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 1fab979daeaf..e2c6da096cef 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -108,6 +108,7 @@ SECTIONS
                        *(.exception.text)
                        __exception_text_end = .;
                        IRQENTRY_TEXT
+                        SOFTIRQENTRY_TEXT
                        TEXT_TEXT
                        SCHED_TEXT
                        LOCK_TEXT
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 6cb7e1a6bc02..0c2eec490abf 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -18,7 +18,7 @@
 #ifndef __ASM_EXCEPTION_H
 #define __ASM_EXCEPTION_H
-#include <linux/ftrace.h>
+#include <linux/interrupt.h>
 #define __exception     __attribute__((section(".exception.text")))
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 37f624df68fa..5a1939a74ff3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -103,6 +103,7 @@ SECTIONS
                        *(.exception.text)
                        __exception_text_end = .;
                        IRQENTRY_TEXT
+                        SOFTIRQENTRY_TEXT
                        TEXT_TEXT
                        SCHED_TEXT
                        LOCK_TEXT
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index c9eec84aa258..d920b959ff3a 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -35,6 +35,7 @@ SECTIONS
 #endif
                LOCK_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                KPROBES_TEXT
 #ifdef CONFIG_ROMKERNEL
                __sinittext = .;
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S
index 5a6e141d1641..50bc10f97bcb 100644
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
                SCHED_TEXT
                LOCK_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                KPROBES_TEXT
                *(.fixup)
                *(.gnu.warning)
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S
index e12055e88bfe..150ace92c7ad 100644
--- a/arch/metag/kernel/vmlinux.lds.S
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@ SECTIONS
        LOCK_TEXT
        KPROBES_TEXT
        IRQENTRY_TEXT
+        SOFTIRQENTRY_TEXT
        *(.text.*)
        *(.gnu.warning)
        }
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index be9488d69734..0a47f0410554 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS {
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                . = ALIGN (4) ;
                _etext = . ;
        }
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 0a93e83cd014..54d653ee17e1 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -58,6 +58,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.text.*)
                *(.fixup)
                *(.gnu.warning)
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S
index 326fab40a9de..e23e89539967 100644
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
                SCHED_TEXT
                LOCK_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                KPROBES_TEXT
        } =0
        _etext = .;
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S
index 2d69a853b742..d936de4c07ca 100644
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
          LOCK_TEXT
          KPROBES_TEXT
          IRQENTRY_TEXT
+          SOFTIRQENTRY_TEXT
          *(.fixup)
          *(.text.__*)
          _etext = .;
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 308f29081d46..f3ead0b6ce46 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -72,6 +72,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.text.do_softirq)
                *(.text.sys_exit)
                *(.text.do_sigaltstack)
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index d41fd0af8980..2dd91f79de05 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -55,6 +55,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
 #ifdef CONFIG_PPC32
                *(.got1)
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 445657fe658c..0f41a8286378 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
        } :text = 0x0700
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index db88cbf9eafd..235a4101999f 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -39,6 +39,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
                _etext = .;             /* End of text section */
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index f1a2f688b28a..aadd321aa05d 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -48,6 +48,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.gnu.warning)
        } = 0
        _etext = .;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index 0e059a0101ea..378f5d8d1ec8 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -45,6 +45,7 @@ SECTIONS
    LOCK_TEXT
    KPROBES_TEXT
    IRQENTRY_TEXT
+    SOFTIRQENTRY_TEXT
    __fix_text_end = .;   /* tile-cpack won't rearrange before this */
    ALIGN_FUNCTION();
    *(.hottext*)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index adaae2c781c1..616ebd22ef9a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,7 @@ endif
 KASAN_SANITIZE_head$(BITS).o                            := n
 KASAN_SANITIZE_dumpstack.o                              := n
 KASAN_SANITIZE_dumpstack_$(BITS).o                      := n
+KASAN_SANITIZE_stacktrace.o := n
 OBJECT_FILES_NON_STANDARD_head_$(BITS).o                := y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o     := y
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d239639e0c1d..4c941f88d405 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
                KPROBES_TEXT
                ENTRY_TEXT
                IRQENTRY_TEXT
+                SOFTIRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
                /* End of text section */
diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c
index 64ca7113ff28..d84d20b9cec0 100644
--- a/drivers/input/input-compat.c
+++ b/drivers/input/input-compat.c
@@ -17,7 +17,7 @@
 int input_event_from_user(const char __user *buffer,
                          struct input_event *event)
 {
-        if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) {
+        if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct input_event_compat compat_event;
                if (copy_from_user(&compat_event, buffer,
@@ -41,7 +41,7 @@ int input_event_from_user(const char __user *buffer,
 int input_event_to_user(char __user *buffer,
                        const struct input_event *event)
 {
-        if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) {
+        if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct input_event_compat compat_event;
                compat_event.time.tv_sec = event->time.tv_sec;
@@ -65,7 +65,7 @@ int input_event_to_user(char __user *buffer,
 int input_ff_effect_from_user(const char __user *buffer, size_t size,
                              struct ff_effect *effect)
 {
-        if (INPUT_COMPAT_TEST) {
+        if (in_compat_syscall()) {
                struct ff_effect_compat *compat_effect;
                if (size != sizeof(struct ff_effect_compat))
diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h
index 0f25878d5fa2..1563160a7af3 100644
--- a/drivers/input/input-compat.h
+++ b/drivers/input/input-compat.h
@@ -17,8 +17,6 @@
 #ifdef CONFIG_COMPAT
-#define INPUT_COMPAT_TEST in_compat_syscall()
 struct input_event_compat {
        struct compat_timeval time;
        __u16 type;
@@ -57,7 +55,7 @@ struct ff_effect_compat {
 static inline size_t input_event_size(void)
 {
-        return (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) ?
+        return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ?
                sizeof(struct input_event_compat) : sizeof(struct input_event);
 }
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 880605959aa6..b87ffbd4547d 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1015,7 +1015,7 @@ static int input_bits_to_string(char *buf, int buf_size,
 {
        int len = 0;
-        if (INPUT_COMPAT_TEST) {
+        if (in_compat_syscall()) {
                u32 dword = bits >> 32;
                if (dword || !skip_empty)
                        len += snprintf(buf, buf_size, "%x ", dword);
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 4eb9e4d94f46..abe1a927b332 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -664,7 +664,7 @@ struct uinput_ff_upload_compat {
 static int uinput_ff_upload_to_user(char __user *buffer,
                                    const struct uinput_ff_upload *ff_up)
 {
-        if (INPUT_COMPAT_TEST) {
+        if (in_compat_syscall()) {
                struct uinput_ff_upload_compat ff_up_compat;
                ff_up_compat.request_id = ff_up->request_id;
@@ -695,7 +695,7 @@ static int uinput_ff_upload_to_user(char __user *buffer,
 static int uinput_ff_upload_from_user(const char __user *buffer,
                                      struct uinput_ff_upload *ff_up)
 {
-        if (INPUT_COMPAT_TEST) {
+        if (in_compat_syscall()) {
                struct uinput_ff_upload_compat ff_up_compat;
                if (copy_from_user(&ff_up_compat, buffer,
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index ef09ba0289d7..d5cfb503b9d6 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -298,8 +298,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev)
        sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ?
                PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
-        if (sg_count != 1 ||
+        if (sg_count != 1 || sg_dma_len(&dev->req->sg) < R592_LFIFO_SIZE) {
-                        (sg_dma_len(&dev->req->sg) < dev->req->sg.length)) {
                message("problem in dma_map_sg");
                return -EIO;
        }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
        struct ocfs2_extent_block *eb;
        u32 range;
-        /*
-         * In normal tree rotation process, we will never touch the
-         * tree branch above subtree_index and ocfs2_extend_rotate_transaction
-         * doesn't reserve the credits for them either.
-         *
-         * But we do have a special case here which will update the rightmost
-         * records for all the bh in the path.
-         * So we have to allocate extra credits and access them.
-         */
-        ret = ocfs2_extend_trans(handle, subtree_index);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
                mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
                     right_path->p_node[subtree_root].bh->b_blocknr,
                     right_path->p_tree_depth);
-                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
                                                      orig_credits, left_path);
                if (ret) {
                        mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        ret = ocfs2_et_sanity_check(et);
        if (ret)
                goto out;
-        /*
-         * There's two ways we handle this depending on
-         * whether path is the only existing one.
-         */
-        ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                              handle->h_buffer_credits,
-                                              path);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                 */
                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
                    le16_to_cpu(el->l_next_free_rec) == 1) {
+                        /* extend credit for ocfs2_remove_rightmost_path */
+                        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        right_path);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
                        ret = ocfs2_remove_rightmost_path(handle, et,
                                                          right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                /*
                 * The merge code will need to create an empty
                 * extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                /* The merge left us with an empty extent, remove it. */
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                        goto out;
                }
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                }
                if (ctxt->c_split_covers_rec) {
+                        /* extend credit for ocfs2_remove_rightmost_path */
+                        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        path);
+                        if (ret) {
+                                mlog_errno(ret);
+                                ret = 0;
+                                goto out;
+                        }
                        /*
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
        struct ocfs2_extent_block *eb;
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                ocfs2_journal_dirty(handle, tl_bh);
-                /* TODO: Perhaps we can calculate the bulk of the
-                 * credits up front rather than extending like
-                 * this. */
-                status = ocfs2_extend_trans(handle,
-                                            OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                rec = tl->tl_recs[i];
                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
                                                    le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                                goto bail;
                        }
                }
+                status = ocfs2_extend_trans(handle,
+                                OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
                i--;
        }
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
                if (cancel)
                        cancel_delayed_work(&osb->osb_truncate_log_wq);
-                queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+                queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
        }
 }
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
        if (tl_inode) {
                cancel_delayed_work(&osb->osb_truncate_log_wq);
-                flush_workqueue(ocfs2_wq);
+                flush_workqueue(osb->ocfs2_wq);
                status = ocfs2_flush_truncate_log(osb);
                if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 043110e5212d..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,158 +499,6 @@ bail:
        return status;
 }
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate
- *   bh.b_size with the size of IO which is permitted at this offset and
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- *                                      fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                     struct buffer_head *bh_result, int create)
-{
-        int ret;
-        u32 cpos = 0;
-        int alloc_locked = 0;
-        u64 p_blkno, inode_blocks, contig_blocks;
-        unsigned int ext_flags;
-        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
-        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-        unsigned long len = bh_result->b_size;
-        unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-        /* This function won't even be called if the request isn't all
-         * nicely aligned and of the right size, so there's no need
-         * for us to check any of that. */
-        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        /* This figures out the size of the next contiguous block, and
-         * our logical offset */
-        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                          &contig_blocks, &ext_flags);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (ret) {
-                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                     (unsigned long long)iblock);
-                ret = -EIO;
-                goto bail;
-        }
-        /* We should already CoW the refcounted extent in case of create. */
-        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-        /* allocate blocks if no p_blkno is found, and create == 1 */
-        if (!p_blkno && create) {
-                ret = ocfs2_inode_lock(inode, NULL, 1);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto bail;
-                }
-                alloc_locked = 1;
-                down_write(&OCFS2_I(inode)->ip_alloc_sem);
-                /* fill hole, allocate blocks can't be larger than the size
-                 * of the hole */
-                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-                contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
-                                contig_blocks);
-                if (clusters_to_alloc > contig_clusters)
-                        clusters_to_alloc = contig_clusters;
-                /* allocate extent and insert them into the extent tree */
-                ret = ocfs2_extend_allocation(inode, cpos,
-                                clusters_to_alloc, 0);
-                if (ret < 0) {
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        mlog_errno(ret);
-                        goto bail;
-                }
-                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                &contig_blocks, &ext_flags);
-                if (ret < 0) {
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                                        (unsigned long long)iblock);
-                        ret = -EIO;
-                        goto bail;
-                }
-                set_buffer_new(bh_result);
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
-        }
-        /*
-         * get_more_blocks() expects us to describe a hole by clearing
-         * the mapped bit on bh_result().
-         *
-         * Consider an unwritten extent as a hole.
-         */
-        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-                map_bh(bh_result, inode->i_sb, p_blkno);
-        else
-                clear_buffer_mapped(bh_result);
-        /* make sure we don't map more than max_blocks blocks here as
-           that's all the kernel will handle at this point. */
-        if (max_blocks < contig_blocks)
-                contig_blocks = max_blocks;
-        bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
-        if (alloc_locked)
-                ocfs2_inode_unlock(inode, 1);
-        return ret;
-}
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
- * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static int ocfs2_dio_end_io(struct kiocb *iocb,
-                             loff_t offset,
-                             ssize_t bytes,
-                             void *private)
-{
-        struct inode *inode = file_inode(iocb->ki_filp);
-        int level;
-        if (bytes <= 0)
-                return 0;
-        /* this io's submitter should not have unlocked this before we could */
-        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
-                ocfs2_iocb_clear_unaligned_aio(iocb);
-                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-        }
-        /* Let rw unlock to be done later to protect append direct io write */
-        if (offset + bytes <= i_size_read(inode)) {
-                ocfs2_iocb_clear_rw_locked(iocb);
-                level = ocfs2_iocb_rw_locked_level(iocb);
-                ocfs2_rw_unlock(inode, level);
-        }
-        return 0;
-}
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
        if (!page_has_buffers(page))
@@ -658,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset)
-{
-        int ret = 0;
-        u32 v_cpos = 0;
-        u32 p_cpos = 0;
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                        &num_clusters, &ext_flags);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
-        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-                return 1;
-        return 0;
-}
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset,
-                u64 zero_len, int cluster_align)
-{
-        u32 p_cpos = 0;
-        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        int ret = 0;
-        if (offset <= i_size_read(inode) || cluster_align)
-                return 0;
-        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                        &ext_flags);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
-        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                u64 s = i_size_read(inode);
-                sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
-                        (do_div(s, osb->s_clustersize) >> 9);
-                ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
-                                zero_len >> 9, GFP_NOFS, false);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-        return ret;
-}
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset)
-{
-        u64 zero_start, zero_len, total_zero_len;
-        u32 p_cpos = 0, clusters_to_add;
-        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        u32 size_div, offset_div;
-        int ret = 0;
-        {
-                u64 o = offset;
-                u64 s = i_size_read(inode);
-                offset_div = do_div(o, osb->s_clustersize);
-                size_div = do_div(s, osb->s_clustersize);
-        }
-        if (offset <= i_size_read(inode))
-                return 0;
-        clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
-                ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
-        total_zero_len = offset - i_size_read(inode);
-        if (clusters_to_add)
-                total_zero_len -= offset_div;
-        /* Allocate clusters to fill out holes, and this is only needed
-         * when we add more than one clusters. Otherwise the cluster will
-         * be allocated during direct IO */
-        if (clusters_to_add > 1) {
-                ret = ocfs2_extend_allocation(inode,
-                                OCFS2_I(inode)->ip_clusters,
-                                clusters_to_add - 1, 0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        while (total_zero_len) {
-                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                                &ext_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
-                        size_div;
-                zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
-                        size_div;
-                zero_len = min(total_zero_len, zero_len);
-                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                        ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                                        zero_start >> 9, zero_len >> 9,
-                                        GFP_NOFS, false);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
-                }
-                total_zero_len -= zero_len;
-                v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-                /* Only at first iteration can be cluster not aligned.
-                 * So set size_div to 0 for the rest */
-                size_div = 0;
-        }
-out:
-        return ret;
-}
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
-                struct iov_iter *iter,
-                loff_t offset)
-{
-        ssize_t ret = 0;
-        ssize_t written = 0;
-        bool orphaned = false;
-        int is_overwrite = 0;
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file_inode(file)->i_mapping->host;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct buffer_head *di_bh = NULL;
-        size_t count = iter->count;
-        journal_t *journal = osb->journal->j_journal;
-        u64 zero_len_head, zero_len_tail;
-        int cluster_align_head, cluster_align_tail;
-        loff_t final_size = offset + count;
-        int append_write = offset >= i_size_read(inode) ? 1 : 0;
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        {
-                u64 o = offset;
-                u64 s = i_size_read(inode);
-                zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
-                cluster_align_head = !zero_len_head;
-                zero_len_tail = osb->s_clustersize -
-                        do_div(s, osb->s_clustersize);
-                if ((offset - i_size_read(inode)) < zero_len_tail)
-                        zero_len_tail = offset - i_size_read(inode);
-                cluster_align_tail = !zero_len_tail;
-        }
-        /*
-         * when final_size > inode->i_size, inode->i_size will be
-         * updated after direct write, so add the inode to orphan
-         * dir first.
-         */
-        if (final_size > i_size_read(inode)) {
-                ret = ocfs2_add_inode_to_orphan(osb, inode);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                orphaned = true;
-        }
-        if (append_write) {
-                ret = ocfs2_inode_lock(inode, NULL, 1);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto clean_orphan;
-                }
-                /* zeroing out the previously allocated cluster tail
-                 * that but not zeroed */
-                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-                        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
-                                        zero_len_tail, cluster_align_tail);
-                        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-                } else {
-                        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
-                                        offset);
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                }
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        ocfs2_inode_unlock(inode, 1);
-                        goto clean_orphan;
-                }
-                is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
-                if (is_overwrite < 0) {
-                        mlog_errno(is_overwrite);
-                        ret = is_overwrite;
-                        ocfs2_inode_unlock(inode, 1);
-                        goto clean_orphan;
-                }
-                ocfs2_inode_unlock(inode, 1);
-        }
-        written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                       offset, ocfs2_direct_IO_get_blocks,
-                                       ocfs2_dio_end_io, NULL, 0);
-        /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
-        if ((written < 0) && (written != -EIOCBQUEUED)) {
-                loff_t i_size = i_size_read(inode);
-                if (offset + count > i_size) {
-                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto clean_orphan;
-                        }
-                        if (i_size == i_size_read(inode)) {
-                                ret = ocfs2_truncate_file(inode, di_bh,
-                                                i_size);
-                                if (ret < 0) {
-                                        if (ret != -ENOSPC)
-                                                mlog_errno(ret);
-                                        ocfs2_inode_unlock(inode, 1);
-                                        brelse(di_bh);
-                                        di_bh = NULL;
-                                        goto clean_orphan;
-                                }
-                        }
-                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
-                        di_bh = NULL;
-                        ret = jbd2_journal_force_commit(journal);
-                        if (ret < 0)
-                                mlog_errno(ret);
-                }
-        } else if (written > 0 && append_write && !is_overwrite &&
-                        !cluster_align_head) {
-                /* zeroing out the allocated cluster head */
-                u32 p_cpos = 0;
-                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-                ret = ocfs2_inode_lock(inode, NULL, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto clean_orphan;
-                }
-                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                                &num_clusters, &ext_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        ocfs2_inode_unlock(inode, 0);
-                        goto clean_orphan;
-                }
-                BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                                (u64)p_cpos << (osb->s_clustersize_bits - 9),
-                                zero_len_head >> 9, GFP_NOFS, false);
-                if (ret < 0)
-                        mlog_errno(ret);
-                ocfs2_inode_unlock(inode, 0);
-        }
-clean_orphan:
-        if (orphaned) {
-                int tmp_ret;
-                int update_isize = written > 0 ? 1 : 0;
-                loff_t end = update_isize ? offset + written : 0;
-                tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                if (tmp_ret < 0) {
-                        ret = tmp_ret;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
-                                update_isize, end);
-                if (tmp_ret < 0) {
-                        ocfs2_inode_unlock(inode, 1);
-                        ret = tmp_ret;
-                        mlog_errno(ret);
-                        brelse(di_bh);
-                        goto out;
-                }
-                ocfs2_inode_unlock(inode, 1);
-                brelse(di_bh);
-                tmp_ret = jbd2_journal_force_commit(journal);
-                if (tmp_ret < 0) {
-                        ret = tmp_ret;
-                        mlog_errno(tmp_ret);
-                }
-        }
-out:
-        if (ret >= 0)
-                ret = written;
-        return ret;
-}
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                               loff_t offset)
-{
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file_inode(file)->i_mapping->host;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int full_coherency = !(osb->s_mount_opt &
-                        OCFS2_MOUNT_COHERENCY_BUFFERED);
-        /*
-         * Fallback to buffered I/O if we see an inode without
-         * extents.
-         */
-        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return 0;
-        /* Fallback to buffered I/O if we are appending and
-         * concurrent O_DIRECT writes are allowed.
-         */
-        if (i_size_read(inode) <= offset && !full_coherency)
-                return 0;
-        if (iov_iter_rw(iter) == READ)
-                return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                            iter, offset,
-                                            ocfs2_direct_IO_get_blocks,
-                                            ocfs2_dio_end_io, NULL, 0);
-        else
-                return ocfs2_direct_IO_write(iocb, iter, offset);
-}
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
                                            u32 cpos,
                                            unsigned int *start,
@@ -1201,6 +692,13 @@ next_bh:
 #define OCFS2_MAX_CLUSTERS_PER_PAGE     (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+        struct list_head        ue_node;
+        struct list_head        ue_ip_node;
+        u32                     ue_cpos;
+        u32                     ue_phys;
+};
 /*
 * Describe the state of a single cluster to be written to.
 */
@@ -1212,7 +710,7 @@ struct ocfs2_write_cluster_desc {
         * filled.
         */
        unsigned        c_new;
-        unsigned        c_unwritten;
+        unsigned        c_clear_unwritten;
        unsigned        c_needs_zero;
 };
@@ -1224,6 +722,9 @@ struct ocfs2_write_ctxt {
        /* First cluster allocated in a nonsparse extend */
        u32                             w_first_new_cpos;
+        /* Type of caller. Must be one of buffer, mmap, direct.  */
+        ocfs2_write_type_t              w_type;
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
        /*
@@ -1272,6 +773,8 @@ struct ocfs2_write_ctxt {
        struct buffer_head              *w_di_bh;
        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+        struct list_head                w_unwritten_list;
 };
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1310,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+                                 struct list_head *head)
 {
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+        list_for_each_entry_safe(ue, tmp, head, ue_node) {
+                list_del(&ue->ue_node);
+                spin_lock(&oi->ip_lock);
+                list_del(&ue->ue_ip_node);
+                spin_unlock(&oi->ip_lock);
+                kfree(ue);
+        }
+}
+static void ocfs2_free_write_ctxt(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc)
+{
+        ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
        ocfs2_unlock_pages(wc);
        brelse(wc->w_di_bh);
        kfree(wc);
@@ -1319,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                                  struct ocfs2_super *osb, loff_t pos,
-                                  unsigned len, struct buffer_head *di_bh)
+                                  unsigned len, ocfs2_write_type_t type,
+                                  struct buffer_head *di_bh)
 {
        u32 cend;
        struct ocfs2_write_ctxt *wc;
@@ -1334,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
        wc->w_di_bh = di_bh;
+        wc->w_type = type;
        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
                wc->w_large_pages = 1;
@@ -1341,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                wc->w_large_pages = 0;
        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+        INIT_LIST_HEAD(&wc->w_unwritten_list);
        *wcp = wc;
@@ -1401,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
                to = user_pos + user_len;
        struct page *tmppage;
-        ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+        if (wc->w_target_page)
+                ocfs2_zero_new_buffers(wc->w_target_page, from, to);
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
-                if (page_has_buffers(tmppage)) {
+                if (tmppage && page_has_buffers(tmppage)) {
                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1536,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                wc->w_num_pages = 1;
                start = target_index;
        }
+        end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
-                if (index == target_index && mmap_page) {
+                if (index >= target_index && index <= end_index &&
+                    wc->w_type == OCFS2_WRITE_MMAP) {
                        /*
                         * ocfs2_pagemkwrite() is a little different
                         * and wants us to directly use the page
@@ -1559,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                        page_cache_get(mmap_page);
                        wc->w_pages[i] = mmap_page;
                        wc->w_target_locked = true;
+                } else if (index >= target_index && index <= end_index &&
+                           wc->w_type == OCFS2_WRITE_DIRECT) {
+                        /* Direct write has no mapping page. */
+                        wc->w_pages[i] = NULL;
+                        continue;
                } else {
                        wc->w_pages[i] = find_or_create_page(mapping, index,
                                                             GFP_NOFS);
@@ -1583,19 +1114,20 @@ out:
 * Prepare a single cluster for write one cluster into the file.
 */
 static int ocfs2_write_cluster(struct address_space *mapping,
-                               u32 phys, unsigned int unwritten,
+                               u32 *phys, unsigned int new,
+                               unsigned int clear_unwritten,
                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-        int ret, i, new;
+        int ret, i;
-        u64 v_blkno, p_blkno;
+        u64 p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-        new = phys == 0 ? 1 : 0;
        if (new) {
                u32 tmp_pos;
@@ -1605,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                 */
                tmp_pos = cpos;
                ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
-                                           &tmp_pos, 1, 0, wc->w_di_bh,
+                                           &tmp_pos, 1, !clear_unwritten,
-                                           wc->w_handle, data_ac,
+                                           wc->w_di_bh, wc->w_handle,
-                                           meta_ac, NULL);
+                                           data_ac, meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1624,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                        mlog_errno(ret);
                        goto out;
                }
-        } else if (unwritten) {
+        } else if (clear_unwritten) {
                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
                                              wc->w_di_bh);
                ret = ocfs2_mark_extent_written(inode, &et,
-                                                wc->w_handle, cpos, 1, phys,
+                                                wc->w_handle, cpos, 1, *phys,
                                                meta_ac, &wc->w_dealloc);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -1636,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                }
        }
-        if (should_zero)
-                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-        else
-                v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
        /*
         * The only reason this should fail is due to an inability to
         * find the extent added.
         */
-        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+        ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
-                                          NULL);
        if (ret < 0) {
                mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
-                            "at logical block %llu",
+                            "at logical cluster %u",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
-                            (unsigned long long)v_blkno);
                goto out;
        }
-        BUG_ON(p_blkno == 0);
+        BUG_ON(*phys == 0);
+        p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+        if (!should_zero)
+                p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
        for(i = 0; i < wc->w_num_pages; i++) {
                int tmpret;
+                /* This is the direct io target page. */
+                if (wc->w_pages[i] == NULL) {
+                        p_blkno++;
+                        continue;
+                }
                tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
                                                      wc->w_pages[i], cpos,
                                                      user_pos, user_len,
@@ -1706,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                if ((cluster_off + local_len) > osb->s_clustersize)
                        local_len = osb->s_clustersize - cluster_off;
-                ret = ocfs2_write_cluster(mapping, desc->c_phys,
+                ret = ocfs2_write_cluster(mapping, &desc->c_phys,
-                                          desc->c_unwritten,
+                                          desc->c_new,
+                                          desc->c_clear_unwritten,
                                          desc->c_needs_zero,
                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
@@ -1778,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 }
 /*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+                                 struct ocfs2_write_ctxt *wc,
+                                 struct ocfs2_write_cluster_desc *desc)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+        int ret = 0;
+        if (!desc->c_needs_zero)
+                return 0;
+retry:
+        spin_lock(&oi->ip_lock);
+        /* Needs not to zero no metter buffer or direct. The one who is zero
+         * the cluster is doing zero. And he will clear unwritten after all
+         * cluster io finished. */
+        list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+                if (desc->c_cpos == ue->ue_cpos) {
+                        BUG_ON(desc->c_new);
+                        desc->c_needs_zero = 0;
+                        desc->c_clear_unwritten = 0;
+                        goto unlock;
+                }
+        }
+        if (wc->w_type != OCFS2_WRITE_DIRECT)
+                goto unlock;
+        if (new == NULL) {
+                spin_unlock(&oi->ip_lock);
+                new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+                             GFP_NOFS);
+                if (new == NULL) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto retry;
+        }
+        /* This direct write will doing zero. */
+        new->ue_cpos = desc->c_cpos;
+        new->ue_phys = desc->c_phys;
+        desc->c_clear_unwritten = 0;
+        list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+        list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+        new = NULL;
+unlock:
+        spin_unlock(&oi->ip_lock);
+out:
+        if (new)
+                kfree(new);
+        return ret;
+}
+/*
 * Populate each single-cluster write descriptor in the write context
 * with information about the i/o to be done.
 *
@@ -1852,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                if (phys == 0) {
                        desc->c_new = 1;
                        desc->c_needs_zero = 1;
+                        desc->c_clear_unwritten = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
-                        desc->c_unwritten = 1;
+                        desc->c_clear_unwritten = 1;
                        desc->c_needs_zero = 1;
                }
+                ret = ocfs2_unwritten_check(inode, wc, desc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                num_clusters--;
        }
@@ -2022,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
        if (ret)
                mlog_errno(ret);
-        wc->w_first_new_cpos =
+        /* There is no wc if this is call from direct. */
-                ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+        if (wc)
+                wc->w_first_new_cpos =
+                        ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
        return ret;
 }
@@ -2077,9 +1682,8 @@ out:
        return ret;
 }
-int ocfs2_write_begin_nolock(struct file *filp,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
-                             struct address_space *mapping,
+                             loff_t pos, unsigned len, ocfs2_write_type_t type,
-                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
@@ -2096,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
        int try_free = 1, ret1;
 try_again:
-        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -2115,14 +1719,17 @@ try_again:
                }
        }
-        if (ocfs2_sparse_alloc(osb))
+        /* Direct io change i_size late, should not zero tail here. */
-                ret = ocfs2_zero_tail(inode, di_bh, pos);
+        if (type != OCFS2_WRITE_DIRECT) {
-        else
+                if (ocfs2_sparse_alloc(osb))
-                ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+                        ret = ocfs2_zero_tail(inode, di_bh, pos);
-                                                   wc);
+                else
-        if (ret) {
+                        ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
-                mlog_errno(ret);
+                                                           len, wc);
-                goto out;
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
        ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2153,7 +1760,7 @@ try_again:
                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
                        (long long)i_size_read(inode),
                        le32_to_cpu(di->i_clusters),
-                        pos, len, flags, mmap_page,
+                        pos, len, type, mmap_page,
                        clusters_to_alloc, extents_to_split);
        /*
@@ -2183,17 +1790,17 @@ try_again:
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list);
+        } else if (type == OCFS2_WRITE_DIRECT)
-        }
+                /* direct write needs not to start trans if no extents alloc. */
+                goto success;
        /*
         * We have to zero sparse allocated clusters, unwritten extent clusters,
         * and non-sparse clusters we just extended.  For non-sparse writes,
         * we know zeros will only be needed in the first and/or last cluster.
         */
-        if (clusters_to_alloc || extents_to_split ||
+        if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
-            (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+                           wc->w_desc[wc->w_clen - 1].c_needs_zero))
-                            wc->w_desc[wc->w_clen - 1].c_needs_zero)))
                cluster_of_pages = 1;
        else
                cluster_of_pages = 0;
@@ -2260,7 +1867,8 @@ try_again:
                ocfs2_free_alloc_context(meta_ac);
 success:
-        *pagep = wc->w_target_page;
+        if (pagep)
+                *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
 out_quota:
@@ -2271,7 +1879,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
-        ocfs2_free_write_ctxt(wc);
+        ocfs2_free_write_ctxt(inode, wc);
        if (data_ac) {
                ocfs2_free_alloc_context(data_ac);
@@ -2323,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
-                                       fsdata, di_bh, NULL);
+                                       pagep, fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
                goto out_fail;
@@ -2381,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+        BUG_ON(!list_empty(&wc->w_unwritten_list));
-                        OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
+        if (handle) {
-                copied = ret;
+                ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
-                mlog_errno(ret);
+                                wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-                goto out;
+                if (ret) {
+                        copied = ret;
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2394,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                goto out_write_size;
        }
-        if (unlikely(copied < len)) {
+        if (unlikely(copied < len) && wc->w_target_page) {
                if (!PageUptodate(wc->w_target_page))
                        copied = 0;
                ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
                                       start+len);
        }
-        flush_dcache_page(wc->w_target_page);
+        if (wc->w_target_page)
+                flush_dcache_page(wc->w_target_page);
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
+                /* This is the direct io target page. */
+                if (tmppage == NULL)
+                        continue;
                if (tmppage == wc->w_target_page) {
                        from = wc->w_target_from;
                        to = wc->w_target_to;
@@ -2424,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode))
+                        if (handle && ocfs2_should_order_data(inode))
-                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
+                                ocfs2_jbd2_file_inode(handle, inode);
                        block_commit_write(tmppage, from, to);
                }
        }
 out_write_size:
-        pos += copied;
+        /* Direct io do not update i_size here. */
-        if (pos > i_size_read(inode)) {
+        if (wc->w_type != OCFS2_WRITE_DIRECT) {
-                i_size_write(inode, pos);
+                pos += copied;
-                mark_inode_dirty(inode);
+                if (pos > i_size_read(inode)) {
-        }
+                        i_size_write(inode, pos);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
+                        mark_inode_dirty(inode);
-        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+                }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+                di->i_size = cpu_to_le64((u64)i_size_read(inode));
-        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ocfs2_update_inode_fsync_trans(handle, inode, 1);
+                di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-        ocfs2_journal_dirty(handle, wc->w_di_bh);
+                di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+                ocfs2_update_inode_fsync_trans(handle, inode, 1);
+        }
+        if (handle)
+                ocfs2_journal_dirty(handle, wc->w_di_bh);
 out:
        /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2452,7 +2073,8 @@ out:
         */
        ocfs2_unlock_pages(wc);
-        ocfs2_commit_trans(osb, handle);
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
        ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2477,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
+struct ocfs2_dio_write_ctxt {
+        struct list_head        dw_zero_list;
+        unsigned                dw_zero_count;
+        int                     dw_orphaned;
+        pid_t                   dw_writer_pid;
+};
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+        struct ocfs2_dio_write_ctxt *dwc = NULL;
+        if (bh->b_private)
+                return bh->b_private;
+        dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+        if (dwc == NULL)
+                return NULL;
+        INIT_LIST_HEAD(&dwc->dw_zero_list);
+        dwc->dw_zero_count = 0;
+        dwc->dw_orphaned = 0;
+        dwc->dw_writer_pid = task_pid_nr(current);
+        bh->b_private = dwc;
+        *alloc = 1;
+        return dwc;
+}
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+                                     struct ocfs2_dio_write_ctxt *dwc)
+{
+        ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+        kfree(dwc);
+}
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ *                                      fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+                               struct buffer_head *bh_result, int create)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_write_ctxt *wc;
+        struct ocfs2_write_cluster_desc *desc = NULL;
+        struct ocfs2_dio_write_ctxt *dwc = NULL;
+        struct buffer_head *di_bh = NULL;
+        u64 p_blkno;
+        loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+        unsigned len, total_len = bh_result->b_size;
+        int ret = 0, first_get_block = 0;
+        len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+        len = min(total_len, len);
+        mlog(0, "get block of %lu at %llu:%u req %u\n",
+                        inode->i_ino, pos, len, total_len);
+        /*
+         * Because we need to change file size in ocfs2_dio_end_io_write(), or
+         * we may need to add it to orphan dir. So can not fall to fast path
+         * while file size will be changed.
+         */
+        if (pos + total_len <= i_size_read(inode)) {
+                down_read(&oi->ip_alloc_sem);
+                /* This is the fast path for re-write. */
+                ret = ocfs2_get_block(inode, iblock, bh_result, create);
+                up_read(&oi->ip_alloc_sem);
+                if (buffer_mapped(bh_result) &&
+                    !buffer_new(bh_result) &&
+                    ret == 0)
+                        goto out;
+                /* Clear state set by ocfs2_get_block. */
+                bh_result->b_state = 0;
+        }
+        dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+        if (unlikely(dwc == NULL)) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+            ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+            !dwc->dw_orphaned) {
+                /*
+                 * when we are going to alloc extents beyond file size, add the
+                 * inode to orphan dir, so we can recall those spaces when
+                 * system crashed during write.
+                 */
+                ret = ocfs2_add_inode_to_orphan(osb, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                dwc->dw_orphaned = 1;
+        }
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_write(&oi->ip_alloc_sem);
+        if (first_get_block) {
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                        ret = ocfs2_zero_tail(inode, di_bh, pos);
+                else
+                        ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+                                                           total_len, NULL);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto unlock;
+                }
+        }
+        ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+                                       OCFS2_WRITE_DIRECT, NULL,
+                                       (void **)&wc, di_bh, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto unlock;
+        }
+        desc = &wc->w_desc[0];
+        p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+        BUG_ON(p_blkno == 0);
+        p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+        map_bh(bh_result, inode->i_sb, p_blkno);
+        bh_result->b_size = len;
+        if (desc->c_needs_zero)
+                set_buffer_new(bh_result);
+        /* May sleep in end_io. It should not happen in a irq context. So defer
+         * it to dio work queue. */
+        set_buffer_defer_completion(bh_result);
+        if (!list_empty(&wc->w_unwritten_list)) {
+                struct ocfs2_unwritten_extent *ue = NULL;
+                ue = list_first_entry(&wc->w_unwritten_list,
+                                      struct ocfs2_unwritten_extent,
+                                      ue_node);
+                BUG_ON(ue->ue_cpos != desc->c_cpos);
+                /* The physical address may be 0, fill it. */
+                ue->ue_phys = desc->c_phys;
+                list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+                dwc->dw_zero_count++;
+        }
+        ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+        BUG_ON(ret != len);
+        ret = 0;
+unlock:
+        up_write(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+out:
+        if (ret < 0)
+                ret = -EIO;
+        return ret;
+}
+static void ocfs2_dio_end_io_write(struct inode *inode,
+                                   struct ocfs2_dio_write_ctxt *dwc,
+                                   loff_t offset,
+                                   ssize_t bytes)
+{
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = NULL;
+        loff_t end = offset + bytes;
+        int ret = 0, credits = 0, locked = 0;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        /* We do clear unwritten, delete orphan, change i_size here. If neither
+         * of these happen, we can skip all this. */
+        if (list_empty(&dwc->dw_zero_list) &&
+            end <= i_size_read(inode) &&
+            !dwc->dw_orphaned)
+                goto out;
+        /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+         * are in that context. */
+        if (dwc->dw_writer_pid != task_pid_nr(current)) {
+                mutex_lock(&inode->i_mutex);
+                locked = 1;
+        }
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_write(&oi->ip_alloc_sem);
+        /* Delete orphan before acquire i_mutex. */
+        if (dwc->dw_orphaned) {
+                BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+                end = end > i_size_read(inode) ? end : 0;
+                ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+                                !!end, end);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        di = (struct ocfs2_dinode *)di_bh;
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+                                    &data_ac, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto unlock;
+        }
+        credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto unlock;
+        }
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto commit;
+        }
+        list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+                ret = ocfs2_mark_extent_written(inode, &et, handle,
+                                                ue->ue_cpos, 1,
+                                                ue->ue_phys,
+                                                meta_ac, &dealloc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        if (end > i_size_read(inode)) {
+                ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+commit:
+        ocfs2_commit_trans(osb, handle);
+unlock:
+        up_write(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        ocfs2_run_deallocs(osb, &dealloc);
+        if (locked)
+                mutex_unlock(&inode->i_mutex);
+        ocfs2_dio_free_write_ctx(inode, dwc);
+}
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+                            loff_t offset,
+                            ssize_t bytes,
+                            void *private)
+{
+        struct inode *inode = file_inode(iocb->ki_filp);
+        int level;
+        if (bytes <= 0)
+                return 0;
+        /* this io's submitter should not have unlocked this before we could */
+        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (private)
+                ocfs2_dio_end_io_write(inode, private, offset, bytes);
+        ocfs2_iocb_clear_rw_locked(iocb);
+        level = ocfs2_iocb_rw_locked_level(iocb);
+        ocfs2_rw_unlock(inode, level);
+        return 0;
+}
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t offset)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        loff_t end = offset + iter->count;
+        get_block_t *get_block;
+        /*
+         * Fallback to buffered I/O if we see an inode without
+         * extents.
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                return 0;
+        /* Fallback to buffered I/O if we do not support append dio. */
+        if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+                return 0;
+        if (iov_iter_rw(iter) == READ)
+                get_block = ocfs2_get_block;
+        else
+                get_block = ocfs2_dio_get_block;
+        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                    iter, offset, get_block,
+                                    ocfs2_dio_end_io, NULL, 0);
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage               = ocfs2_readpage,
        .readpages              = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
+typedef enum {
-                             struct address_space *mapping,
+        OCFS2_WRITE_BUFFER = 0,
-                             loff_t pos, unsigned len, unsigned flags,
+        OCFS2_WRITE_DIRECT,
+        OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                             loff_t pos, unsigned len, ocfs2_write_type_t type,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page);
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
-        OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
 #define ocfs2_iocb_rw_locked_level(iocb) \
        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
-        set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
-        clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
-        test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ef6a2ec494de..bd15929b5f92 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1444,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_dir);
        kfree(reg->hr_db_livenodes);
        kfree(reg->hr_db_regnum);
-        kfree(reg->hr_debug_elapsed_time);
+        kfree(reg->hr_db_elapsed_time);
-        kfree(reg->hr_debug_pinned);
+        kfree(reg->hr_db_pinned);
        spin_lock(&o2hb_live_lock);
        list_del(&reg->hr_all_item);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
        if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
                memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+        /*
+         * Move the lock to the tail because it may be the only lock which has
+         * an invalid lvb.
+         */
+        list_move_tail(&lock->list, &res->granted);
        status = DLM_NORMAL;
        *call_ast = 1;
        goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                                  struct dlm_lock *lock, int flags, int type)
 {
        enum dlm_status status;
+        u8 old_owner = res->owner;
        mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
             lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                status = DLM_DENIED;
                goto bail;
        }
+        if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+                mlog(0, "last convert request returned DLM_RECOVERING, but "
+                     "owner has already queued and sent ast to me. res %.*s, "
+                     "(cookie=%u:%llu, type=%d, conv=%d)\n",
+                     res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.convert_type);
+                status = DLM_NORMAL;
+                goto bail;
+        }
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* move lock to local convert queue */
        /* do not alter lock refcount.  switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->convert_pending = 0;
-        /* if it failed, move it back to granted queue */
+        /* if it failed, move it back to granted queue.
+         * if master returns DLM_NORMAL and then down before sending ast,
+         * it may have already been moved to granted queue, reset to
+         * DLM_RECOVERING and retry convert */
        if (status != DLM_NORMAL) {
                if (status != DLM_NOTQUEUED)
                        dlm_error(status);
                dlm_revert_pending_convert(res, lock);
+        } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+                        (old_owner != res->owner)) {
+                mlog(0, "res %.*s is in recovering or has been recovered.\n",
+                                res->lockname.len, res->lockname.name);
+                status = DLM_RECOVERING;
        }
 bail:
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index cd38488a10fc..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2083,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                        dlm_lock_get(lock);
                        if (lock->convert_pending) {
                                /* move converting lock back to granted */
-                                BUG_ON(i != DLM_CONVERTING_LIST);
                                mlog(0, "node died with convert pending "
                                     "on %.*s. move back to granted list.\n",
                                     res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
        return ret;
 }
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
-                                       size_t count)
-{
-        int ret = 0;
-        unsigned int extent_flags;
-        u32 cpos, clusters, extent_len, phys_cpos;
-        struct super_block *sb = inode->i_sb;
-        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
-        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-        while (clusters) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-                                         &extent_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
-                        ret = 1;
-                        break;
-                }
-                if (extent_len > clusters)
-                        extent_len = clusters;
-                clusters -= extent_len;
-                cpos += extent_len;
-        }
-out:
-        return ret;
-}
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
@@ -2129,18 +2091,12 @@ out:
 static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t pos,
-                                         size_t count,
+                                         size_t count)
-                                         int appending,
-                                         int *direct_io,
-                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = d_inode(dentry);
        loff_t end;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int full_coherency = !(osb->s_mount_opt &
-                OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                                                               pos,
                                                               count,
                                                               &meta_level);
-                        if (has_refcount)
-                                *has_refcount = 1;
-                        if (direct_io)
-                                *direct_io = 0;
                }
                if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                        goto out_unlock;
                }
-                /*
-                 * Skip the O_DIRECT checks if we don't need
-                 * them.
-                 */
-                if (!direct_io || !(*direct_io))
-                        break;
-                /*
-                 * There's no sane way to do direct writes to an inode
-                 * with inline data.
-                 */
-                if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * Allowing concurrent direct writes means
-                 * i_size changes wouldn't be synchronized, so
-                 * one node could wind up truncating another
-                 * nodes writes.
-                 */
-                if (end > i_size_read(inode) && !full_coherency) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * Fallback to old way if the feature bit is not set.
-                 */
-                if (end > i_size_read(inode) &&
-                                !ocfs2_supports_append_dio(osb)) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * We don't fill holes during direct io, so
-                 * check for them here. If any are found, the
-                 * caller will have to retake some cluster
-                 * locks and initiate the io as buffered.
-                 */
-                ret = ocfs2_check_range_for_holes(inode, pos, count);
-                if (ret == 1) {
-                        /*
-                         * Fallback to old way if the feature bit is not set.
-                         * Otherwise try dio first and then complete the rest
-                         * request through buffer io.
-                         */
-                        if (!ocfs2_supports_append_dio(osb))
-                                *direct_io = 0;
-                        ret = 0;
-                } else if (ret < 0)
-                        mlog_errno(ret);
                break;
        }
 out_unlock:
        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-                                            pos, appending, count,
+                                            pos, count);
-                                            direct_io, has_refcount);
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
 {
-        int direct_io, appending, rw_level;
+        int direct_io, rw_level;
-        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        ssize_t ret;
-        size_t count = iov_iter_count(from), orig_count;
+        size_t count = iov_iter_count(from);
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
-        int unaligned_dio = 0;
+        void *saved_ki_complete = NULL;
-        int dropped_dio = 0;
        int append_write = ((iocb->ki_pos + count) >=
                        i_size_read(inode) ? 1 : 0);
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        if (count == 0)
                return 0;
-        appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
        direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
        inode_lock(inode);
-relock:
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
                ocfs2_inode_unlock(inode, 1);
        }
-        orig_count = iov_iter_count(from);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0) {
                if (ret)
@@ -2343,41 +2235,18 @@ relock:
        }
        count = ret;
-        can_do_direct = direct_io;
+        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
-        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
-                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        if (direct_io && !is_sync_kiocb(iocb))
+        if (direct_io && !is_sync_kiocb(iocb) &&
-                unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
+            ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
-        /*
-         * We can't complete the direct I/O as requested, fall back to
-         * buffered I/O.
-         */
-        if (direct_io && !can_do_direct) {
-                ocfs2_rw_unlock(inode, rw_level);
-                rw_level = -1;
-                direct_io = 0;
-                iocb->ki_flags &= ~IOCB_DIRECT;
-                iov_iter_reexpand(from, orig_count);
-                dropped_dio = 1;
-                goto relock;
-        }
-        if (unaligned_dio) {
                /*
-                 * Wait on previous unaligned aio to complete before
+                 * Make it a sync io if it's an unaligned aio.
-                 * proceeding.
                 */
-                mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+                saved_ki_complete = xchg(&iocb->ki_complete, NULL);
-                /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
-                ocfs2_iocb_set_unaligned_aio(iocb);
        }
        /* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
         */
        if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
-                unaligned_dio = 0;
        }
        if (unlikely(written <= 0))
-                goto no_sync;
+                goto out;
        if (((file->f_flags & O_DSYNC) && !direct_io) ||
-            IS_SYNC(inode) || dropped_dio) {
+            IS_SYNC(inode)) {
                ret = filemap_fdatawrite_range(file->f_mapping,
                                               iocb->ki_pos - written,
                                               iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
                                                      iocb->ki_pos - 1);
        }
-no_sync:
-        if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
-                ocfs2_iocb_clear_unaligned_aio(iocb);
-                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-        }
 out:
+        if (saved_ki_complete)
+                xchg(&iocb->ki_complete, saved_ki_complete);
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ba495beff1c2..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1170,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
+        mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+                        "Clear inode of %llu, inode has unwritten extents\n",
+                        (unsigned long long)oi->ip_blkno);
        ocfs2_extent_map_trunc(inode, 0);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 01635e016b3e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
        /* protects extended attribute changes on this inode */
        struct rw_semaphore             ip_xattr_sem;
-        /* Number of outstanding AIO's which are not page aligned */
-        struct mutex                    ip_unaligned_aio;
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
+        /* Record unwritten extents during direct io. */
+        struct list_head                ip_unwritten_list;
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
        /* At this point, we know that no more recovery threads can be
         * launched, so wait for any recovery completion work to
         * complete. */
-        flush_workqueue(ocfs2_wq);
+        flush_workqueue(osb->ocfs2_wq);
        /*
         * Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
-        queue_work(ocfs2_wq, &journal->j_recovery_work);
+        queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
        spin_unlock(&journal->j_lock);
 }
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+                queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+                queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                   ocfs2_orphan_scan_timeout());
        }
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        struct ocfs2_dinode *alloc = NULL;
        cancel_delayed_work(&osb->la_enable_wq);
-        flush_workqueue(ocfs2_wq);
+        flush_workqueue(osb->ocfs2_wq);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
                } else {
                        osb->local_alloc_state = OCFS2_LA_DISABLED;
                }
-                queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+                queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
                                   OCFS2_LA_ENABLE_INTERVAL);
                goto out_unlock;
        }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 77ebc2bc1cca..9ea081f4e6e4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
        if (page->index == last_index)
                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
-        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
-                                       &fsdata, di_bh, page);
+                                       &locked_page, &fsdata, di_bh, page);
        if (ret) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
        struct ocfs2_refcount_tree *osb_ref_tree_lru;
        struct mutex system_file_mutex;
+        /*
+         * OCFS2 needs to schedule several different types of work which
+         * require cluster locking, disk I/O, recovery waits, etc. Since these
+         * types of work tend to be heavy we avoid using the kernel events
+         * workqueue and schedule on our own.
+         */
+        struct workqueue_struct *ocfs2_wq;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 24b7e7f591dc..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
 TRACE_EVENT(ocfs2_prepare_inode_for_write,
        TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-                 int appending, unsigned long count,
+                 unsigned long count),
-                 int *direct_io, int *has_refcount),
+        TP_ARGS(ino, saved_pos, count),
-        TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
        TP_STRUCT__entry(
                __field(unsigned long long, ino)
                __field(unsigned long long, saved_pos)
-                __field(int, appending)
                __field(unsigned long, count)
-                __field(int, direct_io)
-                __field(int, has_refcount)
        ),
        TP_fast_assign(
                __entry->ino = ino;
                __entry->saved_pos = saved_pos;
-                __entry->appending = appending;
                __entry->count = count;
-                __entry->direct_io = direct_io ? *direct_io : -1;
-                __entry->has_refcount = has_refcount ? *has_refcount : -1;
        ),
-        TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+        TP_printk("%llu %llu %lu", __entry->ino,
-                  __entry->saved_pos, __entry->appending, __entry->count,
+                  __entry->saved_pos, __entry->count)
-                  __entry->direct_io, __entry->has_refcount)
 );
 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 91bc674203ed..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                dqgrab(dquot);
                /* First entry on list -> queue work */
                if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
-                        queue_work(ocfs2_wq, &osb->dquot_drop_work);
+                        queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
                goto out;
        }
        status = ocfs2_lock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-                if (cluster > clusters)
+                if (cluster >= clusters)
                        break;
                ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccc9386c42c5..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -80,12 +80,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
 struct kmem_cache *ocfs2_dquot_cachep;
 struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
 static struct dentry *ocfs2_debugfs_root;
 MODULE_AUTHOR("Oracle");
@@ -1613,33 +1607,25 @@ static int __init ocfs2_init(void)
        if (status < 0)
                goto out2;
-        ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
-        if (!ocfs2_wq) {
-                status = -ENOMEM;
-                goto out3;
-        }
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
        if (!ocfs2_debugfs_root) {
                status = -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
-                goto out4;
+                goto out3;
        }
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
        if (status < 0)
-                goto out4;
+                goto out3;
        status = register_filesystem(&ocfs2_fs_type);
        if (!status)
                return 0;
        unregister_quota_format(&ocfs2_quota_format);
-out4:
-        destroy_workqueue(ocfs2_wq);
-        debugfs_remove(ocfs2_debugfs_root);
 out3:
+        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
 out2:
        exit_ocfs2_uptodate_cache();
@@ -1650,11 +1636,6 @@ out1:
 static void __exit ocfs2_exit(void)
 {
-        if (ocfs2_wq) {
-                flush_workqueue(ocfs2_wq);
-                destroy_workqueue(ocfs2_wq);
-        }
        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
@@ -1745,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
        spin_lock_init(&oi->ip_lock);
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
+        INIT_LIST_HEAD(&oi->ip_unwritten_list);
        oi->ip_dir_start_lookup = 0;
-        mutex_init(&oi->ip_unaligned_aio);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
@@ -2349,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        cleancache_init_shared_fs(sb);
+        osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+        if (!osb->ocfs2_wq) {
+                status = -ENOMEM;
+                mlog_errno(status);
+        }
 bail:
        return status;
 }
@@ -2536,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
        /* This function assumes that the caller has the main osb resource */
+        /* ocfs2_initializer_super have already created this workqueue */
+        if (osb->ocfs2_wq) {
+                flush_workqueue(osb->ocfs2_wq);
+                destroy_workqueue(osb->ocfs2_wq);
+        }
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
 #ifndef OCFS2_SUPER_H
 #define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8f5a12ab2f2b..339125bb4d2c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -456,7 +456,7 @@
                *(.entry.text)                                          \
                VMLINUX_SYMBOL(__entry_text_end) = .;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT                                                   \
                ALIGN_FUNCTION();                                       \
                VMLINUX_SYMBOL(__irqentry_text_start) = .;              \
@@ -466,6 +466,16 @@
 #define IRQENTRY_TEXT
 #endif
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+#define SOFTIRQENTRY_TEXT                                               \
+                ALIGN_FUNCTION();                                       \
+                VMLINUX_SYMBOL(__softirqentry_text_start) = .;          \
+                *(.softirqentry.text)                                   \
+                VMLINUX_SYMBOL(__softirqentry_text_end) = .;
+#else
+#define SOFTIRQENTRY_TEXT
+#endif
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6d9df3f7e334..dea12a6e413b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
 */
 #define __notrace_funcgraph             notrace
-/*
- * We want to which function is an entrypoint of a hardirq.
- * That will help us to put a signal on output.
- */
-#define __irq_entry              __attribute__((__section__(".irqentry.text")))
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
 #define FTRACE_NOTRACE_DEPTH 65536
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
@@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void)
 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
 #define __notrace_funcgraph
-#define __irq_entry
 #define INIT_FTRACE_GRAPH
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 358076eda364..9fcabeb07787 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -683,4 +683,24 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+/*
+ * We want to know which function is an entrypoint of a hardirq or a softirq.
+ */
+#define __irq_entry              __attribute__((__section__(".irqentry.text")))
+#define __softirq_entry  \
+        __attribute__((__section__(".softirqentry.text")))
+/* Limits of hardirq entrypoints */
+extern char __irqentry_text_start[];
+extern char __irqentry_text_end[];
+/* Limits of softirq entrypoints */
+extern char __softirqentry_text_start[];
+extern char __softirqentry_text_end[];
+#else
+#define __irq_entry
+#define __softirq_entry
+#endif
 #endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0fdc798e3ff7..737371b56044 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -48,19 +48,28 @@ void kasan_unpoison_task_stack(struct task_struct *task);
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+                        unsigned long *flags);
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
 void kasan_poison_object_data(struct kmem_cache *cache, void *object);
-void kasan_kmalloc_large(const void *ptr, size_t size);
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(const void *ptr);
 void kasan_kfree(void *ptr);
-void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
-void kasan_krealloc(const void *object, size_t new_size);
+                  gfp_t flags);
+void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
-void kasan_slab_alloc(struct kmem_cache *s, void *object);
+void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
 void kasan_slab_free(struct kmem_cache *s, void *object);
+struct kasan_cache {
+        int alloc_meta_offset;
+        int free_meta_offset;
+};
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
@@ -76,20 +85,26 @@ static inline void kasan_disable_current(void) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+static inline void kasan_cache_create(struct kmem_cache *cache,
+                                      size_t *size,
+                                      unsigned long *flags) {}
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
                                        void *object) {}
 static inline void kasan_poison_object_data(struct kmem_cache *cache,
                                        void *object) {}
-static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
+static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
 static inline void kasan_kfree_large(const void *ptr) {}
 static inline void kasan_kfree(void *ptr) {}
 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
-                                size_t size) {}
+                                size_t size, gfp_t flags) {}
-static inline void kasan_krealloc(const void *object, size_t new_size) {}
+static inline void kasan_krealloc(const void *object, size_t new_size,
+                                 gfp_t flags) {}
-static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
+                                   gfp_t flags) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 450fc977ed02..ed6407d1b7b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1132,6 +1132,8 @@ struct zap_details {
        struct address_space *check_mapping;    /* Check page->mapping if set */
        pgoff_t first_index;                    /* Lowest page->index to unmap */
        pgoff_t last_index;                     /* Highest page->index to unmap */
+        bool ignore_dirty;                      /* Ignore dirty pages */
+        bool check_swap_entries;                /* Check also swap entries */
 };
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 03e6257321f0..628a43242a34 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -76,8 +76,6 @@ extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
                unsigned long totalpages);
-extern int oom_kills_count(void);
-extern void note_oom_kill(void);
 extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                             unsigned int points, unsigned long totalpages,
                             struct mem_cgroup *memcg, const char *message);
@@ -91,7 +89,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 extern bool out_of_memory(struct oom_control *oc);
-extern void exit_oom_victim(void);
+extern void exit_oom_victim(struct task_struct *tsk);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 589c4780b077..60bba7e032dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
+extern signed long schedule_timeout_idle(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
@@ -1848,6 +1849,9 @@ struct task_struct {
        unsigned long   task_state_change;
 #endif
        int pagefault_disabled;
+#ifdef CONFIG_MMU
+        struct task_struct *oom_reaper_list;
+#endif
 /* CPU-specific state of this task */
        struct thread_struct thread;
 /*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e4b568738ca3..508bd827e6dc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -92,6 +92,12 @@
 # define SLAB_ACCOUNT           0x00000000UL
 #endif
+#ifdef CONFIG_KASAN
+#define SLAB_KASAN              0x08000000UL
+#else
+#define SLAB_KASAN              0x00000000UL
+#endif
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT    0x00020000UL            /* Objects are reclaimable */
 #define SLAB_TEMPORARY          SLAB_RECLAIM_ACCOUNT    /* Objects are short-lived */
@@ -370,7 +376,7 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
 {
        void *ret = kmem_cache_alloc(s, flags);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, flags);
        return ret;
 }
@@ -381,7 +387,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 {
        void *ret = kmem_cache_alloc_node(s, gfpflags, node);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
 }
 #endif /* CONFIG_TRACING */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e878ba35ae91..9edbbf352340 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -76,8 +76,22 @@ struct kmem_cache {
 #ifdef CONFIG_MEMCG
        struct memcg_cache_params memcg_params;
 #endif
+#ifdef CONFIG_KASAN
+        struct kasan_cache kasan_info;
+#endif
        struct kmem_cache_node *node[MAX_NUMNODES];
 };
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+                                void *x) {
+        void *object = x - (x - page->s_mem) % cache->size;
+        void *last_object = page->s_mem + (cache->num - 1) * cache->size;
+        if (unlikely(object > last_object))
+                return last_object;
+        else
+                return object;
+}
 #endif  /* _LINUX_SLAB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index ac5143f95ee6..665cd0cd18b8 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -130,4 +130,15 @@ static inline void *virt_to_obj(struct kmem_cache *s,
 void object_err(struct kmem_cache *s, struct page *page,
                u8 *object, char *reason);
+static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
+                                void *x) {
+        void *object = x - (x - page_address(page)) % cache->size;
+        void *last_object = page_address(page) +
+                (page->objects - 1) * cache->size;
+        if (unlikely(object > last_object))
+                return last_object;
+        else
+                return object;
+}
 #endif /* _LINUX_SLUB_DEF_H */
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
new file mode 100644
index 000000000000..7978b3e2c1e1
--- /dev/null
+++ b/include/linux/stackdepot.h
@@ -0,0 +1,32 @@
+/*
+ * A generic stack depot implementation
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_STACKDEPOT_H
+#define _LINUX_STACKDEPOT_H
+typedef u32 depot_stack_handle_t;
+struct stack_trace;
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags);
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace);
+#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 953d1a1c0387..fd90195667e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
-                exit_oom_victim();
+                exit_oom_victim(tsk);
 }
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8aae49dd7da8..17caf4b63342 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
-asmlinkage __visible void __do_softirq(void)
+asmlinkage __visible void __softirq_entry __do_softirq(void)
 {
        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        unsigned long old_flags = current->flags;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d1798fa0c743..73164c3aa56b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+/*
+ * Like schedule_timeout_uninterruptible(), except this task will not contribute
+ * to load average.
+ */
+signed long __sched schedule_timeout_idle(signed long timeout)
+{
+        __set_current_state(TASK_IDLE);
+        return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_idle);
 #ifdef CONFIG_HOTPLUG_CPU
 static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 91d6a63a2ea7..3a0244ff7ea8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
 */
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/lib/Kconfig b/lib/Kconfig
index 133ebc0c1773..3cca1222578e 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -536,4 +536,8 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_MMIO_FLUSH
        bool
+config STACKDEPOT
+        bool
+        select STACKTRACE
 endmenu
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 0fee5acd5aa0..67d8c6838ba9 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -5,8 +5,9 @@ if HAVE_ARCH_KASAN
 config KASAN
        bool "KASan: runtime memory debugger"
-        depends on SLUB_DEBUG
+        depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
        select CONSTRUCTORS
+        select STACKDEPOT if SLAB
        help
          Enables kernel address sanitizer - runtime memory debugger,
          designed to find out-of-bounds accesses and use-after-free bugs.
@@ -16,6 +17,8 @@ config KASAN
          This feature consumes about 1/8 of available memory and brings about
          ~x3 performance slowdown.
          For better error detection enable CONFIG_STACKTRACE.
+          Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB
+          (the resulting kernel does not boot).
 choice
        prompt "Instrumentation type"
diff --git a/lib/Makefile b/lib/Makefile
index a1de5b61ff40..7bd6fd436c97 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -181,6 +181,9 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+obj-$(CONFIG_STACKDEPOT) += stackdepot.o
+KASAN_SANITIZE_stackdepot.o := n
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
               fdt_empty_tree.o
 $(foreach file, $(libfdt_files), \
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
new file mode 100644
index 000000000000..654c9d87e83a
--- /dev/null
+++ b/lib/stackdepot.c
@@ -0,0 +1,284 @@
+/*
+ * Generic stack depot for storing stack traces.
+ *
+ * Some debugging tools need to save stack traces of certain events which can
+ * be later presented to the user. For example, KASAN needs to safe alloc and
+ * free stacks for each object, but storing two stack traces per object
+ * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for
+ * that).
+ *
+ * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc
+ * and free stacks repeat a lot, we save about 100x space.
+ * Stacks are never removed from depot, so we store them contiguously one after
+ * another in a contiguos memory allocation.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
+#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
+#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
+#define STACK_ALLOC_ALIGN 4
+#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
+                                        STACK_ALLOC_ALIGN)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_SLABS_CAP 1024
+#define STACK_ALLOC_MAX_SLABS \
+        (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
+         (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
+/* The compact structure to store the reference to stacks. */
+union handle_parts {
+        depot_stack_handle_t handle;
+        struct {
+                u32 slabindex : STACK_ALLOC_INDEX_BITS;
+                u32 offset : STACK_ALLOC_OFFSET_BITS;
+        };
+};
+struct stack_record {
+        struct stack_record *next;      /* Link in the hashtable */
+        u32 hash;                       /* Hash in the hastable */
+        u32 size;                       /* Number of frames in the stack */
+        union handle_parts handle;
+        unsigned long entries[1];       /* Variable-sized array of entries. */
+};
+static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
+static int depot_index;
+static int next_slab_inited;
+static size_t depot_offset;
+static DEFINE_SPINLOCK(depot_lock);
+static bool init_stack_slab(void **prealloc)
+{
+        if (!*prealloc)
+                return false;
+        /*
+         * This smp_load_acquire() pairs with smp_store_release() to
+         * |next_slab_inited| below and in depot_alloc_stack().
+         */
+        if (smp_load_acquire(&next_slab_inited))
+                return true;
+        if (stack_slabs[depot_index] == NULL) {
+                stack_slabs[depot_index] = *prealloc;
+        } else {
+                stack_slabs[depot_index + 1] = *prealloc;
+                /*
+                 * This smp_store_release pairs with smp_load_acquire() from
+                 * |next_slab_inited| above and in depot_save_stack().
+                 */
+                smp_store_release(&next_slab_inited, 1);
+        }
+        *prealloc = NULL;
+        return true;
+}
+/* Allocation of a new stack in raw storage */
+static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
+                u32 hash, void **prealloc, gfp_t alloc_flags)
+{
+        int required_size = offsetof(struct stack_record, entries) +
+                sizeof(unsigned long) * size;
+        struct stack_record *stack;
+        required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN);
+        if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) {
+                if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) {
+                        WARN_ONCE(1, "Stack depot reached limit capacity");
+                        return NULL;
+                }
+                depot_index++;
+                depot_offset = 0;
+                /*
+                 * smp_store_release() here pairs with smp_load_acquire() from
+                 * |next_slab_inited| in depot_save_stack() and
+                 * init_stack_slab().
+                 */
+                if (depot_index + 1 < STACK_ALLOC_MAX_SLABS)
+                        smp_store_release(&next_slab_inited, 0);
+        }
+        init_stack_slab(prealloc);
+        if (stack_slabs[depot_index] == NULL)
+                return NULL;
+        stack = stack_slabs[depot_index] + depot_offset;
+        stack->hash = hash;
+        stack->size = size;
+        stack->handle.slabindex = depot_index;
+        stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+        memcpy(stack->entries, entries, size * sizeof(unsigned long));
+        depot_offset += required_size;
+        return stack;
+}
+#define STACK_HASH_ORDER 20
+#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER)
+#define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
+#define STACK_HASH_SEED 0x9747b28c
+static struct stack_record *stack_table[STACK_HASH_SIZE] = {
+        [0 ...  STACK_HASH_SIZE - 1] = NULL
+};
+/* Calculate hash for a stack */
+static inline u32 hash_stack(unsigned long *entries, unsigned int size)
+{
+        return jhash2((u32 *)entries,
+                               size * sizeof(unsigned long) / sizeof(u32),
+                               STACK_HASH_SEED);
+}
+/* Find a stack that is equal to the one stored in entries in the hash */
+static inline struct stack_record *find_stack(struct stack_record *bucket,
+                                             unsigned long *entries, int size,
+                                             u32 hash)
+{
+        struct stack_record *found;
+        for (found = bucket; found; found = found->next) {
+                if (found->hash == hash &&
+                    found->size == size &&
+                    !memcmp(entries, found->entries,
+                            size * sizeof(unsigned long))) {
+                        return found;
+                }
+        }
+        return NULL;
+}
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
+{
+        union handle_parts parts = { .handle = handle };
+        void *slab = stack_slabs[parts.slabindex];
+        size_t offset = parts.offset << STACK_ALLOC_ALIGN;
+        struct stack_record *stack = slab + offset;
+        trace->nr_entries = trace->max_entries = stack->size;
+        trace->entries = stack->entries;
+        trace->skip = 0;
+}
+/**
+ * depot_save_stack - save stack in a stack depot.
+ * @trace - the stacktrace to save.
+ * @alloc_flags - flags for allocating additional memory if required.
+ *
+ * Returns the handle of the stack struct stored in depot.
+ */
+depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
+                                    gfp_t alloc_flags)
+{
+        u32 hash;
+        depot_stack_handle_t retval = 0;
+        struct stack_record *found = NULL, **bucket;
+        unsigned long flags;
+        struct page *page = NULL;
+        void *prealloc = NULL;
+        if (unlikely(trace->nr_entries == 0))
+                goto fast_exit;
+        hash = hash_stack(trace->entries, trace->nr_entries);
+        /* Bad luck, we won't store this stack. */
+        if (hash == 0)
+                goto exit;
+        bucket = &stack_table[hash & STACK_HASH_MASK];
+        /*
+         * Fast path: look the stack trace up without locking.
+         * The smp_load_acquire() here pairs with smp_store_release() to
+         * |bucket| below.
+         */
+        found = find_stack(smp_load_acquire(bucket), trace->entries,
+                           trace->nr_entries, hash);
+        if (found)
+                goto exit;
+        /*
+         * Check if the current or the next stack slab need to be initialized.
+         * If so, allocate the memory - we won't be able to do that under the
+         * lock.
+         *
+         * The smp_load_acquire() here pairs with smp_store_release() to
+         * |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
+         */
+        if (unlikely(!smp_load_acquire(&next_slab_inited))) {
+                /*
+                 * Zero out zone modifiers, as we don't have specific zone
+                 * requirements. Keep the flags related to allocation in atomic
+                 * contexts and I/O.
+                 */
+                alloc_flags &= ~GFP_ZONEMASK;
+                alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
+                page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
+                if (page)
+                        prealloc = page_address(page);
+        }
+        spin_lock_irqsave(&depot_lock, flags);
+        found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
+        if (!found) {
+                struct stack_record *new =
+                        depot_alloc_stack(trace->entries, trace->nr_entries,
+                                          hash, &prealloc, alloc_flags);
+                if (new) {
+                        new->next = *bucket;
+                        /*
+                         * This smp_store_release() pairs with
+                         * smp_load_acquire() from |bucket| above.
+                         */
+                        smp_store_release(bucket, new);
+                        found = new;
+                }
+        } else if (prealloc) {
+                /*
+                 * We didn't need to store this stack trace, but let's keep
+                 * the preallocated memory for the future.
+                 */
+                WARN_ON(!init_stack_slab(&prealloc));
+        }
+        spin_unlock_irqrestore(&depot_lock, flags);
+exit:
+        if (prealloc) {
+                /* Nobody used this memory, ok to free it. */
+                free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER);
+        }
+        if (found)
+                retval = found->handle.handle;
+fast_exit:
+        return retval;
+}
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index c32f3b0048dc..82169fbf2453 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -65,11 +65,34 @@ static noinline void __init kmalloc_node_oob_right(void)
        kfree(ptr);
 }
-static noinline void __init kmalloc_large_oob_right(void)
+#ifdef CONFIG_SLUB
+static noinline void __init kmalloc_pagealloc_oob_right(void)
 {
        char *ptr;
        size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+        /* Allocate a chunk that does not fit into a SLUB cache to trigger
+         * the page allocator fallback.
+         */
+        pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n");
+        ptr = kmalloc(size, GFP_KERNEL);
+        if (!ptr) {
+                pr_err("Allocation failed\n");
+                return;
+        }
+        ptr[size] = 0;
+        kfree(ptr);
+}
+#endif
+static noinline void __init kmalloc_large_oob_right(void)
+{
+        char *ptr;
+        size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
+        /* Allocate a chunk that is large enough, but still fits into a slab
+         * and does not trigger the page allocator fallback in SLUB.
+         */
        pr_info("kmalloc large allocation: out-of-bounds to right\n");
        ptr = kmalloc(size, GFP_KERNEL);
        if (!ptr) {
@@ -271,6 +294,8 @@ static noinline void __init kmalloc_uaf2(void)
        }
        ptr1[40] = 'x';
+        if (ptr1 == ptr2)
+                pr_err("Could not detect use-after-free: ptr1 == ptr2\n");
        kfree(ptr2);
 }
@@ -324,6 +349,9 @@ static int __init kmalloc_tests_init(void)
        kmalloc_oob_right();
        kmalloc_oob_left();
        kmalloc_node_oob_right();
+#ifdef CONFIG_SLUB
+        kmalloc_pagealloc_oob_right();
+#endif
        kmalloc_large_oob_right();
        kmalloc_oob_krealloc_more();
        kmalloc_oob_krealloc_less();
diff --git a/mm/Makefile b/mm/Makefile
index f5e797cbd128..deb467edca2d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,6 +3,7 @@
 #
 KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
 KASAN_SANITIZE_slub.o := n
 # These files are disabled because they produce non-interesting and/or
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c00f105845e..a8c69c8c0a90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1840,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        ssize_t retval = 0;
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
+        size_t count = iov_iter_count(iter);
+        if (!count)
+                goto out; /* skip atime */
        if (iocb->ki_flags & IOCB_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
-                size_t count = iov_iter_count(iter);
                loff_t size;
-                if (!count)
-                        goto out; /* skip atime */
                size = i_size_read(inode);
                retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + count - 1);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fbfb1b8d6726..86f9f8b82f8e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2578,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                }
                khugepaged_node_load[node]++;
                if (!PageLRU(page)) {
-                        result = SCAN_SCAN_ABORT;
+                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
                if (PageLocked(page)) {
diff --git a/mm/internal.h b/mm/internal.h
index 7449392c6faa..b79abb6721cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,6 +38,11 @@
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
+void unmap_page_range(struct mmu_gather *tlb,
+                             struct vm_area_struct *vma,
+                             unsigned long addr, unsigned long end,
+                             struct zap_details *details);
 extern int __do_page_cache_readahead(struct address_space *mapping,
                struct file *filp, pgoff_t offset, unsigned long nr_to_read,
                unsigned long lookahead_size);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8c91..acb3b6c4dd89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
 #define DISABLE_BRANCH_PROFILING
 #include <linux/export.h>
+#include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
 #include <linux/linkage.h>
@@ -32,7 +34,6 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
-#include <linux/kasan.h>
 #include "kasan.h"
 #include "../slab.h"
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
                                KASAN_FREE_PAGE);
 }
+#ifdef CONFIG_SLAB
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+        int rz =
+                object_size <= 64        - 16   ? 16 :
+                object_size <= 128       - 32   ? 32 :
+                object_size <= 512       - 64   ? 64 :
+                object_size <= 4096      - 128  ? 128 :
+                object_size <= (1 << 14) - 256  ? 256 :
+                object_size <= (1 << 15) - 512  ? 512 :
+                object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+        return rz;
+}
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+                        unsigned long *flags)
+{
+        int redzone_adjust;
+        /* Make sure the adjusted size is still less than
+         * KMALLOC_MAX_CACHE_SIZE.
+         * TODO: this check is only useful for SLAB, but not SLUB. We'll need
+         * to skip it for SLUB when it starts using kasan_cache_create().
+         */
+        if (*size > KMALLOC_MAX_CACHE_SIZE -
+            sizeof(struct kasan_alloc_meta) -
+            sizeof(struct kasan_free_meta))
+                return;
+        *flags |= SLAB_KASAN;
+        /* Add alloc meta. */
+        cache->kasan_info.alloc_meta_offset = *size;
+        *size += sizeof(struct kasan_alloc_meta);
+        /* Add free meta. */
+        if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+            cache->object_size < sizeof(struct kasan_free_meta)) {
+                cache->kasan_info.free_meta_offset = *size;
+                *size += sizeof(struct kasan_free_meta);
+        }
+        redzone_adjust = optimal_redzone(cache->object_size) -
+                (*size - cache->object_size);
+        if (redzone_adjust > 0)
+                *size += redzone_adjust;
+        *size = min(KMALLOC_MAX_CACHE_SIZE,
+                    max(*size,
+                        cache->object_size +
+                        optimal_redzone(cache->object_size)));
+}
+#endif
 void kasan_poison_slab(struct page *page)
 {
        kasan_poison_shadow(page_address(page),
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
        kasan_poison_shadow(object,
                        round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
                        KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+        if (cache->flags & SLAB_KASAN) {
+                struct kasan_alloc_meta *alloc_info =
+                        get_alloc_info(cache, object);
+                alloc_info->state = KASAN_STATE_INIT;
+        }
+#endif
 }
-void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+#ifdef CONFIG_SLAB
+static inline int in_irqentry_text(unsigned long ptr)
 {
-        kasan_kmalloc(cache, object, cache->object_size);
+        return (ptr >= (unsigned long)&__irqentry_text_start &&
+                ptr < (unsigned long)&__irqentry_text_end) ||
+                (ptr >= (unsigned long)&__softirqentry_text_start &&
+                 ptr < (unsigned long)&__softirqentry_text_end);
+}
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+        int i;
+        if (!trace->nr_entries)
+                return;
+        for (i = 0; i < trace->nr_entries; i++)
+                if (in_irqentry_text(trace->entries[i])) {
+                        /* Include the irqentry function into the stack. */
+                        trace->nr_entries = i + 1;
+                        break;
+                }
+}
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+        unsigned long entries[KASAN_STACK_DEPTH];
+        struct stack_trace trace = {
+                .nr_entries = 0,
+                .entries = entries,
+                .max_entries = KASAN_STACK_DEPTH,
+                .skip = 0
+        };
+        save_stack_trace(&trace);
+        filter_irq_stacks(&trace);
+        if (trace.nr_entries != 0 &&
+            trace.entries[trace.nr_entries-1] == ULONG_MAX)
+                trace.nr_entries--;
+        return depot_save_stack(&trace, flags);
+}
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+        track->pid = current->pid;
+        track->stack = save_stack(flags);
+}
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+                                        const void *object)
+{
+        BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+        return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+                                      const void *object)
+{
+        BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+        return (void *)object + cache->kasan_info.free_meta_offset;
+}
+#endif
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+        kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
        if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
                return;
+#ifdef CONFIG_SLAB
+        if (cache->flags & SLAB_KASAN) {
+                struct kasan_free_meta *free_info =
+                        get_free_info(cache, object);
+                struct kasan_alloc_meta *alloc_info =
+                        get_alloc_info(cache, object);
+                alloc_info->state = KASAN_STATE_FREE;
+                set_track(&free_info->track);
+        }
+#endif
        kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+                   gfp_t flags)
 {
        unsigned long redzone_start;
        unsigned long redzone_end;
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
        kasan_unpoison_shadow(object, size);
        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
                KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+        if (cache->flags & SLAB_KASAN) {
+                struct kasan_alloc_meta *alloc_info =
+                        get_alloc_info(cache, object);
+                alloc_info->state = KASAN_STATE_ALLOC;
+                alloc_info->alloc_size = size;
+                set_track(&alloc_info->track, flags);
+        }
+#endif
 }
 EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size)
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 {
        struct page *page;
        unsigned long redzone_start;
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
                KASAN_PAGE_REDZONE);
 }
-void kasan_krealloc(const void *object, size_t size)
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
 {
        struct page *page;
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size)
        page = virt_to_head_page(object);
        if (unlikely(!PageSlab(page)))
-                kasan_kmalloc_large(object, size);
+                kasan_kmalloc_large(object, size, flags);
        else
-                kasan_kmalloc(page->slab_cache, object, size);
+                kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 void kasan_kfree(void *ptr)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4f6c62e5c21e..30a2f0ba0e09 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
 #define __MM_KASAN_KASAN_H
 #include <linux/kasan.h>
+#include <linux/stackdepot.h>
 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -54,6 +55,42 @@ struct kasan_global {
 #endif
 };
+/**
+ * Structures to keep alloc and free tracks *
+ */
+enum kasan_state {
+        KASAN_STATE_INIT,
+        KASAN_STATE_ALLOC,
+        KASAN_STATE_FREE
+};
+#define KASAN_STACK_DEPTH 64
+struct kasan_track {
+        u32 pid;
+        depot_stack_handle_t stack;
+};
+struct kasan_alloc_meta {
+        struct kasan_track track;
+        u32 state : 2;  /* enum kasan_state */
+        u32 alloc_size : 30;
+        u32 reserved;
+};
+struct kasan_free_meta {
+        /* Allocator freelist pointer, unused by KASAN. */
+        void **freelist;
+        struct kasan_track track;
+};
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+                                        const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+                                        const void *object);
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
        return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 745aa8f36028..60869a5a0124 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -18,6 +18,7 @@
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/stackdepot.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr)
                        sizeof(init_thread_union.stack));
 }
+#ifdef CONFIG_SLAB
+static void print_track(struct kasan_track *track)
+{
+        pr_err("PID = %u\n", track->pid);
+        if (track->stack) {
+                struct stack_trace trace;
+                depot_fetch_stack(track->stack, &trace);
+                print_stack_trace(&trace, 0);
+        } else {
+                pr_err("(stack is not available)\n");
+        }
+}
+static void object_err(struct kmem_cache *cache, struct page *page,
+                        void *object, char *unused_reason)
+{
+        struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+        struct kasan_free_meta *free_info;
+        dump_stack();
+        pr_err("Object at %p, in cache %s\n", object, cache->name);
+        if (!(cache->flags & SLAB_KASAN))
+                return;
+        switch (alloc_info->state) {
+        case KASAN_STATE_INIT:
+                pr_err("Object not allocated yet\n");
+                break;
+        case KASAN_STATE_ALLOC:
+                pr_err("Object allocated with size %u bytes.\n",
+                       alloc_info->alloc_size);
+                pr_err("Allocation:\n");
+                print_track(&alloc_info->track);
+                break;
+        case KASAN_STATE_FREE:
+                pr_err("Object freed, allocated with size %u bytes\n",
+                       alloc_info->alloc_size);
+                free_info = get_free_info(cache, object);
+                pr_err("Allocation:\n");
+                print_track(&alloc_info->track);
+                pr_err("Deallocation:\n");
+                print_track(&free_info->track);
+                break;
+        }
+}
+#endif
 static void print_address_description(struct kasan_access_info *info)
 {
        const void *addr = info->access_addr;
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info)
                if (PageSlab(page)) {
                        void *object;
                        struct kmem_cache *cache = page->slab_cache;
-                        void *last_object;
+                        object = nearest_obj(cache, page,
+                                                (void *)info->access_addr);
-                        object = virt_to_obj(cache, page_address(page), addr);
-                        last_object = page_address(page) +
-                                page->objects * cache->size;
-                        if (unlikely(object > last_object))
-                                object = last_object; /* we hit into padding */
                        object_err(cache, page, object,
-                                "kasan: bad access detected");
+                                        "kasan: bad access detected");
                        return;
                }
                dump_page(page, "kasan: bad access detected");
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info)
                if (!init_task_stack_addr(addr))
                        pr_err("Address belongs to variable %pS\n", addr);
        }
        dump_stack();
 }
diff --git a/mm/memory.c b/mm/memory.c
index 81dca0083fcd..098f00d05461 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1102,6 +1102,12 @@ again:
                        if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
+                                        /*
+                                         * oom_reaper cannot tear down dirty
+                                         * pages
+                                         */
+                                        if (unlikely(details && details->ignore_dirty))
+                                                continue;
                                        force_flush = 1;
                                        set_page_dirty(page);
                                }
@@ -1120,8 +1126,8 @@ again:
                        }
                        continue;
                }
-                /* If details->check_mapping, we leave swap entries. */
+                /* only check swap_entries if explicitly asked for in details */
-                if (unlikely(details))
+                if (unlikely(details && !details->check_swap_entries))
                        continue;
                entry = pte_to_swp_entry(ptent);
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        return addr;
 }
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
        pgd_t *pgd;
        unsigned long next;
-        if (details && !details->check_mapping)
-                details = NULL;
        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
 void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
 {
-        struct zap_details details;
+        struct zap_details details = { };
        pgoff_t hba = holebegin >> PAGE_SHIFT;
        pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/mm/mempool.c b/mm/mempool.c
index 07c383ddbbab..9b7a14a791cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
                kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
 {
        if (pool->alloc == mempool_alloc_slab)
-                kasan_slab_alloc(pool->pool_data, element);
+                kasan_slab_alloc(pool->pool_data, element, flags);
        if (pool->alloc == mempool_kmalloc)
-                kasan_krealloc(element, (size_t)pool->pool_data);
+                kasan_krealloc(element, (size_t)pool->pool_data, flags);
        if (pool->alloc == mempool_alloc_pages)
                kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
        pool->elements[pool->curr_nr++] = element;
 }
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
 {
        void *element = pool->elements[--pool->curr_nr];
        BUG_ON(pool->curr_nr < 0);
-        kasan_unpoison_element(pool, element);
+        kasan_unpoison_element(pool, element, flags);
        check_element(pool, element);
        return element;
 }
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
                return;
        while (pool->curr_nr) {
-                void *element = remove_element(pool);
+                void *element = remove_element(pool, GFP_KERNEL);
                pool->free(element, pool->pool_data);
        }
        kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
                while (new_min_nr < pool->curr_nr) {
-                        element = remove_element(pool);
+                        element = remove_element(pool, GFP_KERNEL);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);
                        spin_lock_irqsave(&pool->lock, flags);
@@ -347,7 +347,7 @@ repeat_alloc:
        spin_lock_irqsave(&pool->lock, flags);
        if (likely(pool->curr_nr)) {
-                element = remove_element(pool);
+                element = remove_element(pool, gfp_temp);
                spin_unlock_irqrestore(&pool->lock, flags);
                /* paired with rmb in mempool_free(), read comment there */
                smp_wmb();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 06f7e1707847..b34d279a7ee6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
 #include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <asm/tlb.h>
+#include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
@@ -405,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 bool oom_killer_disabled __read_mostly;
+#define K(x) ((x) << (PAGE_SHIFT-10))
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static struct task_struct *oom_reaper_list;
+static DEFINE_SPINLOCK(oom_reaper_lock);
+static bool __oom_reap_task(struct task_struct *tsk)
+{
+        struct mmu_gather tlb;
+        struct vm_area_struct *vma;
+        struct mm_struct *mm;
+        struct task_struct *p;
+        struct zap_details details = {.check_swap_entries = true,
+                                      .ignore_dirty = true};
+        bool ret = true;
+        /*
+         * Make sure we find the associated mm_struct even when the particular
+         * thread has already terminated and cleared its mm.
+         * We might have race with exit path so consider our work done if there
+         * is no mm.
+         */
+        p = find_lock_task_mm(tsk);
+        if (!p)
+                return true;
+        mm = p->mm;
+        if (!atomic_inc_not_zero(&mm->mm_users)) {
+                task_unlock(p);
+                return true;
+        }
+        task_unlock(p);
+        if (!down_read_trylock(&mm->mmap_sem)) {
+                ret = false;
+                goto out;
+        }
+        tlb_gather_mmu(&tlb, mm, 0, -1);
+        for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+                if (is_vm_hugetlb_page(vma))
+                        continue;
+                /*
+                 * mlocked VMAs require explicit munlocking before unmap.
+                 * Let's keep it simple here and skip such VMAs.
+                 */
+                if (vma->vm_flags & VM_LOCKED)
+                        continue;
+                /*
+                 * Only anonymous pages have a good chance to be dropped
+                 * without additional steps which we cannot afford as we
+                 * are OOM already.
+                 *
+                 * We do not even care about fs backed pages because all
+                 * which are reclaimable have already been reclaimed and
+                 * we do not want to block exit_mmap by keeping mm ref
+                 * count elevated without a good reason.
+                 */
+                if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+                        unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+                                         &details);
+        }
+        tlb_finish_mmu(&tlb, 0, -1);
+        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+                        task_pid_nr(tsk), tsk->comm,
+                        K(get_mm_counter(mm, MM_ANONPAGES)),
+                        K(get_mm_counter(mm, MM_FILEPAGES)),
+                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
+        up_read(&mm->mmap_sem);
+        /*
+         * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+         * reasonably reclaimable memory anymore. OOM killer can continue
+         * by selecting other victim if unmapping hasn't led to any
+         * improvements. This also means that selecting this task doesn't
+         * make any sense.
+         */
+        tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+        exit_oom_victim(tsk);
+out:
+        mmput(mm);
+        return ret;
+}
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+        int attempts = 0;
+        /* Retry the down_read_trylock(mmap_sem) a few times */
+        while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+                schedule_timeout_idle(HZ/10);
+        if (attempts > MAX_OOM_REAP_RETRIES) {
+                pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+                                task_pid_nr(tsk), tsk->comm);
+                debug_show_all_locks();
+        }
+        /* Drop a reference taken by wake_oom_reaper */
+        put_task_struct(tsk);
+}
+static int oom_reaper(void *unused)
+{
+        set_freezable();
+        while (true) {
+                struct task_struct *tsk = NULL;
+                wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+                spin_lock(&oom_reaper_lock);
+                if (oom_reaper_list != NULL) {
+                        tsk = oom_reaper_list;
+                        oom_reaper_list = tsk->oom_reaper_list;
+                }
+                spin_unlock(&oom_reaper_lock);
+                if (tsk)
+                        oom_reap_task(tsk);
+        }
+        return 0;
+}
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+        if (!oom_reaper_th || tsk->oom_reaper_list)
+                return;
+        get_task_struct(tsk);
+        spin_lock(&oom_reaper_lock);
+        tsk->oom_reaper_list = oom_reaper_list;
+        oom_reaper_list = tsk;
+        spin_unlock(&oom_reaper_lock);
+        wake_up(&oom_reaper_wait);
+}
+static int __init oom_init(void)
+{
+        oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+        if (IS_ERR(oom_reaper_th)) {
+                pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+                                PTR_ERR(oom_reaper_th));
+                oom_reaper_th = NULL;
+        }
+        return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
 /**
 * mark_oom_victim - mark the given task as OOM victim
 * @tsk: task to mark
@@ -431,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk)
 /**
 * exit_oom_victim - note the exit of an OOM victim
 */
-void exit_oom_victim(void)
+void exit_oom_victim(struct task_struct *tsk)
 {
-        clear_thread_flag(TIF_MEMDIE);
+        if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+                return;
        if (!atomic_dec_return(&oom_victims))
                wake_up_all(&oom_victims_wait);
@@ -494,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
        return false;
 }
-#define K(x) ((x) << (PAGE_SHIFT-10))
 /*
 * Must be called while holding a reference to p, which will be released upon
 * returning.
@@ -510,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
        unsigned int victim_points = 0;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                              DEFAULT_RATELIMIT_BURST);
+        bool can_oom_reap = true;
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -600,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                        continue;
                if (same_thread_group(p, victim))
                        continue;
-                if (unlikely(p->flags & PF_KTHREAD))
+                if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
-                        continue;
+                    p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-                if (is_global_init(p))
+                        /*
-                        continue;
+                         * We cannot use oom_reaper for the mm shared by this
-                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                         * process because it wouldn't get killed and so the
+                         * memory might be still used.
+                         */
+                        can_oom_reap = false;
                        continue;
+                }
                do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
        }
        rcu_read_unlock();
+        if (can_oom_reap)
+                wake_oom_reaper(victim);
        mmdrop(mm);
        put_task_struct(victim);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a762be57e46e..59de90d5d3a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -692,34 +692,28 @@ static inline void __free_one_page(struct page *page,
        unsigned long combined_idx;
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
-        unsigned int max_order = MAX_ORDER;
+        unsigned int max_order;
+        max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
        VM_BUG_ON(!zone_is_initialized(zone));
        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
        VM_BUG_ON(migratetype == -1);
-        if (is_migrate_isolate(migratetype)) {
+        if (likely(!is_migrate_isolate(migratetype)))
-                /*
-                 * We restrict max order of merging to prevent merge
-                 * between freepages on isolate pageblock and normal
-                 * pageblock. Without this, pageblock isolation
-                 * could cause incorrect freepage accounting.
-                 */
-                max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
-        } else {
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
-        }
-        page_idx = pfn & ((1 << max_order) - 1);
+        page_idx = pfn & ((1 << MAX_ORDER) - 1);
        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
+continue_merging:
        while (order < max_order - 1) {
                buddy_idx = __find_buddy_index(page_idx, order);
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
-                        break;
+                        goto done_merging;
                /*
                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
                 * merge with it and move up one order.
@@ -736,6 +730,32 @@ static inline void __free_one_page(struct page *page,
                page_idx = combined_idx;
                order++;
        }
+        if (max_order < MAX_ORDER) {
+                /* If we are here, it means order is >= pageblock_order.
+                 * We want to prevent merge between freepages on isolate
+                 * pageblock and normal pageblock. Without this, pageblock
+                 * isolation could cause incorrect freepage or CMA accounting.
+                 *
+                 * We don't want to hit this code for the more frequent
+                 * low-order merging.
+                 */
+                if (unlikely(has_isolate_pageblock(zone))) {
+                        int buddy_mt;
+                        buddy_idx = __find_buddy_index(page_idx, order);
+                        buddy = page + (buddy_idx - page_idx);
+                        buddy_mt = get_pageblock_migratetype(buddy);
+                        if (migratetype != buddy_mt
+                                        && (is_migrate_isolate(migratetype) ||
+                                                is_migrate_isolate(buddy_mt)))
+                                goto done_merging;
+                }
+                max_order++;
+                goto continue_merging;
+        }
+done_merging:
        set_page_order(page, order);
        /*
diff --git a/mm/slab.c b/mm/slab.c
index e719a5cb3396..17e2848979c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        }
 #endif
+        kasan_cache_create(cachep, &size, &flags);
        size = ALIGN(size, cachep->align);
        /*
         * We should restrict the number of objects in a slab to implement
@@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
                 * cache which they are a constructor for.  Otherwise, deadlock.
                 * They must also be threaded.
                 */
-                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+                if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+                        kasan_unpoison_object_data(cachep,
+                                                   objp + obj_offset(cachep));
                        cachep->ctor(objp + obj_offset(cachep));
+                        kasan_poison_object_data(
+                                cachep, objp + obj_offset(cachep));
+                }
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                            struct page *page)
 {
        int i;
+        void *objp;
        cache_init_objs_debug(cachep, page);
@@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep,
        for (i = 0; i < cachep->num; i++) {
                /* constructor could break poison info */
-                if (DEBUG == 0 && cachep->ctor)
+                if (DEBUG == 0 && cachep->ctor) {
-                        cachep->ctor(index_to_obj(cachep, page, i));
+                        objp = index_to_obj(cachep, page, i);
+                        kasan_unpoison_object_data(cachep, objp);
+                        cachep->ctor(objp);
+                        kasan_poison_object_data(cachep, objp);
+                }
                set_free_obj(page, i, i);
        }
@@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep,
        slab_map_pages(cachep, page, freelist);
+        kasan_poison_slab(page);
        cache_init_objs(cachep, page);
        if (gfpflags_allow_blocking(local_flags))
@@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 {
        struct array_cache *ac = cpu_cache_get(cachep);
+        kasan_slab_free(cachep, objp);
        check_irq_off();
        kmemleak_free_recursive(objp, cachep->flags);
        objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *ret = slab_alloc(cachep, flags, _RET_IP_);
+        kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
        ret = slab_alloc(cachep, flags, _RET_IP_);
+        kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc(_RET_IP_, ret,
                      size, cachep->size, flags);
        return ret;
@@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+        kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
@@ -3469,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+        kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc_node(_RET_IP_, ret,
                           size, cachep->size,
                           flags, nodeid);
@@ -3481,11 +3500,15 @@ static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
        struct kmem_cache *cachep;
+        void *ret;
        cachep = kmalloc_slab(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+        ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+        kasan_kmalloc(cachep, ret, size, flags);
+        return ret;
 }
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3519,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                return cachep;
        ret = slab_alloc(cachep, flags, caller);
+        kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc(caller, ret,
                      size, cachep->size, flags);
@@ -4290,10 +4314,18 @@ module_init(slab_proc_init);
 */
 size_t ksize(const void *objp)
 {
+        size_t size;
        BUG_ON(!objp);
        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
-        return virt_to_cache(objp)->object_size;
+        size = virt_to_cache(objp)->object_size;
+        /* We assume that ksize callers could use the whole allocated area,
+         * so we need to unpoison this area.
+         */
+        kasan_krealloc(objp, size, GFP_NOWAIT);
+        return size;
 }
 EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
index ff39a8fc3b3f..5969769fbee6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
                kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
                kmemleak_alloc_recursive(object, s->object_size, 1,
                                         s->flags, flags);
-                kasan_slab_alloc(s, object);
+                kasan_slab_alloc(s, object, flags);
        }
        memcg_kmem_put_cache(s);
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b2e379639a5b..3239bfd758e6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
 */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
-                SLAB_FAILSLAB)
+                SLAB_FAILSLAB | SLAB_KASAN)
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
                         SLAB_NOTRACK | SLAB_ACCOUNT)
@@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
        page = alloc_kmem_pages(flags, order);
        ret = page ? page_address(page) : NULL;
        kmemleak_alloc(ret, size, 1, flags);
-        kasan_kmalloc_large(ret, size);
+        kasan_kmalloc_large(ret, size, flags);
        return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
                ks = ksize(p);
        if (ks >= new_size) {
-                kasan_krealloc((void *)p, new_size);
+                kasan_krealloc((void *)p, new_size, flags);
                return (void *)p;
        }
diff --git a/mm/slub.c b/mm/slub.c
index 7277413ebc8b..4dbb109eb8cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
        kmemleak_alloc(ptr, size, 1, flags);
-        kasan_kmalloc_large(ptr, size);
+        kasan_kmalloc_large(ptr, size, flags);
 }
 static inline void kfree_hook(const void *x)
@@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
        trace_kmalloc_node(_RET_IP_, ret,
                           size, s->size, gfpflags, node);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node)
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
        init_tracking(kmem_cache_node, n);
 #endif
-        kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
+        kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+                      GFP_KERNEL);
        init_kmem_cache_node(n);
        inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags)
        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, flags);
        return ret;
 }
@@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
-        kasan_kmalloc(s, ret, size);
+        kasan_kmalloc(s, ret, size, flags);
        return ret;
 }
@@ -3635,7 +3636,7 @@ size_t ksize(const void *object)
        size_t size = __ksize(object);
        /* We assume that ksize callers could use whole allocated area,
           so we need unpoison this area. */
-        kasan_krealloc(object, size);
+        kasan_krealloc(object, size, GFP_NOWAIT);
        return size;
 }
 EXPORT_SYMBOL(ksize);
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-25 19:59:11 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-25 19:59:11 -0400
commit	606c61a0579669c292dc5f5e1cf898edecfc0d53 (patch)
tree	569aa7e9b99571890bfccd7278bbc303cfa0a919
parent	15dbc136dff62ebefb03353cfb7d308d49b275f3 (diff)
parent	0fda2788b03c1868e2f20b3b7995b8cc2adf4715 (diff)