41 files changed, 703 insertions, 189 deletions
diff --git a/CREDITS b/CREDITS
index c322dcfb926d..28ee1514b9de 100644
--- a/CREDITS
+++ b/CREDITS
@@ -9,6 +9,10 @@
                        Linus
 ----------
+M: Matt Mackal
+E: mpm@selenic.com
+D: SLOB slab allocator
 N: Matti Aarnio
 E: mea@nic.funet.fi
 D: Alpha systems hacking, IPv6 and other network related stuff
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index c6a06b71594d..f40578026a04 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -314,6 +314,7 @@ int main(int argc, char *argv[])
                        break;
                case 'm':
                        strncpy(cpumask, optarg, sizeof(cpumask));
+                        cpumask[sizeof(cpumask) - 1] = '\0';
                        maskset = 1;
                        printf("cpumask %s maskset %d\n", cpumask, maskset);
                        break;
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 884904975d0b..c1b9aa8c5a52 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        [KNL] Should the soft-lockup detector generate panics.
                        Format: <integer>
+        softlockup_all_cpu_backtrace=
+                        [KNL] Should the soft-lockup detector generate
+                        backtraces on all cpus.
+                        Format: <integer>
        sonypi.*=       [HW] Sony Programmable I/O Control Device driver
                        See Documentation/laptops/sonypi.txt
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index f304edb8fbe7..45134dc23854 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -209,15 +209,12 @@ If memory device is found, memory hotplug code will be called.
 4.2 Notify memory hot-add event by hand
 ------------
-On powerpc, the firmware does not notify a memory hotplug event to the kernel.
+On some architectures, the firmware may not notify the kernel of a memory
-Therefore, "probe" interface is supported to notify the event to the kernel.
+hotplug event.  Therefore, the memory "probe" interface is supported to
-This interface depends on CONFIG_ARCH_MEMORY_PROBE.
+explicitly notify the kernel.  This interface depends on
+CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
-CONFIG_ARCH_MEMORY_PROBE is supported on powerpc only. On x86, this config
+if hotplug is supported, although for x86 this should be handled by ACPI
-option is disabled by default since ACPI notifies a memory hotplug event to
+notification.
-the kernel, which performs its hotplug operation as the result. Please
-enable this option if you need the "probe" interface for testing purposes
-on x86.
 Probe interface is located at
 /sys/devices/system/memory/probe
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 708bb7f1b7e0..c14374e71775 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -75,6 +75,7 @@ show up in /proc/sys/kernel:
 - shmall
 - shmmax                      [ sysv ipc ]
 - shmmni
+- softlockup_all_cpu_backtrace
 - stop-a                      [ SPARC only ]
 - sysrq                       ==> Documentation/sysrq.txt
 - sysctl_writes_strict
@@ -783,6 +784,22 @@ via the /proc/sys interface:
 ==============================================================
+softlockup_all_cpu_backtrace:
+This value controls the soft lockup detector thread's behavior
+when a soft lockup condition is detected as to whether or not
+to gather further debug information. If enabled, each cpu will
+be issued an NMI and instructed to capture stack trace.
+This feature is only applicable for architectures which support
+NMI.
+0: do nothing. This is the default behavior.
+1: on detection capture more debug information.
+==============================================================
 tainted:
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd4b34c03738..4415aa915681 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result.  It is
 set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
 The initial value is zero.  Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list.  If the user writes '0' to this
+sysctl, it will revert to this default behavior.
 ==============================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index 3f2e171047b9..3cc94fff780f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8196,13 +8196,15 @@ S:	Maintained
 F:      drivers/usb/misc/sisusbvga/
 SLAB ALLOCATOR
-M:      Christoph Lameter <cl@linux-foundation.org>
+M:      Christoph Lameter <cl@linux.com>
 M:      Pekka Enberg <penberg@kernel.org>
-M:      Matt Mackall <mpm@selenic.com>
+M:      David Rientjes <rientjes@google.com>
+M:      Joonsoo Kim <iamjoonsoo.kim@lge.com>
+M:      Andrew Morton <akpm@linux-foundation.org>
 L:      linux-mm@kvack.org
 S:      Maintained
 F:      include/linux/sl?b*.h
-F:      mm/sl?b.c
+F:      mm/sl?b*
 SLEEPABLE READ-COPY UPDATE (SRCU)
 M:      Lai Jiangshan <laijs@cn.fujitsu.com>
diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h
index 1dd275dc8f65..7b485876cad4 100644
--- a/arch/ia64/include/uapi/asm/fcntl.h
+++ b/arch/ia64/include/uapi/asm/fcntl.h
@@ -8,6 +8,7 @@
 #define force_o_largefile()     \
                (personality(current->personality) != PER_LINUX32)
+#include <linux/personality.h>
 #include <asm-generic/fcntl.h>
 #endif /* _ASM_IA64_FCNTL_H */
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 375cffcf7dbd..91d219381306 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,7 +89,7 @@ static inline unsigned long get_softint(void)
        return retval;
 }
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 extern void *hardirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index b2988f25e230..027e09986194 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
        }
 }
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
        struct thread_info *tp = current_thread_info();
        struct pt_regs *regs = get_irq_regs();
@@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void)
        spin_lock_irqsave(&global_cpu_snapshot_lock, flags);
-        memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
        this_cpu = raw_smp_processor_id();
-        __global_reg_self(tp, regs, this_cpu);
+        memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
+        if (include_self)
+                __global_reg_self(tp, regs, this_cpu);
        smp_fetch_global_regs();
        for_each_online_cpu(cpu) {
-                struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg;
+                struct global_reg_snapshot *gp;
+                if (!include_self && cpu == this_cpu)
+                        continue;
+                gp = &global_cpu_snapshot[cpu].reg;
                __global_reg_poll(gp);
@@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void)
 static void sysrq_handle_globreg(int key)
 {
-        arch_trigger_all_cpu_backtrace();
+        arch_trigger_all_cpu_backtrace(true);
 }
 static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034cf..a80cbb88ea91 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 extern void init_ISA_irqs(void);
 #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de5083..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
 {
        int i;
+        int cpu = get_cpu();
-        if (test_and_set_bit(0, &backtrace_flag))
+        if (test_and_set_bit(0, &backtrace_flag)) {
                /*
                 * If there is already a trigger_all_cpu_backtrace() in progress
                 * (backtrace_flag == 1), don't output double cpu dump infos.
                 */
+                put_cpu();
                return;
+        }
        cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+        if (!include_self)
+                cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-        printk(KERN_INFO "sending NMI to all CPUs:\n");
+        if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-        apic->send_IPI_all(NMI_VECTOR);
+                pr_info("sending NMI to %s CPUs:\n",
+                        (include_self ? "all" : "other"));
+                apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+        }
        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
        for (i = 0; i < 10 * 1000; i++) {
                if (cpumask_empty(to_cpumask(backtrace_mask)))
                        break;
                mdelay(1);
+                touch_softlockup_watchdog();
        }
        clear_bit(0, &backtrace_flag);
        smp_mb__after_atomic();
+        put_cpu();
 }
 static int
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 83969f8c5727..6467c919c509 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -176,14 +176,24 @@ static int __init cma_activate_area(struct cma *cma)
                base_pfn = pfn;
                for (j = pageblock_nr_pages; j; --j, pfn++) {
                        WARN_ON_ONCE(!pfn_valid(pfn));
+                        /*
+                         * alloc_contig_range requires the pfn range
+                         * specified to be in the same zone. Make this
+                         * simple by forcing the entire CMA resv range
+                         * to be in the same zone.
+                         */
                        if (page_zone(pfn_to_page(pfn)) != zone)
-                                return -EINVAL;
+                                goto err;
                }
                init_cma_reserved_pageblock(pfn_to_page(base_pfn));
        } while (--i);
        mutex_init(&cma->lock);
        return 0;
+err:
+        kfree(cma->bitmap);
+        return -EINVAL;
 }
 static struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index 2a635b6fdaf7..c880ba685754 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -601,6 +601,7 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev)
        pcr->slots[RTSX_MS_CARD].card_event = NULL;
        msh = host->msh;
        host->eject = true;
+        cancel_work_sync(&host->handle_req);
        mutex_lock(&host->host_mutex);
        if (host->req) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a106b3f2b22a..fae17c640df3 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -331,6 +331,7 @@ struct dlm_lock_resource
        u16 state;
        char lvb[DLM_LVB_LEN];
        unsigned int inflight_locks;
+        unsigned int inflight_assert_workers;
        unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 };
@@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
                                   struct dlm_lock_resource *res);
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res);
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3087a21d32f9..82abf0cc9a12 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        res->inflight_assert_workers = 0;
        res->dlm = dlm;
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
        wake_up(&res->wq);
 }
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
+        res->inflight_assert_workers++;
+        mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+                        dlm->name, res->lockname.len, res->lockname.name,
+                        res->inflight_assert_workers);
+}
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        spin_lock(&res->spinlock);
+        __dlm_lockres_grab_inflight_worker(dlm, res);
+        spin_unlock(&res->spinlock);
+}
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
+        BUG_ON(res->inflight_assert_workers == 0);
+        res->inflight_assert_workers--;
+        mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+                        dlm->name, res->lockname.len, res->lockname.name,
+                        res->inflight_assert_workers);
+}
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        spin_lock(&res->spinlock);
+        __dlm_lockres_drop_inflight_worker(dlm, res);
+        spin_unlock(&res->spinlock);
+}
 /*
 * lookup a lock resource by name.
 * may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
                        mlog(ML_ERROR, "failed to dispatch assert master work\n");
                        response = DLM_MASTER_RESP_ERROR;
                        dlm_lockres_put(res);
-                }
+                } else
+                        dlm_lockres_grab_inflight_worker(dlm, res);
        } else {
                if (res)
                        dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_release_ast(dlm, res);
 put:
+        dlm_lockres_drop_inflight_worker(dlm, res);
        dlm_lockres_put(res);
        mlog(0, "finished with dlm_assert_master_worker\n");
@@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        /* remove it so that only one mle will be found */
                        __dlm_unlink_mle(dlm, tmp);
                        __dlm_mle_detach_hb_events(dlm, tmp);
-                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+                        if (tmp->type == DLM_MLE_MASTER) {
-                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+                                ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
-                            "telling master to get ref for cleared out mle "
+                                mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
-                            "during migration\n", dlm->name, namelen, name,
+                                                "telling master to get ref "
-                            master, new_master);
+                                                "for cleared out mle during "
+                                                "migration\n", dlm->name,
+                                                namelen, name, master,
+                                                new_master);
+                        }
                }
                spin_unlock(&tmp->spinlock);
        }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 5de019437ea5..45067faf5695 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
                                mlog_errno(-ENOMEM);
                                /* retry!? */
                                BUG();
-                        }
+                        } else
+                                __dlm_lockres_grab_inflight_worker(dlm, res);
                } else /* put.. incase we are not the master */
                        dlm_lockres_put(res);
                spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869de829d..69aac6f088ad 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                 * refs on it. */
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
-                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+                    (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+                    (lockres->inflight_assert_workers != 0)) {
                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "used %d, state %d\n", dlm->name,
+                             "used %d, state %d, assert master workers %u\n",
-                             lockres->lockname.len, lockres->lockname.name,
+                             dlm->name, lockres->lockname.len,
-                             !unused, lockres->state);
+                             lockres->lockname.name,
-                        list_move_tail(&dlm->purge_list, &lockres->purge);
+                             !unused, lockres->state,
+                             lockres->inflight_assert_workers);
+                        list_move_tail(&lockres->purge, &dlm->purge_list);
                        spin_unlock(&lockres->spinlock);
                        continue;
                }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52cf5c9..2e3c9dbab68c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                                     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
                } else if (status == DLM_RECOVERING ||
                           status == DLM_MIGRATING ||
-                           status == DLM_FORWARD) {
+                           status == DLM_FORWARD ||
+                           status == DLM_NOLOCKMGR
+                           ) {
                        /* must clear the actions because this unlock
                         * is about to be retried.  cannot free or do
                         * any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                             res->lockname.name,
                             status==DLM_RECOVERING?"recovering":
                             (status==DLM_MIGRATING?"migrating":
-                              "forward"));
+                                (status == DLM_FORWARD ? "forward" :
+                                                "nolockmanager")));
                        actions = 0;
                }
                if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                         * updated state to the recovery master.  this thread
                         * just needs to finish out the operation and call
                         * the unlockast. */
-                        ret = DLM_NORMAL;
+                        if (dlm_is_node_dead(dlm, owner))
+                                ret = DLM_NORMAL;
+                        else
+                                ret = DLM_NOLOCKMGR;
                } else {
                        /* something bad.  this will BUG in ocfs2 */
                        ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
        if (status == DLM_RECOVERING ||
            status == DLM_MIGRATING ||
-            status == DLM_FORWARD) {
+            status == DLM_FORWARD ||
+            status == DLM_NOLOCKMGR) {
                /* We want to go away for a tiny bit to allow recovery
                 * / migration to complete on this resource. I don't
                 * know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
                msleep(50);
                mlog(0, "retrying unlock due to pending recovery/"
-                     "migration/in-progress\n");
+                     "migration/in-progress/reconnect\n");
                goto retry;
        }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc398445..8add6f1030d7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
        return inode;
 }
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+                struct dentry *dentry, struct inode *inode)
+{
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+        ocfs2_lock_res_free(&dl->dl_lockres);
+        BUG_ON(dl->dl_count != 1);
+        spin_lock(&dentry_attach_lock);
+        dentry->d_fsdata = NULL;
+        spin_unlock(&dentry_attach_lock);
+        kfree(dl);
+        iput(inode);
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       umode_t mode,
@@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir,
        sigset_t oldset;
        int did_block_signals = 0;
        struct posix_acl *default_acl = NULL, *acl = NULL;
+        struct ocfs2_dentry_lock *dl = NULL;
        trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
                          (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        dl = dentry->d_fsdata;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 &lookup);
@@ -469,6 +487,9 @@ leave:
         * ocfs2_delete_inode will mutex_lock again.
         */
        if ((status < 0) && inode) {
+                if (dl)
+                        ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
@@ -991,6 +1012,65 @@ leave:
        return status;
 }
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+                u64 src_inode_no, u64 dest_inode_no)
+{
+        int ret = 0, i = 0;
+        u64 parent_inode_no = 0;
+        u64 child_inode_no = src_inode_no;
+        struct inode *child_inode;
+#define MAX_LOOKUP_TIMES 32
+        while (1) {
+                child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+                if (IS_ERR(child_inode)) {
+                        ret = PTR_ERR(child_inode);
+                        break;
+                }
+                ret = ocfs2_inode_lock(child_inode, NULL, 0);
+                if (ret < 0) {
+                        iput(child_inode);
+                        if (ret != -ENOENT)
+                                mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+                                &parent_inode_no);
+                ocfs2_inode_unlock(child_inode, 0);
+                iput(child_inode);
+                if (ret < 0) {
+                        ret = -ENOENT;
+                        break;
+                }
+                if (parent_inode_no == dest_inode_no) {
+                        ret = 1;
+                        break;
+                }
+                if (parent_inode_no == osb->root_inode->i_ino) {
+                        ret = 0;
+                        break;
+                }
+                child_inode_no = parent_inode_no;
+                if (++i >= MAX_LOOKUP_TIMES) {
+                        mlog(ML_NOTICE, "max lookup times reached, filesystem "
+                                        "may have nested directories, "
+                                        "src inode: %llu, dest inode: %llu.\n",
+                                        (unsigned long long)src_inode_no,
+                                        (unsigned long long)dest_inode_no);
+                        ret = 0;
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * The only place this should be used is rename!
 * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                             struct inode *inode2)
 {
        int status;
+        int inode1_is_ancestor, inode2_is_ancestor;
        struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
        struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
        struct buffer_head **tmpbh;
@@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        if (*bh2)
                *bh2 = NULL;
-        /* we always want to lock the one with the lower lockid first. */
+        /* we always want to lock the one with the lower lockid first.
+         * and if they are nested, we lock ancestor first */
        if (oi1->ip_blkno != oi2->ip_blkno) {
-                if (oi1->ip_blkno < oi2->ip_blkno) {
+                inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+                                oi1->ip_blkno);
+                if (inode1_is_ancestor < 0) {
+                        status = inode1_is_ancestor;
+                        goto bail;
+                }
+                inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+                                oi2->ip_blkno);
+                if (inode2_is_ancestor < 0) {
+                        status = inode2_is_ancestor;
+                        goto bail;
+                }
+                if ((inode1_is_ancestor == 1) ||
+                                (oi1->ip_blkno < oi2->ip_blkno &&
+                                inode2_is_ancestor == 0)) {
                        /* switch id1 and id2 around */
                        tmpbh = bh2;
                        bh2 = bh1;
@@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,
        struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
        struct ocfs2_dir_lookup_result target_insert = { NULL, };
+        bool should_add_orphan = false;
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
                rename_lock = 1;
+                /* here we cannot guarantee the inodes haven't just been
+                 * changed, so check if they are nested again */
+                status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+                                old_inode->i_ino);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                } else if (status == 1) {
+                        status = -EPERM;
+                        trace_ocfs2_rename_not_permitted(
+                                        (unsigned long long)old_inode->i_ino,
+                                        (unsigned long long)new_dir->i_ino);
+                        goto bail;
+                }
        }
        /* if old and new are the same, this'll just do one lock. */
@@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                mlog_errno(status);
                                goto bail;
                        }
+                        should_add_orphan = true;
                }
        } else {
                BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
-                if (S_ISDIR(new_inode->i_mode) ||
-                    (ocfs2_read_links_count(newfe) == 1)) {
-                        status = ocfs2_orphan_add(osb, handle, new_inode,
-                                                  newfe_bh, orphan_name,
-                                                  &orphan_insert, orphan_dir);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                }
                /* change the dirent to point to the correct inode */
                status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
                                            old_inode);
@@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,
                else
                        ocfs2_add_links_count(newfe, -1);
                ocfs2_journal_dirty(handle, newfe_bh);
+                if (should_add_orphan) {
+                        status = ocfs2_orphan_add(osb, handle, new_inode,
+                                        newfe_bh, orphan_name,
+                                        &orphan_insert, orphan_dir);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        sigset_t oldset;
        int did_block_signals = 0;
+        struct ocfs2_dentry_lock *dl = NULL;
        trace_ocfs2_symlink_begin(dir, dentry, symname,
                                  dentry->d_name.len, dentry->d_name.name);
@@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        dl = dentry->d_fsdata;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 &lookup);
@@ -1864,6 +1980,9 @@ bail:
        if (xattr_ac)
                ocfs2_free_alloc_context(xattr_ac);
        if ((status < 0) && inode) {
+                if (dl)
+                        ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d6..6cb019b7c6a8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
                  __entry->new_len, __get_str(new_name))
 );
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
 TRACE_EVENT(ocfs2_rename_target_exists,
        TP_PROTO(int new_len, const char *new_name),
        TP_ARGS(new_len, new_name),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 714e53b9cc66..636aab69ead5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
                goto out;
        }
+        error = ocfs2_rw_lock(inode, 1);
+        if (error) {
+                mlog_errno(error);
+                goto out;
+        }
        error = ocfs2_inode_lock(inode, &old_bh, 1);
        if (error) {
                mlog_errno(error);
+                ocfs2_rw_unlock(inode, 1);
                goto out;
        }
@@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+        ocfs2_rw_unlock(inode, 1);
        brelse(old_bh);
        if (error) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c7a89cea5c5d..ddb662b32447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_shutdown_local_alloc(osb);
+        ocfs2_truncate_log_shutdown(osb);
        /* This will disable recovery and flush any recovery work. */
        ocfs2_recovery_exit(osb);
-        /*
-         * During dismount, when it recovers another node it will call
-         * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
-         */
-        ocfs2_truncate_log_shutdown(osb);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 6a45fb583ff1..447775ee2c4b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void)
 #ifdef arch_trigger_all_cpu_backtrace
 static inline bool trigger_all_cpu_backtrace(void)
 {
-        arch_trigger_all_cpu_backtrace();
+        arch_trigger_all_cpu_backtrace(true);
        return true;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+        arch_trigger_all_cpu_backtrace(false);
+        return true;
+}
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
        return false;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+        return false;
+}
 #endif
 #ifdef CONFIG_LOCKUP_DETECTOR
@@ -48,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
                           void __user *, size_t *, loff_t *);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3c545b48aeab..8304959ad336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page)
        ClearPageHead(page);
 }
 #endif
+#define PG_head_mask ((1L << PG_head))
 #else
 /*
 * Reduce page flag use as much as possible by overlapping
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_MEMORY_FAILURE
        VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
+        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
        arch_crash_save_vmcoreinfo();
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
+                /* Fall-through to the CPU_DEAD[_FROZEN] case. */
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
                free_percpu(cfd->csd);
                break;
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
+                /*
+                 * The IPIs for the smp-call-function callbacks queued by other
+                 * CPUs might arrive late, either due to hardware latencies or
+                 * because this CPU disabled interrupts (inside stop-machine)
+                 * before the IPIs were sent. So flush out any pending callbacks
+                 * explicitly (without waiting for the IPIs to arrive), to
+                 * ensure that the outgoing CPU doesn't go offline with work
+                 * still pending.
+                 */
+                flush_smp_call_function_queue(false);
+                break;
 #endif
        };
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        return 0;
 }
-/*
+/**
- * Invoked by arch to handle an IPI for call function single. Must be
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
- * called from the arch with interrupts disabled.
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
 */
 void generic_smp_call_function_single_interrupt(void)
 {
+        flush_smp_call_function_queue(true);
+}
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ *                    offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+        struct llist_head *head;
        struct llist_node *entry;
        struct call_single_data *csd, *csd_next;
        static bool warned;
-        entry = llist_del_all(&__get_cpu_var(call_single_queue));
+        WARN_ON(!irqs_disabled());
+        head = &__get_cpu_var(call_single_queue);
+        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);
-        /*
+        /* There shouldn't be any pending callbacks on an offline CPU. */
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
+        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
-         */
+                     !warned && !llist_empty(head))) {
-        if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7de6555cfea0..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -861,6 +860,17 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#ifdef CONFIG_SMP
+        {
+                .procname       = "softlockup_all_cpu_backtrace",
+                .data           = &sysctl_softlockup_all_cpu_backtrace,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif /* CONFIG_SMP */
        {
                .procname       = "nmi_watchdog",
                .data           = &watchdog_user_enabled,
@@ -1317,7 +1327,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(percpu_pagelist_fraction),
                .mode           = 0644,
                .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
-                .extra1         = &min_percpu_pagelist_fract,
+                .extra1         = &zero,
        },
 #ifdef CONFIG_MMU
        {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
 int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
+static unsigned long soft_lockup_nmi_warn;
 /* boot commands */
 /*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+        sysctl_softlockup_all_cpu_backtrace =
+                !!simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
 /*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
+        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
        /* kick the hardlockup detector */
        watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
+                if (softlockup_all_cpu_backtrace) {
+                        /* Prevent multiple soft-lockup reports if one cpu is already
+                         * engaged in dumping cpu back traces
+                         */
+                        if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+                                /* Someone else will report us. Let's give up */
+                                __this_cpu_write(soft_watchdog_warn, true);
+                                return HRTIMER_RESTART;
+                        }
+                }
                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                else
                        dump_stack();
+                if (softlockup_all_cpu_backtrace) {
+                        /* Avoid generating two back traces for current
+                         * given that one is already made above
+                         */
+                        trigger_allbutself_cpu_backtrace();
+                        clear_bit(0, &soft_lockup_nmi_warn);
+                        /* Barrier to sync with other cpus */
+                        smp_mb__after_atomic();
+                }
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
                __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
        int cpu;
        get_online_cpus();
-        preempt_disable();
        for_each_online_cpu(cpu)
                update_timers(cpu);
-        preempt_enable();
        put_online_cpus();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7cfcc1b8e101..7a638aa3545b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -930,7 +930,7 @@ config LOCKDEP
        bool
        depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
        select STACKTRACE
-        select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC
+        select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
        select KALLSYMS
        select KALLSYMS_ALL
@@ -1408,7 +1408,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
        depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
        depends on !X86_64
        select STACKTRACE
-        select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+        select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
        help
          Provide stacktrace filter for fault-injection capabilities
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e60837dc785c..33514d88fef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,37 @@ unlock:
        spin_unlock(ptl);
 }
+/*
+ * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
+ * during copy_user_huge_page()'s copy_page_rep(): in the case when
+ * the source page gets split and a tail freed before copy completes.
+ * Called under pmd_lock of checked pmd, so safe from splitting itself.
+ */
+static void get_user_huge_page(struct page *page)
+{
+        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+                struct page *endpage = page + HPAGE_PMD_NR;
+                atomic_add(HPAGE_PMD_NR, &page->_count);
+                while (++page < endpage)
+                        get_huge_page_tail(page);
+        } else {
+                get_page(page);
+        }
+}
+static void put_user_huge_page(struct page *page)
+{
+        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+                struct page *endpage = page + HPAGE_PMD_NR;
+                while (page < endpage)
+                        put_page(page++);
+        } else {
+                put_page(page);
+        }
+}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-        get_page(page);
+        get_user_huge_page(page);
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
@@ -1095,7 +1126,7 @@ alloc:
                                split_huge_page(page);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                        put_page(page);
+                        put_user_huge_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
@@ -1105,7 +1136,7 @@ alloc:
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
-                        put_page(page);
+                        put_user_huge_page(page);
                } else
                        split_huge_page_pmd(vma, address, pmd);
                ret |= VM_FAULT_FALLBACK;
@@ -1127,7 +1158,7 @@ alloc:
        spin_lock(ptl);
        if (page)
-                put_page(page);
+                put_user_huge_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
                mem_cgroup_uncharge_page(new_page);
@@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-        if (pmd_trans_huge(*pmd))
-                goto out;
        anon_vma_lock_write(vma->anon_vma);
@@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-        if (pmd_trans_huge(*pmd))
-                goto out;
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 static void split_huge_page_address(struct mm_struct *mm,
                                    unsigned long address)
 {
+        pgd_t *pgd;
+        pud_t *pud;
        pmd_t *pmd;
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-        pmd = mm_find_pmd(mm, address);
+        pgd = pgd_offset(mm, address);
-        if (!pmd)
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 226910cb7c9b..2024bbd573d2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
                update_mmu_cache(vma, address, ptep);
 }
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_migration_entry(swp))
+                return 1;
+        else
+                return 0;
+}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+                return 1;
+        else
+                return 0;
+}
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
@@ -2559,10 +2584,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
-                if (!huge_pte_none(huge_ptep_get(src_pte))) {
+                entry = huge_ptep_get(src_pte);
+                if (huge_pte_none(entry)) { /* skip none entry */
+                        ;
+                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+                                    is_hugetlb_entry_hwpoisoned(entry))) {
+                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
+                        if (is_write_migration_entry(swp_entry) && cow) {
+                                /*
+                                 * COW mappings require pages in both
+                                 * parent and child to be set to read.
+                                 */
+                                make_migration_entry_read(&swp_entry);
+                                entry = swp_entry_to_pte(swp_entry);
+                                set_huge_pte_at(src, addr, src_pte, entry);
+                        }
+                        set_huge_pte_at(dst, addr, dst_pte, entry);
+                } else {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
-                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
                        page_dup_rmap(ptepage);
@@ -2578,32 +2619,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        return ret;
 }
-static int is_hugetlb_entry_migration(pte_t pte)
-{
-        swp_entry_t swp;
-        if (huge_pte_none(pte) || pte_present(pte))
-                return 0;
-        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_migration_entry(swp))
-                return 1;
-        else
-                return 0;
-}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
-        swp_entry_t swp;
-        if (huge_pte_none(pte) || pte_present(pte))
-                return 0;
-        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_hwpoison_entry(swp))
-                return 1;
-        else
-                return 0;
-}
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                            unsigned long start, unsigned long end,
                            struct page *ref_page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e80994a..346ddc9e4c0d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
-        BUG_ON(pmd_trans_huge(*pmd));
        mmun_start = addr;
        mmun_end   = addr + PAGE_SIZE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 284974230459..eb58de19f815 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 * passed via @private.)
 */
-static struct vm_area_struct *
+static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                const nodemask_t *nodes, unsigned long flags, void *private)
 {
-        int err;
+        int err = 0;
-        struct vm_area_struct *first, *vma, *prev;
+        struct vm_area_struct *vma, *prev;
-        first = find_vma(mm, start);
+        vma = find_vma(mm, start);
-        if (!first)
+        if (!vma)
-                return ERR_PTR(-EFAULT);
+                return -EFAULT;
        prev = NULL;
-        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+        for (; vma && vma->vm_start < end; vma = vma->vm_next) {
                unsigned long endvma = vma->vm_end;
                if (endvma > end)
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
-                                return ERR_PTR(-EFAULT);
+                                return -EFAULT;
                        if (prev && prev->vm_end < vma->vm_start)
-                                return ERR_PTR(-EFAULT);
+                                return -EFAULT;
                }
                if (flags & MPOL_MF_LAZY) {
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                        err = queue_pages_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
-                        if (err) {
+                        if (err)
-                                first = ERR_PTR(err);
                                break;
-                        }
                }
 next:
                prev = vma;
        }
-        return first;
+        return err;
 }
 /*
@@ -1156,16 +1153,17 @@ out:
 /*
 * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
 * Search forward from there, if not.  N.B., this assumes that the
 * list of pages handed to migrate_pages()--which is how we get here--
 * is in virtual address order.
 */
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
-        struct vm_area_struct *vma = (struct vm_area_struct *)private;
+        struct vm_area_struct *vma;
        unsigned long uninitialized_var(address);
+        vma = find_vma(current->mm, start);
        while (vma) {
                address = page_address_in_vma(page, vma);
                if (address != -EFAULT)
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
        return -ENOSYS;
 }
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
        return NULL;
 }
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
 {
-        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        struct mempolicy *new;
        unsigned long end;
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (err)
                goto mpol_out;
-        vma = queue_pages_range(mm, start, end, nmask,
+        err = queue_pages_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
+        if (!err)
-        err = PTR_ERR(vma);     /* maybe ... */
-        if (!IS_ERR(vma))
                err = mbind_range(mm, start, end, new);
        if (!err) {
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
-                        nr_failed = migrate_pages(&pagelist, new_vma_page,
+                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
-                                        NULL, (unsigned long)vma,
+                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
-                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
                }
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd559999..9e0beaa91845 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                pmd = mm_find_pmd(mm, addr);
                if (!pmd)
                        goto out;
-                if (pmd_trans_huge(*pmd))
-                        goto out;
                ptep = pte_offset_map(pmd, addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index b78e3a8f5ee7..4a852f6c5709 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        for (i = 0; i < VMACACHE_SIZE; i++) {
                /* if the vma is cached, invalidate the entire cache */
                if (curr->vmacache[i] == vma) {
-                        vmacache_invalidate(curr->mm);
+                        vmacache_invalidate(mm);
                        break;
                }
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa29eda8..20d17f8266fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION    (8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
@@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
        pageset_update(&p->pcp, high, batch);
 }
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
+static void pageset_set_high_and_batch(struct zone *zone,
-                struct per_cpu_pageset *pcp)
+                                       struct per_cpu_pageset *pcp)
 {
        if (percpu_pagelist_fraction)
                pageset_set_high(pcp,
@@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct zone *zone;
-        unsigned int cpu;
+        int old_percpu_pagelist_fraction;
        int ret;
+        mutex_lock(&pcp_batch_high_lock);
+        old_percpu_pagelist_fraction = percpu_pagelist_fraction;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-        if (!write || (ret < 0))
+        if (!write || ret < 0)
-                return ret;
+                goto out;
+        /* Sanity checking to avoid pcp imbalance */
+        if (percpu_pagelist_fraction &&
+            percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+                percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+                ret = -EINVAL;
+                goto out;
+        }
+        /* No change? */
+        if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+                goto out;
-        mutex_lock(&pcp_batch_high_lock);
        for_each_populated_zone(zone) {
-                unsigned long  high;
+                unsigned int cpu;
-                high = zone->managed_pages / percpu_pagelist_fraction;
                for_each_possible_cpu(cpu)
-                        pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
+                        pageset_set_high_and_batch(zone,
-                                         high);
+                                        per_cpu_ptr(zone->pageset, cpu));
        }
+out:
        mutex_unlock(&pcp_batch_high_lock);
-        return 0;
+        return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc872ae8..b7e94ebbd09e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd = NULL;
+        pmd_t pmde;
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
                goto out;
        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
+        /*
+         * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+         * without holding anon_vma lock for write.  So when looking for a
+         * genuine pmde (in which to find pte), test present and !THP together.
+         */
+        pmde = ACCESS_ONCE(*pmd);
+        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                pmd = NULL;
 out:
        return pmd;
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        if (!pmd)
                return NULL;
-        if (pmd_trans_huge(*pmd))
-                return NULL;
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
        if (!sync && !pte_present(*pte)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index f484c276e994..8f419cff9e34 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
 #define SHORT_SYMLINK_LEN 128
 /*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * (with i_mutex making sure that it has only one user at a time):
+ * inode->i_private (with i_mutex making sure that it has only one user at
- * we would prefer not to enlarge the shmem inode just for that.
+ * a time): we would prefer not to enlarge the shmem inode just for that.
 */
 struct shmem_falloc {
+        int     mode;           /* FALLOC_FL mode currently operating */
        pgoff_t start;          /* start of range currently being fallocated */
        pgoff_t next;           /* the next page offset to be fallocated */
        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
@@ -759,6 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
+                            !shmem_falloc->mode &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
@@ -1233,6 +1235,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
+        /*
+         * Trinity finds that probing a hole which tmpfs is punching can
+         * prevent the hole-punch from ever completing: which in turn
+         * locks writers out with its hold on i_mutex.  So refrain from
+         * faulting pages into the hole while it's being punched, and
+         * wait on i_mutex to be released if vmf->flags permits.
+         */
+        if (unlikely(inode->i_private)) {
+                struct shmem_falloc *shmem_falloc;
+                spin_lock(&inode->i_lock);
+                shmem_falloc = inode->i_private;
+                if (!shmem_falloc ||
+                    shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
+                    vmf->pgoff < shmem_falloc->start ||
+                    vmf->pgoff >= shmem_falloc->next)
+                        shmem_falloc = NULL;
+                spin_unlock(&inode->i_lock);
+                /*
+                 * i_lock has protected us from taking shmem_falloc seriously
+                 * once return from shmem_fallocate() went back up that stack.
+                 * i_lock does not serialize with i_mutex at all, but it does
+                 * not matter if sometimes we wait unnecessarily, or sometimes
+                 * miss out on waiting: we just need to make those cases rare.
+                 */
+                if (shmem_falloc) {
+                        if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+                           !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                                up_read(&vma->vm_mm->mmap_sem);
+                                mutex_lock(&inode->i_mutex);
+                                mutex_unlock(&inode->i_mutex);
+                                return VM_FAULT_RETRY;
+                        }
+                        /* cond_resched? Leave that to GUP or return to user */
+                        return VM_FAULT_NOPAGE;
+                }
+        }
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1724,20 +1764,31 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        pgoff_t start, index, end;
        int error;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        mutex_lock(&inode->i_mutex);
+        shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+                spin_lock(&inode->i_lock);
+                inode->i_private = &shmem_falloc;
+                spin_unlock(&inode->i_lock);
                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */
                error = 0;
-                goto out;
+                goto undone;
        }
        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87edabc..3070b929a1bf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+static void set_obj_status(struct page *page, int idx, int val)
+{
+        int freelist_size;
+        char *status;
+        struct kmem_cache *cachep = page->slab_cache;
+        freelist_size = cachep->num * sizeof(freelist_idx_t);
+        status = (char *)page->freelist + freelist_size;
+        status[idx] = val;
+}
+static inline unsigned int get_obj_status(struct page *page, int idx)
+{
+        int freelist_size;
+        char *status;
+        struct kmem_cache *cachep = page->slab_cache;
+        freelist_size = cachep->num * sizeof(freelist_idx_t);
+        status = (char *)page->freelist + freelist_size;
+        return status[idx];
+}
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
+#endif
 /*
 * Do not go above this order unless 0 objects fit into the slab or
 * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
        return cachep->array[smp_processor_id()];
 }
+static size_t calculate_freelist_size(int nr_objs, size_t align)
+{
+        size_t freelist_size;
+        freelist_size = nr_objs * sizeof(freelist_idx_t);
+        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                freelist_size += nr_objs * sizeof(char);
+        if (align)
+                freelist_size = ALIGN(freelist_size, align);
+        return freelist_size;
+}
 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
                                size_t idx_size, size_t align)
 {
        int nr_objs;
+        size_t remained_size;
        size_t freelist_size;
+        int extra_space = 0;
+        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                extra_space = sizeof(char);
        /*
         * Ignore padding for the initial guess. The padding
         * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
         * into the memory allocation when taking the padding
         * into account.
         */
-        nr_objs = slab_size / (buffer_size + idx_size);
+        nr_objs = slab_size / (buffer_size + idx_size + extra_space);
        /*
         * This calculated number will be either the right
         * amount, or one greater than what we want.
         */
-        freelist_size = slab_size - nr_objs * buffer_size;
+        remained_size = slab_size - nr_objs * buffer_size;
-        if (freelist_size < ALIGN(nr_objs * idx_size, align))
+        freelist_size = calculate_freelist_size(nr_objs, align);
+        if (remained_size < freelist_size)
                nr_objs--;
        return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
        } else {
                nr_objs = calculate_nr_objs(slab_size, buffer_size,
                                        sizeof(freelist_idx_t), align);
-                mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
+                mgmt_size = calculate_freelist_size(nr_objs, align);
        }
        *num = nr_objs;
        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                        break;
                if (flags & CFLGS_OFF_SLAB) {
+                        size_t freelist_size_per_obj = sizeof(freelist_idx_t);
                        /*
                         * Max number of objs-per-slab for caches which
                         * use off-slab slabs. Needed to avoid a possible
                         * looping condition in cache_grow().
                         */
+                        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                                freelist_size_per_obj += sizeof(char);
                        offslab_limit = size;
-                        offslab_limit /= sizeof(freelist_idx_t);
+                        offslab_limit /= freelist_size_per_obj;
                        if (num > offslab_limit)
                                break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (!cachep->num)
                return -E2BIG;
-        freelist_size =
+        freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-                ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-                freelist_size = cachep->num * sizeof(freelist_idx_t);
+                freelist_size = calculate_freelist_size(cachep->num, 0);
 #ifdef CONFIG_PAGE_POISONING
                /* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                if (cachep->ctor)
                        cachep->ctor(objp);
 #endif
+                set_obj_status(page, i, OBJECT_FREE);
                set_free_obj(page, i, i);
        }
 }
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        BUG_ON(objnr >= cachep->num);
        BUG_ON(objp != index_to_obj(cachep, page, objnr));
+        set_obj_status(page, objnr, OBJECT_FREE);
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                                gfp_t flags, void *objp, unsigned long caller)
 {
+        struct page *page;
        if (!objp)
                return objp;
        if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
+        page = virt_to_head_page(objp);
+        set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
                                                struct page *page)
 {
        void *p;
-        int i, j;
+        int i;
        if (n[0] == n[1])
                return;
        for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-                bool active = true;
+                if (get_obj_status(page, i) != OBJECT_ACTIVE)
-                for (j = page->active; j < c->num; j++) {
-                        /* Skip freed item */
-                        if (get_free_obj(page, j) == i) {
-                                active = false;
-                                break;
-                        }
-                }
-                if (!active)
                        continue;
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 010b18ef4ea0..182be0f12407 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3476,12 +3476,17 @@ sub process {
                        }
                }
-# unnecessary return in a void function? (a single leading tab, then return;)
+# unnecessary return in a void function
-                if ($sline =~ /^\+\treturn\s*;\s*$/ &&
+# at end-of-function, with the previous line a single leading tab, then return;
-                    $prevline =~ /^\+/) {
+# and the line before that not a goto label target like "out:"
+                if ($sline =~ /^[ \+]}\s*$/ &&
+                    $prevline =~ /^\+\treturn\s*;\s*$/ &&
+                    $linenr >= 3 &&
+                    $lines[$linenr - 3] =~ /^[ +]/ &&
+                    $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) {
                        WARN("RETURN_VOID",
-                             "void function return statements are not generally useful\n" . $herecurr);
+                             "void function return statements are not generally useful\n" . $hereprev);
-                }
+               }
 # if statements using unnecessary parentheses - ie: if ((foo == bar))
                if ($^V && $^V ge 5.10.0 &&