aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS4
-rw-r--r--Documentation/accounting/getdelays.c1
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/memory-hotplug.txt15
-rw-r--r--Documentation/sysctl/kernel.txt17
-rw-r--r--Documentation/sysctl/vm.txt3
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/ia64/include/uapi/asm/fcntl.h1
-rw-r--r--arch/sparc/include/asm/irq_64.h2
-rw-r--r--arch/sparc/kernel/process_64.c18
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c18
-rw-r--r--drivers/base/dma-contiguous.c12
-rw-r--r--drivers/memstick/host/rtsx_pci_ms.c1
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c57
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c3
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c18
-rw-r--r--fs/ocfs2/namei.c145
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/refcounttree.c8
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--include/linux/nmi.h12
-rw-r--r--include/linux/page-flags.h3
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/smp.c57
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/watchdog.c41
-rw-r--r--lib/Kconfig.debug4
-rw-r--r--mm/huge_memory.c57
-rw-r--r--mm/hugetlb.c71
-rw-r--r--mm/ksm.c1
-rw-r--r--mm/mempolicy.c46
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c40
-rw-r--r--mm/rmap.c12
-rw-r--r--mm/shmem.c59
-rw-r--r--mm/slab.c90
-rwxr-xr-xscripts/checkpatch.pl15
41 files changed, 703 insertions, 189 deletions
diff --git a/CREDITS b/CREDITS
index c322dcfb926d..28ee1514b9de 100644
--- a/CREDITS
+++ b/CREDITS
@@ -9,6 +9,10 @@
9 Linus 9 Linus
10---------- 10----------
11 11
12M: Matt Mackal
13E: mpm@selenic.com
14D: SLOB slab allocator
15
12N: Matti Aarnio 16N: Matti Aarnio
13E: mea@nic.funet.fi 17E: mea@nic.funet.fi
14D: Alpha systems hacking, IPv6 and other network related stuff 18D: Alpha systems hacking, IPv6 and other network related stuff
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index c6a06b71594d..f40578026a04 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -314,6 +314,7 @@ int main(int argc, char *argv[])
314 break; 314 break;
315 case 'm': 315 case 'm':
316 strncpy(cpumask, optarg, sizeof(cpumask)); 316 strncpy(cpumask, optarg, sizeof(cpumask));
317 cpumask[sizeof(cpumask) - 1] = '\0';
317 maskset = 1; 318 maskset = 1;
318 printf("cpumask %s maskset %d\n", cpumask, maskset); 319 printf("cpumask %s maskset %d\n", cpumask, maskset);
319 break; 320 break;
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 884904975d0b..c1b9aa8c5a52 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
3130 [KNL] Should the soft-lockup detector generate panics. 3130 [KNL] Should the soft-lockup detector generate panics.
3131 Format: <integer> 3131 Format: <integer>
3132 3132
3133 softlockup_all_cpu_backtrace=
3134 [KNL] Should the soft-lockup detector generate
3135 backtraces on all cpus.
3136 Format: <integer>
3137
3133 sonypi.*= [HW] Sony Programmable I/O Control Device driver 3138 sonypi.*= [HW] Sony Programmable I/O Control Device driver
3134 See Documentation/laptops/sonypi.txt 3139 See Documentation/laptops/sonypi.txt
3135 3140
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index f304edb8fbe7..45134dc23854 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -209,15 +209,12 @@ If memory device is found, memory hotplug code will be called.
209 209
2104.2 Notify memory hot-add event by hand 2104.2 Notify memory hot-add event by hand
211------------ 211------------
212On powerpc, the firmware does not notify a memory hotplug event to the kernel. 212On some architectures, the firmware may not notify the kernel of a memory
213Therefore, "probe" interface is supported to notify the event to the kernel. 213hotplug event. Therefore, the memory "probe" interface is supported to
214This interface depends on CONFIG_ARCH_MEMORY_PROBE. 214explicitly notify the kernel. This interface depends on
215 215CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
216CONFIG_ARCH_MEMORY_PROBE is supported on powerpc only. On x86, this config 216if hotplug is supported, although for x86 this should be handled by ACPI
217option is disabled by default since ACPI notifies a memory hotplug event to 217notification.
218the kernel, which performs its hotplug operation as the result. Please
219enable this option if you need the "probe" interface for testing purposes
220on x86.
221 218
222Probe interface is located at 219Probe interface is located at
223/sys/devices/system/memory/probe 220/sys/devices/system/memory/probe
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 708bb7f1b7e0..c14374e71775 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -75,6 +75,7 @@ show up in /proc/sys/kernel:
75- shmall 75- shmall
76- shmmax [ sysv ipc ] 76- shmmax [ sysv ipc ]
77- shmmni 77- shmmni
78- softlockup_all_cpu_backtrace
78- stop-a [ SPARC only ] 79- stop-a [ SPARC only ]
79- sysrq ==> Documentation/sysrq.txt 80- sysrq ==> Documentation/sysrq.txt
80- sysctl_writes_strict 81- sysctl_writes_strict
@@ -783,6 +784,22 @@ via the /proc/sys interface:
783 784
784============================================================== 785==============================================================
785 786
787softlockup_all_cpu_backtrace:
788
789This value controls the soft lockup detector thread's behavior
790when a soft lockup condition is detected as to whether or not
791to gather further debug information. If enabled, each cpu will
792be issued an NMI and instructed to capture stack trace.
793
794This feature is only applicable for architectures which support
795NMI.
796
7970: do nothing. This is the default behavior.
798
7991: on detection capture more debug information.
800
801==============================================================
802
786tainted: 803tainted:
787 804
788Non-zero if the kernel has been tainted. Numeric values, which 805Non-zero if the kernel has been tainted. Numeric values, which
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd4b34c03738..4415aa915681 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
702set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) 702set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
703 703
704The initial value is zero. Kernel does not use this value at boot time to set 704The initial value is zero. Kernel does not use this value at boot time to set
705the high water marks for each per cpu page list. 705the high water marks for each per cpu page list. If the user writes '0' to this
706sysctl, it will revert to this default behavior.
706 707
707============================================================== 708==============================================================
708 709
diff --git a/MAINTAINERS b/MAINTAINERS
index 3f2e171047b9..3cc94fff780f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8196,13 +8196,15 @@ S: Maintained
8196F: drivers/usb/misc/sisusbvga/ 8196F: drivers/usb/misc/sisusbvga/
8197 8197
8198SLAB ALLOCATOR 8198SLAB ALLOCATOR
8199M: Christoph Lameter <cl@linux-foundation.org> 8199M: Christoph Lameter <cl@linux.com>
8200M: Pekka Enberg <penberg@kernel.org> 8200M: Pekka Enberg <penberg@kernel.org>
8201M: Matt Mackall <mpm@selenic.com> 8201M: David Rientjes <rientjes@google.com>
8202M: Joonsoo Kim <iamjoonsoo.kim@lge.com>
8203M: Andrew Morton <akpm@linux-foundation.org>
8202L: linux-mm@kvack.org 8204L: linux-mm@kvack.org
8203S: Maintained 8205S: Maintained
8204F: include/linux/sl?b*.h 8206F: include/linux/sl?b*.h
8205F: mm/sl?b.c 8207F: mm/sl?b*
8206 8208
8207SLEEPABLE READ-COPY UPDATE (SRCU) 8209SLEEPABLE READ-COPY UPDATE (SRCU)
8208M: Lai Jiangshan <laijs@cn.fujitsu.com> 8210M: Lai Jiangshan <laijs@cn.fujitsu.com>
diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h
index 1dd275dc8f65..7b485876cad4 100644
--- a/arch/ia64/include/uapi/asm/fcntl.h
+++ b/arch/ia64/include/uapi/asm/fcntl.h
@@ -8,6 +8,7 @@
8#define force_o_largefile() \ 8#define force_o_largefile() \
9 (personality(current->personality) != PER_LINUX32) 9 (personality(current->personality) != PER_LINUX32)
10 10
11#include <linux/personality.h>
11#include <asm-generic/fcntl.h> 12#include <asm-generic/fcntl.h>
12 13
13#endif /* _ASM_IA64_FCNTL_H */ 14#endif /* _ASM_IA64_FCNTL_H */
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 375cffcf7dbd..91d219381306 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,7 +89,7 @@ static inline unsigned long get_softint(void)
89 return retval; 89 return retval;
90} 90}
91 91
92void arch_trigger_all_cpu_backtrace(void); 92void arch_trigger_all_cpu_backtrace(bool);
93#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 93#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
94 94
95extern void *hardirq_stack[NR_CPUS]; 95extern void *hardirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index b2988f25e230..027e09986194 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
239 } 239 }
240} 240}
241 241
242void arch_trigger_all_cpu_backtrace(void) 242void arch_trigger_all_cpu_backtrace(bool include_self)
243{ 243{
244 struct thread_info *tp = current_thread_info(); 244 struct thread_info *tp = current_thread_info();
245 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
@@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void)
251 251
252 spin_lock_irqsave(&global_cpu_snapshot_lock, flags); 252 spin_lock_irqsave(&global_cpu_snapshot_lock, flags);
253 253
254 memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
255
256 this_cpu = raw_smp_processor_id(); 254 this_cpu = raw_smp_processor_id();
257 255
258 __global_reg_self(tp, regs, this_cpu); 256 memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
257
258 if (include_self)
259 __global_reg_self(tp, regs, this_cpu);
259 260
260 smp_fetch_global_regs(); 261 smp_fetch_global_regs();
261 262
262 for_each_online_cpu(cpu) { 263 for_each_online_cpu(cpu) {
263 struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg; 264 struct global_reg_snapshot *gp;
265
266 if (!include_self && cpu == this_cpu)
267 continue;
268
269 gp = &global_cpu_snapshot[cpu].reg;
264 270
265 __global_reg_poll(gp); 271 __global_reg_poll(gp);
266 272
@@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void)
292 298
293static void sysrq_handle_globreg(int key) 299static void sysrq_handle_globreg(int key)
294{ 300{
295 arch_trigger_all_cpu_backtrace(); 301 arch_trigger_all_cpu_backtrace(true);
296} 302}
297 303
298static struct sysrq_key_op sparc_globalreg_op = { 304static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034cf..a80cbb88ea91 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
43extern void init_ISA_irqs(void); 43extern void init_ISA_irqs(void);
44 44
45#ifdef CONFIG_X86_LOCAL_APIC 45#ifdef CONFIG_X86_LOCAL_APIC
46void arch_trigger_all_cpu_backtrace(void); 46void arch_trigger_all_cpu_backtrace(bool);
47#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 47#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
48#endif 48#endif
49 49
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de5083..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
33/* "in progress" flag of arch_trigger_all_cpu_backtrace */ 33/* "in progress" flag of arch_trigger_all_cpu_backtrace */
34static unsigned long backtrace_flag; 34static unsigned long backtrace_flag;
35 35
36void arch_trigger_all_cpu_backtrace(void) 36void arch_trigger_all_cpu_backtrace(bool include_self)
37{ 37{
38 int i; 38 int i;
39 int cpu = get_cpu();
39 40
40 if (test_and_set_bit(0, &backtrace_flag)) 41 if (test_and_set_bit(0, &backtrace_flag)) {
41 /* 42 /*
42 * If there is already a trigger_all_cpu_backtrace() in progress 43 * If there is already a trigger_all_cpu_backtrace() in progress
43 * (backtrace_flag == 1), don't output double cpu dump infos. 44 * (backtrace_flag == 1), don't output double cpu dump infos.
44 */ 45 */
46 put_cpu();
45 return; 47 return;
48 }
46 49
47 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); 50 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
51 if (!include_self)
52 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
48 53
49 printk(KERN_INFO "sending NMI to all CPUs:\n"); 54 if (!cpumask_empty(to_cpumask(backtrace_mask))) {
50 apic->send_IPI_all(NMI_VECTOR); 55 pr_info("sending NMI to %s CPUs:\n",
56 (include_self ? "all" : "other"));
57 apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
58 }
51 59
52 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 60 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
53 for (i = 0; i < 10 * 1000; i++) { 61 for (i = 0; i < 10 * 1000; i++) {
54 if (cpumask_empty(to_cpumask(backtrace_mask))) 62 if (cpumask_empty(to_cpumask(backtrace_mask)))
55 break; 63 break;
56 mdelay(1); 64 mdelay(1);
65 touch_softlockup_watchdog();
57 } 66 }
58 67
59 clear_bit(0, &backtrace_flag); 68 clear_bit(0, &backtrace_flag);
60 smp_mb__after_atomic(); 69 smp_mb__after_atomic();
70 put_cpu();
61} 71}
62 72
63static int 73static int
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 83969f8c5727..6467c919c509 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -176,14 +176,24 @@ static int __init cma_activate_area(struct cma *cma)
176 base_pfn = pfn; 176 base_pfn = pfn;
177 for (j = pageblock_nr_pages; j; --j, pfn++) { 177 for (j = pageblock_nr_pages; j; --j, pfn++) {
178 WARN_ON_ONCE(!pfn_valid(pfn)); 178 WARN_ON_ONCE(!pfn_valid(pfn));
179 /*
180 * alloc_contig_range requires the pfn range
181 * specified to be in the same zone. Make this
182 * simple by forcing the entire CMA resv range
183 * to be in the same zone.
184 */
179 if (page_zone(pfn_to_page(pfn)) != zone) 185 if (page_zone(pfn_to_page(pfn)) != zone)
180 return -EINVAL; 186 goto err;
181 } 187 }
182 init_cma_reserved_pageblock(pfn_to_page(base_pfn)); 188 init_cma_reserved_pageblock(pfn_to_page(base_pfn));
183 } while (--i); 189 } while (--i);
184 190
185 mutex_init(&cma->lock); 191 mutex_init(&cma->lock);
186 return 0; 192 return 0;
193
194err:
195 kfree(cma->bitmap);
196 return -EINVAL;
187} 197}
188 198
189static struct cma cma_areas[MAX_CMA_AREAS]; 199static struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index 2a635b6fdaf7..c880ba685754 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -601,6 +601,7 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev)
601 pcr->slots[RTSX_MS_CARD].card_event = NULL; 601 pcr->slots[RTSX_MS_CARD].card_event = NULL;
602 msh = host->msh; 602 msh = host->msh;
603 host->eject = true; 603 host->eject = true;
604 cancel_work_sync(&host->handle_req);
604 605
605 mutex_lock(&host->host_mutex); 606 mutex_lock(&host->host_mutex);
606 if (host->req) { 607 if (host->req) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a106b3f2b22a..fae17c640df3 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -331,6 +331,7 @@ struct dlm_lock_resource
331 u16 state; 331 u16 state;
332 char lvb[DLM_LVB_LEN]; 332 char lvb[DLM_LVB_LEN];
333 unsigned int inflight_locks; 333 unsigned int inflight_locks;
334 unsigned int inflight_assert_workers;
334 unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 335 unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
335}; 336};
336 337
@@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
910void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 911void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
911 struct dlm_lock_resource *res); 912 struct dlm_lock_resource *res);
912 913
914void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
915 struct dlm_lock_resource *res);
916
913void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 917void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
914void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 918void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
915void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 919void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3087a21d32f9..82abf0cc9a12 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
581 atomic_set(&res->asts_reserved, 0); 581 atomic_set(&res->asts_reserved, 0);
582 res->migration_pending = 0; 582 res->migration_pending = 0;
583 res->inflight_locks = 0; 583 res->inflight_locks = 0;
584 res->inflight_assert_workers = 0;
584 585
585 res->dlm = dlm; 586 res->dlm = dlm;
586 587
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
683 wake_up(&res->wq); 684 wake_up(&res->wq);
684} 685}
685 686
687void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
688 struct dlm_lock_resource *res)
689{
690 assert_spin_locked(&res->spinlock);
691 res->inflight_assert_workers++;
692 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
693 dlm->name, res->lockname.len, res->lockname.name,
694 res->inflight_assert_workers);
695}
696
697static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
698 struct dlm_lock_resource *res)
699{
700 spin_lock(&res->spinlock);
701 __dlm_lockres_grab_inflight_worker(dlm, res);
702 spin_unlock(&res->spinlock);
703}
704
705static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
706 struct dlm_lock_resource *res)
707{
708 assert_spin_locked(&res->spinlock);
709 BUG_ON(res->inflight_assert_workers == 0);
710 res->inflight_assert_workers--;
711 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
712 dlm->name, res->lockname.len, res->lockname.name,
713 res->inflight_assert_workers);
714}
715
716static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
717 struct dlm_lock_resource *res)
718{
719 spin_lock(&res->spinlock);
720 __dlm_lockres_drop_inflight_worker(dlm, res);
721 spin_unlock(&res->spinlock);
722}
723
686/* 724/*
687 * lookup a lock resource by name. 725 * lookup a lock resource by name.
688 * may already exist in the hashtable. 726 * may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
1603 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1641 mlog(ML_ERROR, "failed to dispatch assert master work\n");
1604 response = DLM_MASTER_RESP_ERROR; 1642 response = DLM_MASTER_RESP_ERROR;
1605 dlm_lockres_put(res); 1643 dlm_lockres_put(res);
1606 } 1644 } else
1645 dlm_lockres_grab_inflight_worker(dlm, res);
1607 } else { 1646 } else {
1608 if (res) 1647 if (res)
1609 dlm_lockres_put(res); 1648 dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2118 dlm_lockres_release_ast(dlm, res); 2157 dlm_lockres_release_ast(dlm, res);
2119 2158
2120put: 2159put:
2160 dlm_lockres_drop_inflight_worker(dlm, res);
2161
2121 dlm_lockres_put(res); 2162 dlm_lockres_put(res);
2122 2163
2123 mlog(0, "finished with dlm_assert_master_worker\n"); 2164 mlog(0, "finished with dlm_assert_master_worker\n");
@@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3088 /* remove it so that only one mle will be found */ 3129 /* remove it so that only one mle will be found */
3089 __dlm_unlink_mle(dlm, tmp); 3130 __dlm_unlink_mle(dlm, tmp);
3090 __dlm_mle_detach_hb_events(dlm, tmp); 3131 __dlm_mle_detach_hb_events(dlm, tmp);
3091 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3132 if (tmp->type == DLM_MLE_MASTER) {
3092 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3133 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3093 "telling master to get ref for cleared out mle " 3134 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3094 "during migration\n", dlm->name, namelen, name, 3135 "telling master to get ref "
3095 master, new_master); 3136 "for cleared out mle during "
3137 "migration\n", dlm->name,
3138 namelen, name, master,
3139 new_master);
3140 }
3096 } 3141 }
3097 spin_unlock(&tmp->spinlock); 3142 spin_unlock(&tmp->spinlock);
3098 } 3143 }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 5de019437ea5..45067faf5695 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1708 mlog_errno(-ENOMEM); 1708 mlog_errno(-ENOMEM);
1709 /* retry!? */ 1709 /* retry!? */
1710 BUG(); 1710 BUG();
1711 } 1711 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res);
1712 } else /* put.. incase we are not the master */ 1713 } else /* put.. incase we are not the master */
1713 dlm_lockres_put(res); 1714 dlm_lockres_put(res);
1714 spin_unlock(&res->spinlock); 1715 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869de829d..69aac6f088ad 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
259 * refs on it. */ 259 * refs on it. */
260 unused = __dlm_lockres_unused(lockres); 260 unused = __dlm_lockres_unused(lockres);
261 if (!unused || 261 if (!unused ||
262 (lockres->state & DLM_LOCK_RES_MIGRATING)) { 262 (lockres->state & DLM_LOCK_RES_MIGRATING) ||
263 (lockres->inflight_assert_workers != 0)) {
263 mlog(0, "%s: res %.*s is in use or being remastered, " 264 mlog(0, "%s: res %.*s is in use or being remastered, "
264 "used %d, state %d\n", dlm->name, 265 "used %d, state %d, assert master workers %u\n",
265 lockres->lockname.len, lockres->lockname.name, 266 dlm->name, lockres->lockname.len,
266 !unused, lockres->state); 267 lockres->lockname.name,
267 list_move_tail(&dlm->purge_list, &lockres->purge); 268 !unused, lockres->state,
269 lockres->inflight_assert_workers);
270 list_move_tail(&lockres->purge, &dlm->purge_list);
268 spin_unlock(&lockres->spinlock); 271 spin_unlock(&lockres->spinlock);
269 continue; 272 continue;
270 } 273 }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52cf5c9..2e3c9dbab68c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
191 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 191 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
192 } else if (status == DLM_RECOVERING || 192 } else if (status == DLM_RECOVERING ||
193 status == DLM_MIGRATING || 193 status == DLM_MIGRATING ||
194 status == DLM_FORWARD) { 194 status == DLM_FORWARD ||
195 status == DLM_NOLOCKMGR
196 ) {
195 /* must clear the actions because this unlock 197 /* must clear the actions because this unlock
196 * is about to be retried. cannot free or do 198 * is about to be retried. cannot free or do
197 * any list manipulation. */ 199 * any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
200 res->lockname.name, 202 res->lockname.name,
201 status==DLM_RECOVERING?"recovering": 203 status==DLM_RECOVERING?"recovering":
202 (status==DLM_MIGRATING?"migrating": 204 (status==DLM_MIGRATING?"migrating":
203 "forward")); 205 (status == DLM_FORWARD ? "forward" :
206 "nolockmanager")));
204 actions = 0; 207 actions = 0;
205 } 208 }
206 if (flags & LKM_CANCEL) 209 if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
364 * updated state to the recovery master. this thread 367 * updated state to the recovery master. this thread
365 * just needs to finish out the operation and call 368 * just needs to finish out the operation and call
366 * the unlockast. */ 369 * the unlockast. */
367 ret = DLM_NORMAL; 370 if (dlm_is_node_dead(dlm, owner))
371 ret = DLM_NORMAL;
372 else
373 ret = DLM_NOLOCKMGR;
368 } else { 374 } else {
369 /* something bad. this will BUG in ocfs2 */ 375 /* something bad. this will BUG in ocfs2 */
370 ret = dlm_err_to_dlm_status(tmpret); 376 ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
638 644
639 if (status == DLM_RECOVERING || 645 if (status == DLM_RECOVERING ||
640 status == DLM_MIGRATING || 646 status == DLM_MIGRATING ||
641 status == DLM_FORWARD) { 647 status == DLM_FORWARD ||
648 status == DLM_NOLOCKMGR) {
649
642 /* We want to go away for a tiny bit to allow recovery 650 /* We want to go away for a tiny bit to allow recovery
643 * / migration to complete on this resource. I don't 651 * / migration to complete on this resource. I don't
644 * know of any wait queue we could sleep on as this 652 * know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
650 msleep(50); 658 msleep(50);
651 659
652 mlog(0, "retrying unlock due to pending recovery/" 660 mlog(0, "retrying unlock due to pending recovery/"
653 "migration/in-progress\n"); 661 "migration/in-progress/reconnect\n");
654 goto retry; 662 goto retry;
655 } 663 }
656 664
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc398445..8add6f1030d7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
205 return inode; 205 return inode;
206} 206}
207 207
208static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
209 struct dentry *dentry, struct inode *inode)
210{
211 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
212
213 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
214 ocfs2_lock_res_free(&dl->dl_lockres);
215 BUG_ON(dl->dl_count != 1);
216 spin_lock(&dentry_attach_lock);
217 dentry->d_fsdata = NULL;
218 spin_unlock(&dentry_attach_lock);
219 kfree(dl);
220 iput(inode);
221}
222
208static int ocfs2_mknod(struct inode *dir, 223static int ocfs2_mknod(struct inode *dir,
209 struct dentry *dentry, 224 struct dentry *dentry,
210 umode_t mode, 225 umode_t mode,
@@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir,
231 sigset_t oldset; 246 sigset_t oldset;
232 int did_block_signals = 0; 247 int did_block_signals = 0;
233 struct posix_acl *default_acl = NULL, *acl = NULL; 248 struct posix_acl *default_acl = NULL, *acl = NULL;
249 struct ocfs2_dentry_lock *dl = NULL;
234 250
235 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, 251 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
236 (unsigned long long)OCFS2_I(dir)->ip_blkno, 252 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
423 goto leave; 439 goto leave;
424 } 440 }
425 441
442 dl = dentry->d_fsdata;
443
426 status = ocfs2_add_entry(handle, dentry, inode, 444 status = ocfs2_add_entry(handle, dentry, inode,
427 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 445 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
428 &lookup); 446 &lookup);
@@ -469,6 +487,9 @@ leave:
469 * ocfs2_delete_inode will mutex_lock again. 487 * ocfs2_delete_inode will mutex_lock again.
470 */ 488 */
471 if ((status < 0) && inode) { 489 if ((status < 0) && inode) {
490 if (dl)
491 ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
492
472 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; 493 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
473 clear_nlink(inode); 494 clear_nlink(inode);
474 iput(inode); 495 iput(inode);
@@ -991,6 +1012,65 @@ leave:
991 return status; 1012 return status;
992} 1013}
993 1014
1015static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
1016 u64 src_inode_no, u64 dest_inode_no)
1017{
1018 int ret = 0, i = 0;
1019 u64 parent_inode_no = 0;
1020 u64 child_inode_no = src_inode_no;
1021 struct inode *child_inode;
1022
1023#define MAX_LOOKUP_TIMES 32
1024 while (1) {
1025 child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
1026 if (IS_ERR(child_inode)) {
1027 ret = PTR_ERR(child_inode);
1028 break;
1029 }
1030
1031 ret = ocfs2_inode_lock(child_inode, NULL, 0);
1032 if (ret < 0) {
1033 iput(child_inode);
1034 if (ret != -ENOENT)
1035 mlog_errno(ret);
1036 break;
1037 }
1038
1039 ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
1040 &parent_inode_no);
1041 ocfs2_inode_unlock(child_inode, 0);
1042 iput(child_inode);
1043 if (ret < 0) {
1044 ret = -ENOENT;
1045 break;
1046 }
1047
1048 if (parent_inode_no == dest_inode_no) {
1049 ret = 1;
1050 break;
1051 }
1052
1053 if (parent_inode_no == osb->root_inode->i_ino) {
1054 ret = 0;
1055 break;
1056 }
1057
1058 child_inode_no = parent_inode_no;
1059
1060 if (++i >= MAX_LOOKUP_TIMES) {
1061 mlog(ML_NOTICE, "max lookup times reached, filesystem "
1062 "may have nested directories, "
1063 "src inode: %llu, dest inode: %llu.\n",
1064 (unsigned long long)src_inode_no,
1065 (unsigned long long)dest_inode_no);
1066 ret = 0;
1067 break;
1068 }
1069 }
1070
1071 return ret;
1072}
1073
994/* 1074/*
995 * The only place this should be used is rename! 1075 * The only place this should be used is rename!
996 * if they have the same id, then the 1st one is the only one locked. 1076 * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1002 struct inode *inode2) 1082 struct inode *inode2)
1003{ 1083{
1004 int status; 1084 int status;
1085 int inode1_is_ancestor, inode2_is_ancestor;
1005 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); 1086 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
1006 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); 1087 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
1007 struct buffer_head **tmpbh; 1088 struct buffer_head **tmpbh;
@@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1015 if (*bh2) 1096 if (*bh2)
1016 *bh2 = NULL; 1097 *bh2 = NULL;
1017 1098
1018 /* we always want to lock the one with the lower lockid first. */ 1099 /* we always want to lock the one with the lower lockid first.
1100 * and if they are nested, we lock ancestor first */
1019 if (oi1->ip_blkno != oi2->ip_blkno) { 1101 if (oi1->ip_blkno != oi2->ip_blkno) {
1020 if (oi1->ip_blkno < oi2->ip_blkno) { 1102 inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
1103 oi1->ip_blkno);
1104 if (inode1_is_ancestor < 0) {
1105 status = inode1_is_ancestor;
1106 goto bail;
1107 }
1108
1109 inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
1110 oi2->ip_blkno);
1111 if (inode2_is_ancestor < 0) {
1112 status = inode2_is_ancestor;
1113 goto bail;
1114 }
1115
1116 if ((inode1_is_ancestor == 1) ||
1117 (oi1->ip_blkno < oi2->ip_blkno &&
1118 inode2_is_ancestor == 0)) {
1021 /* switch id1 and id2 around */ 1119 /* switch id1 and id2 around */
1022 tmpbh = bh2; 1120 tmpbh = bh2;
1023 bh2 = bh1; 1121 bh2 = bh1;
@@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,
1098 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; 1196 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1099 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 1197 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1100 struct ocfs2_dir_lookup_result target_insert = { NULL, }; 1198 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1199 bool should_add_orphan = false;
1101 1200
1102 /* At some point it might be nice to break this function up a 1201 /* At some point it might be nice to break this function up a
1103 * bit. */ 1202 * bit. */
@@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
1134 goto bail; 1233 goto bail;
1135 } 1234 }
1136 rename_lock = 1; 1235 rename_lock = 1;
1236
1237 /* here we cannot guarantee the inodes haven't just been
1238 * changed, so check if they are nested again */
1239 status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
1240 old_inode->i_ino);
1241 if (status < 0) {
1242 mlog_errno(status);
1243 goto bail;
1244 } else if (status == 1) {
1245 status = -EPERM;
1246 trace_ocfs2_rename_not_permitted(
1247 (unsigned long long)old_inode->i_ino,
1248 (unsigned long long)new_dir->i_ino);
1249 goto bail;
1250 }
1137 } 1251 }
1138 1252
1139 /* if old and new are the same, this'll just do one lock. */ 1253 /* if old and new are the same, this'll just do one lock. */
@@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
1304 mlog_errno(status); 1418 mlog_errno(status);
1305 goto bail; 1419 goto bail;
1306 } 1420 }
1421 should_add_orphan = true;
1307 } 1422 }
1308 } else { 1423 } else {
1309 BUG_ON(new_dentry->d_parent->d_inode != new_dir); 1424 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
1348 goto bail; 1463 goto bail;
1349 } 1464 }
1350 1465
1351 if (S_ISDIR(new_inode->i_mode) ||
1352 (ocfs2_read_links_count(newfe) == 1)) {
1353 status = ocfs2_orphan_add(osb, handle, new_inode,
1354 newfe_bh, orphan_name,
1355 &orphan_insert, orphan_dir);
1356 if (status < 0) {
1357 mlog_errno(status);
1358 goto bail;
1359 }
1360 }
1361
1362 /* change the dirent to point to the correct inode */ 1466 /* change the dirent to point to the correct inode */
1363 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, 1467 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1364 old_inode); 1468 old_inode);
@@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,
1373 else 1477 else
1374 ocfs2_add_links_count(newfe, -1); 1478 ocfs2_add_links_count(newfe, -1);
1375 ocfs2_journal_dirty(handle, newfe_bh); 1479 ocfs2_journal_dirty(handle, newfe_bh);
1480 if (should_add_orphan) {
1481 status = ocfs2_orphan_add(osb, handle, new_inode,
1482 newfe_bh, orphan_name,
1483 &orphan_insert, orphan_dir);
1484 if (status < 0) {
1485 mlog_errno(status);
1486 goto bail;
1487 }
1488 }
1376 } else { 1489 } else {
1377 /* if the name was not found in new_dir, add it now */ 1490 /* if the name was not found in new_dir, add it now */
1378 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1491 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
1642 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1755 struct ocfs2_dir_lookup_result lookup = { NULL, };
1643 sigset_t oldset; 1756 sigset_t oldset;
1644 int did_block_signals = 0; 1757 int did_block_signals = 0;
1758 struct ocfs2_dentry_lock *dl = NULL;
1645 1759
1646 trace_ocfs2_symlink_begin(dir, dentry, symname, 1760 trace_ocfs2_symlink_begin(dir, dentry, symname,
1647 dentry->d_name.len, dentry->d_name.name); 1761 dentry->d_name.len, dentry->d_name.name);
@@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
1830 goto bail; 1944 goto bail;
1831 } 1945 }
1832 1946
1947 dl = dentry->d_fsdata;
1948
1833 status = ocfs2_add_entry(handle, dentry, inode, 1949 status = ocfs2_add_entry(handle, dentry, inode,
1834 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1950 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1835 &lookup); 1951 &lookup);
@@ -1864,6 +1980,9 @@ bail:
1864 if (xattr_ac) 1980 if (xattr_ac)
1865 ocfs2_free_alloc_context(xattr_ac); 1981 ocfs2_free_alloc_context(xattr_ac);
1866 if ((status < 0) && inode) { 1982 if ((status < 0) && inode) {
1983 if (dl)
1984 ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
1985
1867 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; 1986 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
1868 clear_nlink(inode); 1987 clear_nlink(inode);
1869 iput(inode); 1988 iput(inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d6..6cb019b7c6a8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
2292 __entry->new_len, __get_str(new_name)) 2292 __entry->new_len, __get_str(new_name))
2293); 2293);
2294 2294
2295DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
2296
2295TRACE_EVENT(ocfs2_rename_target_exists, 2297TRACE_EVENT(ocfs2_rename_target_exists,
2296 TP_PROTO(int new_len, const char *new_name), 2298 TP_PROTO(int new_len, const char *new_name),
2297 TP_ARGS(new_len, new_name), 2299 TP_ARGS(new_len, new_name),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 714e53b9cc66..636aab69ead5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4288 goto out; 4288 goto out;
4289 } 4289 }
4290 4290
4291 error = ocfs2_rw_lock(inode, 1);
4292 if (error) {
4293 mlog_errno(error);
4294 goto out;
4295 }
4296
4291 error = ocfs2_inode_lock(inode, &old_bh, 1); 4297 error = ocfs2_inode_lock(inode, &old_bh, 1);
4292 if (error) { 4298 if (error) {
4293 mlog_errno(error); 4299 mlog_errno(error);
4300 ocfs2_rw_unlock(inode, 1);
4294 goto out; 4301 goto out;
4295 } 4302 }
4296 4303
@@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4302 up_write(&OCFS2_I(inode)->ip_xattr_sem); 4309 up_write(&OCFS2_I(inode)->ip_xattr_sem);
4303 4310
4304 ocfs2_inode_unlock(inode, 1); 4311 ocfs2_inode_unlock(inode, 1);
4312 ocfs2_rw_unlock(inode, 1);
4305 brelse(old_bh); 4313 brelse(old_bh);
4306 4314
4307 if (error) { 4315 if (error) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c7a89cea5c5d..ddb662b32447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1925 1925
1926 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1927 1927
1928 ocfs2_truncate_log_shutdown(osb);
1929
1928 /* This will disable recovery and flush any recovery work. */ 1930 /* This will disable recovery and flush any recovery work. */
1929 ocfs2_recovery_exit(osb); 1931 ocfs2_recovery_exit(osb);
1930 1932
1931 /*
1932 * During dismount, when it recovers another node it will call
1933 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
1934 */
1935 ocfs2_truncate_log_shutdown(osb);
1936
1937 ocfs2_journal_shutdown(osb); 1933 ocfs2_journal_shutdown(osb);
1938 1934
1939 ocfs2_sync_blockdev(sb); 1935 ocfs2_sync_blockdev(sb);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 6a45fb583ff1..447775ee2c4b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void)
32#ifdef arch_trigger_all_cpu_backtrace 32#ifdef arch_trigger_all_cpu_backtrace
33static inline bool trigger_all_cpu_backtrace(void) 33static inline bool trigger_all_cpu_backtrace(void)
34{ 34{
35 arch_trigger_all_cpu_backtrace(); 35 arch_trigger_all_cpu_backtrace(true);
36 36
37 return true; 37 return true;
38} 38}
39static inline bool trigger_allbutself_cpu_backtrace(void)
40{
41 arch_trigger_all_cpu_backtrace(false);
42 return true;
43}
39#else 44#else
40static inline bool trigger_all_cpu_backtrace(void) 45static inline bool trigger_all_cpu_backtrace(void)
41{ 46{
42 return false; 47 return false;
43} 48}
49static inline bool trigger_allbutself_cpu_backtrace(void)
50{
51 return false;
52}
44#endif 53#endif
45 54
46#ifdef CONFIG_LOCKUP_DETECTOR 55#ifdef CONFIG_LOCKUP_DETECTOR
@@ -48,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
48u64 hw_nmi_get_sample_period(int watchdog_thresh); 57u64 hw_nmi_get_sample_period(int watchdog_thresh);
49extern int watchdog_user_enabled; 58extern int watchdog_user_enabled;
50extern int watchdog_thresh; 59extern int watchdog_thresh;
60extern int sysctl_softlockup_all_cpu_backtrace;
51struct ctl_table; 61struct ctl_table;
52extern int proc_dowatchdog(struct ctl_table *, int , 62extern int proc_dowatchdog(struct ctl_table *, int ,
53 void __user *, size_t *, loff_t *); 63 void __user *, size_t *, loff_t *);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3c545b48aeab..8304959ad336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page)
360 ClearPageHead(page); 360 ClearPageHead(page);
361} 361}
362#endif 362#endif
363
364#define PG_head_mask ((1L << PG_head))
365
363#else 366#else
364/* 367/*
365 * Reduce page flag use as much as possible by overlapping 368 * Reduce page flag use as much as possible by overlapping
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1617#ifdef CONFIG_MEMORY_FAILURE 1617#ifdef CONFIG_MEMORY_FAILURE
1618 VMCOREINFO_NUMBER(PG_hwpoison); 1618 VMCOREINFO_NUMBER(PG_hwpoison);
1619#endif 1619#endif
1620 VMCOREINFO_NUMBER(PG_head_mask);
1620 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1621 1622
1622 arch_crash_save_vmcoreinfo(); 1623 arch_crash_save_vmcoreinfo();
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
29 29
30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
31 31
32static void flush_smp_call_function_queue(bool warn_cpu_offline);
33
32static int 34static int
33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 35hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
34{ 36{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
52 case CPU_UP_CANCELED: 54 case CPU_UP_CANCELED:
53 case CPU_UP_CANCELED_FROZEN: 55 case CPU_UP_CANCELED_FROZEN:
56 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
54 57
55 case CPU_DEAD: 58 case CPU_DEAD:
56 case CPU_DEAD_FROZEN: 59 case CPU_DEAD_FROZEN:
57 free_cpumask_var(cfd->cpumask); 60 free_cpumask_var(cfd->cpumask);
58 free_percpu(cfd->csd); 61 free_percpu(cfd->csd);
59 break; 62 break;
63
64 case CPU_DYING:
65 case CPU_DYING_FROZEN:
66 /*
67 * The IPIs for the smp-call-function callbacks queued by other
68 * CPUs might arrive late, either due to hardware latencies or
69 * because this CPU disabled interrupts (inside stop-machine)
70 * before the IPIs were sent. So flush out any pending callbacks
71 * explicitly (without waiting for the IPIs to arrive), to
72 * ensure that the outgoing CPU doesn't go offline with work
73 * still pending.
74 */
75 flush_smp_call_function_queue(false);
76 break;
60#endif 77#endif
61 }; 78 };
62 79
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
177 return 0; 194 return 0;
178} 195}
179 196
180/* 197/**
181 * Invoked by arch to handle an IPI for call function single. Must be 198 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
182 * called from the arch with interrupts disabled. 199 *
200 * Invoked by arch to handle an IPI for call function single.
201 * Must be called with interrupts disabled.
183 */ 202 */
184void generic_smp_call_function_single_interrupt(void) 203void generic_smp_call_function_single_interrupt(void)
185{ 204{
205 flush_smp_call_function_queue(true);
206}
207
208/**
209 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
210 *
211 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
212 * offline CPU. Skip this check if set to 'false'.
213 *
214 * Flush any pending smp-call-function callbacks queued on this CPU. This is
215 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
216 * to ensure that all pending IPI callbacks are run before it goes completely
217 * offline.
218 *
219 * Loop through the call_single_queue and run all the queued callbacks.
220 * Must be called with interrupts disabled.
221 */
222static void flush_smp_call_function_queue(bool warn_cpu_offline)
223{
224 struct llist_head *head;
186 struct llist_node *entry; 225 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 226 struct call_single_data *csd, *csd_next;
188 static bool warned; 227 static bool warned;
189 228
190 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 229 WARN_ON(!irqs_disabled());
230
231 head = &__get_cpu_var(call_single_queue);
232 entry = llist_del_all(head);
191 entry = llist_reverse_order(entry); 233 entry = llist_reverse_order(entry);
192 234
193 /* 235 /* There shouldn't be any pending callbacks on an offline CPU. */
194 * Shouldn't receive this interrupt on a cpu that is not yet online. 236 if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
195 */ 237 !warned && !llist_empty(head))) {
196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true; 238 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); 239 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
199 240
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7de6555cfea0..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -861,6 +860,17 @@ static struct ctl_table kern_table[] = {
861 .extra1 = &zero, 860 .extra1 = &zero,
862 .extra2 = &one, 861 .extra2 = &one,
863 }, 862 },
863#ifdef CONFIG_SMP
864 {
865 .procname = "softlockup_all_cpu_backtrace",
866 .data = &sysctl_softlockup_all_cpu_backtrace,
867 .maxlen = sizeof(int),
868 .mode = 0644,
869 .proc_handler = proc_dointvec_minmax,
870 .extra1 = &zero,
871 .extra2 = &one,
872 },
873#endif /* CONFIG_SMP */
864 { 874 {
865 .procname = "nmi_watchdog", 875 .procname = "nmi_watchdog",
866 .data = &watchdog_user_enabled, 876 .data = &watchdog_user_enabled,
@@ -1317,7 +1327,7 @@ static struct ctl_table vm_table[] = {
1317 .maxlen = sizeof(percpu_pagelist_fraction), 1327 .maxlen = sizeof(percpu_pagelist_fraction),
1318 .mode = 0644, 1328 .mode = 0644,
1319 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1329 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1320 .extra1 = &min_percpu_pagelist_fract, 1330 .extra1 = &zero,
1321 }, 1331 },
1322#ifdef CONFIG_MMU 1332#ifdef CONFIG_MMU
1323 { 1333 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
31 31
32int watchdog_user_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34#ifdef CONFIG_SMP
35int __read_mostly sysctl_softlockup_all_cpu_backtrace;
36#else
37#define sysctl_softlockup_all_cpu_backtrace 0
38#endif
39
34static int __read_mostly watchdog_running; 40static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 41static u64 __read_mostly sample_period;
36 42
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 53static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
48static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 54static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
49#endif 55#endif
56static unsigned long soft_lockup_nmi_warn;
50 57
51/* boot commands */ 58/* boot commands */
52/* 59/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
95} 102}
96__setup("nosoftlockup", nosoftlockup_setup); 103__setup("nosoftlockup", nosoftlockup_setup);
97/* */ 104/* */
105#ifdef CONFIG_SMP
106static int __init softlockup_all_cpu_backtrace_setup(char *str)
107{
108 sysctl_softlockup_all_cpu_backtrace =
109 !!simple_strtol(str, NULL, 0);
110 return 1;
111}
112__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
113#endif
98 114
99/* 115/*
100 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 116 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
271 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); 287 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
272 struct pt_regs *regs = get_irq_regs(); 288 struct pt_regs *regs = get_irq_regs();
273 int duration; 289 int duration;
290 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
274 291
275 /* kick the hardlockup detector */ 292 /* kick the hardlockup detector */
276 watchdog_interrupt_count(); 293 watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
317 if (__this_cpu_read(soft_watchdog_warn) == true) 334 if (__this_cpu_read(soft_watchdog_warn) == true)
318 return HRTIMER_RESTART; 335 return HRTIMER_RESTART;
319 336
337 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already
339 * engaged in dumping cpu back traces
340 */
341 if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
342 /* Someone else will report us. Let's give up */
343 __this_cpu_write(soft_watchdog_warn, true);
344 return HRTIMER_RESTART;
345 }
346 }
347
320 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
321 smp_processor_id(), duration, 349 smp_processor_id(), duration,
322 current->comm, task_pid_nr(current)); 350 current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 else 355 else
328 dump_stack(); 356 dump_stack();
329 357
358 if (softlockup_all_cpu_backtrace) {
359 /* Avoid generating two back traces for current
360 * given that one is already made above
361 */
362 trigger_allbutself_cpu_backtrace();
363
364 clear_bit(0, &soft_lockup_nmi_warn);
365 /* Barrier to sync with other cpus */
366 smp_mb__after_atomic();
367 }
368
330 if (softlockup_panic) 369 if (softlockup_panic)
331 panic("softlockup: hung tasks"); 370 panic("softlockup: hung tasks");
332 __this_cpu_write(soft_watchdog_warn, true); 371 __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
527 int cpu; 566 int cpu;
528 567
529 get_online_cpus(); 568 get_online_cpus();
530 preempt_disable();
531 for_each_online_cpu(cpu) 569 for_each_online_cpu(cpu)
532 update_timers(cpu); 570 update_timers(cpu);
533 preempt_enable();
534 put_online_cpus(); 571 put_online_cpus();
535} 572}
536 573
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7cfcc1b8e101..7a638aa3545b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -930,7 +930,7 @@ config LOCKDEP
930 bool 930 bool
931 depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT 931 depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
932 select STACKTRACE 932 select STACKTRACE
933 select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC 933 select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
934 select KALLSYMS 934 select KALLSYMS
935 select KALLSYMS_ALL 935 select KALLSYMS_ALL
936 936
@@ -1408,7 +1408,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
1408 depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT 1408 depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
1409 depends on !X86_64 1409 depends on !X86_64
1410 select STACKTRACE 1410 select STACKTRACE
1411 select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC 1411 select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
1412 help 1412 help
1413 Provide stacktrace filter for fault-injection capabilities 1413 Provide stacktrace filter for fault-injection capabilities
1414 1414
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e60837dc785c..33514d88fef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,37 @@ unlock:
941 spin_unlock(ptl); 941 spin_unlock(ptl);
942} 942}
943 943
944/*
945 * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
946 * during copy_user_huge_page()'s copy_page_rep(): in the case when
947 * the source page gets split and a tail freed before copy completes.
948 * Called under pmd_lock of checked pmd, so safe from splitting itself.
949 */
950static void get_user_huge_page(struct page *page)
951{
952 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
953 struct page *endpage = page + HPAGE_PMD_NR;
954
955 atomic_add(HPAGE_PMD_NR, &page->_count);
956 while (++page < endpage)
957 get_huge_page_tail(page);
958 } else {
959 get_page(page);
960 }
961}
962
963static void put_user_huge_page(struct page *page)
964{
965 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
966 struct page *endpage = page + HPAGE_PMD_NR;
967
968 while (page < endpage)
969 put_page(page++);
970 } else {
971 put_page(page);
972 }
973}
974
944static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 975static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
945 struct vm_area_struct *vma, 976 struct vm_area_struct *vma,
946 unsigned long address, 977 unsigned long address,
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1074 ret |= VM_FAULT_WRITE; 1105 ret |= VM_FAULT_WRITE;
1075 goto out_unlock; 1106 goto out_unlock;
1076 } 1107 }
1077 get_page(page); 1108 get_user_huge_page(page);
1078 spin_unlock(ptl); 1109 spin_unlock(ptl);
1079alloc: 1110alloc:
1080 if (transparent_hugepage_enabled(vma) && 1111 if (transparent_hugepage_enabled(vma) &&
@@ -1095,7 +1126,7 @@ alloc:
1095 split_huge_page(page); 1126 split_huge_page(page);
1096 ret |= VM_FAULT_FALLBACK; 1127 ret |= VM_FAULT_FALLBACK;
1097 } 1128 }
1098 put_page(page); 1129 put_user_huge_page(page);
1099 } 1130 }
1100 count_vm_event(THP_FAULT_FALLBACK); 1131 count_vm_event(THP_FAULT_FALLBACK);
1101 goto out; 1132 goto out;
@@ -1105,7 +1136,7 @@ alloc:
1105 put_page(new_page); 1136 put_page(new_page);
1106 if (page) { 1137 if (page) {
1107 split_huge_page(page); 1138 split_huge_page(page);
1108 put_page(page); 1139 put_user_huge_page(page);
1109 } else 1140 } else
1110 split_huge_page_pmd(vma, address, pmd); 1141 split_huge_page_pmd(vma, address, pmd);
1111 ret |= VM_FAULT_FALLBACK; 1142 ret |= VM_FAULT_FALLBACK;
@@ -1127,7 +1158,7 @@ alloc:
1127 1158
1128 spin_lock(ptl); 1159 spin_lock(ptl);
1129 if (page) 1160 if (page)
1130 put_page(page); 1161 put_user_huge_page(page);
1131 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1162 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1132 spin_unlock(ptl); 1163 spin_unlock(ptl);
1133 mem_cgroup_uncharge_page(new_page); 1164 mem_cgroup_uncharge_page(new_page);
@@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
2392 pmd = mm_find_pmd(mm, address); 2423 pmd = mm_find_pmd(mm, address);
2393 if (!pmd) 2424 if (!pmd)
2394 goto out; 2425 goto out;
2395 if (pmd_trans_huge(*pmd))
2396 goto out;
2397 2426
2398 anon_vma_lock_write(vma->anon_vma); 2427 anon_vma_lock_write(vma->anon_vma);
2399 2428
@@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2492 pmd = mm_find_pmd(mm, address); 2521 pmd = mm_find_pmd(mm, address);
2493 if (!pmd) 2522 if (!pmd)
2494 goto out; 2523 goto out;
2495 if (pmd_trans_huge(*pmd))
2496 goto out;
2497 2524
2498 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2525 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2499 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2526 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2846static void split_huge_page_address(struct mm_struct *mm, 2873static void split_huge_page_address(struct mm_struct *mm,
2847 unsigned long address) 2874 unsigned long address)
2848{ 2875{
2876 pgd_t *pgd;
2877 pud_t *pud;
2849 pmd_t *pmd; 2878 pmd_t *pmd;
2850 2879
2851 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2880 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2852 2881
2853 pmd = mm_find_pmd(mm, address); 2882 pgd = pgd_offset(mm, address);
2854 if (!pmd) 2883 if (!pgd_present(*pgd))
2884 return;
2885
2886 pud = pud_offset(pgd, address);
2887 if (!pud_present(*pud))
2888 return;
2889
2890 pmd = pmd_offset(pud, address);
2891 if (!pmd_present(*pmd))
2855 return; 2892 return;
2856 /* 2893 /*
2857 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2894 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 226910cb7c9b..2024bbd573d2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2520 update_mmu_cache(vma, address, ptep); 2520 update_mmu_cache(vma, address, ptep);
2521} 2521}
2522 2522
2523static int is_hugetlb_entry_migration(pte_t pte)
2524{
2525 swp_entry_t swp;
2526
2527 if (huge_pte_none(pte) || pte_present(pte))
2528 return 0;
2529 swp = pte_to_swp_entry(pte);
2530 if (non_swap_entry(swp) && is_migration_entry(swp))
2531 return 1;
2532 else
2533 return 0;
2534}
2535
2536static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2537{
2538 swp_entry_t swp;
2539
2540 if (huge_pte_none(pte) || pte_present(pte))
2541 return 0;
2542 swp = pte_to_swp_entry(pte);
2543 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2544 return 1;
2545 else
2546 return 0;
2547}
2523 2548
2524int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2549int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2525 struct vm_area_struct *vma) 2550 struct vm_area_struct *vma)
@@ -2559,10 +2584,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2559 dst_ptl = huge_pte_lock(h, dst, dst_pte); 2584 dst_ptl = huge_pte_lock(h, dst, dst_pte);
2560 src_ptl = huge_pte_lockptr(h, src, src_pte); 2585 src_ptl = huge_pte_lockptr(h, src, src_pte);
2561 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2586 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
2562 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2587 entry = huge_ptep_get(src_pte);
2588 if (huge_pte_none(entry)) { /* skip none entry */
2589 ;
2590 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
2591 is_hugetlb_entry_hwpoisoned(entry))) {
2592 swp_entry_t swp_entry = pte_to_swp_entry(entry);
2593
2594 if (is_write_migration_entry(swp_entry) && cow) {
2595 /*
2596 * COW mappings require pages in both
2597 * parent and child to be set to read.
2598 */
2599 make_migration_entry_read(&swp_entry);
2600 entry = swp_entry_to_pte(swp_entry);
2601 set_huge_pte_at(src, addr, src_pte, entry);
2602 }
2603 set_huge_pte_at(dst, addr, dst_pte, entry);
2604 } else {
2563 if (cow) 2605 if (cow)
2564 huge_ptep_set_wrprotect(src, addr, src_pte); 2606 huge_ptep_set_wrprotect(src, addr, src_pte);
2565 entry = huge_ptep_get(src_pte);
2566 ptepage = pte_page(entry); 2607 ptepage = pte_page(entry);
2567 get_page(ptepage); 2608 get_page(ptepage);
2568 page_dup_rmap(ptepage); 2609 page_dup_rmap(ptepage);
@@ -2578,32 +2619,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2578 return ret; 2619 return ret;
2579} 2620}
2580 2621
2581static int is_hugetlb_entry_migration(pte_t pte)
2582{
2583 swp_entry_t swp;
2584
2585 if (huge_pte_none(pte) || pte_present(pte))
2586 return 0;
2587 swp = pte_to_swp_entry(pte);
2588 if (non_swap_entry(swp) && is_migration_entry(swp))
2589 return 1;
2590 else
2591 return 0;
2592}
2593
2594static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2595{
2596 swp_entry_t swp;
2597
2598 if (huge_pte_none(pte) || pte_present(pte))
2599 return 0;
2600 swp = pte_to_swp_entry(pte);
2601 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2602 return 1;
2603 else
2604 return 0;
2605}
2606
2607void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 2622void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2608 unsigned long start, unsigned long end, 2623 unsigned long start, unsigned long end,
2609 struct page *ref_page) 2624 struct page *ref_page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e80994a..346ddc9e4c0d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
945 pmd = mm_find_pmd(mm, addr); 945 pmd = mm_find_pmd(mm, addr);
946 if (!pmd) 946 if (!pmd)
947 goto out; 947 goto out;
948 BUG_ON(pmd_trans_huge(*pmd));
949 948
950 mmun_start = addr; 949 mmun_start = addr;
951 mmun_end = addr + PAGE_SIZE; 950 mmun_end = addr + PAGE_SIZE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 284974230459..eb58de19f815 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
656 * @nodes and @flags,) it's isolated and queued to the pagelist which is 656 * @nodes and @flags,) it's isolated and queued to the pagelist which is
657 * passed via @private.) 657 * passed via @private.)
658 */ 658 */
659static struct vm_area_struct * 659static int
660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
661 const nodemask_t *nodes, unsigned long flags, void *private) 661 const nodemask_t *nodes, unsigned long flags, void *private)
662{ 662{
663 int err; 663 int err = 0;
664 struct vm_area_struct *first, *vma, *prev; 664 struct vm_area_struct *vma, *prev;
665
666 665
667 first = find_vma(mm, start); 666 vma = find_vma(mm, start);
668 if (!first) 667 if (!vma)
669 return ERR_PTR(-EFAULT); 668 return -EFAULT;
670 prev = NULL; 669 prev = NULL;
671 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 670 for (; vma && vma->vm_start < end; vma = vma->vm_next) {
672 unsigned long endvma = vma->vm_end; 671 unsigned long endvma = vma->vm_end;
673 672
674 if (endvma > end) 673 if (endvma > end)
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
678 677
679 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 678 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
680 if (!vma->vm_next && vma->vm_end < end) 679 if (!vma->vm_next && vma->vm_end < end)
681 return ERR_PTR(-EFAULT); 680 return -EFAULT;
682 if (prev && prev->vm_end < vma->vm_start) 681 if (prev && prev->vm_end < vma->vm_start)
683 return ERR_PTR(-EFAULT); 682 return -EFAULT;
684 } 683 }
685 684
686 if (flags & MPOL_MF_LAZY) { 685 if (flags & MPOL_MF_LAZY) {
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
694 693
695 err = queue_pages_pgd_range(vma, start, endvma, nodes, 694 err = queue_pages_pgd_range(vma, start, endvma, nodes,
696 flags, private); 695 flags, private);
697 if (err) { 696 if (err)
698 first = ERR_PTR(err);
699 break; 697 break;
700 }
701 } 698 }
702next: 699next:
703 prev = vma; 700 prev = vma;
704 } 701 }
705 return first; 702 return err;
706} 703}
707 704
708/* 705/*
@@ -1156,16 +1153,17 @@ out:
1156 1153
1157/* 1154/*
1158 * Allocate a new page for page migration based on vma policy. 1155 * Allocate a new page for page migration based on vma policy.
1159 * Start assuming that page is mapped by vma pointed to by @private. 1156 * Start by assuming the page is mapped by the same vma as contains @start.
1160 * Search forward from there, if not. N.B., this assumes that the 1157 * Search forward from there, if not. N.B., this assumes that the
1161 * list of pages handed to migrate_pages()--which is how we get here-- 1158 * list of pages handed to migrate_pages()--which is how we get here--
1162 * is in virtual address order. 1159 * is in virtual address order.
1163 */ 1160 */
1164static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1161static struct page *new_page(struct page *page, unsigned long start, int **x)
1165{ 1162{
1166 struct vm_area_struct *vma = (struct vm_area_struct *)private; 1163 struct vm_area_struct *vma;
1167 unsigned long uninitialized_var(address); 1164 unsigned long uninitialized_var(address);
1168 1165
1166 vma = find_vma(current->mm, start);
1169 while (vma) { 1167 while (vma) {
1170 address = page_address_in_vma(page, vma); 1168 address = page_address_in_vma(page, vma);
1171 if (address != -EFAULT) 1169 if (address != -EFAULT)
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1195 return -ENOSYS; 1193 return -ENOSYS;
1196} 1194}
1197 1195
1198static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 1196static struct page *new_page(struct page *page, unsigned long start, int **x)
1199{ 1197{
1200 return NULL; 1198 return NULL;
1201} 1199}
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
1205 unsigned short mode, unsigned short mode_flags, 1203 unsigned short mode, unsigned short mode_flags,
1206 nodemask_t *nmask, unsigned long flags) 1204 nodemask_t *nmask, unsigned long flags)
1207{ 1205{
1208 struct vm_area_struct *vma;
1209 struct mm_struct *mm = current->mm; 1206 struct mm_struct *mm = current->mm;
1210 struct mempolicy *new; 1207 struct mempolicy *new;
1211 unsigned long end; 1208 unsigned long end;
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1271 if (err) 1268 if (err)
1272 goto mpol_out; 1269 goto mpol_out;
1273 1270
1274 vma = queue_pages_range(mm, start, end, nmask, 1271 err = queue_pages_range(mm, start, end, nmask,
1275 flags | MPOL_MF_INVERT, &pagelist); 1272 flags | MPOL_MF_INVERT, &pagelist);
1276 1273 if (!err)
1277 err = PTR_ERR(vma); /* maybe ... */
1278 if (!IS_ERR(vma))
1279 err = mbind_range(mm, start, end, new); 1274 err = mbind_range(mm, start, end, new);
1280 1275
1281 if (!err) { 1276 if (!err) {
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1283 1278
1284 if (!list_empty(&pagelist)) { 1279 if (!list_empty(&pagelist)) {
1285 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1280 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1286 nr_failed = migrate_pages(&pagelist, new_vma_page, 1281 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1287 NULL, (unsigned long)vma, 1282 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1288 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1289 if (nr_failed) 1283 if (nr_failed)
1290 putback_movable_pages(&pagelist); 1284 putback_movable_pages(&pagelist);
1291 } 1285 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd559999..9e0beaa91845 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 pmd = mm_find_pmd(mm, addr); 120 pmd = mm_find_pmd(mm, addr);
121 if (!pmd) 121 if (!pmd)
122 goto out; 122 goto out;
123 if (pmd_trans_huge(*pmd))
124 goto out;
125 123
126 ptep = pte_offset_map(pmd, addr); 124 ptep = pte_offset_map(pmd, addr);
127 125
diff --git a/mm/nommu.c b/mm/nommu.c
index b78e3a8f5ee7..4a852f6c5709 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
786 for (i = 0; i < VMACACHE_SIZE; i++) { 786 for (i = 0; i < VMACACHE_SIZE; i++) {
787 /* if the vma is cached, invalidate the entire cache */ 787 /* if the vma is cached, invalidate the entire cache */
788 if (curr->vmacache[i] == vma) { 788 if (curr->vmacache[i] == vma) {
789 vmacache_invalidate(curr->mm); 789 vmacache_invalidate(mm);
790 break; 790 break;
791 } 791 }
792 } 792 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa29eda8..20d17f8266fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
69 69
70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 70/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71static DEFINE_MUTEX(pcp_batch_high_lock); 71static DEFINE_MUTEX(pcp_batch_high_lock);
72#define MIN_PERCPU_PAGELIST_FRACTION (8)
72 73
73#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 74#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
74DEFINE_PER_CPU(int, numa_node); 75DEFINE_PER_CPU(int, numa_node);
@@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
4145 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 4146 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
4146#endif 4147#endif
4147 4148
4148static int __meminit zone_batchsize(struct zone *zone) 4149static int zone_batchsize(struct zone *zone)
4149{ 4150{
4150#ifdef CONFIG_MMU 4151#ifdef CONFIG_MMU
4151 int batch; 4152 int batch;
@@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
4261 pageset_update(&p->pcp, high, batch); 4262 pageset_update(&p->pcp, high, batch);
4262} 4263}
4263 4264
4264static void __meminit pageset_set_high_and_batch(struct zone *zone, 4265static void pageset_set_high_and_batch(struct zone *zone,
4265 struct per_cpu_pageset *pcp) 4266 struct per_cpu_pageset *pcp)
4266{ 4267{
4267 if (percpu_pagelist_fraction) 4268 if (percpu_pagelist_fraction)
4268 pageset_set_high(pcp, 4269 pageset_set_high(pcp,
@@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
5881 void __user *buffer, size_t *length, loff_t *ppos) 5882 void __user *buffer, size_t *length, loff_t *ppos)
5882{ 5883{
5883 struct zone *zone; 5884 struct zone *zone;
5884 unsigned int cpu; 5885 int old_percpu_pagelist_fraction;
5885 int ret; 5886 int ret;
5886 5887
5888 mutex_lock(&pcp_batch_high_lock);
5889 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
5890
5887 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5891 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5888 if (!write || (ret < 0)) 5892 if (!write || ret < 0)
5889 return ret; 5893 goto out;
5894
5895 /* Sanity checking to avoid pcp imbalance */
5896 if (percpu_pagelist_fraction &&
5897 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
5898 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
5899 ret = -EINVAL;
5900 goto out;
5901 }
5902
5903 /* No change? */
5904 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
5905 goto out;
5890 5906
5891 mutex_lock(&pcp_batch_high_lock);
5892 for_each_populated_zone(zone) { 5907 for_each_populated_zone(zone) {
5893 unsigned long high; 5908 unsigned int cpu;
5894 high = zone->managed_pages / percpu_pagelist_fraction; 5909
5895 for_each_possible_cpu(cpu) 5910 for_each_possible_cpu(cpu)
5896 pageset_set_high(per_cpu_ptr(zone->pageset, cpu), 5911 pageset_set_high_and_batch(zone,
5897 high); 5912 per_cpu_ptr(zone->pageset, cpu));
5898 } 5913 }
5914out:
5899 mutex_unlock(&pcp_batch_high_lock); 5915 mutex_unlock(&pcp_batch_high_lock);
5900 return 0; 5916 return ret;
5901} 5917}
5902 5918
5903int hashdist = HASHDIST_DEFAULT; 5919int hashdist = HASHDIST_DEFAULT;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc872ae8..b7e94ebbd09e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
569 pgd_t *pgd; 569 pgd_t *pgd;
570 pud_t *pud; 570 pud_t *pud;
571 pmd_t *pmd = NULL; 571 pmd_t *pmd = NULL;
572 pmd_t pmde;
572 573
573 pgd = pgd_offset(mm, address); 574 pgd = pgd_offset(mm, address);
574 if (!pgd_present(*pgd)) 575 if (!pgd_present(*pgd))
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
579 goto out; 580 goto out;
580 581
581 pmd = pmd_offset(pud, address); 582 pmd = pmd_offset(pud, address);
582 if (!pmd_present(*pmd)) 583 /*
584 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
585 * without holding anon_vma lock for write. So when looking for a
586 * genuine pmde (in which to find pte), test present and !THP together.
587 */
588 pmde = ACCESS_ONCE(*pmd);
589 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
583 pmd = NULL; 590 pmd = NULL;
584out: 591out:
585 return pmd; 592 return pmd;
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
615 if (!pmd) 622 if (!pmd)
616 return NULL; 623 return NULL;
617 624
618 if (pmd_trans_huge(*pmd))
619 return NULL;
620
621 pte = pte_offset_map(pmd, address); 625 pte = pte_offset_map(pmd, address);
622 /* Make a quick check before getting the lock */ 626 /* Make a quick check before getting the lock */
623 if (!sync && !pte_present(*pte)) { 627 if (!sync && !pte_present(*pte)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index f484c276e994..8f419cff9e34 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
80#define SHORT_SYMLINK_LEN 128 80#define SHORT_SYMLINK_LEN 128
81 81
82/* 82/*
83 * shmem_fallocate and shmem_writepage communicate via inode->i_private 83 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
84 * (with i_mutex making sure that it has only one user at a time): 84 * inode->i_private (with i_mutex making sure that it has only one user at
85 * we would prefer not to enlarge the shmem inode just for that. 85 * a time): we would prefer not to enlarge the shmem inode just for that.
86 */ 86 */
87struct shmem_falloc { 87struct shmem_falloc {
88 int mode; /* FALLOC_FL mode currently operating */
88 pgoff_t start; /* start of range currently being fallocated */ 89 pgoff_t start; /* start of range currently being fallocated */
89 pgoff_t next; /* the next page offset to be fallocated */ 90 pgoff_t next; /* the next page offset to be fallocated */
90 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 91 pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -759,6 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
759 spin_lock(&inode->i_lock); 760 spin_lock(&inode->i_lock);
760 shmem_falloc = inode->i_private; 761 shmem_falloc = inode->i_private;
761 if (shmem_falloc && 762 if (shmem_falloc &&
763 !shmem_falloc->mode &&
762 index >= shmem_falloc->start && 764 index >= shmem_falloc->start &&
763 index < shmem_falloc->next) 765 index < shmem_falloc->next)
764 shmem_falloc->nr_unswapped++; 766 shmem_falloc->nr_unswapped++;
@@ -1233,6 +1235,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1233 int error; 1235 int error;
1234 int ret = VM_FAULT_LOCKED; 1236 int ret = VM_FAULT_LOCKED;
1235 1237
1238 /*
1239 * Trinity finds that probing a hole which tmpfs is punching can
1240 * prevent the hole-punch from ever completing: which in turn
1241 * locks writers out with its hold on i_mutex. So refrain from
1242 * faulting pages into the hole while it's being punched, and
1243 * wait on i_mutex to be released if vmf->flags permits.
1244 */
1245 if (unlikely(inode->i_private)) {
1246 struct shmem_falloc *shmem_falloc;
1247
1248 spin_lock(&inode->i_lock);
1249 shmem_falloc = inode->i_private;
1250 if (!shmem_falloc ||
1251 shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
1252 vmf->pgoff < shmem_falloc->start ||
1253 vmf->pgoff >= shmem_falloc->next)
1254 shmem_falloc = NULL;
1255 spin_unlock(&inode->i_lock);
1256 /*
1257 * i_lock has protected us from taking shmem_falloc seriously
1258 * once return from shmem_fallocate() went back up that stack.
1259 * i_lock does not serialize with i_mutex at all, but it does
1260 * not matter if sometimes we wait unnecessarily, or sometimes
1261 * miss out on waiting: we just need to make those cases rare.
1262 */
1263 if (shmem_falloc) {
1264 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1265 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1266 up_read(&vma->vm_mm->mmap_sem);
1267 mutex_lock(&inode->i_mutex);
1268 mutex_unlock(&inode->i_mutex);
1269 return VM_FAULT_RETRY;
1270 }
1271 /* cond_resched? Leave that to GUP or return to user */
1272 return VM_FAULT_NOPAGE;
1273 }
1274 }
1275
1236 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1276 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1237 if (error) 1277 if (error)
1238 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1278 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1724,20 +1764,31 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1724 pgoff_t start, index, end; 1764 pgoff_t start, index, end;
1725 int error; 1765 int error;
1726 1766
1767 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1768 return -EOPNOTSUPP;
1769
1727 mutex_lock(&inode->i_mutex); 1770 mutex_lock(&inode->i_mutex);
1728 1771
1772 shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
1773
1729 if (mode & FALLOC_FL_PUNCH_HOLE) { 1774 if (mode & FALLOC_FL_PUNCH_HOLE) {
1730 struct address_space *mapping = file->f_mapping; 1775 struct address_space *mapping = file->f_mapping;
1731 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1776 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1732 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1777 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1733 1778
1779 shmem_falloc.start = unmap_start >> PAGE_SHIFT;
1780 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
1781 spin_lock(&inode->i_lock);
1782 inode->i_private = &shmem_falloc;
1783 spin_unlock(&inode->i_lock);
1784
1734 if ((u64)unmap_end > (u64)unmap_start) 1785 if ((u64)unmap_end > (u64)unmap_start)
1735 unmap_mapping_range(mapping, unmap_start, 1786 unmap_mapping_range(mapping, unmap_start,
1736 1 + unmap_end - unmap_start, 0); 1787 1 + unmap_end - unmap_start, 0);
1737 shmem_truncate_range(inode, offset, offset + len - 1); 1788 shmem_truncate_range(inode, offset, offset + len - 1);
1738 /* No need to unmap again: hole-punching leaves COWed pages */ 1789 /* No need to unmap again: hole-punching leaves COWed pages */
1739 error = 0; 1790 error = 0;
1740 goto out; 1791 goto undone;
1741 } 1792 }
1742 1793
1743 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 1794 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87edabc..3070b929a1bf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
386 386
387#endif 387#endif
388 388
389#define OBJECT_FREE (0)
390#define OBJECT_ACTIVE (1)
391
392#ifdef CONFIG_DEBUG_SLAB_LEAK
393
394static void set_obj_status(struct page *page, int idx, int val)
395{
396 int freelist_size;
397 char *status;
398 struct kmem_cache *cachep = page->slab_cache;
399
400 freelist_size = cachep->num * sizeof(freelist_idx_t);
401 status = (char *)page->freelist + freelist_size;
402 status[idx] = val;
403}
404
405static inline unsigned int get_obj_status(struct page *page, int idx)
406{
407 int freelist_size;
408 char *status;
409 struct kmem_cache *cachep = page->slab_cache;
410
411 freelist_size = cachep->num * sizeof(freelist_idx_t);
412 status = (char *)page->freelist + freelist_size;
413
414 return status[idx];
415}
416
417#else
418static inline void set_obj_status(struct page *page, int idx, int val) {}
419
420#endif
421
389/* 422/*
390 * Do not go above this order unless 0 objects fit into the slab or 423 * Do not go above this order unless 0 objects fit into the slab or
391 * overridden on the command line. 424 * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
576 return cachep->array[smp_processor_id()]; 609 return cachep->array[smp_processor_id()];
577} 610}
578 611
612static size_t calculate_freelist_size(int nr_objs, size_t align)
613{
614 size_t freelist_size;
615
616 freelist_size = nr_objs * sizeof(freelist_idx_t);
617 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
618 freelist_size += nr_objs * sizeof(char);
619
620 if (align)
621 freelist_size = ALIGN(freelist_size, align);
622
623 return freelist_size;
624}
625
579static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 626static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
580 size_t idx_size, size_t align) 627 size_t idx_size, size_t align)
581{ 628{
582 int nr_objs; 629 int nr_objs;
630 size_t remained_size;
583 size_t freelist_size; 631 size_t freelist_size;
632 int extra_space = 0;
584 633
634 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
635 extra_space = sizeof(char);
585 /* 636 /*
586 * Ignore padding for the initial guess. The padding 637 * Ignore padding for the initial guess. The padding
587 * is at most @align-1 bytes, and @buffer_size is at 638 * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
590 * into the memory allocation when taking the padding 641 * into the memory allocation when taking the padding
591 * into account. 642 * into account.
592 */ 643 */
593 nr_objs = slab_size / (buffer_size + idx_size); 644 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
594 645
595 /* 646 /*
596 * This calculated number will be either the right 647 * This calculated number will be either the right
597 * amount, or one greater than what we want. 648 * amount, or one greater than what we want.
598 */ 649 */
599 freelist_size = slab_size - nr_objs * buffer_size; 650 remained_size = slab_size - nr_objs * buffer_size;
600 if (freelist_size < ALIGN(nr_objs * idx_size, align)) 651 freelist_size = calculate_freelist_size(nr_objs, align);
652 if (remained_size < freelist_size)
601 nr_objs--; 653 nr_objs--;
602 654
603 return nr_objs; 655 return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
635 } else { 687 } else {
636 nr_objs = calculate_nr_objs(slab_size, buffer_size, 688 nr_objs = calculate_nr_objs(slab_size, buffer_size,
637 sizeof(freelist_idx_t), align); 689 sizeof(freelist_idx_t), align);
638 mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); 690 mgmt_size = calculate_freelist_size(nr_objs, align);
639 } 691 }
640 *num = nr_objs; 692 *num = nr_objs;
641 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 693 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2041 break; 2093 break;
2042 2094
2043 if (flags & CFLGS_OFF_SLAB) { 2095 if (flags & CFLGS_OFF_SLAB) {
2096 size_t freelist_size_per_obj = sizeof(freelist_idx_t);
2044 /* 2097 /*
2045 * Max number of objs-per-slab for caches which 2098 * Max number of objs-per-slab for caches which
2046 * use off-slab slabs. Needed to avoid a possible 2099 * use off-slab slabs. Needed to avoid a possible
2047 * looping condition in cache_grow(). 2100 * looping condition in cache_grow().
2048 */ 2101 */
2102 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
2103 freelist_size_per_obj += sizeof(char);
2049 offslab_limit = size; 2104 offslab_limit = size;
2050 offslab_limit /= sizeof(freelist_idx_t); 2105 offslab_limit /= freelist_size_per_obj;
2051 2106
2052 if (num > offslab_limit) 2107 if (num > offslab_limit)
2053 break; 2108 break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2294 if (!cachep->num) 2349 if (!cachep->num)
2295 return -E2BIG; 2350 return -E2BIG;
2296 2351
2297 freelist_size = 2352 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2298 ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
2299 2353
2300 /* 2354 /*
2301 * If the slab has been placed off-slab, and we have enough space then 2355 * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2308 2362
2309 if (flags & CFLGS_OFF_SLAB) { 2363 if (flags & CFLGS_OFF_SLAB) {
2310 /* really off slab. No need for manual alignment */ 2364 /* really off slab. No need for manual alignment */
2311 freelist_size = cachep->num * sizeof(freelist_idx_t); 2365 freelist_size = calculate_freelist_size(cachep->num, 0);
2312 2366
2313#ifdef CONFIG_PAGE_POISONING 2367#ifdef CONFIG_PAGE_POISONING
2314 /* If we're going to use the generic kernel_map_pages() 2368 /* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2612 if (cachep->ctor) 2666 if (cachep->ctor)
2613 cachep->ctor(objp); 2667 cachep->ctor(objp);
2614#endif 2668#endif
2669 set_obj_status(page, i, OBJECT_FREE);
2615 set_free_obj(page, i, i); 2670 set_free_obj(page, i, i);
2616 } 2671 }
2617} 2672}
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2820 BUG_ON(objnr >= cachep->num); 2875 BUG_ON(objnr >= cachep->num);
2821 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2876 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2822 2877
2878 set_obj_status(page, objnr, OBJECT_FREE);
2823 if (cachep->flags & SLAB_POISON) { 2879 if (cachep->flags & SLAB_POISON) {
2824#ifdef CONFIG_DEBUG_PAGEALLOC 2880#ifdef CONFIG_DEBUG_PAGEALLOC
2825 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2881 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2953static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3009static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2954 gfp_t flags, void *objp, unsigned long caller) 3010 gfp_t flags, void *objp, unsigned long caller)
2955{ 3011{
3012 struct page *page;
3013
2956 if (!objp) 3014 if (!objp)
2957 return objp; 3015 return objp;
2958 if (cachep->flags & SLAB_POISON) { 3016 if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2983 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3041 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2984 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3042 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2985 } 3043 }
3044
3045 page = virt_to_head_page(objp);
3046 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2986 objp += obj_offset(cachep); 3047 objp += obj_offset(cachep);
2987 if (cachep->ctor && cachep->flags & SLAB_POISON) 3048 if (cachep->ctor && cachep->flags & SLAB_POISON)
2988 cachep->ctor(objp); 3049 cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
4219 struct page *page) 4280 struct page *page)
4220{ 4281{
4221 void *p; 4282 void *p;
4222 int i, j; 4283 int i;
4223 4284
4224 if (n[0] == n[1]) 4285 if (n[0] == n[1])
4225 return; 4286 return;
4226 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4287 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4227 bool active = true; 4288 if (get_obj_status(page, i) != OBJECT_ACTIVE)
4228
4229 for (j = page->active; j < c->num; j++) {
4230 /* Skip freed item */
4231 if (get_free_obj(page, j) == i) {
4232 active = false;
4233 break;
4234 }
4235 }
4236 if (!active)
4237 continue; 4289 continue;
4238 4290
4239 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4291 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 010b18ef4ea0..182be0f12407 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3476,12 +3476,17 @@ sub process {
3476 } 3476 }
3477 } 3477 }
3478 3478
3479# unnecessary return in a void function? (a single leading tab, then return;) 3479# unnecessary return in a void function
3480 if ($sline =~ /^\+\treturn\s*;\s*$/ && 3480# at end-of-function, with the previous line a single leading tab, then return;
3481 $prevline =~ /^\+/) { 3481# and the line before that not a goto label target like "out:"
3482 if ($sline =~ /^[ \+]}\s*$/ &&
3483 $prevline =~ /^\+\treturn\s*;\s*$/ &&
3484 $linenr >= 3 &&
3485 $lines[$linenr - 3] =~ /^[ +]/ &&
3486 $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) {
3482 WARN("RETURN_VOID", 3487 WARN("RETURN_VOID",
3483 "void function return statements are not generally useful\n" . $herecurr); 3488 "void function return statements are not generally useful\n" . $hereprev);
3484 } 3489 }
3485 3490
3486# if statements using unnecessary parentheses - ie: if ((foo == bar)) 3491# if statements using unnecessary parentheses - ie: if ((foo == bar))
3487 if ($^V && $^V ge 5.10.0 && 3492 if ($^V && $^V ge 5.10.0 &&