aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kprobes.txt207
-rw-r--r--arch/Kconfig13
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/alternative.h4
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/kernel/alternative.c60
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1
-rw-r--r--arch/x86/kernel/kprobes.c609
-rw-r--r--crypto/ahash.c1
-rw-r--r--crypto/authenc.c27
-rw-r--r--crypto/md5.c1
-rw-r--r--fs/ext4/balloc.c35
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h106
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c260
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c32
-rw-r--r--fs/ext4/inode.c459
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c73
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/migrate.c35
-rw-r--r--fs/ext4/move_extent.c36
-rw-r--r--fs/ext4/namei.c63
-rw-r--r--fs/ext4/resize.c102
-rw-r--r--fs/ext4/super.c342
-rw-r--r--fs/ext4/xattr.c56
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c76
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c49
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c150
-rw-r--r--include/linux/jbd2.h11
-rw-r--r--include/linux/kprobes.h44
-rw-r--r--include/trace/events/ext4.h101
-rw-r--r--include/trace/events/jbd2.h28
-rw-r--r--kernel/kprobes.c647
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/sysctl.c12
-rw-r--r--tools/perf/Documentation/perf-probe.txt58
-rw-r--r--tools/perf/Makefile10
-rw-r--r--tools/perf/builtin-probe.c36
-rw-r--r--tools/perf/util/probe-event.c55
-rw-r--r--tools/perf/util/probe-finder.c1002
-rw-r--r--tools/perf/util/probe-finder.h53
-rw-r--r--tools/perf/util/string.c55
-rw-r--r--tools/perf/util/string.h1
68 files changed, 3766 insertions, 1634 deletions
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 053037a1fe6d..2f9115c0ae62 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -1,6 +1,7 @@
1Title : Kernel Probes (Kprobes) 1Title : Kernel Probes (Kprobes)
2Authors : Jim Keniston <jkenisto@us.ibm.com> 2Authors : Jim Keniston <jkenisto@us.ibm.com>
3 : Prasanna S Panchamukhi <prasanna@in.ibm.com> 3 : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
4 : Masami Hiramatsu <mhiramat@redhat.com>
4 5
5CONTENTS 6CONTENTS
6 7
@@ -15,6 +16,7 @@ CONTENTS
159. Jprobes Example 169. Jprobes Example
1610. Kretprobes Example 1710. Kretprobes Example
17Appendix A: The kprobes debugfs interface 18Appendix A: The kprobes debugfs interface
19Appendix B: The kprobes sysctl interface
18 20
191. Concepts: Kprobes, Jprobes, Return Probes 211. Concepts: Kprobes, Jprobes, Return Probes
20 22
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions
42can speed up unregistration process when you have to unregister 44can speed up unregistration process when you have to unregister
43a lot of probes at once. 45a lot of probes at once.
44 46
45The next three subsections explain how the different types of 47The next four subsections explain how the different types of
46probes work. They explain certain things that you'll need to 48probes work and how jump optimization works. They explain certain
47know in order to make the best use of Kprobes -- e.g., the 49things that you'll need to know in order to make the best use of
48difference between a pre_handler and a post_handler, and how 50Kprobes -- e.g., the difference between a pre_handler and
49to use the maxactive and nmissed fields of a kretprobe. But 51a post_handler, and how to use the maxactive and nmissed fields of
50if you're in a hurry to start using Kprobes, you can skip ahead 52a kretprobe. But if you're in a hurry to start using Kprobes, you
51to section 2. 53can skip ahead to section 2.
52 54
531.1 How Does a Kprobe Work? 551.1 How Does a Kprobe Work?
54 56
@@ -161,13 +163,125 @@ In case probed function is entered but there is no kretprobe_instance
161object available, then in addition to incrementing the nmissed count, 163object available, then in addition to incrementing the nmissed count,
162the user entry_handler invocation is also skipped. 164the user entry_handler invocation is also skipped.
163 165
1661.4 How Does Jump Optimization Work?
167
168If you configured your kernel with CONFIG_OPTPROBES=y (currently
169this option is supported on x86/x86-64, non-preemptive kernel) and
170the "debug.kprobes_optimization" kernel parameter is set to 1 (see
171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
172instruction instead of a breakpoint instruction at each probepoint.
173
1741.4.1 Init a Kprobe
175
176When a probe is registered, before attempting this optimization,
177Kprobes inserts an ordinary, breakpoint-based kprobe at the specified
178address. So, even if it's not possible to optimize this particular
179probepoint, there'll be a probe there.
180
1811.4.2 Safety Check
182
183Before optimizing a probe, Kprobes performs the following safety checks:
184
185- Kprobes verifies that the region that will be replaced by the jump
186instruction (the "optimized region") lies entirely within one function.
187(A jump instruction is multiple bytes, and so may overlay multiple
188instructions.)
189
190- Kprobes analyzes the entire function and verifies that there is no
191jump into the optimized region. Specifically:
192 - the function contains no indirect jump;
193 - the function contains no instruction that causes an exception (since
194 the fixup code triggered by the exception could jump back into the
195 optimized region -- Kprobes checks the exception tables to verify this);
196 and
197 - there is no near jump to the optimized region (other than to the first
198 byte).
199
200- For each instruction in the optimized region, Kprobes verifies that
201the instruction can be executed out of line.
202
2031.4.3 Preparing Detour Buffer
204
205Next, Kprobes prepares a "detour" buffer, which contains the following
206instruction sequence:
207- code to push the CPU's registers (emulating a breakpoint trap)
208- a call to the trampoline code which calls user's probe handlers.
209- code to restore registers
210- the instructions from the optimized region
211- a jump back to the original execution path.
212
2131.4.4 Pre-optimization
214
215After preparing the detour buffer, Kprobes verifies that none of the
216following situations exist:
217- The probe has either a break_handler (i.e., it's a jprobe) or a
218post_handler.
219- Other instructions in the optimized region are probed.
220- The probe is disabled.
221In any of the above cases, Kprobes won't start optimizing the probe.
222Since these are temporary situations, Kprobes tries to start
223optimizing it again if the situation is changed.
224
225If the kprobe can be optimized, Kprobes enqueues the kprobe to an
226optimizing list, and kicks the kprobe-optimizer workqueue to optimize
227it. If the to-be-optimized probepoint is hit before being optimized,
228Kprobes returns control to the original instruction path by setting
229the CPU's instruction pointer to the copied code in the detour buffer
230-- thus at least avoiding the single-step.
231
2321.4.5 Optimization
233
234The Kprobe-optimizer doesn't insert the jump instruction immediately;
235rather, it calls synchronize_sched() for safety first, because it's
236possible for a CPU to be interrupted in the middle of executing the
237optimized region(*). As you know, synchronize_sched() can ensure
238that all interruptions that were active when synchronize_sched()
239was called are done, but only if CONFIG_PREEMPT=n. So, this version
240of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**)
241
242After that, the Kprobe-optimizer calls stop_machine() to replace
243the optimized region with a jump instruction to the detour buffer,
244using text_poke_smp().
245
2461.4.6 Unoptimization
247
248When an optimized kprobe is unregistered, disabled, or blocked by
249another kprobe, it will be unoptimized. If this happens before
250the optimization is complete, the kprobe is just dequeued from the
251optimized list. If the optimization has been done, the jump is
252replaced with the original code (except for an int3 breakpoint in
253the first byte) by using text_poke_smp().
254
255(*)Please imagine that the 2nd instruction is interrupted and then
256the optimizer replaces the 2nd instruction with the jump *address*
257while the interrupt handler is running. When the interrupt
258returns to original address, there is no valid instruction,
259and it causes an unexpected result.
260
261(**)This optimization-safety checking may be replaced with the
262stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
263kernel.
264
265NOTE for geeks:
266The jump optimization changes the kprobe's pre_handler behavior.
267Without optimization, the pre_handler can change the kernel's execution
268path by changing regs->ip and returning 1. However, when the probe
269is optimized, that modification is ignored. Thus, if you want to
270tweak the kernel's execution path, you need to suppress optimization,
271using one of the following techniques:
272- Specify an empty function for the kprobe's post_handler or break_handler.
273 or
274- Config CONFIG_OPTPROBES=n.
275 or
276- Execute 'sysctl -w debug.kprobes_optimization=n'
277
1642. Architectures Supported 2782. Architectures Supported
165 279
166Kprobes, jprobes, and return probes are implemented on the following 280Kprobes, jprobes, and return probes are implemented on the following
167architectures: 281architectures:
168 282
169- i386 283- i386 (Supports jump optimization)
170- x86_64 (AMD-64, EM64T) 284- x86_64 (AMD-64, EM64T) (Supports jump optimization)
171- ppc64 285- ppc64
172- ia64 (Does not support probes on instruction slot1.) 286- ia64 (Does not support probes on instruction slot1.)
173- sparc64 (Return probes not yet implemented.) 287- sparc64 (Return probes not yet implemented.)
@@ -193,6 +307,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
193so you can use "objdump -d -l vmlinux" to see the source-to-object 307so you can use "objdump -d -l vmlinux" to see the source-to-object
194code mapping. 308code mapping.
195 309
310If you want to reduce probing overhead, set "Kprobes jump optimization
311support" (CONFIG_OPTPROBES) to "y". You can find this option under the
312"Kprobes" line.
313
1964. API Reference 3144. API Reference
197 315
198The Kprobes API includes a "register" function and an "unregister" 316The Kprobes API includes a "register" function and an "unregister"
@@ -389,7 +507,10 @@ the probe which has been registered.
389 507
390Kprobes allows multiple probes at the same address. Currently, 508Kprobes allows multiple probes at the same address. Currently,
391however, there cannot be multiple jprobes on the same function at 509however, there cannot be multiple jprobes on the same function at
392the same time. 510the same time. Also, a probepoint for which there is a jprobe or
511a post_handler cannot be optimized. So if you install a jprobe,
512or a kprobe with a post_handler, at an optimized probepoint, the
513probepoint will be unoptimized automatically.
393 514
394In general, you can install a probe anywhere in the kernel. 515In general, you can install a probe anywhere in the kernel.
395In particular, you can probe interrupt handlers. Known exceptions 516In particular, you can probe interrupt handlers. Known exceptions
@@ -453,6 +574,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes)
453on the x86_64 version of __switch_to(); the registration functions 574on the x86_64 version of __switch_to(); the registration functions
454return -EINVAL. 575return -EINVAL.
455 576
577On x86/x86-64, since the Jump Optimization of Kprobes modifies
578instructions widely, there are some limitations to optimization. To
579explain it, we introduce some terminology. Imagine a 3-instruction
580sequence consisting of a two 2-byte instructions and one 3-byte
581instruction.
582
583 IA
584 |
585[-2][-1][0][1][2][3][4][5][6][7]
586 [ins1][ins2][ ins3 ]
587 [<- DCR ->]
588 [<- JTPR ->]
589
590ins1: 1st Instruction
591ins2: 2nd Instruction
592ins3: 3rd Instruction
593IA: Insertion Address
594JTPR: Jump Target Prohibition Region
595DCR: Detoured Code Region
596
597The instructions in DCR are copied to the out-of-line buffer
598of the kprobe, because the bytes in DCR are replaced by
599a 5-byte jump instruction. So there are several limitations.
600
601a) The instructions in DCR must be relocatable.
602b) The instructions in DCR must not include a call instruction.
603c) JTPR must not be targeted by any jump or call instruction.
604d) DCR must not straddle the border betweeen functions.
605
606Anyway, these limitations are checked by the in-kernel instruction
607decoder, so you don't need to worry about that.
608
4566. Probe Overhead 6096. Probe Overhead
457 610
458On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 611On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
@@ -476,6 +629,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
476ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) 629ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
477k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 630k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
478 631
6326.1 Optimized Probe Overhead
633
634Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to
635process. Here are sample overhead figures (in usec) for x86 architectures.
636k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
637r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
638
639i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
640k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
641
642x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
643k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
644
4797. TODO 6457. TODO
480 646
481a. SystemTap (http://sourceware.org/systemtap): Provides a simplified 647a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
@@ -523,7 +689,8 @@ is also specified. Following columns show probe status. If the probe is on
523a virtual address that is no longer valid (module init sections, module 689a virtual address that is no longer valid (module init sections, module
524virtual addresses that correspond to modules that've been unloaded), 690virtual addresses that correspond to modules that've been unloaded),
525such probes are marked with [GONE]. If the probe is temporarily disabled, 691such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED]. 692such probes are marked with [DISABLED]. If the probe is optimized, it is
693marked with [OPTIMIZED].
527 694
528/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. 695/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
529 696
@@ -533,3 +700,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this
533file. Note that this knob just disarms and arms all kprobes and doesn't 700file. Note that this knob just disarms and arms all kprobes and doesn't
534change each probe's disabling state. This means that disabled kprobes (marked 701change each probe's disabling state. This means that disabled kprobes (marked
535[DISABLED]) will be not enabled if you turn ON all kprobes by this knob. 702[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
703
704
705Appendix B: The kprobes sysctl interface
706
707/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF.
708
709When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides
710a knob to globally and forcibly turn jump optimization (see section
7111.4) ON or OFF. By default, jump optimization is allowed (ON).
712If you echo "0" to this file or set "debug.kprobes_optimization" to
7130 via sysctl, all optimized probes will be unoptimized, and any new
714probes registered after that will not be optimized. Note that this
715knob *changes* the optimized state. This means that optimized probes
716(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
717removed). If the knob is turned on, they will be optimized again.
718
diff --git a/arch/Kconfig b/arch/Kconfig
index 215e46073c45..e5eb1337a537 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -41,6 +41,17 @@ config KPROBES
41 for kernel debugging, non-intrusive instrumentation and testing. 41 for kernel debugging, non-intrusive instrumentation and testing.
42 If in doubt, say "N". 42 If in doubt, say "N".
43 43
44config OPTPROBES
45 bool "Kprobes jump optimization support (EXPERIMENTAL)"
46 default y
47 depends on KPROBES
48 depends on !PREEMPT
49 depends on HAVE_OPTPROBES
50 select KALLSYMS_ALL
51 help
52 This option will allow kprobes to optimize breakpoint to
53 a jump for reducing its overhead.
54
44config HAVE_EFFICIENT_UNALIGNED_ACCESS 55config HAVE_EFFICIENT_UNALIGNED_ACCESS
45 bool 56 bool
46 help 57 help
@@ -83,6 +94,8 @@ config HAVE_KPROBES
83config HAVE_KRETPROBES 94config HAVE_KRETPROBES
84 bool 95 bool
85 96
97config HAVE_OPTPROBES
98 bool
86# 99#
87# An arch should select this if it provides all these things: 100# An arch should select this if it provides all these things:
88# 101#
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 57ccdcec1469..f15f37bfbd62 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
31 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 32 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES
34 select HAVE_FTRACE_MCOUNT_RECORD 35 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
36 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f1e253ceba4b..b09ec55650b3 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
165 * invalid instruction possible) or if the instructions are changed from a 165 * invalid instruction possible) or if the instructions are changed from a
166 * consistent state to another consistent state atomically. 166 * consistent state to another consistent state atomically.
167 * More care must be taken when modifying code in the SMP case because of 167 * More care must be taken when modifying code in the SMP case because of
168 * Intel's errata. 168 * Intel's errata. text_poke_smp() takes care that errata, but still
169 * doesn't support NMI/MCE handler code modifying.
169 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 170 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
170 * inconsistent instruction while you patch. 171 * inconsistent instruction while you patch.
171 */ 172 */
172extern void *text_poke(void *addr, const void *opcode, size_t len); 173extern void *text_poke(void *addr, const void *opcode, size_t len);
174extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
173 175
174#endif /* _ASM_X86_ALTERNATIVE_H */ 176#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
32 32
33typedef u8 kprobe_opcode_t; 33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc 34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9 35#define RELATIVEJUMP_OPCODE 0xe9
36#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4
36#define MAX_INSN_SIZE 16 39#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
44 47
45#define flush_insn_slot(p) do { } while (0) 48#define flush_insn_slot(p) do { } while (0)
46 49
50/* optinsn template addresses */
51extern kprobe_opcode_t optprobe_template_entry;
52extern kprobe_opcode_t optprobe_template_val;
53extern kprobe_opcode_t optprobe_template_call;
54extern kprobe_opcode_t optprobe_template_end;
55#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
56#define MAX_OPTINSN_SIZE \
57 (((unsigned long)&optprobe_template_end - \
58 (unsigned long)&optprobe_template_entry) + \
59 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
60
47extern const int kretprobe_blacklist_size; 61extern const int kretprobe_blacklist_size;
48 62
49void arch_remove_kprobe(struct kprobe *p); 63void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
64 int boostable; 78 int boostable;
65}; 79};
66 80
81struct arch_optimized_insn {
82 /* copy of the original instructions */
83 kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
84 /* detour code buffer */
85 kprobe_opcode_t *insn;
86 /* the size of instructions copied to detour code buffer */
87 size_t size;
88};
89
90/* Return true (!0) if optinsn is prepared for optimization. */
91static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
92{
93 return optinsn->size;
94}
95
67struct prev_kprobe { 96struct prev_kprobe {
68 struct kprobe *kp; 97 struct kprobe *kp;
69 unsigned long status; 98 unsigned long status;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e6ea0342c8f8..3a4bf35c179b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
10#include <asm/alternative.h> 11#include <asm/alternative.h>
11#include <asm/sections.h> 12#include <asm/sections.h>
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
@@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
572 local_irq_restore(flags); 573 local_irq_restore(flags);
573 return addr; 574 return addr;
574} 575}
576
577/*
578 * Cross-modifying kernel text with stop_machine().
579 * This code originally comes from immediate value.
580 */
581static atomic_t stop_machine_first;
582static int wrote_text;
583
584struct text_poke_params {
585 void *addr;
586 const void *opcode;
587 size_t len;
588};
589
590static int __kprobes stop_machine_text_poke(void *data)
591{
592 struct text_poke_params *tpp = data;
593
594 if (atomic_dec_and_test(&stop_machine_first)) {
595 text_poke(tpp->addr, tpp->opcode, tpp->len);
596 smp_wmb(); /* Make sure other cpus see that this has run */
597 wrote_text = 1;
598 } else {
599 while (!wrote_text)
600 cpu_relax();
601 smp_mb(); /* Load wrote_text before following execution */
602 }
603
604 flush_icache_range((unsigned long)tpp->addr,
605 (unsigned long)tpp->addr + tpp->len);
606 return 0;
607}
608
609/**
610 * text_poke_smp - Update instructions on a live kernel on SMP
611 * @addr: address to modify
612 * @opcode: source of the copy
613 * @len: length to copy
614 *
615 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
616 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
617 * should be allowed, since stop_machine() does _not_ protect code against
618 * NMI and MCE.
619 *
620 * Note: Must be called under get_online_cpus() and text_mutex.
621 */
622void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
623{
624 struct text_poke_params tpp;
625
626 tpp.addr = addr;
627 tpp.opcode = opcode;
628 tpp.len = len;
629 atomic_set(&stop_machine_first, 1);
630 wrote_text = 0;
631 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
632 return addr;
633}
634
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index fe4622e8c837..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5de9f4a9c3fd..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -406,18 +432,6 @@ static void __kprobes restore_btf(void)
406 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
407} 433}
408 434
409static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
410{
411 clear_btf();
412 regs->flags |= X86_EFLAGS_TF;
413 regs->flags &= ~X86_EFLAGS_IF;
414 /* single step inline if the instruction is an int3 */
415 if (p->opcode == BREAKPOINT_INSTRUCTION)
416 regs->ip = (unsigned long)p->addr;
417 else
418 regs->ip = (unsigned long)p->ainsn.insn;
419}
420
421void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
422 struct pt_regs *regs) 436 struct pt_regs *regs)
423{ 437{
@@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
429 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
430} 444}
431 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
432static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
433 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
434{ 456{
457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
435#if !defined(CONFIG_PREEMPT) 460#if !defined(CONFIG_PREEMPT)
436 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
437 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
438 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
439 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
440 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
441 return; 472 return;
442 } 473 }
443#endif 474#endif
444 prepare_singlestep(p, regs); 475 if (reenter) {
445 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
446} 490}
447 491
448/* 492/*
@@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
456 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
457 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
458 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
459 save_previous_kprobe(kcb);
460 set_current_kprobe(p, regs, kcb);
461 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
462 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
463 kcb->kprobe_status = KPROBE_REENTER;
464 break; 505 break;
465 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
466 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
535 * more here. 576 * more here.
536 */ 577 */
537 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
538 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
539 return 1; 580 return 1;
540 } 581 }
541 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
542 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
543 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
544 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
545 return 1; 586 return 1;
546 } 587 }
547 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
550 return 0; 591 return 0;
551} 592}
552 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
553/* 657/*
554 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
555 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
563 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
564 " pushq %rsp\n" 668 " pushq %rsp\n"
565 " pushfq\n" 669 " pushfq\n"
566 /* 670 SAVE_REGS_STRING
567 * Skip cs, ip, orig_ax.
568 * trampoline_handler() will plug in these values
569 */
570 " subq $24, %rsp\n"
571 " pushq %rdi\n"
572 " pushq %rsi\n"
573 " pushq %rdx\n"
574 " pushq %rcx\n"
575 " pushq %rax\n"
576 " pushq %r8\n"
577 " pushq %r9\n"
578 " pushq %r10\n"
579 " pushq %r11\n"
580 " pushq %rbx\n"
581 " pushq %rbp\n"
582 " pushq %r12\n"
583 " pushq %r13\n"
584 " pushq %r14\n"
585 " pushq %r15\n"
586 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
587 " call trampoline_handler\n" 672 " call trampoline_handler\n"
588 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
589 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
590 " popq %r15\n" 675 RESTORE_REGS_STRING
591 " popq %r14\n"
592 " popq %r13\n"
593 " popq %r12\n"
594 " popq %rbp\n"
595 " popq %rbx\n"
596 " popq %r11\n"
597 " popq %r10\n"
598 " popq %r9\n"
599 " popq %r8\n"
600 " popq %rax\n"
601 " popq %rcx\n"
602 " popq %rdx\n"
603 " popq %rsi\n"
604 " popq %rdi\n"
605 /* Skip orig_ax, ip, cs */
606 " addq $24, %rsp\n"
607 " popfq\n" 676 " popfq\n"
608#else 677#else
609 " pushf\n" 678 " pushf\n"
610 /* 679 SAVE_REGS_STRING
611 * Skip cs, ip, orig_ax and gs.
612 * trampoline_handler() will plug in these values
613 */
614 " subl $16, %esp\n"
615 " pushl %fs\n"
616 " pushl %es\n"
617 " pushl %ds\n"
618 " pushl %eax\n"
619 " pushl %ebp\n"
620 " pushl %edi\n"
621 " pushl %esi\n"
622 " pushl %edx\n"
623 " pushl %ecx\n"
624 " pushl %ebx\n"
625 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
626 " call trampoline_handler\n" 681 " call trampoline_handler\n"
627 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
629 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
630 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
631 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
632 " popl %ebx\n" 687 RESTORE_REGS_STRING
633 " popl %ecx\n"
634 " popl %edx\n"
635 " popl %esi\n"
636 " popl %edi\n"
637 " popl %ebp\n"
638 " popl %eax\n"
639 /* Skip ds, es, fs, gs, orig_ax and ip */
640 " addl $24, %esp\n"
641 " popf\n" 688 " popf\n"
642#endif 689#endif
643 " ret\n"); 690 " ret\n");
@@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
805 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
806 * jumps back to correct address. 853 * jumps back to correct address.
807 */ 854 */
808 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
809 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
810 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
811 } else { 858 } else {
812 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1033 return 0; 1080 return 0;
1034} 1081}
1035 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1036int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1037{ 1436{
1038 return 0; 1437 return 0;
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 33a4ff45f842..b8c59b889c6e 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -78,7 +78,6 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
78 walk->data -= walk->offset; 78 walk->data -= walk->offset;
79 79
80 if (nbytes && walk->offset & alignmask && !err) { 80 if (nbytes && walk->offset & alignmask && !err) {
81 walk->offset += alignmask - 1;
82 walk->offset = ALIGN(walk->offset, alignmask + 1); 81 walk->offset = ALIGN(walk->offset, alignmask + 1);
83 walk->data += walk->offset; 82 walk->data += walk->offset;
84 83
diff --git a/crypto/authenc.c b/crypto/authenc.c
index 18870906ea06..2bb7348d8d55 100644
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -386,11 +386,13 @@ static int crypto_authenc_encrypt(struct aead_request *req)
386{ 386{
387 struct crypto_aead *authenc = crypto_aead_reqtfm(req); 387 struct crypto_aead *authenc = crypto_aead_reqtfm(req);
388 struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); 388 struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
389 struct ablkcipher_request *abreq = aead_request_ctx(req); 389 struct authenc_request_ctx *areq_ctx = aead_request_ctx(req);
390 struct crypto_ablkcipher *enc = ctx->enc; 390 struct crypto_ablkcipher *enc = ctx->enc;
391 struct scatterlist *dst = req->dst; 391 struct scatterlist *dst = req->dst;
392 unsigned int cryptlen = req->cryptlen; 392 unsigned int cryptlen = req->cryptlen;
393 u8 *iv = (u8 *)(abreq + 1) + crypto_ablkcipher_reqsize(enc); 393 struct ablkcipher_request *abreq = (void *)(areq_ctx->tail
394 + ctx->reqoff);
395 u8 *iv = (u8 *)abreq - crypto_ablkcipher_ivsize(enc);
394 int err; 396 int err;
395 397
396 ablkcipher_request_set_tfm(abreq, enc); 398 ablkcipher_request_set_tfm(abreq, enc);
@@ -454,7 +456,7 @@ static int crypto_authenc_verify(struct aead_request *req,
454 unsigned int authsize; 456 unsigned int authsize;
455 457
456 areq_ctx->complete = authenc_verify_ahash_done; 458 areq_ctx->complete = authenc_verify_ahash_done;
457 areq_ctx->complete = authenc_verify_ahash_update_done; 459 areq_ctx->update_complete = authenc_verify_ahash_update_done;
458 460
459 ohash = authenc_ahash_fn(req, CRYPTO_TFM_REQ_MAY_SLEEP); 461 ohash = authenc_ahash_fn(req, CRYPTO_TFM_REQ_MAY_SLEEP);
460 if (IS_ERR(ohash)) 462 if (IS_ERR(ohash))
@@ -546,10 +548,6 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm)
546 if (IS_ERR(auth)) 548 if (IS_ERR(auth))
547 return PTR_ERR(auth); 549 return PTR_ERR(auth);
548 550
549 ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) +
550 crypto_ahash_alignmask(auth),
551 crypto_ahash_alignmask(auth) + 1);
552
553 enc = crypto_spawn_skcipher(&ictx->enc); 551 enc = crypto_spawn_skcipher(&ictx->enc);
554 err = PTR_ERR(enc); 552 err = PTR_ERR(enc);
555 if (IS_ERR(enc)) 553 if (IS_ERR(enc))
@@ -558,13 +556,18 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm)
558 ctx->auth = auth; 556 ctx->auth = auth;
559 ctx->enc = enc; 557 ctx->enc = enc;
560 558
561 tfm->crt_aead.reqsize = max_t(unsigned int, 559 ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) +
562 crypto_ahash_reqsize(auth) + ctx->reqoff + 560 crypto_ahash_alignmask(auth),
563 sizeof(struct authenc_request_ctx) + 561 crypto_ahash_alignmask(auth) + 1) +
562 crypto_ablkcipher_ivsize(enc);
563
564 tfm->crt_aead.reqsize = sizeof(struct authenc_request_ctx) +
565 ctx->reqoff +
566 max_t(unsigned int,
567 crypto_ahash_reqsize(auth) +
564 sizeof(struct ahash_request), 568 sizeof(struct ahash_request),
565 sizeof(struct skcipher_givcrypt_request) + 569 sizeof(struct skcipher_givcrypt_request) +
566 crypto_ablkcipher_reqsize(enc) + 570 crypto_ablkcipher_reqsize(enc));
567 crypto_ablkcipher_ivsize(enc));
568 571
569 return 0; 572 return 0;
570 573
diff --git a/crypto/md5.c b/crypto/md5.c
index 9fda213a592e..30efc7dad891 100644
--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -234,6 +234,7 @@ static struct shash_alg alg = {
234 .export = md5_export, 234 .export = md5_export,
235 .import = md5_import, 235 .import = md5_import,
236 .descsize = sizeof(struct md5_state), 236 .descsize = sizeof(struct md5_state),
237 .statesize = sizeof(struct md5_state),
237 .base = { 238 .base = {
238 .cra_name = "md5", 239 .cra_name = "md5",
239 .cra_flags = CRYPTO_ALG_TYPE_SHASH, 240 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
97 /* If checksum is bad mark all blocks used to prevent allocation 97 /* If checksum is bad mark all blocks used to prevent allocation
98 * essentially implementing a per-group read-only flag. */ 98 * essentially implementing a per-group read-only flag. */
99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
100 ext4_error(sb, __func__, 100 ext4_error(sb, "Checksum bad for group %u",
101 "Checksum bad for group %u", block_group); 101 block_group);
102 ext4_free_blks_set(sb, gdp, 0); 102 ext4_free_blks_set(sb, gdp, 0);
103 ext4_free_inodes_set(sb, gdp, 0); 103 ext4_free_inodes_set(sb, gdp, 0);
104 ext4_itable_unused_set(sb, gdp, 0); 104 ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
130 * to make sure we calculate the right free blocks 130 * to make sure we calculate the right free blocks
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 ext4_group_first_block_no(sb, ngroups - 1);
134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 134 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 135 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 136 }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
189 * when a file system is mounted (see ext4_fill_super). 188 * when a file system is mounted (see ext4_fill_super).
190 */ 189 */
191 190
192
193#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
194
195/** 191/**
196 * ext4_get_group_desc() -- load group descriptor from disk 192 * ext4_get_group_desc() -- load group descriptor from disk
197 * @sb: super block 193 * @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 206 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 207
212 if (block_group >= ngroups) { 208 if (block_group >= ngroups) {
213 ext4_error(sb, "ext4_get_group_desc", 209 ext4_error(sb, "block_group >= groups_count - block_group = %u,"
214 "block_group >= groups_count - " 210 " groups_count = %u", block_group, ngroups);
215 "block_group = %u, groups_count = %u",
216 block_group, ngroups);
217 211
218 return NULL; 212 return NULL;
219 } 213 }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 215 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 216 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
223 if (!sbi->s_group_desc[group_desc]) { 217 if (!sbi->s_group_desc[group_desc]) {
224 ext4_error(sb, "ext4_get_group_desc", 218 ext4_error(sb, "Group descriptor not loaded - "
225 "Group descriptor not loaded - "
226 "block_group = %u, group_desc = %u, desc = %u", 219 "block_group = %u, group_desc = %u, desc = %u",
227 block_group, group_desc, offset); 220 block_group, group_desc, offset);
228 return NULL; 221 return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
282 return 1; 275 return 1;
283 276
284err_out: 277err_out:
285 ext4_error(sb, __func__, 278 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
286 "Invalid block bitmap - "
287 "block_group = %d, block = %llu",
288 block_group, bitmap_blk); 279 block_group, bitmap_blk);
289 return 0; 280 return 0;
290} 281}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
311 bitmap_blk = ext4_block_bitmap(sb, desc); 302 bitmap_blk = ext4_block_bitmap(sb, desc);
312 bh = sb_getblk(sb, bitmap_blk); 303 bh = sb_getblk(sb, bitmap_blk);
313 if (unlikely(!bh)) { 304 if (unlikely(!bh)) {
314 ext4_error(sb, __func__, 305 ext4_error(sb, "Cannot read block bitmap - "
315 "Cannot read block bitmap - "
316 "block_group = %u, block_bitmap = %llu", 306 "block_group = %u, block_bitmap = %llu",
317 block_group, bitmap_blk); 307 block_group, bitmap_blk);
318 return NULL; 308 return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
354 set_bitmap_uptodate(bh); 344 set_bitmap_uptodate(bh);
355 if (bh_submit_read(bh) < 0) { 345 if (bh_submit_read(bh) < 0) {
356 put_bh(bh); 346 put_bh(bh);
357 ext4_error(sb, __func__, 347 ext4_error(sb, "Cannot read block bitmap - "
358 "Cannot read block bitmap - "
359 "block_group = %u, block_bitmap = %llu", 348 "block_group = %u, block_bitmap = %llu",
360 block_group, bitmap_blk); 349 block_group, bitmap_blk);
361 return NULL; 350 return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
419 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 408 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
420 in_range(block + count - 1, ext4_inode_table(sb, desc), 409 in_range(block + count - 1, ext4_inode_table(sb, desc),
421 sbi->s_itb_per_group)) { 410 sbi->s_itb_per_group)) {
422 ext4_error(sb, __func__, 411 ext4_error(sb, "Adding blocks in system zones - "
423 "Adding blocks in system zones - "
424 "Block = %llu, count = %lu", 412 "Block = %llu, count = %lu",
425 block, count); 413 block, count);
426 goto error_return; 414 goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 441 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 442 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 443 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 444 ext4_error(sb, "bit already cleared for block %llu",
457 "bit already cleared for block %llu",
458 (ext4_fsblk_t)(block + i)); 445 (ext4_fsblk_t)(block + i));
459 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 446 BUFFER_TRACE(bitmap_bh, "bit already cleared");
460 } else { 447 } else {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57d..983f0e127493 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
205 entry = rb_entry(n, struct ext4_system_zone, node); 205 entry = rb_entry(n, struct ext4_system_zone, node);
206 kmem_cache_free(ext4_system_zone_cachep, entry); 206 kmem_cache_free(ext4_system_zone_cachep, entry);
207 if (!parent) 207 if (!parent)
208 EXT4_SB(sb)->system_blks.rb_node = NULL; 208 EXT4_SB(sb)->system_blks = RB_ROOT;
209 else if (parent->rb_left == n) 209 else if (parent->rb_left == n)
210 parent->rb_left = NULL; 210 parent->rb_left = NULL;
211 else if (parent->rb_right == n) 211 else if (parent->rb_right == n)
212 parent->rb_right = NULL; 212 parent->rb_right = NULL;
213 n = parent; 213 n = parent;
214 } 214 }
215 EXT4_SB(sb)->system_blks.rb_node = NULL; 215 EXT4_SB(sb)->system_blks = RB_ROOT;
216} 216}
217 217
218/* 218/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 ext4_error(dir->i_sb, function, 86 __ext4_error(dir->i_sb, function,
87 "bad entry in directory #%lu: %s - " 87 "bad entry in directory #%lu: %s - block=%llu"
88 "offset=%u, inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, offset, 89 dir->i_ino, error_msg,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
90 le32_to_cpu(de->inode), 92 le32_to_cpu(de->inode),
91 rlen, de->name_len); 93 rlen, de->name_len);
92 return error_msg == NULL ? 1 : 0; 94 return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
150 */ 152 */
151 if (!bh) { 153 if (!bh) {
152 if (!dir_has_error) { 154 if (!dir_has_error) {
153 ext4_error(sb, __func__, "directory #%lu " 155 ext4_error(sb, "directory #%lu "
154 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
155 inode->i_ino, 157 inode->i_ino,
156 (unsigned long long) filp->f_pos); 158 (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
303 kfree(old); 305 kfree(old);
304 } 306 }
305 if (!parent) 307 if (!parent)
306 root->rb_node = NULL; 308 *root = RB_ROOT;
307 else if (parent->rb_left == n) 309 else if (parent->rb_left == n)
308 parent->rb_left = NULL; 310 parent->rb_left = NULL;
309 else if (parent->rb_right == n) 311 else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4cedc91ec59d..6e5787a29b90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
53#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
54#endif 54#endif
55 55
56#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a);
58
59#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a);
61
56/* data type for block offset of block group */ 62/* data type for block offset of block group */
57typedef int ext4_grpblk_t; 63typedef int ext4_grpblk_t;
58 64
@@ -133,14 +139,14 @@ struct mpage_da_data {
133 int pages_written; 139 int pages_written;
134 int retval; 140 int retval;
135}; 141};
136#define DIO_AIO_UNWRITTEN 0x1 142#define EXT4_IO_UNWRITTEN 0x1
137typedef struct ext4_io_end { 143typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 144 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 145 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 146 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 147 struct page *page; /* page struct for buffer write */
142 ext4_lblk_t offset; /* offset in the file */ 148 loff_t offset; /* offset in the file */
143 size_t size; /* size of the extent */ 149 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 150 struct work_struct work; /* data work queue */
145} ext4_io_end_t; 151} ext4_io_end_t;
146 152
@@ -284,10 +290,12 @@ struct flex_groups {
284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 290#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 291#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 292#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
293#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
294#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 295#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
288 296
289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 297#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
290#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 298#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
291 299
292/* Flags that should be inherited by new inodes from their parent. */ 300/* Flags that should be inherited by new inodes from their parent. */
293#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 301#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
313 return flags & EXT4_OTHER_FLMASK; 321 return flags & EXT4_OTHER_FLMASK;
314} 322}
315 323
316/*
317 * Inode dynamic state flags
318 */
319#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
320#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
326
327/* Used to pass group descriptor data when online resize is done */ 324/* Used to pass group descriptor data when online resize is done */
328struct ext4_new_group_input { 325struct ext4_new_group_input {
329 __u32 group; /* Group number for this data */ 326 __u32 group; /* Group number for this data */
@@ -364,19 +361,20 @@ struct ext4_new_group_data {
364 /* caller is from the direct IO path, request to creation of an 361 /* caller is from the direct IO path, request to creation of an
365 unitialized extents if not allocated, split the uninitialized 362 unitialized extents if not allocated, split the uninitialized
366 extent if blocks has been preallocated already*/ 363 extent if blocks has been preallocated already*/
367#define EXT4_GET_BLOCKS_DIO 0x0008 364#define EXT4_GET_BLOCKS_PRE_IO 0x0008
368#define EXT4_GET_BLOCKS_CONVERT 0x0010 365#define EXT4_GET_BLOCKS_CONVERT 0x0010
369#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ 366#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
367 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
368 /* Convert extent to initialized after IO complete */
369#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
371 /* Convert extent to initialized after direct IO complete */
372#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
373 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
374 371
375/* 372/*
376 * Flags used by ext4_free_blocks 373 * Flags used by ext4_free_blocks
377 */ 374 */
378#define EXT4_FREE_BLOCKS_METADATA 0x0001 375#define EXT4_FREE_BLOCKS_METADATA 0x0001
379#define EXT4_FREE_BLOCKS_FORGET 0x0002 376#define EXT4_FREE_BLOCKS_FORGET 0x0002
377#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
380 378
381/* 379/*
382 * ioctl commands 380 * ioctl commands
@@ -630,7 +628,7 @@ struct ext4_inode_info {
630 * near to their parent directory's inode. 628 * near to their parent directory's inode.
631 */ 629 */
632 ext4_group_t i_block_group; 630 ext4_group_t i_block_group;
633 __u32 i_state; /* Dynamic state flags for ext4 */ 631 unsigned long i_state_flags; /* Dynamic state flags */
634 632
635 ext4_lblk_t i_dir_start_lookup; 633 ext4_lblk_t i_dir_start_lookup;
636#ifdef CONFIG_EXT4_FS_XATTR 634#ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +706,9 @@ struct ext4_inode_info {
708 qsize_t i_reserved_quota; 706 qsize_t i_reserved_quota;
709#endif 707#endif
710 708
711 /* completed async DIOs that might need unwritten extents handling */ 709 /* completed IOs that might need unwritten extents handling */
712 struct list_head i_aio_dio_complete_list; 710 struct list_head i_completed_io_list;
711 spinlock_t i_completed_io_lock;
713 /* current io_end structure for async DIO write*/ 712 /* current io_end structure for async DIO write*/
714 ext4_io_end_t *cur_aio_dio; 713 ext4_io_end_t *cur_aio_dio;
715 714
@@ -760,6 +759,7 @@ struct ext4_inode_info {
760#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
761#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
762#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
762#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1050 (ino >= EXT4_FIRST_INO(sb) && 1050 (ino >= EXT4_FIRST_INO(sb) &&
1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1052} 1052}
1053
1054/*
1055 * Inode dynamic state flags
1056 */
1057enum {
1058 EXT4_STATE_JDATA, /* journaled data exists */
1059 EXT4_STATE_NEW, /* inode is newly created */
1060 EXT4_STATE_XATTR, /* has in-inode xattrs */
1061 EXT4_STATE_NO_EXPAND, /* No space for expansion */
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1065};
1066
1067static inline int ext4_test_inode_state(struct inode *inode, int bit)
1068{
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags);
1070}
1071
1072static inline void ext4_set_inode_state(struct inode *inode, int bit)
1073{
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1053#else 1081#else
1054/* Assume that user mode programs are passing in an ext4fs superblock, not 1082/* Assume that user mode programs are passing in an ext4fs superblock, not
1055 * a kernel struct super_block. This will allow us to call the feature-test 1083 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1126#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 1154#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
1127#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 1155#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
1128#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1156#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1157#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1158#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1129 1159
1130#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1160#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1131#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1161#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1439 struct address_space *mapping, loff_t from); 1469 struct address_space *mapping, loff_t from);
1440extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1470extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1441extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1471extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1442extern int flush_aio_dio_completed_IO(struct inode *inode); 1472extern int flush_completed_IO(struct inode *inode);
1443extern void ext4_da_update_reserve_space(struct inode *inode, 1473extern void ext4_da_update_reserve_space(struct inode *inode,
1444 int used, int quota_claim); 1474 int used, int quota_claim);
1445/* ioctl.c */ 1475/* ioctl.c */
@@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
1465 ext4_fsblk_t n_blocks_count); 1495 ext4_fsblk_t n_blocks_count);
1466 1496
1467/* super.c */ 1497/* super.c */
1468extern void ext4_error(struct super_block *, const char *, const char *, ...) 1498extern void __ext4_error(struct super_block *, const char *, const char *, ...)
1499 __attribute__ ((format (printf, 3, 4)));
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
1502 __attribute__ ((format (printf, 3, 4)));
1503extern void ext4_error_file(const char *, struct file *, const char *, ...)
1469 __attribute__ ((format (printf, 3, 4))); 1504 __attribute__ ((format (printf, 3, 4)));
1470extern void __ext4_std_error(struct super_block *, const char *, int); 1505extern void __ext4_std_error(struct super_block *, const char *, int);
1471extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1506extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1472 __attribute__ ((format (printf, 3, 4))); 1507 __attribute__ ((format (printf, 3, 4)));
1473extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1508extern void __ext4_warning(struct super_block *, const char *,
1509 const char *, ...)
1474 __attribute__ ((format (printf, 3, 4))); 1510 __attribute__ ((format (printf, 3, 4)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
1475extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1512extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1476 __attribute__ ((format (printf, 3, 4))); 1513 __attribute__ ((format (printf, 3, 4)));
1477extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
1744extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1781extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1745 loff_t len); 1782 loff_t len);
1746extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1747 loff_t len); 1784 ssize_t len);
1748extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1749 sector_t block, unsigned int max_blocks, 1786 sector_t block, unsigned int max_blocks,
1750 struct buffer_head *bh, int flags); 1787 struct buffer_head *bh, int flags);
@@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1756 __u64 len, __u64 *moved_len); 1793 __u64 len, __u64 *moved_len);
1757 1794
1758 1795
1796/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1797enum ext4_state_bits {
1798 BH_Uninit /* blocks are allocated but uninitialized on disk */
1799 = BH_JBDPrivateStart,
1800};
1801
1802BUFFER_FNS(Uninit, uninit)
1803TAS_BUFFER_FNS(Uninit, uninit)
1804
1759/* 1805/*
1760 * Add new method to test wether block and inode bitmaps are properly 1806 * Add new method to test wether block and inode bitmaps are properly
1761 * initialized. With uninit_bg reading the block from disk is not enough 1807 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
1773 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 1819 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1774} 1820}
1775 1821
1822#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
1823
1776#endif /* __KERNEL__ */ 1824#endif /* __KERNEL__ */
1777 1825
1778#endif /* _EXT4_H */ 1826#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
125 ext4_journal_abort_handle(where, __func__, bh, 125 ext4_journal_abort_handle(where, __func__, bh,
126 handle, err); 126 handle, err);
127 } else { 127 } else {
128 if (inode && bh) 128 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 129 mark_buffer_dirty_inode(bh, inode);
130 else 130 else
131 mark_buffer_dirty(bh); 131 mark_buffer_dirty(bh);
132 if (inode && inode_needs_sync(inode)) { 132 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 133 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 134 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, __func__, 135 ext4_error(inode->i_sb,
136 "IO error syncing inode, " 136 "IO error syncing inode, "
137 "inode=%lu, block=%llu", 137 "inode=%lu, block=%llu",
138 inode->i_ino, 138 inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b118..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
195 if (S_ISREG(inode->i_mode)) 195 if (S_ISREG(inode->i_mode))
196 block_group++; 196 block_group++;
197 } 197 }
198 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 198 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
199 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 199 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201 200
202 /* 201 /*
@@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
440 return 0; 439 return 0;
441 440
442corrupted: 441corrupted:
443 ext4_error(inode->i_sb, function, 442 __ext4_error(inode->i_sb, function,
444 "bad header/extent in inode #%lu: %s - magic %x, " 443 "bad header/extent in inode #%lu: %s - magic %x, "
445 "entries %u, max %u(%u), depth %u(%u)", 444 "entries %u, max %u(%u), depth %u(%u)",
446 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
703 } 702 }
704 eh = ext_block_hdr(bh); 703 eh = ext_block_hdr(bh);
705 ppos++; 704 ppos++;
706 BUG_ON(ppos > depth); 705 if (unlikely(ppos > depth)) {
706 put_bh(bh);
707 EXT4_ERROR_INODE(inode,
708 "ppos %d > depth %d", ppos, depth);
709 goto err;
710 }
707 path[ppos].p_bh = bh; 711 path[ppos].p_bh = bh;
708 path[ppos].p_hdr = eh; 712 path[ppos].p_hdr = eh;
709 i--; 713 i--;
@@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
749 if (err) 753 if (err)
750 return err; 754 return err;
751 755
752 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 756 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
757 EXT4_ERROR_INODE(inode,
758 "logical %d == ei_block %d!",
759 logical, le32_to_cpu(curp->p_idx->ei_block));
760 return -EIO;
761 }
753 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 762 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
754 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 763 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
755 /* insert after */ 764 /* insert after */
@@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
779 ext4_idx_store_pblock(ix, ptr); 788 ext4_idx_store_pblock(ix, ptr);
780 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 789 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
781 790
782 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 791 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
783 > le16_to_cpu(curp->p_hdr->eh_max)); 792 > le16_to_cpu(curp->p_hdr->eh_max))) {
784 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 793 EXT4_ERROR_INODE(inode,
794 "logical %d == ei_block %d!",
795 logical, le32_to_cpu(curp->p_idx->ei_block));
796 return -EIO;
797 }
798 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
799 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
800 return -EIO;
801 }
785 802
786 err = ext4_ext_dirty(handle, inode, curp); 803 err = ext4_ext_dirty(handle, inode, curp);
787 ext4_std_error(inode->i_sb, err); 804 ext4_std_error(inode->i_sb, err);
@@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
819 836
820 /* if current leaf will be split, then we should use 837 /* if current leaf will be split, then we should use
821 * border from split point */ 838 * border from split point */
822 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 839 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
840 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
841 return -EIO;
842 }
823 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 843 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
824 border = path[depth].p_ext[1].ee_block; 844 border = path[depth].p_ext[1].ee_block;
825 ext_debug("leaf will be split." 845 ext_debug("leaf will be split."
@@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
860 880
861 /* initialize new leaf */ 881 /* initialize new leaf */
862 newblock = ablocks[--a]; 882 newblock = ablocks[--a];
863 BUG_ON(newblock == 0); 883 if (unlikely(newblock == 0)) {
884 EXT4_ERROR_INODE(inode, "newblock == 0!");
885 err = -EIO;
886 goto cleanup;
887 }
864 bh = sb_getblk(inode->i_sb, newblock); 888 bh = sb_getblk(inode->i_sb, newblock);
865 if (!bh) { 889 if (!bh) {
866 err = -EIO; 890 err = -EIO;
@@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
880 ex = EXT_FIRST_EXTENT(neh); 904 ex = EXT_FIRST_EXTENT(neh);
881 905
882 /* move remainder of path[depth] to the new leaf */ 906 /* move remainder of path[depth] to the new leaf */
883 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 907 if (unlikely(path[depth].p_hdr->eh_entries !=
908 path[depth].p_hdr->eh_max)) {
909 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
910 path[depth].p_hdr->eh_entries,
911 path[depth].p_hdr->eh_max);
912 err = -EIO;
913 goto cleanup;
914 }
884 /* start copy from next extent */ 915 /* start copy from next extent */
885 /* TODO: we could do it by single memmove */ 916 /* TODO: we could do it by single memmove */
886 m = 0; 917 m = 0;
@@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
927 958
928 /* create intermediate indexes */ 959 /* create intermediate indexes */
929 k = depth - at - 1; 960 k = depth - at - 1;
930 BUG_ON(k < 0); 961 if (unlikely(k < 0)) {
962 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
963 err = -EIO;
964 goto cleanup;
965 }
931 if (k) 966 if (k)
932 ext_debug("create %d intermediate indices\n", k); 967 ext_debug("create %d intermediate indices\n", k);
933 /* insert new index into current index block */ 968 /* insert new index into current index block */
@@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
964 999
965 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
966 EXT_MAX_INDEX(path[i].p_hdr)); 1001 EXT_MAX_INDEX(path[i].p_hdr));
967 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 1002 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
968 EXT_LAST_INDEX(path[i].p_hdr)); 1003 EXT_LAST_INDEX(path[i].p_hdr))) {
1004 EXT4_ERROR_INODE(inode,
1005 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1006 le32_to_cpu(path[i].p_ext->ee_block));
1007 err = -EIO;
1008 goto cleanup;
1009 }
969 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1010 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
970 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1011 ext_debug("%d: move %d:%llu in new index %llu\n", i,
971 le32_to_cpu(path[i].p_idx->ei_block), 1012 le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1203 struct ext4_extent *ex; 1244 struct ext4_extent *ex;
1204 int depth, ee_len; 1245 int depth, ee_len;
1205 1246
1206 BUG_ON(path == NULL); 1247 if (unlikely(path == NULL)) {
1248 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1249 return -EIO;
1250 }
1207 depth = path->p_depth; 1251 depth = path->p_depth;
1208 *phys = 0; 1252 *phys = 0;
1209 1253
@@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1217 ex = path[depth].p_ext; 1261 ex = path[depth].p_ext;
1218 ee_len = ext4_ext_get_actual_len(ex); 1262 ee_len = ext4_ext_get_actual_len(ex);
1219 if (*logical < le32_to_cpu(ex->ee_block)) { 1263 if (*logical < le32_to_cpu(ex->ee_block)) {
1220 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1264 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1265 EXT4_ERROR_INODE(inode,
1266 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1267 *logical, le32_to_cpu(ex->ee_block));
1268 return -EIO;
1269 }
1221 while (--depth >= 0) { 1270 while (--depth >= 0) {
1222 ix = path[depth].p_idx; 1271 ix = path[depth].p_idx;
1223 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1272 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1273 EXT4_ERROR_INODE(inode,
1274 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1275 ix != NULL ? ix->ei_block : 0,
1276 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1277 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1278 depth);
1279 return -EIO;
1280 }
1224 } 1281 }
1225 return 0; 1282 return 0;
1226 } 1283 }
1227 1284
1228 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1285 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1286 EXT4_ERROR_INODE(inode,
1287 "logical %d < ee_block %d + ee_len %d!",
1288 *logical, le32_to_cpu(ex->ee_block), ee_len);
1289 return -EIO;
1290 }
1229 1291
1230 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1292 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1231 *phys = ext_pblock(ex) + ee_len - 1; 1293 *phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1251 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1313 int depth; /* Note, NOT eh_depth; depth from top of tree */
1252 int ee_len; 1314 int ee_len;
1253 1315
1254 BUG_ON(path == NULL); 1316 if (unlikely(path == NULL)) {
1317 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1318 return -EIO;
1319 }
1255 depth = path->p_depth; 1320 depth = path->p_depth;
1256 *phys = 0; 1321 *phys = 0;
1257 1322
@@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1265 ex = path[depth].p_ext; 1330 ex = path[depth].p_ext;
1266 ee_len = ext4_ext_get_actual_len(ex); 1331 ee_len = ext4_ext_get_actual_len(ex);
1267 if (*logical < le32_to_cpu(ex->ee_block)) { 1332 if (*logical < le32_to_cpu(ex->ee_block)) {
1268 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1333 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1334 EXT4_ERROR_INODE(inode,
1335 "first_extent(path[%d].p_hdr) != ex",
1336 depth);
1337 return -EIO;
1338 }
1269 while (--depth >= 0) { 1339 while (--depth >= 0) {
1270 ix = path[depth].p_idx; 1340 ix = path[depth].p_idx;
1271 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1341 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1342 EXT4_ERROR_INODE(inode,
1343 "ix != EXT_FIRST_INDEX *logical %d!",
1344 *logical);
1345 return -EIO;
1346 }
1272 } 1347 }
1273 *logical = le32_to_cpu(ex->ee_block); 1348 *logical = le32_to_cpu(ex->ee_block);
1274 *phys = ext_pblock(ex); 1349 *phys = ext_pblock(ex);
1275 return 0; 1350 return 0;
1276 } 1351 }
1277 1352
1278 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1353 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1354 EXT4_ERROR_INODE(inode,
1355 "logical %d < ee_block %d + ee_len %d!",
1356 *logical, le32_to_cpu(ex->ee_block), ee_len);
1357 return -EIO;
1358 }
1279 1359
1280 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1360 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1281 /* next allocated block in this leaf */ 1361 /* next allocated block in this leaf */
@@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1414 1494
1415 eh = path[depth].p_hdr; 1495 eh = path[depth].p_hdr;
1416 ex = path[depth].p_ext; 1496 ex = path[depth].p_ext;
1417 BUG_ON(ex == NULL); 1497
1418 BUG_ON(eh == NULL); 1498 if (unlikely(ex == NULL || eh == NULL)) {
1499 EXT4_ERROR_INODE(inode,
1500 "ex %p == NULL or eh %p == NULL", ex, eh);
1501 return -EIO;
1502 }
1419 1503
1420 if (depth == 0) { 1504 if (depth == 0) {
1421 /* there is no tree at all */ 1505 /* there is no tree at all */
@@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1538 merge_done = 1; 1622 merge_done = 1;
1539 WARN_ON(eh->eh_entries == 0); 1623 WARN_ON(eh->eh_entries == 0);
1540 if (!eh->eh_entries) 1624 if (!eh->eh_entries)
1541 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1625 ext4_error(inode->i_sb,
1542 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1543 } 1628 }
1544 1629
1545 return merge_done; 1630 return merge_done;
@@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1612 ext4_lblk_t next; 1697 ext4_lblk_t next;
1613 unsigned uninitialized = 0; 1698 unsigned uninitialized = 0;
1614 1699
1615 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1700 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1701 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1702 return -EIO;
1703 }
1616 depth = ext_depth(inode); 1704 depth = ext_depth(inode);
1617 ex = path[depth].p_ext; 1705 ex = path[depth].p_ext;
1618 BUG_ON(path[depth].p_hdr == NULL); 1706 if (unlikely(path[depth].p_hdr == NULL)) {
1707 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1708 return -EIO;
1709 }
1619 1710
1620 /* try to insert block into found extent and return */ 1711 /* try to insert block into found extent and return */
1621 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1712 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1622 && ext4_can_extents_be_merged(inode, ex, newext)) { 1713 && ext4_can_extents_be_merged(inode, ex, newext)) {
1623 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1714 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1624 ext4_ext_is_uninitialized(newext), 1715 ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1830,7 @@ has_space:
1739 1830
1740merge: 1831merge:
1741 /* try to merge extents to the right */ 1832 /* try to merge extents to the right */
1742 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1833 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1743 ext4_ext_try_to_merge(inode, path, nearex); 1834 ext4_ext_try_to_merge(inode, path, nearex);
1744 1835
1745 /* try to merge extents to the left */ 1836 /* try to merge extents to the left */
@@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1787 } 1878 }
1788 1879
1789 depth = ext_depth(inode); 1880 depth = ext_depth(inode);
1790 BUG_ON(path[depth].p_hdr == NULL); 1881 if (unlikely(path[depth].p_hdr == NULL)) {
1882 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1883 err = -EIO;
1884 break;
1885 }
1791 ex = path[depth].p_ext; 1886 ex = path[depth].p_ext;
1792 next = ext4_ext_next_allocated_block(path); 1887 next = ext4_ext_next_allocated_block(path);
1793 1888
@@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1838 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1933 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1839 } 1934 }
1840 1935
1841 BUG_ON(cbex.ec_len == 0); 1936 if (unlikely(cbex.ec_len == 0)) {
1937 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1938 err = -EIO;
1939 break;
1940 }
1842 err = func(inode, path, &cbex, ex, cbdata); 1941 err = func(inode, path, &cbex, ex, cbdata);
1843 ext4_ext_drop_refs(path); 1942 ext4_ext_drop_refs(path);
1844 1943
@@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1952 2051
1953 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2052 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1954 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2053 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1955 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 2054 if (in_range(block, cex->ec_block, cex->ec_len)) {
1956 ex->ee_block = cpu_to_le32(cex->ec_block); 2055 ex->ee_block = cpu_to_le32(cex->ec_block);
1957 ext4_ext_store_pblock(ex, cex->ec_start); 2056 ext4_ext_store_pblock(ex, cex->ec_start);
1958 ex->ee_len = cpu_to_le16(cex->ec_len); 2057 ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1981 /* free index block */ 2080 /* free index block */
1982 path--; 2081 path--;
1983 leaf = idx_pblock(path->p_idx); 2082 leaf = idx_pblock(path->p_idx);
1984 BUG_ON(path->p_hdr->eh_entries == 0); 2083 if (unlikely(path->p_hdr->eh_entries == 0)) {
2084 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2085 return -EIO;
2086 }
1985 err = ext4_ext_get_access(handle, inode, path); 2087 err = ext4_ext_get_access(handle, inode, path);
1986 if (err) 2088 if (err)
1987 return err; 2089 return err;
@@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2119 if (!path[depth].p_hdr) 2221 if (!path[depth].p_hdr)
2120 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2222 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2121 eh = path[depth].p_hdr; 2223 eh = path[depth].p_hdr;
2122 BUG_ON(eh == NULL); 2224 if (unlikely(path[depth].p_hdr == NULL)) {
2123 2225 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2226 return -EIO;
2227 }
2124 /* find where to start removing */ 2228 /* find where to start removing */
2125 ex = EXT_LAST_EXTENT(eh); 2229 ex = EXT_LAST_EXTENT(eh);
2126 2230
@@ -2983,7 +3087,7 @@ fix_extent_len:
2983 ext4_ext_dirty(handle, inode, path + depth); 3087 ext4_ext_dirty(handle, inode, path + depth);
2984 return err; 3088 return err;
2985} 3089}
2986static int ext4_convert_unwritten_extents_dio(handle_t *handle, 3090static int ext4_convert_unwritten_extents_endio(handle_t *handle,
2987 struct inode *inode, 3091 struct inode *inode,
2988 struct ext4_ext_path *path) 3092 struct ext4_ext_path *path)
2989{ 3093{
@@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3063 flags, allocated); 3167 flags, allocated);
3064 ext4_ext_show_leaf(inode, path); 3168 ext4_ext_show_leaf(inode, path);
3065 3169
3066 /* DIO get_block() before submit the IO, split the extent */ 3170 /* get_block() before submit the IO, split the extent */
3067 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3068 ret = ext4_split_unwritten_extents(handle, 3172 ret = ext4_split_unwritten_extents(handle,
3069 inode, path, iblock, 3173 inode, path, iblock,
3070 max_blocks, flags); 3174 max_blocks, flags);
@@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3074 * completed 3178 * completed
3075 */ 3179 */
3076 if (io) 3180 if (io)
3077 io->flag = DIO_AIO_UNWRITTEN; 3181 io->flag = EXT4_IO_UNWRITTEN;
3078 else 3182 else
3079 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; 3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result);
3080 goto out; 3186 goto out;
3081 } 3187 }
3082 /* async DIO end_io complete, convert the filled extent to written */ 3188 /* IO end_io complete, convert the filled extent to written */
3083 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3189 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3084 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3190 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3085 path); 3191 path);
3086 if (ret >= 0) 3192 if (ret >= 0)
3087 ext4_update_inode_fsync_trans(handle, inode, 1); 3193 ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3185{ 3291{
3186 struct ext4_ext_path *path = NULL; 3292 struct ext4_ext_path *path = NULL;
3187 struct ext4_extent_header *eh; 3293 struct ext4_extent_header *eh;
3188 struct ext4_extent newex, *ex; 3294 struct ext4_extent newex, *ex, *last_ex;
3189 ext4_fsblk_t newblock; 3295 ext4_fsblk_t newblock;
3190 int err = 0, depth, ret, cache_type; 3296 int err = 0, depth, ret, cache_type;
3191 unsigned int allocated = 0; 3297 unsigned int allocated = 0;
@@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3237 * this situation is possible, though, _during_ tree modification; 3343 * this situation is possible, though, _during_ tree modification;
3238 * this is why assert can't be put in ext4_ext_find_extent() 3344 * this is why assert can't be put in ext4_ext_find_extent()
3239 */ 3345 */
3240 if (path[depth].p_ext == NULL && depth != 0) { 3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3241 ext4_error(inode->i_sb, __func__, "bad extent address " 3347 EXT4_ERROR_INODE(inode, "bad extent address "
3242 "inode: %lu, iblock: %d, depth: %d", 3348 "iblock: %d, depth: %d pblock %lld",
3243 inode->i_ino, iblock, depth); 3349 iblock, depth, path[depth].p_block);
3244 err = -EIO; 3350 err = -EIO;
3245 goto out2; 3351 goto out2;
3246 } 3352 }
@@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3258 */ 3364 */
3259 ee_len = ext4_ext_get_actual_len(ex); 3365 ee_len = ext4_ext_get_actual_len(ex);
3260 /* if found extent covers block, simply return it */ 3366 /* if found extent covers block, simply return it */
3261 if (iblock >= ee_block && iblock < ee_block + ee_len) { 3367 if (in_range(iblock, ee_block, ee_len)) {
3262 newblock = iblock - ee_block + ee_start; 3368 newblock = iblock - ee_block + ee_start;
3263 /* number of remaining blocks in the extent */ 3369 /* number of remaining blocks in the extent */
3264 allocated = ee_len - (iblock - ee_block); 3370 allocated = ee_len - (iblock - ee_block);
@@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3350 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3456 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3351 ext4_ext_mark_uninitialized(&newex); 3457 ext4_ext_mark_uninitialized(&newex);
3352 /* 3458 /*
3353 * io_end structure was created for every async 3459 * io_end structure was created for every IO write to an
3354 * direct IO write to the middle of the file. 3460 * uninitialized extent. To avoid unecessary conversion,
3355 * To avoid unecessary convertion for every aio dio rewrite 3461 * here we flag the IO that really needs the conversion.
3356 * to the mid of file, here we flag the IO that is really
3357 * need the convertion.
3358 * For non asycn direct IO case, flag the inode state 3462 * For non asycn direct IO case, flag the inode state
3359 * that we need to perform convertion when IO is done. 3463 * that we need to perform convertion when IO is done.
3360 */ 3464 */
3361 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3362 if (io) 3466 if (io)
3363 io->flag = DIO_AIO_UNWRITTEN; 3467 io->flag = EXT4_IO_UNWRITTEN;
3364 else 3468 else
3365 EXT4_I(inode)->i_state |= 3469 ext4_set_inode_state(inode,
3366 EXT4_STATE_DIO_UNWRITTEN;; 3470 EXT4_STATE_DIO_UNWRITTEN);
3471 }
3472 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result);
3474 }
3475
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
3477 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d",
3480 ex->ee_block);
3481 err = -EIO;
3482 goto out2;
3367 } 3483 }
3484 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
3486 + ext4_ext_get_actual_len(last_ex))
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
3368 } 3488 }
3369 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3370 if (err) { 3490 if (err) {
@@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
3499 i_size_write(inode, new_size); 3619 i_size_write(inode, new_size);
3500 if (new_size > EXT4_I(inode)->i_disksize) 3620 if (new_size > EXT4_I(inode)->i_disksize)
3501 ext4_update_i_disksize(inode, new_size); 3621 ext4_update_i_disksize(inode, new_size);
3622 } else {
3623 /*
3624 * Mark that we allocate beyond EOF so the subsequent truncate
3625 * can proceed even if the new size is the same as i_size.
3626 */
3627 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
3502 } 3629 }
3503 3630
3504} 3631}
@@ -3603,7 +3730,7 @@ retry:
3603 * Returns 0 on success. 3730 * Returns 0 on success.
3604 */ 3731 */
3605int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3732int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3606 loff_t len) 3733 ssize_t len)
3607{ 3734{
3608 handle_t *handle; 3735 handle_t *handle;
3609 ext4_lblk_t block; 3736 ext4_lblk_t block;
@@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3635 map_bh.b_state = 0; 3762 map_bh.b_state = 0;
3636 ret = ext4_get_blocks(handle, inode, block, 3763 ret = ext4_get_blocks(handle, inode, block,
3637 max_blocks, &map_bh, 3764 max_blocks, &map_bh,
3638 EXT4_GET_BLOCKS_DIO_CONVERT_EXT); 3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3639 if (ret <= 0) { 3766 if (ret <= 0) {
3640 WARN_ON(ret <= 0); 3767 WARN_ON(ret <= 0);
3641 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3768 printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3739 int error = 0; 3866 int error = 0;
3740 3867
3741 /* in-inode? */ 3868 /* in-inode? */
3742 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 3869 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3743 struct ext4_iloc iloc; 3870 struct ext4_iloc iloc;
3744 int offset; /* offset of xattr in inode */ 3871 int offset; /* offset of xattr in inode */
3745 3872
@@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3767 __u64 start, __u64 len) 3894 __u64 start, __u64 len)
3768{ 3895{
3769 ext4_lblk_t start_blk; 3896 ext4_lblk_t start_blk;
3770 ext4_lblk_t len_blks;
3771 int error = 0; 3897 int error = 0;
3772 3898
3773 /* fallback to generic here if not in extents fmt */ 3899 /* fallback to generic here if not in extents fmt */
@@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3781 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3907 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3782 error = ext4_xattr_fiemap(inode, fieinfo); 3908 error = ext4_xattr_fiemap(inode, fieinfo);
3783 } else { 3909 } else {
3910 ext4_lblk_t len_blks;
3911 __u64 last_blk;
3912
3784 start_blk = start >> inode->i_sb->s_blocksize_bits; 3913 start_blk = start >> inode->i_sb->s_blocksize_bits;
3785 len_blks = len >> inode->i_sb->s_blocksize_bits; 3914 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3915 if (last_blk >= EXT_MAX_BLOCK)
3916 last_blk = EXT_MAX_BLOCK-1;
3917 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3786 3918
3787 /* 3919 /*
3788 * Walk the extent tree gathering extent information. 3920 * Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 56eee3d796c2..503a48927402 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -35,9 +35,9 @@
35 */ 35 */
36static int ext4_release_file(struct inode *inode, struct file *filp) 36static int ext4_release_file(struct inode *inode, struct file *filp)
37{ 37{
38 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { 38 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
39 ext4_alloc_da_blocks(inode); 39 ext4_alloc_da_blocks(inode);
40 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; 40 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
41 } 41 }
42 /* if we are the last writer on the inode, drop the block reservation */ 42 /* if we are the last writer on the inode, drop the block reservation */
43 if ((filp->f_mode & FMODE_WRITE) && 43 if ((filp->f_mode & FMODE_WRITE) &&
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad01..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
63 if (inode->i_sb->s_flags & MS_RDONLY) 63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 64 return 0;
65 65
66 ret = flush_aio_dio_completed_IO(inode); 66 ret = flush_completed_IO(inode);
67 if (ret < 0) 67 if (ret < 0)
68 return ret; 68 return ret;
69 69
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..004c9da9e5c6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
76 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
77 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
79 ext4_error(sb, __func__, "Checksum bad for group %u", 79 ext4_error(sb, "Checksum bad for group %u", block_group);
80 block_group);
81 ext4_free_blks_set(sb, gdp, 0); 80 ext4_free_blks_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 81 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 82 ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 bitmap_blk = ext4_inode_bitmap(sb, desc); 110 bitmap_blk = ext4_inode_bitmap(sb, desc);
112 bh = sb_getblk(sb, bitmap_blk); 111 bh = sb_getblk(sb, bitmap_blk);
113 if (unlikely(!bh)) { 112 if (unlikely(!bh)) {
114 ext4_error(sb, __func__, 113 ext4_error(sb, "Cannot read inode bitmap - "
115 "Cannot read inode bitmap - "
116 "block_group = %u, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
117 block_group, bitmap_blk); 115 block_group, bitmap_blk);
118 return NULL; 116 return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
153 set_bitmap_uptodate(bh); 151 set_bitmap_uptodate(bh);
154 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
155 put_bh(bh); 153 put_bh(bh);
156 ext4_error(sb, __func__, 154 ext4_error(sb, "Cannot read inode bitmap - "
157 "Cannot read inode bitmap - "
158 "block_group = %u, inode_bitmap = %llu", 155 "block_group = %u, inode_bitmap = %llu",
159 block_group, bitmap_blk); 156 block_group, bitmap_blk);
160 return NULL; 157 return NULL;
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
229 226
230 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
231 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
232 ext4_error(sb, "ext4_free_inode", 229 ext4_error(sb, "reserved or nonexistent inode %lu", ino);
233 "reserved or nonexistent inode %lu", ino);
234 goto error_return; 230 goto error_return;
235 } 231 }
236 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 232 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
249 bit, bitmap_bh->b_data); 245 bit, bitmap_bh->b_data);
250 if (!cleared) 246 if (!cleared)
251 ext4_error(sb, "ext4_free_inode", 247 ext4_error(sb, "bit already cleared for inode %lu", ino);
252 "bit already cleared for inode %lu", ino);
253 else { 248 else {
254 gdp = ext4_get_group_desc(sb, block_group, &bh2); 249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
255 250
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 731 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
737 ino > EXT4_INODES_PER_GROUP(sb)) { 732 ino > EXT4_INODES_PER_GROUP(sb)) {
738 ext4_unlock_group(sb, group); 733 ext4_unlock_group(sb, group);
739 ext4_error(sb, __func__, 734 ext4_error(sb, "reserved inode or inode > inodes count - "
740 "reserved inode or inode > inodes count - "
741 "block_group = %u, inode=%lu", group, 735 "block_group = %u, inode=%lu", group,
742 ino + group * EXT4_INODES_PER_GROUP(sb)); 736 ino + group * EXT4_INODES_PER_GROUP(sb));
743 return 1; 737 return 1;
@@ -904,7 +898,7 @@ repeat_in_this_group:
904 BUFFER_TRACE(inode_bitmap_bh, 898 BUFFER_TRACE(inode_bitmap_bh,
905 "call ext4_handle_dirty_metadata"); 899 "call ext4_handle_dirty_metadata");
906 err = ext4_handle_dirty_metadata(handle, 900 err = ext4_handle_dirty_metadata(handle,
907 inode, 901 NULL,
908 inode_bitmap_bh); 902 inode_bitmap_bh);
909 if (err) 903 if (err)
910 goto fail; 904 goto fail;
@@ -1029,7 +1023,8 @@ got:
1029 inode->i_generation = sbi->s_next_generation++; 1023 inode->i_generation = sbi->s_next_generation++;
1030 spin_unlock(&sbi->s_next_gen_lock); 1024 spin_unlock(&sbi->s_next_gen_lock);
1031 1025
1032 ei->i_state = EXT4_STATE_NEW; 1026 ei->i_state_flags = 0;
1027 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1033 1028
1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1029 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1035 1030
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1098 1093
1099 /* Error cases - e2fsck has already cleaned up for us */ 1094 /* Error cases - e2fsck has already cleaned up for us */
1100 if (ino > max_ino) { 1095 if (ino > max_ino) {
1101 ext4_warning(sb, __func__, 1096 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1102 "bad orphan ino %lu! e2fsck was run?", ino);
1103 goto error; 1097 goto error;
1104 } 1098 }
1105 1099
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1107 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1101 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1108 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1102 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1109 if (!bitmap_bh) { 1103 if (!bitmap_bh) {
1110 ext4_warning(sb, __func__, 1104 ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1111 "inode bitmap error for orphan %lu", ino);
1112 goto error; 1105 goto error;
1113 } 1106 }
1114 1107
@@ -1140,8 +1133,7 @@ iget_failed:
1140 err = PTR_ERR(inode); 1133 err = PTR_ERR(inode);
1141 inode = NULL; 1134 inode = NULL;
1142bad_orphan: 1135bad_orphan:
1143 ext4_warning(sb, __func__, 1136 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1144 "bad orphan inode %lu! e2fsck was run?", ino);
1145 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1137 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1146 bit, (unsigned long long)bitmap_bh->b_blocknr, 1138 bit, (unsigned long long)bitmap_bh->b_blocknr,
1147 ext4_test_bit(bit, bitmap_bh->b_data)); 1139 ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..f55df7192b95 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -194,7 +195,7 @@ void ext4_delete_inode(struct inode *inode)
194 inode->i_size = 0; 195 inode->i_size = 0;
195 err = ext4_mark_inode_dirty(handle, inode); 196 err = ext4_mark_inode_dirty(handle, inode);
196 if (err) { 197 if (err) {
197 ext4_warning(inode->i_sb, __func__, 198 ext4_warning(inode->i_sb,
198 "couldn't mark inode dirty (err %d)", err); 199 "couldn't mark inode dirty (err %d)", err);
199 goto stop_handle; 200 goto stop_handle;
200 } 201 }
@@ -212,7 +213,7 @@ void ext4_delete_inode(struct inode *inode)
212 if (err > 0) 213 if (err > 0)
213 err = ext4_journal_restart(handle, 3); 214 err = ext4_journal_restart(handle, 3);
214 if (err != 0) { 215 if (err != 0) {
215 ext4_warning(inode->i_sb, __func__, 216 ext4_warning(inode->i_sb,
216 "couldn't extend journal (err %d)", err); 217 "couldn't extend journal (err %d)", err);
217 stop_handle: 218 stop_handle:
218 ext4_journal_stop(handle); 219 ext4_journal_stop(handle);
@@ -323,8 +324,7 @@ static int ext4_block_to_path(struct inode *inode,
323 offsets[n++] = i_block & (ptrs - 1); 324 offsets[n++] = i_block & (ptrs - 1);
324 final = ptrs; 325 final = ptrs;
325 } else { 326 } else {
326 ext4_warning(inode->i_sb, "ext4_block_to_path", 327 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
327 "block %lu > max in inode %lu",
328 i_block + direct_blocks + 328 i_block + direct_blocks +
329 indirect_blocks + double_blocks, inode->i_ino); 329 indirect_blocks + double_blocks, inode->i_ino);
330 } 330 }
@@ -344,7 +344,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
344 if (blk && 344 if (blk &&
345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
346 blk, 1))) { 346 blk, 1))) {
347 ext4_error(inode->i_sb, function, 347 __ext4_error(inode->i_sb, function,
348 "invalid block reference %u " 348 "invalid block reference %u "
349 "in inode #%lu", blk, inode->i_ino); 349 "in inode #%lu", blk, inode->i_ino);
350 return -EIO; 350 return -EIO;
@@ -607,7 +607,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
607 if (*err) 607 if (*err)
608 goto failed_out; 608 goto failed_out;
609 609
610 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 610 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
611 EXT4_ERROR_INODE(inode,
612 "current_block %llu + count %lu > %d!",
613 current_block, count,
614 EXT4_MAX_BLOCK_FILE_PHYS);
615 *err = -EIO;
616 goto failed_out;
617 }
611 618
612 target -= count; 619 target -= count;
613 /* allocate blocks for indirect blocks */ 620 /* allocate blocks for indirect blocks */
@@ -643,7 +650,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
643 ar.flags = EXT4_MB_HINT_DATA; 650 ar.flags = EXT4_MB_HINT_DATA;
644 651
645 current_block = ext4_mb_new_blocks(handle, &ar, err); 652 current_block = ext4_mb_new_blocks(handle, &ar, err);
646 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 653 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
654 EXT4_ERROR_INODE(inode,
655 "current_block %llu + ar.len %d > %d!",
656 current_block, ar.len,
657 EXT4_MAX_BLOCK_FILE_PHYS);
658 *err = -EIO;
659 goto failed_out;
660 }
647 661
648 if (*err && (target == blks)) { 662 if (*err && (target == blks)) {
649 /* 663 /*
@@ -1061,6 +1075,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
1061 int mdb_free = 0, allocated_meta_blocks = 0; 1075 int mdb_free = 0, allocated_meta_blocks = 0;
1062 1076
1063 spin_lock(&ei->i_block_reservation_lock); 1077 spin_lock(&ei->i_block_reservation_lock);
1078 trace_ext4_da_update_reserve_space(inode, used);
1064 if (unlikely(used > ei->i_reserved_data_blocks)) { 1079 if (unlikely(used > ei->i_reserved_data_blocks)) {
1065 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 1080 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1066 "with only %d reserved data blocks\n", 1081 "with only %d reserved data blocks\n",
@@ -1124,7 +1139,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1124 sector_t logical, sector_t phys, int len) 1139 sector_t logical, sector_t phys, int len)
1125{ 1140{
1126 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1141 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1127 ext4_error(inode->i_sb, msg, 1142 __ext4_error(inode->i_sb, msg,
1128 "inode #%lu logical block %llu mapped to %llu " 1143 "inode #%lu logical block %llu mapped to %llu "
1129 "(size %d)", inode->i_ino, 1144 "(size %d)", inode->i_ino,
1130 (unsigned long long) logical, 1145 (unsigned long long) logical,
@@ -1306,7 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1306 * i_data's format changing. Force the migrate 1321 * i_data's format changing. Force the migrate
1307 * to fail by clearing migrate flags 1322 * to fail by clearing migrate flags
1308 */ 1323 */
1309 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1324 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1310 } 1325 }
1311 1326
1312 /* 1327 /*
@@ -1534,6 +1549,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1534 ext4_truncate(inode); 1549 ext4_truncate(inode);
1535} 1550}
1536 1551
1552static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1553 struct buffer_head *bh_result, int create);
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1554static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1555 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1556 struct page **pagep, void **fsdata)
@@ -1575,8 +1592,12 @@ retry:
1575 } 1592 }
1576 *pagep = page; 1593 *pagep = page;
1577 1594
1578 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1595 if (ext4_should_dioread_nolock(inode))
1579 ext4_get_block); 1596 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1597 fsdata, ext4_get_block_write);
1598 else
1599 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1600 fsdata, ext4_get_block);
1580 1601
1581 if (!ret && ext4_should_journal_data(inode)) { 1602 if (!ret && ext4_should_journal_data(inode)) {
1582 ret = walk_page_buffers(handle, page_buffers(page), 1603 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1814,7 @@ static int ext4_journalled_write_end(struct file *file,
1793 new_i_size = pos + copied; 1814 new_i_size = pos + copied;
1794 if (new_i_size > inode->i_size) 1815 if (new_i_size > inode->i_size)
1795 i_size_write(inode, pos+copied); 1816 i_size_write(inode, pos+copied);
1796 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1817 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1797 if (new_i_size > EXT4_I(inode)->i_disksize) { 1818 if (new_i_size > EXT4_I(inode)->i_disksize) {
1798 ext4_update_i_disksize(inode, new_i_size); 1819 ext4_update_i_disksize(inode, new_i_size);
1799 ret2 = ext4_mark_inode_dirty(handle, inode); 1820 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1846,6 +1867,7 @@ repeat:
1846 spin_lock(&ei->i_block_reservation_lock); 1867 spin_lock(&ei->i_block_reservation_lock);
1847 md_reserved = ei->i_reserved_meta_blocks; 1868 md_reserved = ei->i_reserved_meta_blocks;
1848 md_needed = ext4_calc_metadata_amount(inode, lblock); 1869 md_needed = ext4_calc_metadata_amount(inode, lblock);
1870 trace_ext4_da_reserve_space(inode, md_needed);
1849 spin_unlock(&ei->i_block_reservation_lock); 1871 spin_unlock(&ei->i_block_reservation_lock);
1850 1872
1851 /* 1873 /*
@@ -2091,6 +2113,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2091 } else if (buffer_mapped(bh)) 2113 } else if (buffer_mapped(bh))
2092 BUG_ON(bh->b_blocknr != pblock); 2114 BUG_ON(bh->b_blocknr != pblock);
2093 2115
2116 if (buffer_uninit(exbh))
2117 set_buffer_uninit(bh);
2094 cur_logical++; 2118 cur_logical++;
2095 pblock++; 2119 pblock++;
2096 } while ((bh = bh->b_this_page) != head); 2120 } while ((bh = bh->b_this_page) != head);
@@ -2133,17 +2157,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2133 break; 2157 break;
2134 for (i = 0; i < nr_pages; i++) { 2158 for (i = 0; i < nr_pages; i++) {
2135 struct page *page = pvec.pages[i]; 2159 struct page *page = pvec.pages[i];
2136 index = page->index; 2160 if (page->index > end)
2137 if (index > end)
2138 break; 2161 break;
2139 index++;
2140
2141 BUG_ON(!PageLocked(page)); 2162 BUG_ON(!PageLocked(page));
2142 BUG_ON(PageWriteback(page)); 2163 BUG_ON(PageWriteback(page));
2143 block_invalidatepage(page, 0); 2164 block_invalidatepage(page, 0);
2144 ClearPageUptodate(page); 2165 ClearPageUptodate(page);
2145 unlock_page(page); 2166 unlock_page(page);
2146 } 2167 }
2168 index = pvec.pages[nr_pages - 1]->index + 1;
2169 pagevec_release(&pvec);
2147 } 2170 }
2148 return; 2171 return;
2149} 2172}
@@ -2220,6 +2243,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2220 */ 2243 */
2221 new.b_state = 0; 2244 new.b_state = 0;
2222 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2245 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2246 if (ext4_should_dioread_nolock(mpd->inode))
2247 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2223 if (mpd->b_state & (1 << BH_Delay)) 2248 if (mpd->b_state & (1 << BH_Delay))
2224 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2249 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2225 2250
@@ -2630,11 +2655,14 @@ static int __ext4_journalled_writepage(struct page *page,
2630 ret = err; 2655 ret = err;
2631 2656
2632 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2657 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2633 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2658 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2634out: 2659out:
2635 return ret; 2660 return ret;
2636} 2661}
2637 2662
2663static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2664static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2665
2638/* 2666/*
2639 * Note that we don't need to start a transaction unless we're journaling data 2667 * Note that we don't need to start a transaction unless we're journaling data
2640 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2668 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2710,7 @@ static int ext4_writepage(struct page *page,
2682 int ret = 0; 2710 int ret = 0;
2683 loff_t size; 2711 loff_t size;
2684 unsigned int len; 2712 unsigned int len;
2685 struct buffer_head *page_bufs; 2713 struct buffer_head *page_bufs = NULL;
2686 struct inode *inode = page->mapping->host; 2714 struct inode *inode = page->mapping->host;
2687 2715
2688 trace_ext4_writepage(inode, page); 2716 trace_ext4_writepage(inode, page);
@@ -2758,7 +2786,11 @@ static int ext4_writepage(struct page *page,
2758 2786
2759 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2787 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2760 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2788 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2761 else 2789 else if (page_bufs && buffer_uninit(page_bufs)) {
2790 ext4_set_bh_endio(page_bufs, inode);
2791 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2792 wbc, ext4_end_io_buffer_write);
2793 } else
2762 ret = block_write_full_page(page, noalloc_get_block_write, 2794 ret = block_write_full_page(page, noalloc_get_block_write,
2763 wbc); 2795 wbc);
2764 2796
@@ -3301,7 +3333,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3301 filemap_write_and_wait(mapping); 3333 filemap_write_and_wait(mapping);
3302 } 3334 }
3303 3335
3304 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3336 if (EXT4_JOURNAL(inode) &&
3337 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3305 /* 3338 /*
3306 * This is a REALLY heavyweight approach, but the use of 3339 * This is a REALLY heavyweight approach, but the use of
3307 * bmap on dirty files is expected to be extremely rare: 3340 * bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3353,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3320 * everything they get. 3353 * everything they get.
3321 */ 3354 */
3322 3355
3323 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3356 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3324 journal = EXT4_JOURNAL(inode); 3357 journal = EXT4_JOURNAL(inode);
3325 jbd2_journal_lock_updates(journal); 3358 jbd2_journal_lock_updates(journal);
3326 err = jbd2_journal_flush(journal); 3359 err = jbd2_journal_flush(journal);
@@ -3345,11 +3378,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3345 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3378 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3346} 3379}
3347 3380
3381static void ext4_free_io_end(ext4_io_end_t *io)
3382{
3383 BUG_ON(!io);
3384 if (io->page)
3385 put_page(io->page);
3386 iput(io->inode);
3387 kfree(io);
3388}
3389
3390static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3391{
3392 struct buffer_head *head, *bh;
3393 unsigned int curr_off = 0;
3394
3395 if (!page_has_buffers(page))
3396 return;
3397 head = bh = page_buffers(page);
3398 do {
3399 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3400 && bh->b_private) {
3401 ext4_free_io_end(bh->b_private);
3402 bh->b_private = NULL;
3403 bh->b_end_io = NULL;
3404 }
3405 curr_off = curr_off + bh->b_size;
3406 bh = bh->b_this_page;
3407 } while (bh != head);
3408}
3409
3348static void ext4_invalidatepage(struct page *page, unsigned long offset) 3410static void ext4_invalidatepage(struct page *page, unsigned long offset)
3349{ 3411{
3350 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3412 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3351 3413
3352 /* 3414 /*
3415 * free any io_end structure allocated for buffers to be discarded
3416 */
3417 if (ext4_should_dioread_nolock(page->mapping->host))
3418 ext4_invalidatepage_free_endio(page, offset);
3419 /*
3353 * If it's a full truncate we just forget about the pending dirtying 3420 * If it's a full truncate we just forget about the pending dirtying
3354 */ 3421 */
3355 if (offset == 0) 3422 if (offset == 0)
@@ -3420,7 +3487,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3420 } 3487 }
3421 3488
3422retry: 3489retry:
3423 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3490 if (rw == READ && ext4_should_dioread_nolock(inode))
3491 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3492 inode->i_sb->s_bdev, iov,
3493 offset, nr_segs,
3494 ext4_get_block, NULL);
3495 else
3496 ret = blockdev_direct_IO(rw, iocb, inode,
3497 inode->i_sb->s_bdev, iov,
3424 offset, nr_segs, 3498 offset, nr_segs,
3425 ext4_get_block, NULL); 3499 ext4_get_block, NULL);
3426 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3500 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3510,9 @@ retry:
3436 * but cannot extend i_size. Bail out and pretend 3510 * but cannot extend i_size. Bail out and pretend
3437 * the write failed... */ 3511 * the write failed... */
3438 ret = PTR_ERR(handle); 3512 ret = PTR_ERR(handle);
3513 if (inode->i_nlink)
3514 ext4_orphan_del(NULL, inode);
3515
3439 goto out; 3516 goto out;
3440 } 3517 }
3441 if (inode->i_nlink) 3518 if (inode->i_nlink)
@@ -3463,75 +3540,63 @@ out:
3463 return ret; 3540 return ret;
3464} 3541}
3465 3542
3466static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3543static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3467 struct buffer_head *bh_result, int create) 3544 struct buffer_head *bh_result, int create)
3468{ 3545{
3469 handle_t *handle = NULL; 3546 handle_t *handle = ext4_journal_current_handle();
3470 int ret = 0; 3547 int ret = 0;
3471 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3548 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3472 int dio_credits; 3549 int dio_credits;
3550 int started = 0;
3473 3551
3474 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3552 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3475 inode->i_ino, create); 3553 inode->i_ino, create);
3476 /* 3554 /*
3477 * DIO VFS code passes create = 0 flag for write to 3555 * ext4_get_block in prepare for a DIO write or buffer write.
3478 * the middle of file. It does this to avoid block 3556 * We allocate an uinitialized extent if blocks haven't been allocated.
3479 * allocation for holes, to prevent expose stale data 3557 * The extent will be converted to initialized after IO complete.
3480 * out when there is parallel buffered read (which does
3481 * not hold the i_mutex lock) while direct IO write has
3482 * not completed. DIO request on holes finally falls back
3483 * to buffered IO for this reason.
3484 *
3485 * For ext4 extent based file, since we support fallocate,
3486 * new allocated extent as uninitialized, for holes, we
3487 * could fallocate blocks for holes, thus parallel
3488 * buffered IO read will zero out the page when read on
3489 * a hole while parallel DIO write to the hole has not completed.
3490 *
3491 * when we come here, we know it's a direct IO write to
3492 * to the middle of file (<i_size)
3493 * so it's safe to override the create flag from VFS.
3494 */ 3558 */
3495 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3559 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3496 3560
3497 if (max_blocks > DIO_MAX_BLOCKS) 3561 if (!handle) {
3498 max_blocks = DIO_MAX_BLOCKS; 3562 if (max_blocks > DIO_MAX_BLOCKS)
3499 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3563 max_blocks = DIO_MAX_BLOCKS;
3500 handle = ext4_journal_start(inode, dio_credits); 3564 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3501 if (IS_ERR(handle)) { 3565 handle = ext4_journal_start(inode, dio_credits);
3502 ret = PTR_ERR(handle); 3566 if (IS_ERR(handle)) {
3503 goto out; 3567 ret = PTR_ERR(handle);
3568 goto out;
3569 }
3570 started = 1;
3504 } 3571 }
3572
3505 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3573 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3506 create); 3574 create);
3507 if (ret > 0) { 3575 if (ret > 0) {
3508 bh_result->b_size = (ret << inode->i_blkbits); 3576 bh_result->b_size = (ret << inode->i_blkbits);
3509 ret = 0; 3577 ret = 0;
3510 } 3578 }
3511 ext4_journal_stop(handle); 3579 if (started)
3580 ext4_journal_stop(handle);
3512out: 3581out:
3513 return ret; 3582 return ret;
3514} 3583}
3515 3584
3516static void ext4_free_io_end(ext4_io_end_t *io) 3585static void dump_completed_IO(struct inode * inode)
3517{
3518 BUG_ON(!io);
3519 iput(io->inode);
3520 kfree(io);
3521}
3522static void dump_aio_dio_list(struct inode * inode)
3523{ 3586{
3524#ifdef EXT4_DEBUG 3587#ifdef EXT4_DEBUG
3525 struct list_head *cur, *before, *after; 3588 struct list_head *cur, *before, *after;
3526 ext4_io_end_t *io, *io0, *io1; 3589 ext4_io_end_t *io, *io0, *io1;
3590 unsigned long flags;
3527 3591
3528 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3592 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3529 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3593 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3530 return; 3594 return;
3531 } 3595 }
3532 3596
3533 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3597 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3534 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3598 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3599 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3535 cur = &io->list; 3600 cur = &io->list;
3536 before = cur->prev; 3601 before = cur->prev;
3537 io0 = container_of(before, ext4_io_end_t, list); 3602 io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3606,31 @@ static void dump_aio_dio_list(struct inode * inode)
3541 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3606 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3542 io, inode->i_ino, io0, io1); 3607 io, inode->i_ino, io0, io1);
3543 } 3608 }
3609 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3544#endif 3610#endif
3545} 3611}
3546 3612
3547/* 3613/*
3548 * check a range of space and convert unwritten extents to written. 3614 * check a range of space and convert unwritten extents to written.
3549 */ 3615 */
3550static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3616static int ext4_end_io_nolock(ext4_io_end_t *io)
3551{ 3617{
3552 struct inode *inode = io->inode; 3618 struct inode *inode = io->inode;
3553 loff_t offset = io->offset; 3619 loff_t offset = io->offset;
3554 size_t size = io->size; 3620 ssize_t size = io->size;
3555 int ret = 0; 3621 int ret = 0;
3556 3622
3557 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3623 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3558 "list->prev 0x%p\n", 3624 "list->prev 0x%p\n",
3559 io, inode->i_ino, io->list.next, io->list.prev); 3625 io, inode->i_ino, io->list.next, io->list.prev);
3560 3626
3561 if (list_empty(&io->list)) 3627 if (list_empty(&io->list))
3562 return ret; 3628 return ret;
3563 3629
3564 if (io->flag != DIO_AIO_UNWRITTEN) 3630 if (io->flag != EXT4_IO_UNWRITTEN)
3565 return ret; 3631 return ret;
3566 3632
3567 if (offset + size <= i_size_read(inode)) 3633 ret = ext4_convert_unwritten_extents(inode, offset, size);
3568 ret = ext4_convert_unwritten_extents(inode, offset, size);
3569
3570 if (ret < 0) { 3634 if (ret < 0) {
3571 printk(KERN_EMERG "%s: failed to convert unwritten" 3635 printk(KERN_EMERG "%s: failed to convert unwritten"
3572 "extents to written extents, error is %d" 3636 "extents to written extents, error is %d"
@@ -3579,50 +3643,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3579 io->flag = 0; 3643 io->flag = 0;
3580 return ret; 3644 return ret;
3581} 3645}
3646
3582/* 3647/*
3583 * work on completed aio dio IO, to convert unwritten extents to extents 3648 * work on completed aio dio IO, to convert unwritten extents to extents
3584 */ 3649 */
3585static void ext4_end_aio_dio_work(struct work_struct *work) 3650static void ext4_end_io_work(struct work_struct *work)
3586{ 3651{
3587 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3652 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3588 struct inode *inode = io->inode; 3653 struct inode *inode = io->inode;
3589 int ret = 0; 3654 struct ext4_inode_info *ei = EXT4_I(inode);
3655 unsigned long flags;
3656 int ret;
3590 3657
3591 mutex_lock(&inode->i_mutex); 3658 mutex_lock(&inode->i_mutex);
3592 ret = ext4_end_aio_dio_nolock(io); 3659 ret = ext4_end_io_nolock(io);
3593 if (ret >= 0) { 3660 if (ret < 0) {
3594 if (!list_empty(&io->list)) 3661 mutex_unlock(&inode->i_mutex);
3595 list_del_init(&io->list); 3662 return;
3596 ext4_free_io_end(io);
3597 } 3663 }
3664
3665 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3666 if (!list_empty(&io->list))
3667 list_del_init(&io->list);
3668 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3598 mutex_unlock(&inode->i_mutex); 3669 mutex_unlock(&inode->i_mutex);
3670 ext4_free_io_end(io);
3599} 3671}
3672
3600/* 3673/*
3601 * This function is called from ext4_sync_file(). 3674 * This function is called from ext4_sync_file().
3602 * 3675 *
3603 * When AIO DIO IO is completed, the work to convert unwritten 3676 * When IO is completed, the work to convert unwritten extents to
3604 * extents to written is queued on workqueue but may not get immediately 3677 * written is queued on workqueue but may not get immediately
3605 * scheduled. When fsync is called, we need to ensure the 3678 * scheduled. When fsync is called, we need to ensure the
3606 * conversion is complete before fsync returns. 3679 * conversion is complete before fsync returns.
3607 * The inode keeps track of a list of completed AIO from DIO path 3680 * The inode keeps track of a list of pending/completed IO that
3608 * that might needs to do the conversion. This function walks through 3681 * might needs to do the conversion. This function walks through
3609 * the list and convert the related unwritten extents to written. 3682 * the list and convert the related unwritten extents for completed IO
3683 * to written.
3684 * The function return the number of pending IOs on success.
3610 */ 3685 */
3611int flush_aio_dio_completed_IO(struct inode *inode) 3686int flush_completed_IO(struct inode *inode)
3612{ 3687{
3613 ext4_io_end_t *io; 3688 ext4_io_end_t *io;
3689 struct ext4_inode_info *ei = EXT4_I(inode);
3690 unsigned long flags;
3614 int ret = 0; 3691 int ret = 0;
3615 int ret2 = 0; 3692 int ret2 = 0;
3616 3693
3617 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3694 if (list_empty(&ei->i_completed_io_list))
3618 return ret; 3695 return ret;
3619 3696
3620 dump_aio_dio_list(inode); 3697 dump_completed_IO(inode);
3621 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3698 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3622 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3699 while (!list_empty(&ei->i_completed_io_list)){
3700 io = list_entry(ei->i_completed_io_list.next,
3623 ext4_io_end_t, list); 3701 ext4_io_end_t, list);
3624 /* 3702 /*
3625 * Calling ext4_end_aio_dio_nolock() to convert completed 3703 * Calling ext4_end_io_nolock() to convert completed
3626 * IO to written. 3704 * IO to written.
3627 * 3705 *
3628 * When ext4_sync_file() is called, run_queue() may already 3706 * When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3713,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3635 * avoid double converting from both fsync and background work 3713 * avoid double converting from both fsync and background work
3636 * queue work. 3714 * queue work.
3637 */ 3715 */
3638 ret = ext4_end_aio_dio_nolock(io); 3716 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3717 ret = ext4_end_io_nolock(io);
3718 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3639 if (ret < 0) 3719 if (ret < 0)
3640 ret2 = ret; 3720 ret2 = ret;
3641 else 3721 else
3642 list_del_init(&io->list); 3722 list_del_init(&io->list);
3643 } 3723 }
3724 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3644 return (ret2 < 0) ? ret2 : 0; 3725 return (ret2 < 0) ? ret2 : 0;
3645} 3726}
3646 3727
3647static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3728static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3648{ 3729{
3649 ext4_io_end_t *io = NULL; 3730 ext4_io_end_t *io = NULL;
3650 3731
3651 io = kmalloc(sizeof(*io), GFP_NOFS); 3732 io = kmalloc(sizeof(*io), flags);
3652 3733
3653 if (io) { 3734 if (io) {
3654 igrab(inode); 3735 igrab(inode);
@@ -3656,8 +3737,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3656 io->flag = 0; 3737 io->flag = 0;
3657 io->offset = 0; 3738 io->offset = 0;
3658 io->size = 0; 3739 io->size = 0;
3659 io->error = 0; 3740 io->page = NULL;
3660 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3741 INIT_WORK(&io->work, ext4_end_io_work);
3661 INIT_LIST_HEAD(&io->list); 3742 INIT_LIST_HEAD(&io->list);
3662 } 3743 }
3663 3744
@@ -3669,6 +3750,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3669{ 3750{
3670 ext4_io_end_t *io_end = iocb->private; 3751 ext4_io_end_t *io_end = iocb->private;
3671 struct workqueue_struct *wq; 3752 struct workqueue_struct *wq;
3753 unsigned long flags;
3754 struct ext4_inode_info *ei;
3672 3755
3673 /* if not async direct IO or dio with 0 bytes write, just return */ 3756 /* if not async direct IO or dio with 0 bytes write, just return */
3674 if (!io_end || !size) 3757 if (!io_end || !size)
@@ -3680,7 +3763,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3680 size); 3763 size);
3681 3764
3682 /* if not aio dio with unwritten extents, just free io and return */ 3765 /* if not aio dio with unwritten extents, just free io and return */
3683 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3766 if (io_end->flag != EXT4_IO_UNWRITTEN){
3684 ext4_free_io_end(io_end); 3767 ext4_free_io_end(io_end);
3685 iocb->private = NULL; 3768 iocb->private = NULL;
3686 return; 3769 return;
@@ -3688,16 +3771,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3688 3771
3689 io_end->offset = offset; 3772 io_end->offset = offset;
3690 io_end->size = size; 3773 io_end->size = size;
3774 io_end->flag = EXT4_IO_UNWRITTEN;
3691 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3775 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3692 3776
3693 /* queue the work to convert unwritten extents to written */ 3777 /* queue the work to convert unwritten extents to written */
3694 queue_work(wq, &io_end->work); 3778 queue_work(wq, &io_end->work);
3695 3779
3696 /* Add the io_end to per-inode completed aio dio list*/ 3780 /* Add the io_end to per-inode completed aio dio list*/
3697 list_add_tail(&io_end->list, 3781 ei = EXT4_I(io_end->inode);
3698 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3782 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3783 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3784 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3699 iocb->private = NULL; 3785 iocb->private = NULL;
3700} 3786}
3787
3788static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3789{
3790 ext4_io_end_t *io_end = bh->b_private;
3791 struct workqueue_struct *wq;
3792 struct inode *inode;
3793 unsigned long flags;
3794
3795 if (!test_clear_buffer_uninit(bh) || !io_end)
3796 goto out;
3797
3798 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3799 printk("sb umounted, discard end_io request for inode %lu\n",
3800 io_end->inode->i_ino);
3801 ext4_free_io_end(io_end);
3802 goto out;
3803 }
3804
3805 io_end->flag = EXT4_IO_UNWRITTEN;
3806 inode = io_end->inode;
3807
3808 /* Add the io_end to per-inode completed io list*/
3809 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3810 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3811 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3812
3813 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3814 /* queue the work to convert unwritten extents to written */
3815 queue_work(wq, &io_end->work);
3816out:
3817 bh->b_private = NULL;
3818 bh->b_end_io = NULL;
3819 clear_buffer_uninit(bh);
3820 end_buffer_async_write(bh, uptodate);
3821}
3822
3823static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3824{
3825 ext4_io_end_t *io_end;
3826 struct page *page = bh->b_page;
3827 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3828 size_t size = bh->b_size;
3829
3830retry:
3831 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3832 if (!io_end) {
3833 if (printk_ratelimit())
3834 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3835 schedule();
3836 goto retry;
3837 }
3838 io_end->offset = offset;
3839 io_end->size = size;
3840 /*
3841 * We need to hold a reference to the page to make sure it
3842 * doesn't get evicted before ext4_end_io_work() has a chance
3843 * to convert the extent from written to unwritten.
3844 */
3845 io_end->page = page;
3846 get_page(io_end->page);
3847
3848 bh->b_private = io_end;
3849 bh->b_end_io = ext4_end_io_buffer_write;
3850 return 0;
3851}
3852
3701/* 3853/*
3702 * For ext4 extent files, ext4 will do direct-io write to holes, 3854 * For ext4 extent files, ext4 will do direct-io write to holes,
3703 * preallocated extents, and those write extend the file, no need to 3855 * preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3903,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3751 iocb->private = NULL; 3903 iocb->private = NULL;
3752 EXT4_I(inode)->cur_aio_dio = NULL; 3904 EXT4_I(inode)->cur_aio_dio = NULL;
3753 if (!is_sync_kiocb(iocb)) { 3905 if (!is_sync_kiocb(iocb)) {
3754 iocb->private = ext4_init_io_end(inode); 3906 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3755 if (!iocb->private) 3907 if (!iocb->private)
3756 return -ENOMEM; 3908 return -ENOMEM;
3757 /* 3909 /*
@@ -3767,7 +3919,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3767 ret = blockdev_direct_IO(rw, iocb, inode, 3919 ret = blockdev_direct_IO(rw, iocb, inode,
3768 inode->i_sb->s_bdev, iov, 3920 inode->i_sb->s_bdev, iov,
3769 offset, nr_segs, 3921 offset, nr_segs,
3770 ext4_get_block_dio_write, 3922 ext4_get_block_write,
3771 ext4_end_io_dio); 3923 ext4_end_io_dio);
3772 if (iocb->private) 3924 if (iocb->private)
3773 EXT4_I(inode)->cur_aio_dio = NULL; 3925 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3940,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3788 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3940 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3789 ext4_free_io_end(iocb->private); 3941 ext4_free_io_end(iocb->private);
3790 iocb->private = NULL; 3942 iocb->private = NULL;
3791 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3943 } else if (ret > 0 && ext4_test_inode_state(inode,
3792 EXT4_STATE_DIO_UNWRITTEN)) { 3944 EXT4_STATE_DIO_UNWRITTEN)) {
3793 int err; 3945 int err;
3794 /* 3946 /*
3795 * for non AIO case, since the IO is already 3947 * for non AIO case, since the IO is already
@@ -3799,7 +3951,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3799 offset, ret); 3951 offset, ret);
3800 if (err < 0) 3952 if (err < 0)
3801 ret = err; 3953 ret = err;
3802 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3954 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3803 } 3955 }
3804 return ret; 3956 return ret;
3805 } 3957 }
@@ -4130,18 +4282,27 @@ no_top:
4130 * We release `count' blocks on disk, but (last - first) may be greater 4282 * We release `count' blocks on disk, but (last - first) may be greater
4131 * than `count' because there can be holes in there. 4283 * than `count' because there can be holes in there.
4132 */ 4284 */
4133static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4285static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 struct buffer_head *bh, 4286 struct buffer_head *bh,
4135 ext4_fsblk_t block_to_free, 4287 ext4_fsblk_t block_to_free,
4136 unsigned long count, __le32 *first, 4288 unsigned long count, __le32 *first,
4137 __le32 *last) 4289 __le32 *last)
4138{ 4290{
4139 __le32 *p; 4291 __le32 *p;
4140 int flags = EXT4_FREE_BLOCKS_FORGET; 4292 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4141 4293
4142 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4294 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4143 flags |= EXT4_FREE_BLOCKS_METADATA; 4295 flags |= EXT4_FREE_BLOCKS_METADATA;
4144 4296
4297 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4298 count)) {
4299 ext4_error(inode->i_sb, "inode #%lu: "
4300 "attempt to clear blocks %llu len %lu, invalid",
4301 inode->i_ino, (unsigned long long) block_to_free,
4302 count);
4303 return 1;
4304 }
4305
4145 if (try_to_extend_transaction(handle, inode)) { 4306 if (try_to_extend_transaction(handle, inode)) {
4146 if (bh) { 4307 if (bh) {
4147 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4308 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4321,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4160 *p = 0; 4321 *p = 0;
4161 4322
4162 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4323 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4324 return 0;
4163} 4325}
4164 4326
4165/** 4327/**
@@ -4215,9 +4377,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4215 } else if (nr == block_to_free + count) { 4377 } else if (nr == block_to_free + count) {
4216 count++; 4378 count++;
4217 } else { 4379 } else {
4218 ext4_clear_blocks(handle, inode, this_bh, 4380 if (ext4_clear_blocks(handle, inode, this_bh,
4219 block_to_free, 4381 block_to_free, count,
4220 count, block_to_free_p, p); 4382 block_to_free_p, p))
4383 break;
4221 block_to_free = nr; 4384 block_to_free = nr;
4222 block_to_free_p = p; 4385 block_to_free_p = p;
4223 count = 1; 4386 count = 1;
@@ -4241,7 +4404,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4241 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4404 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4242 ext4_handle_dirty_metadata(handle, inode, this_bh); 4405 ext4_handle_dirty_metadata(handle, inode, this_bh);
4243 else 4406 else
4244 ext4_error(inode->i_sb, __func__, 4407 ext4_error(inode->i_sb,
4245 "circular indirect block detected, " 4408 "circular indirect block detected, "
4246 "inode=%lu, block=%llu", 4409 "inode=%lu, block=%llu",
4247 inode->i_ino, 4410 inode->i_ino,
@@ -4281,6 +4444,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4281 if (!nr) 4444 if (!nr)
4282 continue; /* A hole */ 4445 continue; /* A hole */
4283 4446
4447 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4448 nr, 1)) {
4449 ext4_error(inode->i_sb,
4450 "indirect mapped block in inode "
4451 "#%lu invalid (level %d, blk #%lu)",
4452 inode->i_ino, depth,
4453 (unsigned long) nr);
4454 break;
4455 }
4456
4284 /* Go read the buffer for the next level down */ 4457 /* Go read the buffer for the next level down */
4285 bh = sb_bread(inode->i_sb, nr); 4458 bh = sb_bread(inode->i_sb, nr);
4286 4459
@@ -4289,7 +4462,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4289 * (should be rare). 4462 * (should be rare).
4290 */ 4463 */
4291 if (!bh) { 4464 if (!bh) {
4292 ext4_error(inode->i_sb, "ext4_free_branches", 4465 ext4_error(inode->i_sb,
4293 "Read failure, inode=%lu, block=%llu", 4466 "Read failure, inode=%lu, block=%llu",
4294 inode->i_ino, nr); 4467 inode->i_ino, nr);
4295 continue; 4468 continue;
@@ -4433,8 +4606,10 @@ void ext4_truncate(struct inode *inode)
4433 if (!ext4_can_truncate(inode)) 4606 if (!ext4_can_truncate(inode))
4434 return; 4607 return;
4435 4608
4609 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4610
4436 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4611 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4437 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4612 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4438 4613
4439 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4614 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4440 ext4_ext_truncate(inode); 4615 ext4_ext_truncate(inode);
@@ -4604,9 +4779,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4604 4779
4605 bh = sb_getblk(sb, block); 4780 bh = sb_getblk(sb, block);
4606 if (!bh) { 4781 if (!bh) {
4607 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4782 ext4_error(sb, "unable to read inode block - "
4608 "inode block - inode=%lu, block=%llu", 4783 "inode=%lu, block=%llu", inode->i_ino, block);
4609 inode->i_ino, block);
4610 return -EIO; 4784 return -EIO;
4611 } 4785 }
4612 if (!buffer_uptodate(bh)) { 4786 if (!buffer_uptodate(bh)) {
@@ -4704,9 +4878,8 @@ make_io:
4704 submit_bh(READ_META, bh); 4878 submit_bh(READ_META, bh);
4705 wait_on_buffer(bh); 4879 wait_on_buffer(bh);
4706 if (!buffer_uptodate(bh)) { 4880 if (!buffer_uptodate(bh)) {
4707 ext4_error(sb, __func__, 4881 ext4_error(sb, "unable to read inode block - inode=%lu,"
4708 "unable to read inode block - inode=%lu, " 4882 " block=%llu", inode->i_ino, block);
4709 "block=%llu", inode->i_ino, block);
4710 brelse(bh); 4883 brelse(bh);
4711 return -EIO; 4884 return -EIO;
4712 } 4885 }
@@ -4720,7 +4893,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4720{ 4893{
4721 /* We have all inode data except xattrs in memory here. */ 4894 /* We have all inode data except xattrs in memory here. */
4722 return __ext4_get_inode_loc(inode, iloc, 4895 return __ext4_get_inode_loc(inode, iloc,
4723 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4896 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4724} 4897}
4725 4898
4726void ext4_set_inode_flags(struct inode *inode) 4899void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4987,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4814 } 4987 }
4815 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4988 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4816 4989
4817 ei->i_state = 0; 4990 ei->i_state_flags = 0;
4818 ei->i_dir_start_lookup = 0; 4991 ei->i_dir_start_lookup = 0;
4819 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4992 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4820 /* We now have enough fields to check if the inode was active or not. 4993 /* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5070,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4897 EXT4_GOOD_OLD_INODE_SIZE + 5070 EXT4_GOOD_OLD_INODE_SIZE +
4898 ei->i_extra_isize; 5071 ei->i_extra_isize;
4899 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5072 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4900 ei->i_state |= EXT4_STATE_XATTR; 5073 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4901 } 5074 }
4902 } else 5075 } else
4903 ei->i_extra_isize = 0; 5076 ei->i_extra_isize = 0;
@@ -4917,8 +5090,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4917 ret = 0; 5090 ret = 0;
4918 if (ei->i_file_acl && 5091 if (ei->i_file_acl &&
4919 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5092 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4920 ext4_error(sb, __func__, 5093 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4921 "bad extended attribute block %llu in inode #%lu",
4922 ei->i_file_acl, inode->i_ino); 5094 ei->i_file_acl, inode->i_ino);
4923 ret = -EIO; 5095 ret = -EIO;
4924 goto bad_inode; 5096 goto bad_inode;
@@ -4964,8 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4964 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5136 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4965 } else { 5137 } else {
4966 ret = -EIO; 5138 ret = -EIO;
4967 ext4_error(inode->i_sb, __func__, 5139 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4968 "bogus i_mode (%o) for inode=%lu",
4969 inode->i_mode, inode->i_ino); 5140 inode->i_mode, inode->i_ino);
4970 goto bad_inode; 5141 goto bad_inode;
4971 } 5142 }
@@ -5037,7 +5208,7 @@ static int ext4_do_update_inode(handle_t *handle,
5037 5208
5038 /* For fields not not tracking in the in-memory inode, 5209 /* For fields not not tracking in the in-memory inode,
5039 * initialise them to zero for new inodes. */ 5210 * initialise them to zero for new inodes. */
5040 if (ei->i_state & EXT4_STATE_NEW) 5211 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5041 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5212 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5042 5213
5043 ext4_get_inode_flags(ei); 5214 ext4_get_inode_flags(ei);
@@ -5101,7 +5272,7 @@ static int ext4_do_update_inode(handle_t *handle,
5101 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5272 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5102 sb->s_dirt = 1; 5273 sb->s_dirt = 1;
5103 ext4_handle_sync(handle); 5274 ext4_handle_sync(handle);
5104 err = ext4_handle_dirty_metadata(handle, inode, 5275 err = ext4_handle_dirty_metadata(handle, NULL,
5105 EXT4_SB(sb)->s_sbh); 5276 EXT4_SB(sb)->s_sbh);
5106 } 5277 }
5107 } 5278 }
@@ -5130,10 +5301,10 @@ static int ext4_do_update_inode(handle_t *handle,
5130 } 5301 }
5131 5302
5132 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5303 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5133 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5304 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5134 if (!err) 5305 if (!err)
5135 err = rc; 5306 err = rc;
5136 ei->i_state &= ~EXT4_STATE_NEW; 5307 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5137 5308
5138 ext4_update_inode_fsync_trans(handle, inode, 0); 5309 ext4_update_inode_fsync_trans(handle, inode, 0);
5139out_brelse: 5310out_brelse:
@@ -5204,10 +5375,8 @@ int ext4_write_inode(struct inode *inode, int wait)
5204 if (wait) 5375 if (wait)
5205 sync_dirty_buffer(iloc.bh); 5376 sync_dirty_buffer(iloc.bh);
5206 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5377 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5207 ext4_error(inode->i_sb, __func__, 5378 ext4_error(inode->i_sb, "IO error syncing inode, "
5208 "IO error syncing inode, " 5379 "inode=%lu, block=%llu", inode->i_ino,
5209 "inode=%lu, block=%llu",
5210 inode->i_ino,
5211 (unsigned long long)iloc.bh->b_blocknr); 5380 (unsigned long long)iloc.bh->b_blocknr);
5212 err = -EIO; 5381 err = -EIO;
5213 } 5382 }
@@ -5288,7 +5457,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5288 } 5457 }
5289 5458
5290 if (S_ISREG(inode->i_mode) && 5459 if (S_ISREG(inode->i_mode) &&
5291 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5460 attr->ia_valid & ATTR_SIZE &&
5461 (attr->ia_size < inode->i_size ||
5462 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5292 handle_t *handle; 5463 handle_t *handle;
5293 5464
5294 handle = ext4_journal_start(inode, 3); 5465 handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5490,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5319 goto err_out; 5490 goto err_out;
5320 } 5491 }
5321 } 5492 }
5493 /* ext4_truncate will clear the flag */
5494 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5495 ext4_truncate(inode);
5322 } 5496 }
5323 5497
5324 rc = inode_setattr(inode, attr); 5498 rc = inode_setattr(inode, attr);
@@ -5557,8 +5731,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5557 entry = IFIRST(header); 5731 entry = IFIRST(header);
5558 5732
5559 /* No extended attributes present */ 5733 /* No extended attributes present */
5560 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5734 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5561 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5735 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5562 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5736 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5563 new_extra_isize); 5737 new_extra_isize);
5564 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5738 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5776,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5602 err = ext4_reserve_inode_write(handle, inode, &iloc); 5776 err = ext4_reserve_inode_write(handle, inode, &iloc);
5603 if (ext4_handle_valid(handle) && 5777 if (ext4_handle_valid(handle) &&
5604 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5778 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5605 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5779 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5606 /* 5780 /*
5607 * We need extra buffer credits since we may write into EA block 5781 * We need extra buffer credits since we may write into EA block
5608 * with this same handle. If journal_extend fails, then it will 5782 * with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5790,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5616 sbi->s_want_extra_isize, 5790 sbi->s_want_extra_isize,
5617 iloc, handle); 5791 iloc, handle);
5618 if (ret) { 5792 if (ret) {
5619 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5793 ext4_set_inode_state(inode,
5794 EXT4_STATE_NO_EXPAND);
5620 if (mnt_count != 5795 if (mnt_count !=
5621 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5796 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5622 ext4_warning(inode->i_sb, __func__, 5797 ext4_warning(inode->i_sb,
5623 "Unable to expand inode %lu. Delete" 5798 "Unable to expand inode %lu. Delete"
5624 " some EAs or run e2fsck.", 5799 " some EAs or run e2fsck.",
5625 inode->i_ino); 5800 inode->i_ino);
@@ -5683,7 +5858,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5683 err = jbd2_journal_get_write_access(handle, iloc.bh); 5858 err = jbd2_journal_get_write_access(handle, iloc.bh);
5684 if (!err) 5859 if (!err)
5685 err = ext4_handle_dirty_metadata(handle, 5860 err = ext4_handle_dirty_metadata(handle,
5686 inode, 5861 NULL,
5687 iloc.bh); 5862 iloc.bh);
5688 brelse(iloc.bh); 5863 brelse(iloc.bh);
5689 } 5864 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
92 flags &= ~EXT4_EXTENTS_FL; 92 flags &= ~EXT4_EXTENTS_FL;
93 } 93 }
94 94
95 if (flags & EXT4_EOFBLOCKS_FL) {
96 /* we don't support adding EOFBLOCKS flag */
97 if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
98 err = -EOPNOTSUPP;
99 goto flags_out;
100 }
101 } else if (oldflags & EXT4_EOFBLOCKS_FL)
102 ext4_truncate(inode);
103
95 handle = ext4_journal_start(inode, 1); 104 handle = ext4_journal_start(inode, 1);
96 if (IS_ERR(handle)) { 105 if (IS_ERR(handle)) {
97 err = PTR_ERR(handle); 106 err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
249 if (me.moved_len > 0) 258 if (me.moved_len > 0)
250 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
251 260
252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me)))
253 err = -EFAULT; 263 err = -EFAULT;
254mext_out: 264mext_out:
255 fput(donor_filp); 265 fput(donor_filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..abb11e328b65 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
441 for (i = 0; i < count; i++) { 441 for (i = 0; i < count; i++) {
442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
443 ext4_fsblk_t blocknr; 443 ext4_fsblk_t blocknr;
444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 444
445 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
445 blocknr += first + i; 446 blocknr += first + i;
446 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 ext4_grp_locked_error(sb, e4b->bd_group, 447 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 448 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %u)", 449 " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1255 1254
1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1255 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1257 ext4_fsblk_t blocknr; 1256 ext4_fsblk_t blocknr;
1258 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1257
1258 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1259 blocknr += block; 1259 blocknr += block;
1260 blocknr +=
1261 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1262 ext4_grp_locked_error(sb, e4b->bd_group, 1260 ext4_grp_locked_error(sb, e4b->bd_group,
1263 __func__, "double-free of inode" 1261 __func__, "double-free of inode"
1264 " %lu's block %llu(bit %u in group %u)", 1262 " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1631 int max; 1629 int max;
1632 int err; 1630 int err;
1633 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1631 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1634 struct ext4_super_block *es = sbi->s_es;
1635 struct ext4_free_extent ex; 1632 struct ext4_free_extent ex;
1636 1633
1637 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1634 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1648 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1645 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1649 ext4_fsblk_t start; 1646 ext4_fsblk_t start;
1650 1647
1651 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1648 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1652 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1649 ex.fe_start;
1653 /* use do_div to get remainder (would be 64-bit modulo) */ 1650 /* use do_div to get remainder (would be 64-bit modulo) */
1654 if (do_div(start, sbi->s_stripe) == 0) { 1651 if (do_div(start, sbi->s_stripe) == 0) {
1655 ac->ac_found++; 1652 ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1803 BUG_ON(sbi->s_stripe == 0); 1800 BUG_ON(sbi->s_stripe == 0);
1804 1801
1805 /* find first stripe-aligned block in group */ 1802 /* find first stripe-aligned block in group */
1806 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1803 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1807 + le32_to_cpu(sbi->s_es->s_first_data_block); 1804
1808 a = first_group_block + sbi->s_stripe - 1; 1805 a = first_group_block + sbi->s_stripe - 1;
1809 do_div(a, sbi->s_stripe); 1806 do_div(a, sbi->s_stripe);
1810 i = (a * sbi->s_stripe) - first_group_block; 1807 i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2256 2253
2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2254 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2258 init_rwsem(&meta_group_info[i]->alloc_sem); 2255 init_rwsem(&meta_group_info[i]->alloc_sem);
2259 meta_group_info[i]->bb_free_root.rb_node = NULL; 2256 meta_group_info[i]->bb_free_root = RB_ROOT;
2260 2257
2261#ifdef DOUBLE_CHECK 2258#ifdef DOUBLE_CHECK
2262 { 2259 {
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2560 ext4_unlock_group(sb, entry->group); 2557 ext4_unlock_group(sb, entry->group);
2561 if (test_opt(sb, DISCARD)) { 2558 if (test_opt(sb, DISCARD)) {
2562 ext4_fsblk_t discard_block; 2559 ext4_fsblk_t discard_block;
2563 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2564 2560
2565 discard_block = (ext4_fsblk_t)entry->group * 2561 discard_block = entry->start_blk +
2566 EXT4_BLOCKS_PER_GROUP(sb) 2562 ext4_group_first_block_no(sb, entry->group);
2567 + entry->start_blk
2568 + le32_to_cpu(es->s_first_data_block);
2569 trace_ext4_discard_blocks(sb, 2563 trace_ext4_discard_blocks(sb,
2570 (unsigned long long)discard_block, 2564 (unsigned long long)discard_block,
2571 entry->count); 2565 entry->count);
@@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2703 if (err) 2697 if (err)
2704 goto out_err; 2698 goto out_err;
2705 2699
2706 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2700 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2707 + ac->ac_b_ex.fe_start
2708 + le32_to_cpu(es->s_first_data_block);
2709 2701
2710 len = ac->ac_b_ex.fe_len; 2702 len = ac->ac_b_ex.fe_len;
2711 if (!ext4_data_block_valid(sbi, block, len)) { 2703 if (!ext4_data_block_valid(sbi, block, len)) {
2712 ext4_error(sb, __func__, 2704 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2713 "Allocating blocks %llu-%llu which overlap "
2714 "fs metadata\n", block, block+len); 2705 "fs metadata\n", block, block+len);
2715 /* File system mounted not to panic on error 2706 /* File system mounted not to panic on error
2716 * Fix the bitmap and repeat the block allocation 2707 * Fix the bitmap and repeat the block allocation
@@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3161 /* The max size of hash table is PREALLOC_TB_SIZE */ 3152 /* The max size of hash table is PREALLOC_TB_SIZE */
3162 order = PREALLOC_TB_SIZE - 1; 3153 order = PREALLOC_TB_SIZE - 1;
3163 3154
3164 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3155 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3165 ac->ac_g_ex.fe_start +
3166 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3167 /* 3156 /*
3168 * search for the prealloc space that is having 3157 * search for the prealloc space that is having
3169 * minimal distance from the goal block. 3158 * minimal distance from the goal block.
@@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3526 if (bit >= end) 3515 if (bit >= end)
3527 break; 3516 break;
3528 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3517 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3529 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3518 start = ext4_group_first_block_no(sb, group) + bit;
3530 le32_to_cpu(sbi->s_es->s_first_data_block);
3531 mb_debug(1, " free preallocated %u/%u in group %u\n", 3519 mb_debug(1, " free preallocated %u/%u in group %u\n",
3532 (unsigned) start, (unsigned) next - bit, 3520 (unsigned) start, (unsigned) next - bit,
3533 (unsigned) group); 3521 (unsigned) group);
@@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3623 3611
3624 bitmap_bh = ext4_read_block_bitmap(sb, group); 3612 bitmap_bh = ext4_read_block_bitmap(sb, group);
3625 if (bitmap_bh == NULL) { 3613 if (bitmap_bh == NULL) {
3626 ext4_error(sb, __func__, "Error in reading block " 3614 ext4_error(sb, "Error reading block bitmap for %u", group);
3627 "bitmap for %u", group);
3628 return 0; 3615 return 0;
3629 } 3616 }
3630 3617
3631 err = ext4_mb_load_buddy(sb, group, &e4b); 3618 err = ext4_mb_load_buddy(sb, group, &e4b);
3632 if (err) { 3619 if (err) {
3633 ext4_error(sb, __func__, "Error in loading buddy " 3620 ext4_error(sb, "Error loading buddy information for %u", group);
3634 "information for %u", group);
3635 put_bh(bitmap_bh); 3621 put_bh(bitmap_bh);
3636 return 0; 3622 return 0;
3637 } 3623 }
@@ -3804,15 +3790,15 @@ repeat:
3804 3790
3805 err = ext4_mb_load_buddy(sb, group, &e4b); 3791 err = ext4_mb_load_buddy(sb, group, &e4b);
3806 if (err) { 3792 if (err) {
3807 ext4_error(sb, __func__, "Error in loading buddy " 3793 ext4_error(sb, "Error loading buddy information for %u",
3808 "information for %u", group); 3794 group);
3809 continue; 3795 continue;
3810 } 3796 }
3811 3797
3812 bitmap_bh = ext4_read_block_bitmap(sb, group); 3798 bitmap_bh = ext4_read_block_bitmap(sb, group);
3813 if (bitmap_bh == NULL) { 3799 if (bitmap_bh == NULL) {
3814 ext4_error(sb, __func__, "Error in reading block " 3800 ext4_error(sb, "Error reading block bitmap for %u",
3815 "bitmap for %u", group); 3801 group);
3816 ext4_mb_release_desc(&e4b); 3802 ext4_mb_release_desc(&e4b);
3817 continue; 3803 continue;
3818 } 3804 }
@@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3938 3924
3939 /* don't use group allocation for large files */ 3925 /* don't use group allocation for large files */
3940 size = max(size, isize); 3926 size = max(size, isize);
3941 if (size >= sbi->s_mb_stream_request) { 3927 if (size > sbi->s_mb_stream_request) {
3942 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3928 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3943 return; 3929 return;
3944 } 3930 }
@@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4077 4063
4078 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4064 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4079 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4065 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4080 ext4_error(sb, __func__, "Error in loading buddy " 4066 ext4_error(sb, "Error loading buddy information for %u",
4081 "information for %u", group); 4067 group);
4082 continue; 4068 continue;
4083 } 4069 }
4084 ext4_lock_group(sb, group); 4070 ext4_lock_group(sb, group);
@@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4476 4462
4477 sbi = EXT4_SB(sb); 4463 sbi = EXT4_SB(sb);
4478 es = EXT4_SB(sb)->s_es; 4464 es = EXT4_SB(sb)->s_es;
4479 if (!ext4_data_block_valid(sbi, block, count)) { 4465 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4480 ext4_error(sb, __func__, 4466 !ext4_data_block_valid(sbi, block, count)) {
4481 "Freeing blocks not in datazone - " 4467 ext4_error(sb, "Freeing blocks not in datazone - "
4482 "block = %llu, count = %lu", block, count); 4468 "block = %llu, count = %lu", block, count);
4483 goto error_return; 4469 goto error_return;
4484 } 4470 }
4485 4471
@@ -4547,8 +4533,7 @@ do_more:
4547 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4533 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4548 EXT4_SB(sb)->s_itb_per_group)) { 4534 EXT4_SB(sb)->s_itb_per_group)) {
4549 4535
4550 ext4_error(sb, __func__, 4536 ext4_error(sb, "Freeing blocks in system zone - "
4551 "Freeing blocks in system zone - "
4552 "Block = %llu, count = %lu", block, count); 4537 "Block = %llu, count = %lu", block, count);
4553 /* err = 0. ext4_std_error should be a no op */ 4538 /* err = 0. ext4_std_error should be a no op */
4554 goto error_return; 4539 goto error_return;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae456..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,16 +220,9 @@ struct ext4_buddy {
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
222 222
223#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
224
225static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 223static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
226 struct ext4_free_extent *fex) 224 struct ext4_free_extent *fex)
227{ 225{
228 ext4_fsblk_t block; 226 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
229
230 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
231 + fex->fe_start
232 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
233 return block;
234} 227}
235#endif 228#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..8b87bd0eac95 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
365 * happened after we started the migrate. We need to 365 * happened after we started the migrate. We need to
366 * fail the migrate 366 * fail the migrate
367 */ 367 */
368 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { 368 if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
369 retval = -EAGAIN; 369 retval = -EAGAIN;
370 up_write(&EXT4_I(inode)->i_data_sem); 370 up_write(&EXT4_I(inode)->i_data_sem);
371 goto err_out; 371 goto err_out;
372 } else 372 } else
373 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 373 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
374 /* 374 /*
375 * We have the extent map build with the tmp inode. 375 * We have the extent map build with the tmp inode.
376 * Now copy the i_data across 376 * Now copy the i_data across
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
503 } 503 }
504 i_size_write(tmp_inode, i_size_read(inode)); 504 i_size_write(tmp_inode, i_size_read(inode));
505 /* 505 /*
506 * We don't want the inode to be reclaimed 506 * Set the i_nlink to zero so it will be deleted later
507 * if we got interrupted in between. We have 507 * when we drop inode reference.
508 * this tmp inode carrying reference to the
509 * data blocks of the original file. We set
510 * the i_nlink to zero at the last stage after
511 * switching the original file to extent format
512 */ 508 */
513 tmp_inode->i_nlink = 1; 509 tmp_inode->i_nlink = 0;
514 510
515 ext4_ext_tree_init(handle, tmp_inode); 511 ext4_ext_tree_init(handle, tmp_inode);
516 ext4_orphan_add(handle, tmp_inode); 512 ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode)
533 * allocation. 529 * allocation.
534 */ 530 */
535 down_read((&EXT4_I(inode)->i_data_sem)); 531 down_read((&EXT4_I(inode)->i_data_sem));
536 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; 532 ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
537 up_read((&EXT4_I(inode)->i_data_sem)); 533 up_read((&EXT4_I(inode)->i_data_sem));
538 534
539 handle = ext4_journal_start(inode, 1); 535 handle = ext4_journal_start(inode, 1);
536 if (IS_ERR(handle)) {
537 /*
538 * It is impossible to update on-disk structures without
539 * a handle, so just rollback in-core changes and live other
540 * work to orphan_list_cleanup()
541 */
542 ext4_orphan_del(NULL, tmp_inode);
543 retval = PTR_ERR(handle);
544 goto out;
545 }
540 546
541 ei = EXT4_I(inode); 547 ei = EXT4_I(inode);
542 i_data = ei->i_data; 548 i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
618 624
619 /* Reset the extent details */ 625 /* Reset the extent details */
620 ext4_ext_tree_init(handle, tmp_inode); 626 ext4_ext_tree_init(handle, tmp_inode);
621
622 /*
623 * Set the i_nlink to zero so that
624 * generic_drop_inode really deletes the
625 * inode
626 */
627 tmp_inode->i_nlink = 0;
628
629 ext4_journal_stop(handle); 627 ext4_journal_stop(handle);
628out:
630 unlock_new_inode(tmp_inode); 629 unlock_new_inode(tmp_inode);
631 iput(tmp_inode); 630 iput(tmp_inode);
632 631
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..aa5fe28d180f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
152 int ret = 0; 152 int ret = 0;
153 153
154 if (inode1 == NULL) { 154 if (inode1 == NULL) {
155 ext4_error(inode2->i_sb, function, 155 __ext4_error(inode2->i_sb, function,
156 "Both inodes should not be NULL: " 156 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino); 157 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO; 158 ret = -EIO;
159 } else if (inode2 == NULL) { 159 } else if (inode2 == NULL) {
160 ext4_error(inode1->i_sb, function, 160 __ext4_error(inode1->i_sb, function,
161 "Both inodes should not be NULL: " 161 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino); 162 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO; 163 ret = -EIO;
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
252 } 252 }
253 253
254 o_start->ee_len = start_ext->ee_len; 254 o_start->ee_len = start_ext->ee_len;
255 eblock = le32_to_cpu(start_ext->ee_block);
255 new_flag = 1; 256 new_flag = 1;
256 257
257 } else if (start_ext->ee_len && new_ext->ee_len && 258 } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
262 * orig |------------------------------| 263 * orig |------------------------------|
263 */ 264 */
264 o_start->ee_len = start_ext->ee_len; 265 o_start->ee_len = start_ext->ee_len;
266 eblock = le32_to_cpu(start_ext->ee_block);
265 new_flag = 1; 267 new_flag = 1;
266 268
267 } else if (!start_ext->ee_len && new_ext->ee_len && 269 } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
475 struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 477 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
476 struct ext4_extent new_ext, start_ext, end_ext; 478 struct ext4_extent new_ext, start_ext, end_ext;
477 ext4_lblk_t new_ext_end; 479 ext4_lblk_t new_ext_end;
478 ext4_fsblk_t new_phys_end;
479 int oext_alen, new_ext_alen, end_ext_alen; 480 int oext_alen, new_ext_alen, end_ext_alen;
480 int depth = ext_depth(orig_inode); 481 int depth = ext_depth(orig_inode);
481 int ret; 482 int ret;
@@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
489 new_ext.ee_len = dext->ee_len; 490 new_ext.ee_len = dext->ee_len;
490 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 491 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
491 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 492 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
492 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
493 493
494 /* 494 /*
495 * Case: original extent is first 495 * Case: original extent is first
@@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
502 le32_to_cpu(oext->ee_block) + oext_alen) { 502 le32_to_cpu(oext->ee_block) + oext_alen) {
503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
504 le32_to_cpu(oext->ee_block)); 504 le32_to_cpu(oext->ee_block));
505 start_ext.ee_block = oext->ee_block;
505 copy_extent_status(oext, &start_ext); 506 copy_extent_status(oext, &start_ext);
506 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 507 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
507 prev_ext = oext - 1; 508 prev_ext = oext - 1;
@@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
515 start_ext.ee_len = cpu_to_le16( 516 start_ext.ee_len = cpu_to_le16(
516 ext4_ext_get_actual_len(prev_ext) + 517 ext4_ext_get_actual_len(prev_ext) +
517 new_ext_alen); 518 new_ext_alen);
519 start_ext.ee_block = oext->ee_block;
518 copy_extent_status(prev_ext, &start_ext); 520 copy_extent_status(prev_ext, &start_ext);
519 new_ext.ee_len = 0; 521 new_ext.ee_len = 0;
520 } 522 }
@@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
526 * new_ext |-------| 528 * new_ext |-------|
527 */ 529 */
528 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 530 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
529 ext4_error(orig_inode->i_sb, __func__, 531 ext4_error(orig_inode->i_sb,
530 "new_ext_end(%u) should be less than or equal to " 532 "new_ext_end(%u) should be less than or equal to "
531 "oext->ee_block(%u) + oext_alen(%d) - 1", 533 "oext->ee_block(%u) + oext_alen(%d) - 1",
532 new_ext_end, le32_to_cpu(oext->ee_block), 534 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
689 while (1) { 691 while (1) {
690 /* The extent for donor must be found. */ 692 /* The extent for donor must be found. */
691 if (!dext) { 693 if (!dext) {
692 ext4_error(donor_inode->i_sb, __func__, 694 ext4_error(donor_inode->i_sb,
693 "The extent for donor must be found"); 695 "The extent for donor must be found");
694 *err = -EIO; 696 *err = -EIO;
695 goto out; 697 goto out;
696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 698 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
697 ext4_error(donor_inode->i_sb, __func__, 699 ext4_error(donor_inode->i_sb,
698 "Donor offset(%u) and the first block of donor " 700 "Donor offset(%u) and the first block of donor "
699 "extent(%u) should be equal", 701 "extent(%u) should be equal",
700 donor_off, 702 donor_off,
@@ -928,7 +930,7 @@ out2:
928} 930}
929 931
930/** 932/**
931 * mext_check_argumants - Check whether move extent can be done 933 * mext_check_arguments - Check whether move extent can be done
932 * 934 *
933 * @orig_inode: original inode 935 * @orig_inode: original inode
934 * @donor_inode: donor inode 936 * @donor_inode: donor inode
@@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode,
949 unsigned int blkbits = orig_inode->i_blkbits; 951 unsigned int blkbits = orig_inode->i_blkbits;
950 unsigned int blocksize = 1 << blkbits; 952 unsigned int blocksize = 1 << blkbits;
951 953
952 /* Regular file check */
953 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
954 ext4_debug("ext4 move extent: The argument files should be "
955 "regular file [ino:orig %lu, donor %lu]\n",
956 orig_inode->i_ino, donor_inode->i_ino);
957 return -EINVAL;
958 }
959
960 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 954 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
961 ext4_debug("ext4 move extent: suid or sgid is set" 955 ext4_debug("ext4 move extent: suid or sgid is set"
962 " to donor file [ino:orig %lu, donor %lu]\n", 956 " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1204 return -EINVAL; 1198 return -EINVAL;
1205 } 1199 }
1206 1200
1201 /* Regular file check */
1202 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
1203 ext4_debug("ext4 move extent: The argument files should be "
1204 "regular file [ino:orig %lu, donor %lu]\n",
1205 orig_inode->i_ino, donor_inode->i_ino);
1206 return -EINVAL;
1207 }
1208
1207 /* Protect orig and donor inodes against a truncate */ 1209 /* Protect orig and donor inodes against a truncate */
1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1210 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1209 if (ret1 < 0) 1211 if (ret1 < 0)
@@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1351 if (ret1 < 0) 1353 if (ret1 < 0)
1352 break; 1354 break;
1353 if (*moved_len > len) { 1355 if (*moved_len > len) {
1354 ext4_error(orig_inode->i_sb, __func__, 1356 ext4_error(orig_inode->i_sb,
1355 "We replaced blocks too much! " 1357 "We replaced blocks too much! "
1356 "sum of replaced: %llu requested: %llu", 1358 "sum of replaced: %llu requested: %llu",
1357 *moved_len, len); 1359 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..608d21f873ec 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
383 if (root->info.hash_version != DX_HASH_TEA && 383 if (root->info.hash_version != DX_HASH_TEA &&
384 root->info.hash_version != DX_HASH_HALF_MD4 && 384 root->info.hash_version != DX_HASH_HALF_MD4 &&
385 root->info.hash_version != DX_HASH_LEGACY) { 385 root->info.hash_version != DX_HASH_LEGACY) {
386 ext4_warning(dir->i_sb, __func__, 386 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
387 "Unrecognised inode hash code %d",
388 root->info.hash_version); 387 root->info.hash_version);
389 brelse(bh); 388 brelse(bh);
390 *err = ERR_BAD_DX_DIR; 389 *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
399 hash = hinfo->hash; 398 hash = hinfo->hash;
400 399
401 if (root->info.unused_flags & 1) { 400 if (root->info.unused_flags & 1) {
402 ext4_warning(dir->i_sb, __func__, 401 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
403 "Unimplemented inode hash flags: %#06x",
404 root->info.unused_flags); 402 root->info.unused_flags);
405 brelse(bh); 403 brelse(bh);
406 *err = ERR_BAD_DX_DIR; 404 *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
408 } 406 }
409 407
410 if ((indirect = root->info.indirect_levels) > 1) { 408 if ((indirect = root->info.indirect_levels) > 1) {
411 ext4_warning(dir->i_sb, __func__, 409 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
412 "Unimplemented inode hash depth: %#06x",
413 root->info.indirect_levels); 410 root->info.indirect_levels);
414 brelse(bh); 411 brelse(bh);
415 *err = ERR_BAD_DX_DIR; 412 *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
421 418
422 if (dx_get_limit(entries) != dx_root_limit(dir, 419 if (dx_get_limit(entries) != dx_root_limit(dir,
423 root->info.info_length)) { 420 root->info.info_length)) {
424 ext4_warning(dir->i_sb, __func__, 421 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
425 "dx entry: limit != root limit");
426 brelse(bh); 422 brelse(bh);
427 *err = ERR_BAD_DX_DIR; 423 *err = ERR_BAD_DX_DIR;
428 goto fail; 424 goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
433 { 429 {
434 count = dx_get_count(entries); 430 count = dx_get_count(entries);
435 if (!count || count > dx_get_limit(entries)) { 431 if (!count || count > dx_get_limit(entries)) {
436 ext4_warning(dir->i_sb, __func__, 432 ext4_warning(dir->i_sb,
437 "dx entry: no count or count > limit"); 433 "dx entry: no count or count > limit");
438 brelse(bh); 434 brelse(bh);
439 *err = ERR_BAD_DX_DIR; 435 *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
478 goto fail2; 474 goto fail2;
479 at = entries = ((struct dx_node *) bh->b_data)->entries; 475 at = entries = ((struct dx_node *) bh->b_data)->entries;
480 if (dx_get_limit(entries) != dx_node_limit (dir)) { 476 if (dx_get_limit(entries) != dx_node_limit (dir)) {
481 ext4_warning(dir->i_sb, __func__, 477 ext4_warning(dir->i_sb,
482 "dx entry: limit != node limit"); 478 "dx entry: limit != node limit");
483 brelse(bh); 479 brelse(bh);
484 *err = ERR_BAD_DX_DIR; 480 *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
494 } 490 }
495fail: 491fail:
496 if (*err == ERR_BAD_DX_DIR) 492 if (*err == ERR_BAD_DX_DIR)
497 ext4_warning(dir->i_sb, __func__, 493 ext4_warning(dir->i_sb,
498 "Corrupt dir inode %ld, running e2fsck is " 494 "Corrupt dir inode %ld, running e2fsck is "
499 "recommended.", dir->i_ino); 495 "recommended.", dir->i_ino);
500 return NULL; 496 return NULL;
@@ -947,9 +943,8 @@ restart:
947 wait_on_buffer(bh); 943 wait_on_buffer(bh);
948 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
949 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
950 ext4_error(sb, __func__, "reading directory #%lu " 946 ext4_error(sb, "reading directory #%lu offset %lu",
951 "offset %lu", dir->i_ino, 947 dir->i_ino, (unsigned long)block);
952 (unsigned long)block);
953 brelse(bh); 948 brelse(bh);
954 goto next; 949 goto next;
955 } 950 }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1041 retval = ext4_htree_next_block(dir, hash, frame, 1036 retval = ext4_htree_next_block(dir, hash, frame,
1042 frames, NULL); 1037 frames, NULL);
1043 if (retval < 0) { 1038 if (retval < 0) {
1044 ext4_warning(sb, __func__, 1039 ext4_warning(sb,
1045 "error reading index page in directory #%lu", 1040 "error reading index page in directory #%lu",
1046 dir->i_ino); 1041 dir->i_ino);
1047 *err = retval; 1042 *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1071 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1072 brelse(bh); 1067 brelse(bh);
1073 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1074 ext4_error(dir->i_sb, "ext4_lookup", 1069 ext4_error(dir->i_sb, "bad inode number: %u", ino);
1075 "bad inode number: %u", ino);
1076 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1077 } 1071 }
1078 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1079 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1080 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1081 ext4_error(dir->i_sb, __func__, 1075 ext4_error(dir->i_sb,
1082 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1083 ino); 1077 ino);
1084 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1110 brelse(bh); 1104 brelse(bh);
1111 1105
1112 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1113 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1107 ext4_error(child->d_inode->i_sb,
1114 "bad inode number: %u", ino); 1108 "bad inode number: %u", ino);
1115 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1116 } 1110 }
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1410 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1411 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1412 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1413 ext4_error(dir->i_sb, __func__, 1407 ext4_error(dir->i_sb,
1414 "invalid rec_len for '..' in inode %lu", 1408 "invalid rec_len for '..' in inode %lu",
1415 dir->i_ino); 1409 dir->i_ino);
1416 brelse(bh); 1410 brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1575 1569
1576 if (levels && (dx_get_count(frames->entries) == 1570 if (levels && (dx_get_count(frames->entries) ==
1577 dx_get_limit(frames->entries))) { 1571 dx_get_limit(frames->entries))) {
1578 ext4_warning(sb, __func__, 1572 ext4_warning(sb, "Directory index full!");
1579 "Directory index full!");
1580 err = -ENOSPC; 1573 err = -ENOSPC;
1581 goto cleanup; 1574 goto cleanup;
1582 } 1575 }
@@ -1916,11 +1909,11 @@ static int empty_dir(struct inode *inode)
1916 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1909 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1917 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1910 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1918 if (err) 1911 if (err)
1919 ext4_error(inode->i_sb, __func__, 1912 ext4_error(inode->i_sb,
1920 "error %d reading directory #%lu offset 0", 1913 "error %d reading directory #%lu offset 0",
1921 err, inode->i_ino); 1914 err, inode->i_ino);
1922 else 1915 else
1923 ext4_warning(inode->i_sb, __func__, 1916 ext4_warning(inode->i_sb,
1924 "bad directory (dir #%lu) - no data block", 1917 "bad directory (dir #%lu) - no data block",
1925 inode->i_ino); 1918 inode->i_ino);
1926 return 1; 1919 return 1;
@@ -1931,7 +1924,7 @@ static int empty_dir(struct inode *inode)
1931 !le32_to_cpu(de1->inode) || 1924 !le32_to_cpu(de1->inode) ||
1932 strcmp(".", de->name) || 1925 strcmp(".", de->name) ||
1933 strcmp("..", de1->name)) { 1926 strcmp("..", de1->name)) {
1934 ext4_warning(inode->i_sb, "empty_dir", 1927 ext4_warning(inode->i_sb,
1935 "bad directory (dir #%lu) - no `.' or `..'", 1928 "bad directory (dir #%lu) - no `.' or `..'",
1936 inode->i_ino); 1929 inode->i_ino);
1937 brelse(bh); 1930 brelse(bh);
@@ -1949,7 +1942,7 @@ static int empty_dir(struct inode *inode)
1949 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1942 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1950 if (!bh) { 1943 if (!bh) {
1951 if (err) 1944 if (err)
1952 ext4_error(sb, __func__, 1945 ext4_error(sb,
1953 "error %d reading directory" 1946 "error %d reading directory"
1954 " #%lu offset %u", 1947 " #%lu offset %u",
1955 err, inode->i_ino, offset); 1948 err, inode->i_ino, offset);
@@ -2020,11 +2013,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2020 err = ext4_reserve_inode_write(handle, inode, &iloc); 2013 err = ext4_reserve_inode_write(handle, inode, &iloc);
2021 if (err) 2014 if (err)
2022 goto out_unlock; 2015 goto out_unlock;
2016 /*
2017 * Due to previous errors inode may be already a part of on-disk
2018 * orphan list. If so skip on-disk list modification.
2019 */
2020 if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2021 (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2022 goto mem_insert;
2023 2023
2024 /* Insert this inode at the head of the on-disk orphan list... */ 2024 /* Insert this inode at the head of the on-disk orphan list... */
2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2027 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); 2027 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2029 if (!err) 2029 if (!err)
2030 err = rc; 2030 err = rc;
@@ -2037,6 +2037,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2037 * 2037 *
2038 * This is safe: on error we're going to ignore the orphan list 2038 * This is safe: on error we're going to ignore the orphan list
2039 * anyway on the next recovery. */ 2039 * anyway on the next recovery. */
2040mem_insert:
2040 if (!err) 2041 if (!err)
2041 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2042 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2042 2043
@@ -2096,7 +2097,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2096 if (err) 2097 if (err)
2097 goto out_brelse; 2098 goto out_brelse;
2098 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2099 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2099 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); 2100 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
2100 } else { 2101 } else {
2101 struct ext4_iloc iloc2; 2102 struct ext4_iloc iloc2;
2102 struct inode *i_prev = 2103 struct inode *i_prev =
@@ -2163,7 +2164,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2163 if (retval) 2164 if (retval)
2164 goto end_rmdir; 2165 goto end_rmdir;
2165 if (!EXT4_DIR_LINK_EMPTY(inode)) 2166 if (!EXT4_DIR_LINK_EMPTY(inode))
2166 ext4_warning(inode->i_sb, "ext4_rmdir", 2167 ext4_warning(inode->i_sb,
2167 "empty directory has too many links (%d)", 2168 "empty directory has too many links (%d)",
2168 inode->i_nlink); 2169 inode->i_nlink);
2169 inode->i_version++; 2170 inode->i_version++;
@@ -2215,7 +2216,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2215 goto end_unlink; 2216 goto end_unlink;
2216 2217
2217 if (!inode->i_nlink) { 2218 if (!inode->i_nlink) {
2218 ext4_warning(inode->i_sb, "ext4_unlink", 2219 ext4_warning(inode->i_sb,
2219 "Deleting nonexistent file (%lu), %d", 2220 "Deleting nonexistent file (%lu), %d",
2220 inode->i_ino, inode->i_nlink); 2221 inode->i_ino, inode->i_nlink);
2221 inode->i_nlink = 1; 2222 inode->i_nlink = 1;
@@ -2462,7 +2463,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2462 } 2463 }
2463 } 2464 }
2464 if (retval) { 2465 if (retval) {
2465 ext4_warning(old_dir->i_sb, "ext4_rename", 2466 ext4_warning(old_dir->i_sb,
2466 "Deleting old file (%lu), %d, error=%d", 2467 "Deleting old file (%lu), %d, error=%d",
2467 old_dir->i_ino, old_dir->i_nlink, retval); 2468 old_dir->i_ino, old_dir->i_nlink, retval);
2468 } 2469 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
48 48
49 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 49 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
50 if (group != sbi->s_groups_count) 50 if (group != sbi->s_groups_count)
51 ext4_warning(sb, __func__, 51 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
52 "Cannot add at group %u (only %u groups)",
53 input->group, sbi->s_groups_count); 52 input->group, sbi->s_groups_count);
54 else if (offset != 0) 53 else if (offset != 0)
55 ext4_warning(sb, __func__, "Last group not full"); 54 ext4_warning(sb, "Last group not full");
56 else if (input->reserved_blocks > input->blocks_count / 5) 55 else if (input->reserved_blocks > input->blocks_count / 5)
57 ext4_warning(sb, __func__, "Reserved blocks too high (%u)", 56 ext4_warning(sb, "Reserved blocks too high (%u)",
58 input->reserved_blocks); 57 input->reserved_blocks);
59 else if (free_blocks_count < 0) 58 else if (free_blocks_count < 0)
60 ext4_warning(sb, __func__, "Bad blocks count %u", 59 ext4_warning(sb, "Bad blocks count %u",
61 input->blocks_count); 60 input->blocks_count);
62 else if (!(bh = sb_bread(sb, end - 1))) 61 else if (!(bh = sb_bread(sb, end - 1)))
63 ext4_warning(sb, __func__, 62 ext4_warning(sb, "Cannot read last block (%llu)",
64 "Cannot read last block (%llu)",
65 end - 1); 63 end - 1);
66 else if (outside(input->block_bitmap, start, end)) 64 else if (outside(input->block_bitmap, start, end))
67 ext4_warning(sb, __func__, 65 ext4_warning(sb, "Block bitmap not in group (block %llu)",
68 "Block bitmap not in group (block %llu)",
69 (unsigned long long)input->block_bitmap); 66 (unsigned long long)input->block_bitmap);
70 else if (outside(input->inode_bitmap, start, end)) 67 else if (outside(input->inode_bitmap, start, end))
71 ext4_warning(sb, __func__, 68 ext4_warning(sb, "Inode bitmap not in group (block %llu)",
72 "Inode bitmap not in group (block %llu)",
73 (unsigned long long)input->inode_bitmap); 69 (unsigned long long)input->inode_bitmap);
74 else if (outside(input->inode_table, start, end) || 70 else if (outside(input->inode_table, start, end) ||
75 outside(itend - 1, start, end)) 71 outside(itend - 1, start, end))
76 ext4_warning(sb, __func__, 72 ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
77 "Inode table not in group (blocks %llu-%llu)",
78 (unsigned long long)input->inode_table, itend - 1); 73 (unsigned long long)input->inode_table, itend - 1);
79 else if (input->inode_bitmap == input->block_bitmap) 74 else if (input->inode_bitmap == input->block_bitmap)
80 ext4_warning(sb, __func__, 75 ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
81 "Block bitmap same as inode bitmap (%llu)",
82 (unsigned long long)input->block_bitmap); 76 (unsigned long long)input->block_bitmap);
83 else if (inside(input->block_bitmap, input->inode_table, itend)) 77 else if (inside(input->block_bitmap, input->inode_table, itend))
84 ext4_warning(sb, __func__, 78 ext4_warning(sb, "Block bitmap (%llu) in inode table "
85 "Block bitmap (%llu) in inode table (%llu-%llu)", 79 "(%llu-%llu)",
86 (unsigned long long)input->block_bitmap, 80 (unsigned long long)input->block_bitmap,
87 (unsigned long long)input->inode_table, itend - 1); 81 (unsigned long long)input->inode_table, itend - 1);
88 else if (inside(input->inode_bitmap, input->inode_table, itend)) 82 else if (inside(input->inode_bitmap, input->inode_table, itend))
89 ext4_warning(sb, __func__, 83 ext4_warning(sb, "Inode bitmap (%llu) in inode table "
90 "Inode bitmap (%llu) in inode table (%llu-%llu)", 84 "(%llu-%llu)",
91 (unsigned long long)input->inode_bitmap, 85 (unsigned long long)input->inode_bitmap,
92 (unsigned long long)input->inode_table, itend - 1); 86 (unsigned long long)input->inode_table, itend - 1);
93 else if (inside(input->block_bitmap, start, metaend)) 87 else if (inside(input->block_bitmap, start, metaend))
94 ext4_warning(sb, __func__, 88 ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
95 "Block bitmap (%llu) in GDT table"
96 " (%llu-%llu)",
97 (unsigned long long)input->block_bitmap, 89 (unsigned long long)input->block_bitmap,
98 start, metaend - 1); 90 start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend)) 91 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __func__, 92 ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 (unsigned long long)input->inode_bitmap, 93 (unsigned long long)input->inode_bitmap,
104 start, metaend - 1); 94 start, metaend - 1);
105 else if (inside(input->inode_table, start, metaend) || 95 else if (inside(input->inode_table, start, metaend) ||
106 inside(itend - 1, start, metaend)) 96 inside(itend - 1, start, metaend))
107 ext4_warning(sb, __func__, 97 ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
108 "Inode table (%llu-%llu) overlaps" 98 "(%llu-%llu)",
109 "GDT table (%llu-%llu)",
110 (unsigned long long)input->inode_table, 99 (unsigned long long)input->inode_table,
111 itend - 1, start, metaend - 1); 100 itend - 1, start, metaend - 1);
112 else 101 else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
364 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { 353 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
365 if (le32_to_cpu(*p++) != 354 if (le32_to_cpu(*p++) !=
366 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ 355 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
367 ext4_warning(sb, __func__, 356 ext4_warning(sb, "reserved GDT %llu"
368 "reserved GDT %llu"
369 " missing grp %d (%llu)", 357 " missing grp %d (%llu)",
370 blk, grp, 358 blk, grp,
371 grp * 359 grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
420 */ 408 */
421 if (EXT4_SB(sb)->s_sbh->b_blocknr != 409 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
422 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 410 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
423 ext4_warning(sb, __func__, 411 ext4_warning(sb, "won't resize using backup superblock at %llu",
424 "won't resize using backup superblock at %llu",
425 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 412 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
426 return -EPERM; 413 return -EPERM;
427 } 414 }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
444 431
445 data = (__le32 *)dind->b_data; 432 data = (__le32 *)dind->b_data;
446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 433 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
447 ext4_warning(sb, __func__, 434 ext4_warning(sb, "new group %u GDT block %llu not reserved",
448 "new group %u GDT block %llu not reserved",
449 input->group, gdblock); 435 input->group, gdblock);
450 err = -EINVAL; 436 err = -EINVAL;
451 goto exit_dind; 437 goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
468 GFP_NOFS); 454 GFP_NOFS);
469 if (!n_group_desc) { 455 if (!n_group_desc) {
470 err = -ENOMEM; 456 err = -ENOMEM;
471 ext4_warning(sb, __func__, 457 ext4_warning(sb,
472 "not enough memory for %lu groups", gdb_num + 1); 458 "not enough memory for %lu groups", gdb_num + 1);
473 goto exit_inode; 459 goto exit_inode;
474 } 460 }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
567 /* Get each reserved primary GDT block and verify it holds backups */ 553 /* Get each reserved primary GDT block and verify it holds backups */
568 for (res = 0; res < reserved_gdb; res++, blk++) { 554 for (res = 0; res < reserved_gdb; res++, blk++) {
569 if (le32_to_cpu(*data) != blk) { 555 if (le32_to_cpu(*data) != blk) {
570 ext4_warning(sb, __func__, 556 ext4_warning(sb, "reserved block %llu"
571 "reserved block %llu"
572 " not at offset %ld", 557 " not at offset %ld",
573 blk, 558 blk,
574 (long)(data - (__le32 *)dind->b_data)); 559 (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
713 */ 698 */
714exit_err: 699exit_err:
715 if (err) { 700 if (err) {
716 ext4_warning(sb, __func__, 701 ext4_warning(sb, "can't update backup for group %u (err %d), "
717 "can't update backup for group %u (err %d), "
718 "forcing fsck on next reboot", group, err); 702 "forcing fsck on next reboot", group, err);
719 sbi->s_mount_state &= ~EXT4_VALID_FS; 703 sbi->s_mount_state &= ~EXT4_VALID_FS;
720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 704 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
753 737
754 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 738 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
755 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 739 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
756 ext4_warning(sb, __func__, 740 ext4_warning(sb, "Can't resize non-sparse filesystem further");
757 "Can't resize non-sparse filesystem further");
758 return -EPERM; 741 return -EPERM;
759 } 742 }
760 743
761 if (ext4_blocks_count(es) + input->blocks_count < 744 if (ext4_blocks_count(es) + input->blocks_count <
762 ext4_blocks_count(es)) { 745 ext4_blocks_count(es)) {
763 ext4_warning(sb, __func__, "blocks_count overflow"); 746 ext4_warning(sb, "blocks_count overflow");
764 return -EINVAL; 747 return -EINVAL;
765 } 748 }
766 749
767 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 750 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
768 le32_to_cpu(es->s_inodes_count)) { 751 le32_to_cpu(es->s_inodes_count)) {
769 ext4_warning(sb, __func__, "inodes_count overflow"); 752 ext4_warning(sb, "inodes_count overflow");
770 return -EINVAL; 753 return -EINVAL;
771 } 754 }
772 755
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
774 if (!EXT4_HAS_COMPAT_FEATURE(sb, 757 if (!EXT4_HAS_COMPAT_FEATURE(sb,
775 EXT4_FEATURE_COMPAT_RESIZE_INODE) 758 EXT4_FEATURE_COMPAT_RESIZE_INODE)
776 || !le16_to_cpu(es->s_reserved_gdt_blocks)) { 759 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 760 ext4_warning(sb,
778 "No reserved GDT blocks, can't resize"); 761 "No reserved GDT blocks, can't resize");
779 return -EPERM; 762 return -EPERM;
780 } 763 }
781 inode = ext4_iget(sb, EXT4_RESIZE_INO); 764 inode = ext4_iget(sb, EXT4_RESIZE_INO);
782 if (IS_ERR(inode)) { 765 if (IS_ERR(inode)) {
783 ext4_warning(sb, __func__, 766 ext4_warning(sb, "Error opening resize inode");
784 "Error opening resize inode");
785 return PTR_ERR(inode); 767 return PTR_ERR(inode);
786 } 768 }
787 } 769 }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
810 792
811 mutex_lock(&sbi->s_resize_lock); 793 mutex_lock(&sbi->s_resize_lock);
812 if (input->group != sbi->s_groups_count) { 794 if (input->group != sbi->s_groups_count) {
813 ext4_warning(sb, __func__, 795 ext4_warning(sb, "multiple resizers run on filesystem!");
814 "multiple resizers run on filesystem!");
815 err = -EBUSY; 796 err = -EBUSY;
816 goto exit_journal; 797 goto exit_journal;
817 } 798 }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 978 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 979 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 980 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); 981 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1001 return -EINVAL; 982 return -EINVAL;
1002 } 983 }
1003 984
1004 if (n_blocks_count < o_blocks_count) { 985 if (n_blocks_count < o_blocks_count) {
1005 ext4_warning(sb, __func__, 986 ext4_warning(sb, "can't shrink FS - resize aborted");
1006 "can't shrink FS - resize aborted");
1007 return -EBUSY; 987 return -EBUSY;
1008 } 988 }
1009 989
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1011 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); 991 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1012 992
1013 if (last == 0) { 993 if (last == 0) {
1014 ext4_warning(sb, __func__, 994 ext4_warning(sb, "need to use ext2online to resize further");
1015 "need to use ext2online to resize further");
1016 return -EPERM; 995 return -EPERM;
1017 } 996 }
1018 997
1019 add = EXT4_BLOCKS_PER_GROUP(sb) - last; 998 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
1020 999
1021 if (o_blocks_count + add < o_blocks_count) { 1000 if (o_blocks_count + add < o_blocks_count) {
1022 ext4_warning(sb, __func__, "blocks_count overflow"); 1001 ext4_warning(sb, "blocks_count overflow");
1023 return -EINVAL; 1002 return -EINVAL;
1024 } 1003 }
1025 1004
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1027 add = n_blocks_count - o_blocks_count; 1006 add = n_blocks_count - o_blocks_count;
1028 1007
1029 if (o_blocks_count + add < n_blocks_count) 1008 if (o_blocks_count + add < n_blocks_count)
1030 ext4_warning(sb, __func__, 1009 ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
1031 "will only finish group (%llu"
1032 " blocks, %u new)",
1033 o_blocks_count + add, add); 1010 o_blocks_count + add, add);
1034 1011
1035 /* See if the device is actually as big as what was requested */ 1012 /* See if the device is actually as big as what was requested */
1036 bh = sb_bread(sb, o_blocks_count + add - 1); 1013 bh = sb_bread(sb, o_blocks_count + add - 1);
1037 if (!bh) { 1014 if (!bh) {
1038 ext4_warning(sb, __func__, 1015 ext4_warning(sb, "can't read last block, resize aborted");
1039 "can't read last block, resize aborted");
1040 return -ENOSPC; 1016 return -ENOSPC;
1041 } 1017 }
1042 brelse(bh); 1018 brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1047 handle = ext4_journal_start_sb(sb, 3); 1023 handle = ext4_journal_start_sb(sb, 3);
1048 if (IS_ERR(handle)) { 1024 if (IS_ERR(handle)) {
1049 err = PTR_ERR(handle); 1025 err = PTR_ERR(handle);
1050 ext4_warning(sb, __func__, "error %d on journal start", err); 1026 ext4_warning(sb, "error %d on journal start", err);
1051 goto exit_put; 1027 goto exit_put;
1052 } 1028 }
1053 1029
1054 mutex_lock(&EXT4_SB(sb)->s_resize_lock); 1030 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1055 if (o_blocks_count != ext4_blocks_count(es)) { 1031 if (o_blocks_count != ext4_blocks_count(es)) {
1056 ext4_warning(sb, __func__, 1032 ext4_warning(sb, "multiple resizers run on filesystem!");
1057 "multiple resizers run on filesystem!");
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1033 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_journal_stop(handle); 1034 ext4_journal_stop(handle);
1060 err = -EBUSY; 1035 err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1063 1038
1064 if ((err = ext4_journal_get_write_access(handle, 1039 if ((err = ext4_journal_get_write_access(handle,
1065 EXT4_SB(sb)->s_sbh))) { 1040 EXT4_SB(sb)->s_sbh))) {
1066 ext4_warning(sb, __func__, 1041 ext4_warning(sb, "error %d on journal write access", err);
1067 "error %d on journal write access", err);
1068 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1042 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1069 ext4_journal_stop(handle); 1043 ext4_journal_stop(handle);
1070 goto exit_put; 1044 goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd56..ad1ee5f21bab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb)
333 sb->s_id); 333 sb->s_id);
334} 334}
335 335
336void ext4_error(struct super_block *sb, const char *function, 336void __ext4_error(struct super_block *sb, const char *function,
337 const char *fmt, ...) 337 const char *fmt, ...)
338{ 338{
339 va_list args; 339 va_list args;
@@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function,
347 ext4_handle_error(sb); 347 ext4_handle_error(sb);
348} 348}
349 349
350void ext4_error_inode(const char *function, struct inode *inode,
351 const char *fmt, ...)
352{
353 va_list args;
354
355 va_start(args, fmt);
356 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
357 inode->i_sb->s_id, function, inode->i_ino, current->comm);
358 vprintk(fmt, args);
359 printk("\n");
360 va_end(args);
361
362 ext4_handle_error(inode->i_sb);
363}
364
365void ext4_error_file(const char *function, struct file *file,
366 const char *fmt, ...)
367{
368 va_list args;
369 struct inode *inode = file->f_dentry->d_inode;
370 char pathname[80], *path;
371
372 va_start(args, fmt);
373 path = d_path(&(file->f_path), pathname, sizeof(pathname));
374 if (!path)
375 path = "(unknown)";
376 printk(KERN_CRIT
377 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
378 inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
379 vprintk(fmt, args);
380 printk("\n");
381 va_end(args);
382
383 ext4_handle_error(inode->i_sb);
384}
385
350static const char *ext4_decode_error(struct super_block *sb, int errno, 386static const char *ext4_decode_error(struct super_block *sb, int errno,
351 char nbuf[16]) 387 char nbuf[16])
352{ 388{
@@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
450 va_end(args); 486 va_end(args);
451} 487}
452 488
453void ext4_warning(struct super_block *sb, const char *function, 489void __ext4_warning(struct super_block *sb, const char *function,
454 const char *fmt, ...) 490 const char *fmt, ...)
455{ 491{
456 va_list args; 492 va_list args;
@@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
507 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 543 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
508 return; 544 return;
509 545
510 ext4_warning(sb, __func__, 546 ext4_warning(sb,
511 "updating to rev %d because of new feature flag, " 547 "updating to rev %d because of new feature flag, "
512 "running e2fsck is recommended", 548 "running e2fsck is recommended",
513 EXT4_DYNAMIC_REV); 549 EXT4_DYNAMIC_REV);
@@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
708#ifdef CONFIG_QUOTA 744#ifdef CONFIG_QUOTA
709 ei->i_reserved_quota = 0; 745 ei->i_reserved_quota = 0;
710#endif 746#endif
711 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 747 INIT_LIST_HEAD(&ei->i_completed_io_list);
748 spin_lock_init(&ei->i_completed_io_lock);
712 ei->cur_aio_dio = NULL; 749 ei->cur_aio_dio = NULL;
713 ei->i_sync_tid = 0; 750 ei->i_sync_tid = 0;
714 ei->i_datasync_tid = 0; 751 ei->i_datasync_tid = 0;
@@ -796,10 +833,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
796 if (sbi->s_qf_names[GRPQUOTA]) 833 if (sbi->s_qf_names[GRPQUOTA])
797 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 834 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
798 835
799 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) 836 if (test_opt(sb, USRQUOTA))
800 seq_puts(seq, ",usrquota"); 837 seq_puts(seq, ",usrquota");
801 838
802 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) 839 if (test_opt(sb, GRPQUOTA))
803 seq_puts(seq, ",grpquota"); 840 seq_puts(seq, ",grpquota");
804#endif 841#endif
805} 842}
@@ -926,6 +963,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
926 if (test_opt(sb, NOLOAD)) 963 if (test_opt(sb, NOLOAD))
927 seq_puts(seq, ",norecovery"); 964 seq_puts(seq, ",norecovery");
928 965
966 if (test_opt(sb, DIOREAD_NOLOCK))
967 seq_puts(seq, ",dioread_nolock");
968
929 ext4_show_quota_options(seq, sb); 969 ext4_show_quota_options(seq, sb);
930 970
931 return 0; 971 return 0;
@@ -1109,6 +1149,7 @@ enum {
1109 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1149 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1110 Opt_block_validity, Opt_noblock_validity, 1150 Opt_block_validity, Opt_noblock_validity,
1111 Opt_inode_readahead_blks, Opt_journal_ioprio, 1151 Opt_inode_readahead_blks, Opt_journal_ioprio,
1152 Opt_dioread_nolock, Opt_dioread_lock,
1112 Opt_discard, Opt_nodiscard, 1153 Opt_discard, Opt_nodiscard,
1113}; 1154};
1114 1155
@@ -1176,6 +1217,8 @@ static const match_table_t tokens = {
1176 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1217 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1177 {Opt_auto_da_alloc, "auto_da_alloc"}, 1218 {Opt_auto_da_alloc, "auto_da_alloc"},
1178 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1219 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1220 {Opt_dioread_nolock, "dioread_nolock"},
1221 {Opt_dioread_lock, "dioread_lock"},
1179 {Opt_discard, "discard"}, 1222 {Opt_discard, "discard"},
1180 {Opt_nodiscard, "nodiscard"}, 1223 {Opt_nodiscard, "nodiscard"},
1181 {Opt_err, NULL}, 1224 {Opt_err, NULL},
@@ -1205,6 +1248,66 @@ static ext4_fsblk_t get_sb_block(void **data)
1205} 1248}
1206 1249
1207#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1250#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1251static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1252 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1253
1254#ifdef CONFIG_QUOTA
1255static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1256{
1257 struct ext4_sb_info *sbi = EXT4_SB(sb);
1258 char *qname;
1259
1260 if (sb_any_quota_loaded(sb) &&
1261 !sbi->s_qf_names[qtype]) {
1262 ext4_msg(sb, KERN_ERR,
1263 "Cannot change journaled "
1264 "quota options when quota turned on");
1265 return 0;
1266 }
1267 qname = match_strdup(args);
1268 if (!qname) {
1269 ext4_msg(sb, KERN_ERR,
1270 "Not enough memory for storing quotafile name");
1271 return 0;
1272 }
1273 if (sbi->s_qf_names[qtype] &&
1274 strcmp(sbi->s_qf_names[qtype], qname)) {
1275 ext4_msg(sb, KERN_ERR,
1276 "%s quota file already specified", QTYPE2NAME(qtype));
1277 kfree(qname);
1278 return 0;
1279 }
1280 sbi->s_qf_names[qtype] = qname;
1281 if (strchr(sbi->s_qf_names[qtype], '/')) {
1282 ext4_msg(sb, KERN_ERR,
1283 "quotafile must be on filesystem root");
1284 kfree(sbi->s_qf_names[qtype]);
1285 sbi->s_qf_names[qtype] = NULL;
1286 return 0;
1287 }
1288 set_opt(sbi->s_mount_opt, QUOTA);
1289 return 1;
1290}
1291
1292static int clear_qf_name(struct super_block *sb, int qtype)
1293{
1294
1295 struct ext4_sb_info *sbi = EXT4_SB(sb);
1296
1297 if (sb_any_quota_loaded(sb) &&
1298 sbi->s_qf_names[qtype]) {
1299 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1300 " when quota turned on");
1301 return 0;
1302 }
1303 /*
1304 * The space will be released later when all options are confirmed
1305 * to be correct
1306 */
1307 sbi->s_qf_names[qtype] = NULL;
1308 return 1;
1309}
1310#endif
1208 1311
1209static int parse_options(char *options, struct super_block *sb, 1312static int parse_options(char *options, struct super_block *sb,
1210 unsigned long *journal_devnum, 1313 unsigned long *journal_devnum,
@@ -1217,8 +1320,7 @@ static int parse_options(char *options, struct super_block *sb,
1217 int data_opt = 0; 1320 int data_opt = 0;
1218 int option; 1321 int option;
1219#ifdef CONFIG_QUOTA 1322#ifdef CONFIG_QUOTA
1220 int qtype, qfmt; 1323 int qfmt;
1221 char *qname;
1222#endif 1324#endif
1223 1325
1224 if (!options) 1326 if (!options)
@@ -1229,19 +1331,31 @@ static int parse_options(char *options, struct super_block *sb,
1229 if (!*p) 1331 if (!*p)
1230 continue; 1332 continue;
1231 1333
1334 /*
1335 * Initialize args struct so we know whether arg was
1336 * found; some options take optional arguments.
1337 */
1338 args[0].to = args[0].from = 0;
1232 token = match_token(p, tokens, args); 1339 token = match_token(p, tokens, args);
1233 switch (token) { 1340 switch (token) {
1234 case Opt_bsd_df: 1341 case Opt_bsd_df:
1342 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1235 clear_opt(sbi->s_mount_opt, MINIX_DF); 1343 clear_opt(sbi->s_mount_opt, MINIX_DF);
1236 break; 1344 break;
1237 case Opt_minix_df: 1345 case Opt_minix_df:
1346 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1238 set_opt(sbi->s_mount_opt, MINIX_DF); 1347 set_opt(sbi->s_mount_opt, MINIX_DF);
1348
1239 break; 1349 break;
1240 case Opt_grpid: 1350 case Opt_grpid:
1351 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1241 set_opt(sbi->s_mount_opt, GRPID); 1352 set_opt(sbi->s_mount_opt, GRPID);
1353
1242 break; 1354 break;
1243 case Opt_nogrpid: 1355 case Opt_nogrpid:
1356 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1244 clear_opt(sbi->s_mount_opt, GRPID); 1357 clear_opt(sbi->s_mount_opt, GRPID);
1358
1245 break; 1359 break;
1246 case Opt_resuid: 1360 case Opt_resuid:
1247 if (match_int(&args[0], &option)) 1361 if (match_int(&args[0], &option))
@@ -1378,14 +1492,13 @@ static int parse_options(char *options, struct super_block *sb,
1378 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1492 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1379 datacheck: 1493 datacheck:
1380 if (is_remount) { 1494 if (is_remount) {
1381 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1495 if (test_opt(sb, DATA_FLAGS) != data_opt) {
1382 != data_opt) {
1383 ext4_msg(sb, KERN_ERR, 1496 ext4_msg(sb, KERN_ERR,
1384 "Cannot change data mode on remount"); 1497 "Cannot change data mode on remount");
1385 return 0; 1498 return 0;
1386 } 1499 }
1387 } else { 1500 } else {
1388 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; 1501 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1389 sbi->s_mount_opt |= data_opt; 1502 sbi->s_mount_opt |= data_opt;
1390 } 1503 }
1391 break; 1504 break;
@@ -1397,63 +1510,22 @@ static int parse_options(char *options, struct super_block *sb,
1397 break; 1510 break;
1398#ifdef CONFIG_QUOTA 1511#ifdef CONFIG_QUOTA
1399 case Opt_usrjquota: 1512 case Opt_usrjquota:
1400 qtype = USRQUOTA; 1513 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1401 goto set_qf_name;
1402 case Opt_grpjquota:
1403 qtype = GRPQUOTA;
1404set_qf_name:
1405 if (sb_any_quota_loaded(sb) &&
1406 !sbi->s_qf_names[qtype]) {
1407 ext4_msg(sb, KERN_ERR,
1408 "Cannot change journaled "
1409 "quota options when quota turned on");
1410 return 0; 1514 return 0;
1411 } 1515 break;
1412 qname = match_strdup(&args[0]); 1516 case Opt_grpjquota:
1413 if (!qname) { 1517 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1414 ext4_msg(sb, KERN_ERR,
1415 "Not enough memory for "
1416 "storing quotafile name");
1417 return 0;
1418 }
1419 if (sbi->s_qf_names[qtype] &&
1420 strcmp(sbi->s_qf_names[qtype], qname)) {
1421 ext4_msg(sb, KERN_ERR,
1422 "%s quota file already "
1423 "specified", QTYPE2NAME(qtype));
1424 kfree(qname);
1425 return 0;
1426 }
1427 sbi->s_qf_names[qtype] = qname;
1428 if (strchr(sbi->s_qf_names[qtype], '/')) {
1429 ext4_msg(sb, KERN_ERR,
1430 "quotafile must be on "
1431 "filesystem root");
1432 kfree(sbi->s_qf_names[qtype]);
1433 sbi->s_qf_names[qtype] = NULL;
1434 return 0; 1518 return 0;
1435 }
1436 set_opt(sbi->s_mount_opt, QUOTA);
1437 break; 1519 break;
1438 case Opt_offusrjquota: 1520 case Opt_offusrjquota:
1439 qtype = USRQUOTA; 1521 if (!clear_qf_name(sb, USRQUOTA))
1440 goto clear_qf_name; 1522 return 0;
1523 break;
1441 case Opt_offgrpjquota: 1524 case Opt_offgrpjquota:
1442 qtype = GRPQUOTA; 1525 if (!clear_qf_name(sb, GRPQUOTA))
1443clear_qf_name:
1444 if (sb_any_quota_loaded(sb) &&
1445 sbi->s_qf_names[qtype]) {
1446 ext4_msg(sb, KERN_ERR, "Cannot change "
1447 "journaled quota options when "
1448 "quota turned on");
1449 return 0; 1526 return 0;
1450 }
1451 /*
1452 * The space will be released later when all options
1453 * are confirmed to be correct
1454 */
1455 sbi->s_qf_names[qtype] = NULL;
1456 break; 1527 break;
1528
1457 case Opt_jqfmt_vfsold: 1529 case Opt_jqfmt_vfsold:
1458 qfmt = QFMT_VFS_OLD; 1530 qfmt = QFMT_VFS_OLD;
1459 goto set_qf_format; 1531 goto set_qf_format;
@@ -1518,10 +1590,11 @@ set_qf_format:
1518 clear_opt(sbi->s_mount_opt, BARRIER); 1590 clear_opt(sbi->s_mount_opt, BARRIER);
1519 break; 1591 break;
1520 case Opt_barrier: 1592 case Opt_barrier:
1521 if (match_int(&args[0], &option)) { 1593 if (args[0].from) {
1522 set_opt(sbi->s_mount_opt, BARRIER); 1594 if (match_int(&args[0], &option))
1523 break; 1595 return 0;
1524 } 1596 } else
1597 option = 1; /* No argument, default to 1 */
1525 if (option) 1598 if (option)
1526 set_opt(sbi->s_mount_opt, BARRIER); 1599 set_opt(sbi->s_mount_opt, BARRIER);
1527 else 1600 else
@@ -1594,10 +1667,11 @@ set_qf_format:
1594 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1667 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1595 break; 1668 break;
1596 case Opt_auto_da_alloc: 1669 case Opt_auto_da_alloc:
1597 if (match_int(&args[0], &option)) { 1670 if (args[0].from) {
1598 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1671 if (match_int(&args[0], &option))
1599 break; 1672 return 0;
1600 } 1673 } else
1674 option = 1; /* No argument, default to 1 */
1601 if (option) 1675 if (option)
1602 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1676 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1603 else 1677 else
@@ -1609,6 +1683,12 @@ set_qf_format:
1609 case Opt_nodiscard: 1683 case Opt_nodiscard:
1610 clear_opt(sbi->s_mount_opt, DISCARD); 1684 clear_opt(sbi->s_mount_opt, DISCARD);
1611 break; 1685 break;
1686 case Opt_dioread_nolock:
1687 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1688 break;
1689 case Opt_dioread_lock:
1690 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1691 break;
1612 default: 1692 default:
1613 ext4_msg(sb, KERN_ERR, 1693 ext4_msg(sb, KERN_ERR,
1614 "Unrecognized mount option \"%s\" " 1694 "Unrecognized mount option \"%s\" "
@@ -1618,18 +1698,13 @@ set_qf_format:
1618 } 1698 }
1619#ifdef CONFIG_QUOTA 1699#ifdef CONFIG_QUOTA
1620 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1700 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1621 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && 1701 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1622 sbi->s_qf_names[USRQUOTA])
1623 clear_opt(sbi->s_mount_opt, USRQUOTA); 1702 clear_opt(sbi->s_mount_opt, USRQUOTA);
1624 1703
1625 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && 1704 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1626 sbi->s_qf_names[GRPQUOTA])
1627 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1705 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1628 1706
1629 if ((sbi->s_qf_names[USRQUOTA] && 1707 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1630 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1631 (sbi->s_qf_names[GRPQUOTA] &&
1632 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1633 ext4_msg(sb, KERN_ERR, "old and new quota " 1708 ext4_msg(sb, KERN_ERR, "old and new quota "
1634 "format mixing"); 1709 "format mixing");
1635 return 0; 1710 return 0;
@@ -2432,8 +2507,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2432 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2507 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
2433 if (def_mount_opts & EXT4_DEFM_DEBUG) 2508 if (def_mount_opts & EXT4_DEFM_DEBUG)
2434 set_opt(sbi->s_mount_opt, DEBUG); 2509 set_opt(sbi->s_mount_opt, DEBUG);
2435 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 2510 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2511 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2512 "2.6.38");
2436 set_opt(sbi->s_mount_opt, GRPID); 2513 set_opt(sbi->s_mount_opt, GRPID);
2514 }
2437 if (def_mount_opts & EXT4_DEFM_UID16) 2515 if (def_mount_opts & EXT4_DEFM_UID16)
2438 set_opt(sbi->s_mount_opt, NO_UID32); 2516 set_opt(sbi->s_mount_opt, NO_UID32);
2439#ifdef CONFIG_EXT4_FS_XATTR 2517#ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2523,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2445 set_opt(sbi->s_mount_opt, POSIX_ACL); 2523 set_opt(sbi->s_mount_opt, POSIX_ACL);
2446#endif 2524#endif
2447 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 2525 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2448 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 2526 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2449 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 2527 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2450 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 2528 set_opt(sbi->s_mount_opt, ORDERED_DATA);
2451 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 2529 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2452 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; 2530 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2453 2531
2454 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 2532 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2455 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 2533 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2477,7 +2555,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2477 goto failed_mount; 2555 goto failed_mount;
2478 2556
2479 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2557 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2480 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2558 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2481 2559
2482 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 2560 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2483 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2561 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2766,7 +2844,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2766 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2844 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2767 ext4_msg(sb, KERN_ERR, "required journal recovery " 2845 ext4_msg(sb, KERN_ERR, "required journal recovery "
2768 "suppressed and not mounted read-only"); 2846 "suppressed and not mounted read-only");
2769 goto failed_mount4; 2847 goto failed_mount_wq;
2770 } else { 2848 } else {
2771 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2849 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2772 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2850 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2857,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2779 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2857 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2780 JBD2_FEATURE_INCOMPAT_64BIT)) { 2858 JBD2_FEATURE_INCOMPAT_64BIT)) {
2781 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2859 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2782 goto failed_mount4; 2860 goto failed_mount_wq;
2783 } 2861 }
2784 2862
2785 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2863 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2896,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2818 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2896 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2819 ext4_msg(sb, KERN_ERR, "Journal does not support " 2897 ext4_msg(sb, KERN_ERR, "Journal does not support "
2820 "requested data journaling mode"); 2898 "requested data journaling mode");
2821 goto failed_mount4; 2899 goto failed_mount_wq;
2822 } 2900 }
2823 default: 2901 default:
2824 break; 2902 break;
@@ -2826,13 +2904,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2826 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2904 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2827 2905
2828no_journal: 2906no_journal:
2829
2830 if (test_opt(sb, NOBH)) { 2907 if (test_opt(sb, NOBH)) {
2831 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2908 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2832 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2909 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2833 "its supported only with writeback mode"); 2910 "its supported only with writeback mode");
2834 clear_opt(sbi->s_mount_opt, NOBH); 2911 clear_opt(sbi->s_mount_opt, NOBH);
2835 } 2912 }
2913 if (test_opt(sb, DIOREAD_NOLOCK)) {
2914 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2915 "not supported with nobh mode");
2916 goto failed_mount_wq;
2917 }
2836 } 2918 }
2837 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2919 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2838 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2920 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,6 +2979,18 @@ no_journal:
2897 "requested data journaling mode"); 2979 "requested data journaling mode");
2898 clear_opt(sbi->s_mount_opt, DELALLOC); 2980 clear_opt(sbi->s_mount_opt, DELALLOC);
2899 } 2981 }
2982 if (test_opt(sb, DIOREAD_NOLOCK)) {
2983 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2984 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2985 "option - requested data journaling mode");
2986 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2987 }
2988 if (sb->s_blocksize < PAGE_SIZE) {
2989 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2990 "option - block size is too small");
2991 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2992 }
2993 }
2900 2994
2901 err = ext4_setup_system_zone(sb); 2995 err = ext4_setup_system_zone(sb);
2902 if (err) { 2996 if (err) {
@@ -3360,10 +3454,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
3360 char nbuf[16]; 3454 char nbuf[16];
3361 3455
3362 errstr = ext4_decode_error(sb, j_errno, nbuf); 3456 errstr = ext4_decode_error(sb, j_errno, nbuf);
3363 ext4_warning(sb, __func__, "Filesystem error recorded " 3457 ext4_warning(sb, "Filesystem error recorded "
3364 "from previous mount: %s", errstr); 3458 "from previous mount: %s", errstr);
3365 ext4_warning(sb, __func__, "Marking fs in need of " 3459 ext4_warning(sb, "Marking fs in need of filesystem check.");
3366 "filesystem check.");
3367 3460
3368 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3461 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3369 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3462 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3514,7 +3607,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3514 ext4_abort(sb, __func__, "Abort forced by user"); 3607 ext4_abort(sb, __func__, "Abort forced by user");
3515 3608
3516 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3609 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3517 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 3610 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3518 3611
3519 es = sbi->s_es; 3612 es = sbi->s_es;
3520 3613
@@ -3917,9 +4010,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3917 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4010 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3918 int err = 0; 4011 int err = 0;
3919 int offset = off & (sb->s_blocksize - 1); 4012 int offset = off & (sb->s_blocksize - 1);
3920 int tocopy;
3921 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; 4013 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3922 size_t towrite = len;
3923 struct buffer_head *bh; 4014 struct buffer_head *bh;
3924 handle_t *handle = journal_current_handle(); 4015 handle_t *handle = journal_current_handle();
3925 4016
@@ -3929,52 +4020,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3929 (unsigned long long)off, (unsigned long long)len); 4020 (unsigned long long)off, (unsigned long long)len);
3930 return -EIO; 4021 return -EIO;
3931 } 4022 }
4023 /*
4024 * Since we account only one data block in transaction credits,
4025 * then it is impossible to cross a block boundary.
4026 */
4027 if (sb->s_blocksize - offset < len) {
4028 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4029 " cancelled because not block aligned",
4030 (unsigned long long)off, (unsigned long long)len);
4031 return -EIO;
4032 }
4033
3932 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 4034 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3933 while (towrite > 0) { 4035 bh = ext4_bread(handle, inode, blk, 1, &err);
3934 tocopy = sb->s_blocksize - offset < towrite ? 4036 if (!bh)
3935 sb->s_blocksize - offset : towrite; 4037 goto out;
3936 bh = ext4_bread(handle, inode, blk, 1, &err); 4038 if (journal_quota) {
3937 if (!bh) 4039 err = ext4_journal_get_write_access(handle, bh);
4040 if (err) {
4041 brelse(bh);
3938 goto out; 4042 goto out;
3939 if (journal_quota) {
3940 err = ext4_journal_get_write_access(handle, bh);
3941 if (err) {
3942 brelse(bh);
3943 goto out;
3944 }
3945 } 4043 }
3946 lock_buffer(bh);
3947 memcpy(bh->b_data+offset, data, tocopy);
3948 flush_dcache_page(bh->b_page);
3949 unlock_buffer(bh);
3950 if (journal_quota)
3951 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3952 else {
3953 /* Always do at least ordered writes for quotas */
3954 err = ext4_jbd2_file_inode(handle, inode);
3955 mark_buffer_dirty(bh);
3956 }
3957 brelse(bh);
3958 if (err)
3959 goto out;
3960 offset = 0;
3961 towrite -= tocopy;
3962 data += tocopy;
3963 blk++;
3964 } 4044 }
4045 lock_buffer(bh);
4046 memcpy(bh->b_data+offset, data, len);
4047 flush_dcache_page(bh->b_page);
4048 unlock_buffer(bh);
4049 if (journal_quota)
4050 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4051 else {
4052 /* Always do at least ordered writes for quotas */
4053 err = ext4_jbd2_file_inode(handle, inode);
4054 mark_buffer_dirty(bh);
4055 }
4056 brelse(bh);
3965out: 4057out:
3966 if (len == towrite) { 4058 if (err) {
3967 mutex_unlock(&inode->i_mutex); 4059 mutex_unlock(&inode->i_mutex);
3968 return err; 4060 return err;
3969 } 4061 }
3970 if (inode->i_size < off+len-towrite) { 4062 if (inode->i_size < off + len) {
3971 i_size_write(inode, off+len-towrite); 4063 i_size_write(inode, off + len);
3972 EXT4_I(inode)->i_disksize = inode->i_size; 4064 EXT4_I(inode)->i_disksize = inode->i_size;
3973 } 4065 }
3974 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 4066 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3975 ext4_mark_inode_dirty(handle, inode); 4067 ext4_mark_inode_dirty(handle, inode);
3976 mutex_unlock(&inode->i_mutex); 4068 mutex_unlock(&inode->i_mutex);
3977 return len - towrite; 4069 return len;
3978} 4070}
3979 4071
3980#endif 4072#endif
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45aa..efc16a4b7ceb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
227 ea_bdebug(bh, "b_count=%d, refcount=%d", 227 ea_bdebug(bh, "b_count=%d, refcount=%d",
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: ext4_error(inode->i_sb, __func__, 230bad_block:
231 ext4_error(inode->i_sb,
231 "inode %lu: bad block %llu", inode->i_ino, 232 "inode %lu: bad block %llu", inode->i_ino,
232 EXT4_I(inode)->i_file_acl); 233 EXT4_I(inode)->i_file_acl);
233 error = -EIO; 234 error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
267 void *end; 268 void *end;
268 int error; 269 int error;
269 270
270 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 271 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
271 return -ENODATA; 272 return -ENODATA;
272 error = ext4_get_inode_loc(inode, &iloc); 273 error = ext4_get_inode_loc(inode, &iloc);
273 if (error) 274 if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
371 ea_bdebug(bh, "b_count=%d, refcount=%d", 372 ea_bdebug(bh, "b_count=%d, refcount=%d",
372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
373 if (ext4_xattr_check_block(bh)) { 374 if (ext4_xattr_check_block(bh)) {
374 ext4_error(inode->i_sb, __func__, 375 ext4_error(inode->i_sb,
375 "inode %lu: bad block %llu", inode->i_ino, 376 "inode %lu: bad block %llu", inode->i_ino,
376 EXT4_I(inode)->i_file_acl); 377 EXT4_I(inode)->i_file_acl);
377 error = -EIO; 378 error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
396 void *end; 397 void *end;
397 int error; 398 int error;
398 399
399 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 400 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
400 return 0; 401 return 0;
401 error = ext4_get_inode_loc(inode, &iloc); 402 error = ext4_get_inode_loc(inode, &iloc);
402 if (error) 403 if (error)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
665 atomic_read(&(bs->bh->b_count)), 666 atomic_read(&(bs->bh->b_count)),
666 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 667 le32_to_cpu(BHDR(bs->bh)->h_refcount));
667 if (ext4_xattr_check_block(bs->bh)) { 668 if (ext4_xattr_check_block(bs->bh)) {
668 ext4_error(sb, __func__, 669 ext4_error(sb, "inode %lu: bad block %llu",
669 "inode %lu: bad block %llu", inode->i_ino, 670 inode->i_ino, EXT4_I(inode)->i_file_acl);
670 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 671 error = -EIO;
672 goto cleanup; 672 goto cleanup;
673 } 673 }
@@ -880,9 +880,8 @@ cleanup_dquot:
880 goto cleanup; 880 goto cleanup;
881 881
882bad_block: 882bad_block:
883 ext4_error(inode->i_sb, __func__, 883 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
884 "inode %lu: bad block %llu", inode->i_ino, 884 inode->i_ino, EXT4_I(inode)->i_file_acl);
885 EXT4_I(inode)->i_file_acl);
886 goto cleanup; 885 goto cleanup;
887 886
888#undef header 887#undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
908 is->s.base = is->s.first = IFIRST(header); 907 is->s.base = is->s.first = IFIRST(header);
909 is->s.here = is->s.first; 908 is->s.here = is->s.first;
910 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 909 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
911 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 910 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
912 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 911 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
913 if (error) 912 if (error)
914 return error; 913 return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
940 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 939 header = IHDR(inode, ext4_raw_inode(&is->iloc));
941 if (!IS_LAST_ENTRY(s->first)) { 940 if (!IS_LAST_ENTRY(s->first)) {
942 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 941 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
943 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; 942 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
944 } else { 943 } else {
945 header->h_magic = cpu_to_le32(0); 944 header->h_magic = cpu_to_le32(0);
946 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; 945 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
947 } 946 }
948 return 0; 947 return 0;
949} 948}
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
986 if (strlen(name) > 255) 985 if (strlen(name) > 255)
987 return -ERANGE; 986 return -ERANGE;
988 down_write(&EXT4_I(inode)->xattr_sem); 987 down_write(&EXT4_I(inode)->xattr_sem);
989 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; 988 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
990 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 989 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
991 990
992 error = ext4_get_inode_loc(inode, &is.iloc); 991 error = ext4_get_inode_loc(inode, &is.iloc);
993 if (error) 992 if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
997 if (error) 996 if (error)
998 goto cleanup; 997 goto cleanup;
999 998
1000 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 999 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
1001 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 1000 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
1002 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 1001 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
1003 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; 1002 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
1004 } 1003 }
1005 1004
1006 error = ext4_xattr_ibody_find(inode, &i, &is); 1005 error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1052 ext4_xattr_update_super_block(handle, inode->i_sb); 1051 ext4_xattr_update_super_block(handle, inode->i_sb);
1053 inode->i_ctime = ext4_current_time(inode); 1052 inode->i_ctime = ext4_current_time(inode);
1054 if (!value) 1053 if (!value)
1055 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1054 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1056 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1055 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1057 /* 1056 /*
1058 * The bh is consumed by ext4_mark_iloc_dirty, even with 1057 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
1067 brelse(is.iloc.bh); 1066 brelse(is.iloc.bh);
1068 brelse(bs.bh); 1067 brelse(bs.bh);
1069 if (no_expand == 0) 1068 if (no_expand == 0)
1070 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1069 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1071 up_write(&EXT4_I(inode)->xattr_sem); 1070 up_write(&EXT4_I(inode)->xattr_sem);
1072 return error; 1071 return error;
1073} 1072}
@@ -1195,9 +1194,8 @@ retry:
1195 if (!bh) 1194 if (!bh)
1196 goto cleanup; 1195 goto cleanup;
1197 if (ext4_xattr_check_block(bh)) { 1196 if (ext4_xattr_check_block(bh)) {
1198 ext4_error(inode->i_sb, __func__, 1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1199 "inode %lu: bad block %llu", inode->i_ino, 1198 inode->i_ino, EXT4_I(inode)->i_file_acl);
1200 EXT4_I(inode)->i_file_acl);
1201 error = -EIO; 1199 error = -EIO;
1202 goto cleanup; 1200 goto cleanup;
1203 } 1201 }
@@ -1302,6 +1300,8 @@ retry:
1302 1300
1303 /* Remove the chosen entry from the inode */ 1301 /* Remove the chosen entry from the inode */
1304 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1302 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1303 if (error)
1304 goto cleanup;
1305 1305
1306 entry = IFIRST(header); 1306 entry = IFIRST(header);
1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1372 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1374 if (!bh) {
1375 ext4_error(inode->i_sb, __func__, 1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error",
1376 "inode %lu: block %llu read error", inode->i_ino, 1376 inode->i_ino, EXT4_I(inode)->i_file_acl);
1377 EXT4_I(inode)->i_file_acl);
1378 goto cleanup; 1377 goto cleanup;
1379 } 1378 }
1380 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1381 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1382 ext4_error(inode->i_sb, __func__, 1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1383 "inode %lu: bad block %llu", inode->i_ino, 1382 inode->i_ino, EXT4_I(inode)->i_file_acl);
1384 EXT4_I(inode)->i_file_acl);
1385 goto cleanup; 1383 goto cleanup;
1386 } 1384 }
1387 ext4_xattr_release_block(handle, inode, bh); 1385 ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1504,7 @@ again:
1506 } 1504 }
1507 bh = sb_bread(inode->i_sb, ce->e_block); 1505 bh = sb_bread(inode->i_sb, ce->e_block);
1508 if (!bh) { 1506 if (!bh) {
1509 ext4_error(inode->i_sb, __func__, 1507 ext4_error(inode->i_sb,
1510 "inode %lu: block %lu read error", 1508 "inode %lu: block %lu read error",
1511 inode->i_ino, (unsigned long) ce->e_block); 1509 inode->i_ino, (unsigned long) ce->e_block);
1512 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 886849370950..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
507 if (blocknr < journal->j_tail) 507 if (blocknr < journal->j_tail)
508 freed = freed + journal->j_last - journal->j_first; 508 freed = freed + journal->j_last - journal->j_first;
509 509
510 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
510 jbd_debug(1, 511 jbd_debug(1,
511 "Cleaning journal tail from %d to %d (offset %lu), " 512 "Cleaning journal tail from %d to %d (offset %lu), "
512 "freeing %lu\n", 513 "freeing %lu\n",
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d2..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -883,8 +883,7 @@ restart_loop:
883 spin_unlock(&journal->j_list_lock); 883 spin_unlock(&journal->j_list_lock);
884 bh = jh2bh(jh); 884 bh = jh2bh(jh);
885 jbd_lock_bh_state(bh); 885 jbd_lock_bh_state(bh);
886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
887 jh->b_transaction == journal->j_running_transaction);
888 887
889 /* 888 /*
890 * If there is undo-protected committed data against 889 * If there is undo-protected committed data against
@@ -930,12 +929,12 @@ restart_loop:
930 /* A buffer which has been freed while still being 929 /* A buffer which has been freed while still being
931 * journaled by a previous transaction may end up still 930 * journaled by a previous transaction may end up still
932 * being dirty here, but we want to avoid writing back 931 * being dirty here, but we want to avoid writing back
933 * that buffer in the future now that the last use has 932 * that buffer in the future after the "add to orphan"
934 * been committed. That's not only a performance gain, 933 * operation been committed, That's not only a performance
935 * it also stops aliasing problems if the buffer is left 934 * gain, it also stops aliasing problems if the buffer is
936 * behind for writeback and gets reallocated for another 935 * left behind for writeback and gets reallocated for another
937 * use in a different page. */ 936 * use in a different page. */
938 if (buffer_freed(bh)) { 937 if (buffer_freed(bh) && !jh->b_next_transaction) {
939 clear_buffer_freed(bh); 938 clear_buffer_freed(bh);
940 clear_buffer_jbddirty(bh); 939 clear_buffer_jbddirty(bh);
941 } 940 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d0..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h>
43#include <linux/vmalloc.h>
42 44
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h> 46#include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
93 95
94static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 96static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
95static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size);
96 99
97/* 100/*
98 * Helper function used to manage commit timeouts 101 * Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
1248 } 1251 }
1249 } 1252 }
1250 1253
1254 /*
1255 * Create a slab for this blocksize
1256 */
1257 err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1258 if (err)
1259 return err;
1260
1251 /* Let the recovery code check whether it needs to recover any 1261 /* Let the recovery code check whether it needs to recover any
1252 * data from the journal. */ 1262 * data from the journal. */
1253 if (jbd2_journal_recover(journal)) 1263 if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
1807} 1817}
1808 1818
1809/* 1819/*
1820 * JBD memory management
1821 *
1822 * These functions are used to allocate block-sized chunks of memory
1823 * used for making copies of buffer_head data. Very often it will be
1824 * page-sized chunks of data, but sometimes it will be in
1825 * sub-page-size chunks. (For example, 16k pages on Power systems
1826 * with a 4k block file system.) For blocks smaller than a page, we
1827 * use a SLAB allocator. There are slab caches for each block size,
1828 * which are allocated at mount time, if necessary, and we only free
1829 * (all of) the slab caches when/if the jbd2 module is unloaded. For
1830 * this reason we don't need to a mutex to protect access to
1831 * jbd2_slab[] allocating or releasing memory; only in
1832 * jbd2_journal_create_slab().
1833 */
1834#define JBD2_MAX_SLABS 8
1835static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1836static DECLARE_MUTEX(jbd2_slab_create_sem);
1837
1838static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1839 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
1840 "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
1841};
1842
1843
1844static void jbd2_journal_destroy_slabs(void)
1845{
1846 int i;
1847
1848 for (i = 0; i < JBD2_MAX_SLABS; i++) {
1849 if (jbd2_slab[i])
1850 kmem_cache_destroy(jbd2_slab[i]);
1851 jbd2_slab[i] = NULL;
1852 }
1853}
1854
1855static int jbd2_journal_create_slab(size_t size)
1856{
1857 int i = order_base_2(size) - 10;
1858 size_t slab_size;
1859
1860 if (size == PAGE_SIZE)
1861 return 0;
1862
1863 if (i >= JBD2_MAX_SLABS)
1864 return -EINVAL;
1865
1866 if (unlikely(i < 0))
1867 i = 0;
1868 down(&jbd2_slab_create_sem);
1869 if (jbd2_slab[i]) {
1870 up(&jbd2_slab_create_sem);
1871 return 0; /* Already created */
1872 }
1873
1874 slab_size = 1 << (i+10);
1875 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1876 slab_size, 0, NULL);
1877 up(&jbd2_slab_create_sem);
1878 if (!jbd2_slab[i]) {
1879 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1880 return -ENOMEM;
1881 }
1882 return 0;
1883}
1884
1885static struct kmem_cache *get_slab(size_t size)
1886{
1887 int i = order_base_2(size) - 10;
1888
1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0))
1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0);
1893 return jbd2_slab[i];
1894}
1895
1896void *jbd2_alloc(size_t size, gfp_t flags)
1897{
1898 void *ptr;
1899
1900 BUG_ON(size & (size-1)); /* Must be a power of 2 */
1901
1902 flags |= __GFP_REPEAT;
1903 if (size == PAGE_SIZE)
1904 ptr = (void *)__get_free_pages(flags, 0);
1905 else if (size > PAGE_SIZE) {
1906 int order = get_order(size);
1907
1908 if (order < 3)
1909 ptr = (void *)__get_free_pages(flags, order);
1910 else
1911 ptr = vmalloc(size);
1912 } else
1913 ptr = kmem_cache_alloc(get_slab(size), flags);
1914
1915 /* Check alignment; SLUB has gotten this wrong in the past,
1916 * and this can lead to user data corruption! */
1917 BUG_ON(((unsigned long) ptr) & (size-1));
1918
1919 return ptr;
1920}
1921
1922void jbd2_free(void *ptr, size_t size)
1923{
1924 if (size == PAGE_SIZE) {
1925 free_pages((unsigned long)ptr, 0);
1926 return;
1927 }
1928 if (size > PAGE_SIZE) {
1929 int order = get_order(size);
1930
1931 if (order < 3)
1932 free_pages((unsigned long)ptr, order);
1933 else
1934 vfree(ptr);
1935 return;
1936 }
1937 kmem_cache_free(get_slab(size), ptr);
1938};
1939
1940/*
1810 * Journal_head storage management 1941 * Journal_head storage management
1811 */ 1942 */
1812static struct kmem_cache *jbd2_journal_head_cache; 1943static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
2204 jbd2_journal_destroy_revoke_caches(); 2335 jbd2_journal_destroy_revoke_caches();
2205 jbd2_journal_destroy_jbd2_journal_head_cache(); 2336 jbd2_journal_destroy_jbd2_journal_head_cache();
2206 jbd2_journal_destroy_handle_cache(); 2337 jbd2_journal_destroy_handle_cache();
2338 jbd2_journal_destroy_slabs();
2207} 2339}
2208 2340
2209static int __init journal_init(void) 2341static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1727 if (!jh) 1727 if (!jh)
1728 goto zap_buffer_no_jh; 1728 goto zap_buffer_no_jh;
1729 1729
1730 /*
1731 * We cannot remove the buffer from checkpoint lists until the
1732 * transaction adding inode to orphan list (let's call it T)
1733 * is committed. Otherwise if the transaction changing the
1734 * buffer would be cleaned from the journal before T is
1735 * committed, a crash will cause that the correct contents of
1736 * the buffer will be lost. On the other hand we have to
1737 * clear the buffer dirty bit at latest at the moment when the
1738 * transaction marking the buffer as freed in the filesystem
1739 * structures is committed because from that moment on the
1740 * buffer can be reallocated and used by a different page.
1741 * Since the block hasn't been freed yet but the inode has
1742 * already been added to orphan list, it is safe for us to add
1743 * the buffer to BJ_Forget list of the newest transaction.
1744 */
1730 transaction = jh->b_transaction; 1745 transaction = jh->b_transaction;
1731 if (transaction == NULL) { 1746 if (transaction == NULL) {
1732 /* First case: not on any transaction. If it 1747 /* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1783 } else if (transaction == journal->j_committing_transaction) { 1798 } else if (transaction == journal->j_committing_transaction) {
1784 JBUFFER_TRACE(jh, "on committing transaction"); 1799 JBUFFER_TRACE(jh, "on committing transaction");
1785 /* 1800 /*
1786 * If it is committing, we simply cannot touch it. We 1801 * The buffer is committing, we simply cannot touch
1787 * can remove it's next_transaction pointer from the 1802 * it. So we just set j_next_transaction to the
1788 * running transaction if that is set, but nothing 1803 * running transaction (if there is one) and mark
1789 * else. */ 1804 * buffer as freed so that commit code knows it should
1805 * clear dirty bits when it is done with the buffer.
1806 */
1790 set_buffer_freed(bh); 1807 set_buffer_freed(bh);
1791 if (jh->b_next_transaction) { 1808 if (journal->j_running_transaction && buffer_jbddirty(bh))
1792 J_ASSERT(jh->b_next_transaction == 1809 jh->b_next_transaction = journal->j_running_transaction;
1793 journal->j_running_transaction);
1794 jh->b_next_transaction = NULL;
1795 }
1796 jbd2_journal_put_journal_head(jh); 1810 jbd2_journal_put_journal_head(jh);
1797 spin_unlock(&journal->j_list_lock); 1811 spin_unlock(&journal->j_list_lock);
1798 jbd_unlock_bh_state(bh); 1812 jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
1969 */ 1983 */
1970void __jbd2_journal_refile_buffer(struct journal_head *jh) 1984void __jbd2_journal_refile_buffer(struct journal_head *jh)
1971{ 1985{
1972 int was_dirty; 1986 int was_dirty, jlist;
1973 struct buffer_head *bh = jh2bh(jh); 1987 struct buffer_head *bh = jh2bh(jh);
1974 1988
1975 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1989 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
1991 __jbd2_journal_temp_unlink_buffer(jh); 2005 __jbd2_journal_temp_unlink_buffer(jh);
1992 jh->b_transaction = jh->b_next_transaction; 2006 jh->b_transaction = jh->b_next_transaction;
1993 jh->b_next_transaction = NULL; 2007 jh->b_next_transaction = NULL;
1994 __jbd2_journal_file_buffer(jh, jh->b_transaction, 2008 if (buffer_freed(bh))
1995 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2009 jlist = BJ_Forget;
2010 else if (jh->b_modified)
2011 jlist = BJ_Metadata;
2012 else
2013 jlist = BJ_Reserved;
2014 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
1996 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2015 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1997 2016
1998 if (was_dirty) 2017 if (was_dirty)
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/vfs.h> 30#include <linux/vfs.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36 34
37#include "squashfs_fs.h" 35#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h" 37#include "squashfs_fs_i.h"
40#include "squashfs.h" 38#include "squashfs.h"
39#include "decompressor.h"
41 40
42/* 41/*
43 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
153 } 152 }
154 153
155 if (compressed) { 154 if (compressed) {
156 int zlib_err = 0, zlib_init = 0; 155 length = squashfs_decompress(msblk, buffer, bh, b, offset,
157 156 length, srclength, pages);
158 /* 157 if (length < 0)
159 * Uncompress block. 158 goto read_failure;
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate error, data probably corrupt\n");
212 goto release_mutex;
213 }
214
215 zlib_err = zlib_inflateEnd(&msblk->stream);
216 if (zlib_err != Z_OK) {
217 ERROR("zlib_inflate error, data probably corrupt\n");
218 goto release_mutex;
219 }
220 length = msblk->stream.total_out;
221 mutex_unlock(&msblk->read_data_mutex);
222 } else { 159 } else {
223 /* 160 /*
224 * Block is uncompressed. 161 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
255 kfree(bh); 192 kfree(bh);
256 return length; 193 return length;
257 194
258release_mutex:
259 mutex_unlock(&msblk->read_data_mutex);
260
261block_release: 195block_release:
262 for (; k < b; k++) 196 for (; k < b; k++)
263 put_bh(bh[k]); 197 put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
51#include <linux/sched.h> 51#include <linux/sched.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/wait.h> 53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h> 54#include <linux/pagemap.h>
56 55
57#include "squashfs_fs.h" 56#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * decompressor.c
22 */
23
24#include <linux/types.h>
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27
28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h"
32#include "squashfs.h"
33
34/*
35 * This file (and decompressor.h) implements a decompressor framework for
36 * Squashfs, allowing multiple decompressors to be easily supported
37 */
38
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41};
42
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45};
46
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0
49};
50
51static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops,
54 &squashfs_lzo_unsupported_comp_ops,
55 &squashfs_unknown_comp_ops
56};
57
58
59const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
60{
61 int i;
62
63 for (i = 0; decompressor[i]->id; i++)
64 if (id == decompressor[i]->id)
65 break;
66
67 return decompressor[i];
68}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
1#ifndef DECOMPRESSOR_H
2#define DECOMPRESSOR_H
3/*
4 * Squashfs - a compressed read only filesystem for Linux
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * decompressor.h
24 */
25
26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *);
28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int);
31 int id;
32 char *name;
33 int supported;
34};
35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s)
43{
44 if (msblk->decompressor)
45 msblk->decompressor->free(s);
46}
47
48static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
49 void **buffer, struct buffer_head **bh, int b, int offset, int length,
50 int srclength, int pages)
51{
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages);
54}
55#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/zlib.h>
34 33
35#include "squashfs_fs.h" 34#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 35#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
39#include <linux/vfs.h> 39#include <linux/vfs.h>
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/zlib.h>
51 50
52#include "squashfs_fs.h" 51#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h" 52#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/zlib.h>
38 37
39#include "squashfs_fs.h" 38#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/zlib.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h" 45#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/zlib.h>
61 60
62#include "squashfs_fs.h" 61#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h" 62#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int); 51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int); 52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53 53
54/* decompressor.c */
55extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
56
54/* export.c */ 57/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 58extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int); 59 unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
71extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
72 75
73/* 76/*
74 * Inodes and files operations 77 * Inodes, files and decompressor operations
75 */ 78 */
76 79
77/* dir.c */ 80/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
88 91
89/* symlink.c */ 92/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops; 93extern const struct address_space_operations squashfs_symlink_aops;
94
95/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \ 183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) 184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185 185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */ 186/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 187#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127 188#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
211/* 209/*
212 * definitions for structures on disk 210 * definitions for structures on disk
213 */ 211 */
214#define ZLIB_COMPRESSION 1 212#define ZLIB_COMPRESSION 1
213#define LZMA_COMPRESSION 2
214#define LZO_COMPRESSION 3
215 215
216struct squashfs_super_block { 216struct squashfs_super_block {
217 __le32 s_magic; 217 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
52}; 52};
53 53
54struct squashfs_sb_info { 54struct squashfs_sb_info {
55 int devblksize; 55 const struct squashfs_decompressor *decompressor;
56 int devblksize_log2; 56 int devblksize;
57 struct squashfs_cache *block_cache; 57 int devblksize_log2;
58 struct squashfs_cache *fragment_cache; 58 struct squashfs_cache *block_cache;
59 struct squashfs_cache *read_page; 59 struct squashfs_cache *fragment_cache;
60 int next_meta_index; 60 struct squashfs_cache *read_page;
61 __le64 *id_table; 61 int next_meta_index;
62 __le64 *fragment_index; 62 __le64 *id_table;
63 unsigned int *fragment_index_2; 63 __le64 *fragment_index;
64 struct mutex read_data_mutex; 64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 66 struct meta_index *meta_index;
67 z_stream stream; 67 void *stream;
68 __le64 *inode_lookup_table; 68 __le64 *inode_lookup_table;
69 u64 inode_table; 69 u64 inode_table;
70 u64 directory_table; 70 u64 directory_table;
71 unsigned int block_size; 71 unsigned int block_size;
72 unsigned short block_log; 72 unsigned short block_log;
73 long long bytes_used; 73 long long bytes_used;
74 unsigned int inodes; 74 unsigned int inodes;
75}; 75};
76#endif 76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/zlib.h>
39#include <linux/magic.h> 38#include <linux/magic.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
44#include "squashfs.h" 43#include "squashfs.h"
44#include "decompressor.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static const struct squashfs_decompressor *supported_squashfs_filesystem(short
50 major, short minor, short id)
50{ 51{
52 const struct squashfs_decompressor *decompressor;
53
51 if (major < SQUASHFS_MAJOR) { 54 if (major < SQUASHFS_MAJOR) {
52 ERROR("Major/Minor mismatch, older Squashfs %d.%d " 55 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
53 "filesystems are unsupported\n", major, minor); 56 "filesystems are unsupported\n", major, minor);
54 return -EINVAL; 57 return NULL;
55 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { 58 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
56 ERROR("Major/Minor mismatch, trying to mount newer " 59 ERROR("Major/Minor mismatch, trying to mount newer "
57 "%d.%d filesystem\n", major, minor); 60 "%d.%d filesystem\n", major, minor);
58 ERROR("Please update your kernel\n"); 61 ERROR("Please update your kernel\n");
59 return -EINVAL; 62 return NULL;
60 } 63 }
61 64
62 if (comp != ZLIB_COMPRESSION) 65 decompressor = squashfs_lookup_decompressor(id);
63 return -EINVAL; 66 if (!decompressor->supported) {
67 ERROR("Filesystem uses \"%s\" compression. This is not "
68 "supported\n", decompressor->name);
69 return NULL;
70 }
64 71
65 return 0; 72 return decompressor;
66} 73}
67 74
68 75
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
87 } 94 }
88 msblk = sb->s_fs_info; 95 msblk = sb->s_fs_info;
89 96
90 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
91 GFP_KERNEL);
92 if (msblk->stream.workspace == NULL) {
93 ERROR("Failed to allocate zlib workspace\n");
94 goto failure;
95 }
96
97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); 97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
98 if (sblk == NULL) { 98 if (sblk == NULL) {
99 ERROR("Failed to allocate squashfs_super_block\n"); 99 ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
120 goto failed_mount; 120 goto failed_mount;
121 } 121 }
122 122
123 err = -EINVAL;
124
123 /* Check it is a SQUASHFS superblock */ 125 /* Check it is a SQUASHFS superblock */
124 sb->s_magic = le32_to_cpu(sblk->s_magic); 126 sb->s_magic = le32_to_cpu(sblk->s_magic);
125 if (sb->s_magic != SQUASHFS_MAGIC) { 127 if (sb->s_magic != SQUASHFS_MAGIC) {
126 if (!silent) 128 if (!silent)
127 ERROR("Can't find a SQUASHFS superblock on %s\n", 129 ERROR("Can't find a SQUASHFS superblock on %s\n",
128 bdevname(sb->s_bdev, b)); 130 bdevname(sb->s_bdev, b));
129 err = -EINVAL;
130 goto failed_mount; 131 goto failed_mount;
131 } 132 }
132 133
133 /* Check the MAJOR & MINOR versions and compression type */ 134 /* Check the MAJOR & MINOR versions and lookup compression type */
134 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), 135 msblk->decompressor = supported_squashfs_filesystem(
136 le16_to_cpu(sblk->s_major),
135 le16_to_cpu(sblk->s_minor), 137 le16_to_cpu(sblk->s_minor),
136 le16_to_cpu(sblk->compression)); 138 le16_to_cpu(sblk->compression));
137 if (err < 0) 139 if (msblk->decompressor == NULL)
138 goto failed_mount; 140 goto failed_mount;
139 141
140 err = -EINVAL;
141
142 /* 142 /*
143 * Check if there's xattrs in the filesystem. These are not 143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored. 144 * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
205 205
206 err = -ENOMEM; 206 err = -ENOMEM;
207 207
208 msblk->stream = squashfs_decompressor_init(msblk);
209 if (msblk->stream == NULL)
210 goto failed_mount;
211
208 msblk->block_cache = squashfs_cache_init("metadata", 212 msblk->block_cache = squashfs_cache_init("metadata",
209 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 213 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
210 if (msblk->block_cache == NULL) 214 if (msblk->block_cache == NULL)
@@ -292,17 +296,16 @@ failed_mount:
292 squashfs_cache_delete(msblk->block_cache); 296 squashfs_cache_delete(msblk->block_cache);
293 squashfs_cache_delete(msblk->fragment_cache); 297 squashfs_cache_delete(msblk->fragment_cache);
294 squashfs_cache_delete(msblk->read_page); 298 squashfs_cache_delete(msblk->read_page);
299 squashfs_decompressor_free(msblk, msblk->stream);
295 kfree(msblk->inode_lookup_table); 300 kfree(msblk->inode_lookup_table);
296 kfree(msblk->fragment_index); 301 kfree(msblk->fragment_index);
297 kfree(msblk->id_table); 302 kfree(msblk->id_table);
298 kfree(msblk->stream.workspace);
299 kfree(sb->s_fs_info); 303 kfree(sb->s_fs_info);
300 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
301 kfree(sblk); 305 kfree(sblk);
302 return err; 306 return err;
303 307
304failure: 308failure:
305 kfree(msblk->stream.workspace);
306 kfree(sb->s_fs_info); 309 kfree(sb->s_fs_info);
307 sb->s_fs_info = NULL; 310 sb->s_fs_info = NULL;
308 return -ENOMEM; 311 return -ENOMEM;
@@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb)
346 squashfs_cache_delete(sbi->block_cache); 349 squashfs_cache_delete(sbi->block_cache);
347 squashfs_cache_delete(sbi->fragment_cache); 350 squashfs_cache_delete(sbi->fragment_cache);
348 squashfs_cache_delete(sbi->read_page); 351 squashfs_cache_delete(sbi->read_page);
352 squashfs_decompressor_free(sbi, sbi->stream);
349 kfree(sbi->id_table); 353 kfree(sbi->id_table);
350 kfree(sbi->fragment_index); 354 kfree(sbi->fragment_index);
351 kfree(sbi->meta_index); 355 kfree(sbi->meta_index);
352 kfree(sbi->stream.workspace);
353 kfree(sb->s_fs_info); 356 kfree(sb->s_fs_info);
354 sb->s_fs_info = NULL; 357 sb->s_fs_info = NULL;
355 } 358 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..e80be2022a7f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -36,7 +36,6 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..4dd70e04333b
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,150 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * zlib_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/zlib.h>
28
29#include "squashfs_fs.h"
30#include "squashfs_fs_sb.h"
31#include "squashfs_fs_i.h"
32#include "squashfs.h"
33#include "decompressor.h"
34
35static void *zlib_init(struct squashfs_sb_info *dummy)
36{
37 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
38 if (stream == NULL)
39 goto failed;
40 stream->workspace = kmalloc(zlib_inflate_workspacesize(),
41 GFP_KERNEL);
42 if (stream->workspace == NULL)
43 goto failed;
44
45 return stream;
46
47failed:
48 ERROR("Failed to allocate zlib workspace\n");
49 kfree(stream);
50 return NULL;
51}
52
53
54static void zlib_free(void *strm)
55{
56 z_stream *stream = strm;
57
58 if (stream)
59 kfree(stream->workspace);
60 kfree(stream);
61}
62
63
64static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
65 struct buffer_head **bh, int b, int offset, int length, int srclength,
66 int pages)
67{
68 int zlib_err = 0, zlib_init = 0;
69 int avail, bytes, k = 0, page = 0;
70 z_stream *stream = msblk->stream;
71
72 mutex_lock(&msblk->read_data_mutex);
73
74 stream->avail_out = 0;
75 stream->avail_in = 0;
76
77 bytes = length;
78 do {
79 if (stream->avail_in == 0 && k < b) {
80 avail = min(bytes, msblk->devblksize - offset);
81 bytes -= avail;
82 wait_on_buffer(bh[k]);
83 if (!buffer_uptodate(bh[k]))
84 goto release_mutex;
85
86 if (avail == 0) {
87 offset = 0;
88 put_bh(bh[k++]);
89 continue;
90 }
91
92 stream->next_in = bh[k]->b_data + offset;
93 stream->avail_in = avail;
94 offset = 0;
95 }
96
97 if (stream->avail_out == 0 && page < pages) {
98 stream->next_out = buffer[page++];
99 stream->avail_out = PAGE_CACHE_SIZE;
100 }
101
102 if (!zlib_init) {
103 zlib_err = zlib_inflateInit(stream);
104 if (zlib_err != Z_OK) {
105 ERROR("zlib_inflateInit returned unexpected "
106 "result 0x%x, srclength %d\n",
107 zlib_err, srclength);
108 goto release_mutex;
109 }
110 zlib_init = 1;
111 }
112
113 zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
114
115 if (stream->avail_in == 0 && k < b)
116 put_bh(bh[k++]);
117 } while (zlib_err == Z_OK);
118
119 if (zlib_err != Z_STREAM_END) {
120 ERROR("zlib_inflate error, data probably corrupt\n");
121 goto release_mutex;
122 }
123
124 zlib_err = zlib_inflateEnd(stream);
125 if (zlib_err != Z_OK) {
126 ERROR("zlib_inflate error, data probably corrupt\n");
127 goto release_mutex;
128 }
129
130 mutex_unlock(&msblk->read_data_mutex);
131 return stream->total_out;
132
133release_mutex:
134 mutex_unlock(&msblk->read_data_mutex);
135
136 for (; k < b; k++)
137 put_bh(bh[k]);
138
139 return -EIO;
140}
141
142const struct squashfs_decompressor squashfs_zlib_comp_ops = {
143 .init = zlib_init,
144 .free = zlib_free,
145 .decompress = zlib_uncompress,
146 .id = ZLIB_COMPRESSION,
147 .name = "zlib",
148 .supported = 1
149};
150
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 638ce4554c76..8ada2a129d08 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
69#define jbd_debug(f, a...) /**/ 69#define jbd_debug(f, a...) /**/
70#endif 70#endif
71 71
72static inline void *jbd2_alloc(size_t size, gfp_t flags) 72extern void *jbd2_alloc(size_t size, gfp_t flags);
73{ 73extern void jbd2_free(void *ptr, size_t size);
74 return (void *)__get_free_pages(flags, get_order(size));
75}
76
77static inline void jbd2_free(void *ptr, size_t size)
78{
79 free_pages((unsigned long)ptr, get_order(size));
80};
81 74
82#define JBD2_MIN_JOURNAL_BLOCKS 1024 75#define JBD2_MIN_JOURNAL_BLOCKS 1024
83 76
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1b672f74a32f..e7d1b2e0070d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -122,6 +122,11 @@ struct kprobe {
122/* Kprobe status flags */ 122/* Kprobe status flags */
123#define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */ 123#define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */
124#define KPROBE_FLAG_DISABLED 2 /* probe is temporarily disabled */ 124#define KPROBE_FLAG_DISABLED 2 /* probe is temporarily disabled */
125#define KPROBE_FLAG_OPTIMIZED 4 /*
126 * probe is really optimized.
127 * NOTE:
128 * this flag is only for optimized_kprobe.
129 */
125 130
126/* Has this kprobe gone ? */ 131/* Has this kprobe gone ? */
127static inline int kprobe_gone(struct kprobe *p) 132static inline int kprobe_gone(struct kprobe *p)
@@ -134,6 +139,12 @@ static inline int kprobe_disabled(struct kprobe *p)
134{ 139{
135 return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE); 140 return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
136} 141}
142
143/* Is this kprobe really running optimized path ? */
144static inline int kprobe_optimized(struct kprobe *p)
145{
146 return p->flags & KPROBE_FLAG_OPTIMIZED;
147}
137/* 148/*
138 * Special probe type that uses setjmp-longjmp type tricks to resume 149 * Special probe type that uses setjmp-longjmp type tricks to resume
139 * execution at a specified entry with a matching prototype corresponding 150 * execution at a specified entry with a matching prototype corresponding
@@ -249,6 +260,39 @@ extern kprobe_opcode_t *get_insn_slot(void);
249extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); 260extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
250extern void kprobes_inc_nmissed_count(struct kprobe *p); 261extern void kprobes_inc_nmissed_count(struct kprobe *p);
251 262
263#ifdef CONFIG_OPTPROBES
264/*
265 * Internal structure for direct jump optimized probe
266 */
267struct optimized_kprobe {
268 struct kprobe kp;
269 struct list_head list; /* list for optimizing queue */
270 struct arch_optimized_insn optinsn;
271};
272
273/* Architecture dependent functions for direct jump optimization */
274extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
275extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
276extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
277extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
278extern int arch_optimize_kprobe(struct optimized_kprobe *op);
279extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
280extern kprobe_opcode_t *get_optinsn_slot(void);
281extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
282extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
283 unsigned long addr);
284
285extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
286
287#ifdef CONFIG_SYSCTL
288extern int sysctl_kprobes_optimization;
289extern int proc_kprobes_optimization_handler(struct ctl_table *table,
290 int write, void __user *buffer,
291 size_t *length, loff_t *ppos);
292#endif
293
294#endif /* CONFIG_OPTPROBES */
295
252/* Get the kprobe at this addr (if any) - called with preemption disabled */ 296/* Get the kprobe at this addr (if any) - called with preemption disabled */
253struct kprobe *get_kprobe(void *addr); 297struct kprobe *get_kprobe(void *addr);
254void kretprobe_hash_lock(struct task_struct *tsk, 298void kretprobe_hash_lock(struct task_struct *tsk,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d0b6cd3afb2f..2aa6aa3e8f61 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -874,6 +874,107 @@ TRACE_EVENT(ext4_forget,
874 __entry->mode, __entry->is_metadata, __entry->block) 874 __entry->mode, __entry->is_metadata, __entry->block)
875); 875);
876 876
877TRACE_EVENT(ext4_da_update_reserve_space,
878 TP_PROTO(struct inode *inode, int used_blocks),
879
880 TP_ARGS(inode, used_blocks),
881
882 TP_STRUCT__entry(
883 __field( dev_t, dev )
884 __field( ino_t, ino )
885 __field( umode_t, mode )
886 __field( __u64, i_blocks )
887 __field( int, used_blocks )
888 __field( int, reserved_data_blocks )
889 __field( int, reserved_meta_blocks )
890 __field( int, allocated_meta_blocks )
891 ),
892
893 TP_fast_assign(
894 __entry->dev = inode->i_sb->s_dev;
895 __entry->ino = inode->i_ino;
896 __entry->mode = inode->i_mode;
897 __entry->i_blocks = inode->i_blocks;
898 __entry->used_blocks = used_blocks;
899 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
900 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
901 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
902 ),
903
904 TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
905 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
906 __entry->mode, (unsigned long long) __entry->i_blocks,
907 __entry->used_blocks, __entry->reserved_data_blocks,
908 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
909);
910
911TRACE_EVENT(ext4_da_reserve_space,
912 TP_PROTO(struct inode *inode, int md_needed),
913
914 TP_ARGS(inode, md_needed),
915
916 TP_STRUCT__entry(
917 __field( dev_t, dev )
918 __field( ino_t, ino )
919 __field( umode_t, mode )
920 __field( __u64, i_blocks )
921 __field( int, md_needed )
922 __field( int, reserved_data_blocks )
923 __field( int, reserved_meta_blocks )
924 ),
925
926 TP_fast_assign(
927 __entry->dev = inode->i_sb->s_dev;
928 __entry->ino = inode->i_ino;
929 __entry->mode = inode->i_mode;
930 __entry->i_blocks = inode->i_blocks;
931 __entry->md_needed = md_needed;
932 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
933 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
934 ),
935
936 TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
937 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
938 __entry->mode, (unsigned long long) __entry->i_blocks,
939 __entry->md_needed, __entry->reserved_data_blocks,
940 __entry->reserved_meta_blocks)
941);
942
943TRACE_EVENT(ext4_da_release_space,
944 TP_PROTO(struct inode *inode, int freed_blocks),
945
946 TP_ARGS(inode, freed_blocks),
947
948 TP_STRUCT__entry(
949 __field( dev_t, dev )
950 __field( ino_t, ino )
951 __field( umode_t, mode )
952 __field( __u64, i_blocks )
953 __field( int, freed_blocks )
954 __field( int, reserved_data_blocks )
955 __field( int, reserved_meta_blocks )
956 __field( int, allocated_meta_blocks )
957 ),
958
959 TP_fast_assign(
960 __entry->dev = inode->i_sb->s_dev;
961 __entry->ino = inode->i_ino;
962 __entry->mode = inode->i_mode;
963 __entry->i_blocks = inode->i_blocks;
964 __entry->freed_blocks = freed_blocks;
965 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
966 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
967 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
968 ),
969
970 TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
971 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
972 __entry->mode, (unsigned long long) __entry->i_blocks,
973 __entry->freed_blocks, __entry->reserved_data_blocks,
974 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
975);
976
977
877#endif /* _TRACE_EXT4_H */ 978#endif /* _TRACE_EXT4_H */
878 979
879/* This part must be outside protection */ 980/* This part must be outside protection */
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 96b370a050de..bf16545cc977 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -199,6 +199,34 @@ TRACE_EVENT(jbd2_checkpoint_stats,
199 __entry->forced_to_close, __entry->written, __entry->dropped) 199 __entry->forced_to_close, __entry->written, __entry->dropped)
200); 200);
201 201
202TRACE_EVENT(jbd2_cleanup_journal_tail,
203
204 TP_PROTO(journal_t *journal, tid_t first_tid,
205 unsigned long block_nr, unsigned long freed),
206
207 TP_ARGS(journal, first_tid, block_nr, freed),
208
209 TP_STRUCT__entry(
210 __field( dev_t, dev )
211 __field( tid_t, tail_sequence )
212 __field( tid_t, first_tid )
213 __field(unsigned long, block_nr )
214 __field(unsigned long, freed )
215 ),
216
217 TP_fast_assign(
218 __entry->dev = journal->j_fs_dev->bd_dev;
219 __entry->tail_sequence = journal->j_tail_sequence;
220 __entry->first_tid = first_tid;
221 __entry->block_nr = block_nr;
222 __entry->freed = freed;
223 ),
224
225 TP_printk("dev %s from %u to %u offset %lu freed %lu",
226 jbd2_dev_to_name(__entry->dev), __entry->tail_sequence,
227 __entry->first_tid, __entry->block_nr, __entry->freed)
228);
229
202#endif /* _TRACE_JBD2_H */ 230#endif /* _TRACE_JBD2_H */
203 231
204/* This part must be outside protection */ 232/* This part must be outside protection */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
47#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h>
48 50
49#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
105 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
106 * is a recipe for disaster 108 * is a recipe for disaster
107 */ 109 */
108#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
109
110struct kprobe_insn_page { 110struct kprobe_insn_page {
111 struct list_head list; 111 struct list_head list;
112 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
113 char slot_used[INSNS_PER_PAGE];
114 int nused; 113 int nused;
115 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
116}; 126};
117 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
118enum kprobe_slot_state { 133enum kprobe_slot_state {
119 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
120 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
121 SLOT_USED = 2, 136 SLOT_USED = 2,
122}; 137};
123 138
124static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
125static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
126static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
127static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
143 .nr_garbage = 0,
144};
145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
128 146
129/** 147/**
130 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
131 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
132 */ 150 */
133static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
134{ 152{
135 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
136 154
137 retry: 155 retry:
138 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
139 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
140 int i; 158 int i;
141 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
142 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
143 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
144 kip->nused++; 162 kip->nused++;
145 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
146 } 164 }
147 } 165 }
148 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
149 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
150 } 169 }
151 } 170 }
152 171
153 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
154 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
155 goto retry; 174 goto retry;
156 } 175
157 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
158 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
159 if (!kip) 178 if (!kip)
160 return NULL; 179 return NULL;
161 180
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
170 return NULL; 189 return NULL;
171 } 190 }
172 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
173 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
174 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
175 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
176 kip->nused = 1; 194 kip->nused = 1;
177 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
178 return kip->insns; 197 return kip->insns;
179} 198}
180 199
200
181kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
182{ 202{
183 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
184 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
185 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
186 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
187 return ret; 209 return ret;
188} 210}
189 211
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
199 * so as not to have to set it up again the 221 * so as not to have to set it up again the
200 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
201 */ 223 */
202 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
203 list_del(&kip->list); 225 list_del(&kip->list);
204 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
205 kfree(kip); 227 kfree(kip);
@@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
209 return 0; 231 return 0;
210} 232}
211 233
212static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
213{ 235{
214 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
215 237
216 /* Ensure no-one is interrupted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
217 synchronize_sched(); 239 synchronize_sched();
218 240
219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
220 int i; 242 int i;
221 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
222 continue; 244 continue;
223 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
224 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
225 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
226 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
227 break; 249 break;
228 } 250 }
229 } 251 }
230 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
231 return 0; 253 return 0;
232} 254}
233 255
234void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
235{ 258{
236 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
237 260
238 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
239 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) / c->insn_size;
240 if (kip->insns <= slot && 263 if (idx >= 0 && idx < slots_per_page(c)) {
241 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 WARN_ON(kip->slot_used[idx] != SLOT_USED);
242 int i = (slot - kip->insns) / MAX_INSN_SIZE;
243 if (dirty) { 265 if (dirty) {
244 kip->slot_used[i] = SLOT_DIRTY; 266 kip->slot_used[idx] = SLOT_DIRTY;
245 kip->ngarbage++; 267 kip->ngarbage++;
268 if (++c->nr_garbage > slots_per_page(c))
269 collect_garbage_slots(c);
246 } else 270 } else
247 collect_one_slot(kip, i); 271 collect_one_slot(kip, idx);
248 break; 272 return;
249 } 273 }
250 } 274 }
275 /* Could not free this slot. */
276 WARN_ON(1);
277}
251 278
252 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 279void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
253 collect_garbage_slots(); 280{
254 281 mutex_lock(&kprobe_insn_mutex);
282 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
255 mutex_unlock(&kprobe_insn_mutex); 283 mutex_unlock(&kprobe_insn_mutex);
256} 284}
285#ifdef CONFIG_OPTPROBES
286/* For optimized_kprobe buffer */
287static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
288static struct kprobe_insn_cache kprobe_optinsn_slots = {
289 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
290 /* .insn_size is initialized later */
291 .nr_garbage = 0,
292};
293/* Get a slot for optimized_kprobe buffer */
294kprobe_opcode_t __kprobes *get_optinsn_slot(void)
295{
296 kprobe_opcode_t *ret = NULL;
297
298 mutex_lock(&kprobe_optinsn_mutex);
299 ret = __get_insn_slot(&kprobe_optinsn_slots);
300 mutex_unlock(&kprobe_optinsn_mutex);
301
302 return ret;
303}
304
305void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
306{
307 mutex_lock(&kprobe_optinsn_mutex);
308 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
309 mutex_unlock(&kprobe_optinsn_mutex);
310}
311#endif
257#endif 312#endif
258 313
259/* We have preemption disabled.. so it is safe to use __ versions */ 314/* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
284 if (p->addr == addr) 339 if (p->addr == addr)
285 return p; 340 return p;
286 } 341 }
342
343 return NULL;
344}
345
346static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
347
348/* Return true if the kprobe is an aggregator */
349static inline int kprobe_aggrprobe(struct kprobe *p)
350{
351 return p->pre_handler == aggr_pre_handler;
352}
353
354/*
355 * Keep all fields in the kprobe consistent
356 */
357static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
358{
359 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
360 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
361}
362
363#ifdef CONFIG_OPTPROBES
364/* NOTE: change this value only with kprobe_mutex held */
365static bool kprobes_allow_optimization;
366
367/*
368 * Call all pre_handler on the list, but ignores its return value.
369 * This must be called from arch-dep optimized caller.
370 */
371void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372{
373 struct kprobe *kp;
374
375 list_for_each_entry_rcu(kp, &p->list, list) {
376 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
377 set_kprobe_instance(kp);
378 kp->pre_handler(kp, regs);
379 }
380 reset_kprobe_instance();
381 }
382}
383
384/* Return true(!0) if the kprobe is ready for optimization. */
385static inline int kprobe_optready(struct kprobe *p)
386{
387 struct optimized_kprobe *op;
388
389 if (kprobe_aggrprobe(p)) {
390 op = container_of(p, struct optimized_kprobe, kp);
391 return arch_prepared_optinsn(&op->optinsn);
392 }
393
394 return 0;
395}
396
397/*
398 * Return an optimized kprobe whose optimizing code replaces
399 * instructions including addr (exclude breakpoint).
400 */
401struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
402{
403 int i;
404 struct kprobe *p = NULL;
405 struct optimized_kprobe *op;
406
407 /* Don't check i == 0, since that is a breakpoint case. */
408 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
409 p = get_kprobe((void *)(addr - i));
410
411 if (p && kprobe_optready(p)) {
412 op = container_of(p, struct optimized_kprobe, kp);
413 if (arch_within_optimized_kprobe(op, addr))
414 return p;
415 }
416
287 return NULL; 417 return NULL;
288} 418}
289 419
420/* Optimization staging list, protected by kprobe_mutex */
421static LIST_HEAD(optimizing_list);
422
423static void kprobe_optimizer(struct work_struct *work);
424static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
425#define OPTIMIZE_DELAY 5
426
427/* Kprobe jump optimizer */
428static __kprobes void kprobe_optimizer(struct work_struct *work)
429{
430 struct optimized_kprobe *op, *tmp;
431
432 /* Lock modules while optimizing kprobes */
433 mutex_lock(&module_mutex);
434 mutex_lock(&kprobe_mutex);
435 if (kprobes_all_disarmed || !kprobes_allow_optimization)
436 goto end;
437
438 /*
439 * Wait for quiesence period to ensure all running interrupts
440 * are done. Because optprobe may modify multiple instructions
441 * there is a chance that Nth instruction is interrupted. In that
442 * case, running interrupt can return to 2nd-Nth byte of jump
443 * instruction. This wait is for avoiding it.
444 */
445 synchronize_sched();
446
447 /*
448 * The optimization/unoptimization refers online_cpus via
449 * stop_machine() and cpu-hotplug modifies online_cpus.
450 * And same time, text_mutex will be held in cpu-hotplug and here.
451 * This combination can cause a deadlock (cpu-hotplug try to lock
452 * text_mutex but stop_machine can not be done because online_cpus
453 * has been changed)
454 * To avoid this deadlock, we need to call get_online_cpus()
455 * for preventing cpu-hotplug outside of text_mutex locking.
456 */
457 get_online_cpus();
458 mutex_lock(&text_mutex);
459 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
460 WARN_ON(kprobe_disabled(&op->kp));
461 if (arch_optimize_kprobe(op) < 0)
462 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
463 list_del_init(&op->list);
464 }
465 mutex_unlock(&text_mutex);
466 put_online_cpus();
467end:
468 mutex_unlock(&kprobe_mutex);
469 mutex_unlock(&module_mutex);
470}
471
472/* Optimize kprobe if p is ready to be optimized */
473static __kprobes void optimize_kprobe(struct kprobe *p)
474{
475 struct optimized_kprobe *op;
476
477 /* Check if the kprobe is disabled or not ready for optimization. */
478 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
479 (kprobe_disabled(p) || kprobes_all_disarmed))
480 return;
481
482 /* Both of break_handler and post_handler are not supported. */
483 if (p->break_handler || p->post_handler)
484 return;
485
486 op = container_of(p, struct optimized_kprobe, kp);
487
488 /* Check there is no other kprobes at the optimized instructions */
489 if (arch_check_optimized_kprobe(op) < 0)
490 return;
491
492 /* Check if it is already optimized. */
493 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
494 return;
495
496 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
497 list_add(&op->list, &optimizing_list);
498 if (!delayed_work_pending(&optimizing_work))
499 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
500}
501
502/* Unoptimize a kprobe if p is optimized */
503static __kprobes void unoptimize_kprobe(struct kprobe *p)
504{
505 struct optimized_kprobe *op;
506
507 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
508 op = container_of(p, struct optimized_kprobe, kp);
509 if (!list_empty(&op->list))
510 /* Dequeue from the optimization queue */
511 list_del_init(&op->list);
512 else
513 /* Replace jump with break */
514 arch_unoptimize_kprobe(op);
515 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
516 }
517}
518
519/* Remove optimized instructions */
520static void __kprobes kill_optimized_kprobe(struct kprobe *p)
521{
522 struct optimized_kprobe *op;
523
524 op = container_of(p, struct optimized_kprobe, kp);
525 if (!list_empty(&op->list)) {
526 /* Dequeue from the optimization queue */
527 list_del_init(&op->list);
528 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
529 }
530 /* Don't unoptimize, because the target code will be freed. */
531 arch_remove_optimized_kprobe(op);
532}
533
534/* Try to prepare optimized instructions */
535static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
536{
537 struct optimized_kprobe *op;
538
539 op = container_of(p, struct optimized_kprobe, kp);
540 arch_prepare_optimized_kprobe(op);
541}
542
543/* Free optimized instructions and optimized_kprobe */
544static __kprobes void free_aggr_kprobe(struct kprobe *p)
545{
546 struct optimized_kprobe *op;
547
548 op = container_of(p, struct optimized_kprobe, kp);
549 arch_remove_optimized_kprobe(op);
550 kfree(op);
551}
552
553/* Allocate new optimized_kprobe and try to prepare optimized instructions */
554static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
555{
556 struct optimized_kprobe *op;
557
558 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
559 if (!op)
560 return NULL;
561
562 INIT_LIST_HEAD(&op->list);
563 op->kp.addr = p->addr;
564 arch_prepare_optimized_kprobe(op);
565
566 return &op->kp;
567}
568
569static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
570
571/*
572 * Prepare an optimized_kprobe and optimize it
573 * NOTE: p must be a normal registered kprobe
574 */
575static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
576{
577 struct kprobe *ap;
578 struct optimized_kprobe *op;
579
580 ap = alloc_aggr_kprobe(p);
581 if (!ap)
582 return;
583
584 op = container_of(ap, struct optimized_kprobe, kp);
585 if (!arch_prepared_optinsn(&op->optinsn)) {
586 /* If failed to setup optimizing, fallback to kprobe */
587 free_aggr_kprobe(ap);
588 return;
589 }
590
591 init_aggr_kprobe(ap, p);
592 optimize_kprobe(ap);
593}
594
595#ifdef CONFIG_SYSCTL
596static void __kprobes optimize_all_kprobes(void)
597{
598 struct hlist_head *head;
599 struct hlist_node *node;
600 struct kprobe *p;
601 unsigned int i;
602
603 /* If optimization is already allowed, just return */
604 if (kprobes_allow_optimization)
605 return;
606
607 kprobes_allow_optimization = true;
608 mutex_lock(&text_mutex);
609 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
610 head = &kprobe_table[i];
611 hlist_for_each_entry_rcu(p, node, head, hlist)
612 if (!kprobe_disabled(p))
613 optimize_kprobe(p);
614 }
615 mutex_unlock(&text_mutex);
616 printk(KERN_INFO "Kprobes globally optimized\n");
617}
618
619static void __kprobes unoptimize_all_kprobes(void)
620{
621 struct hlist_head *head;
622 struct hlist_node *node;
623 struct kprobe *p;
624 unsigned int i;
625
626 /* If optimization is already prohibited, just return */
627 if (!kprobes_allow_optimization)
628 return;
629
630 kprobes_allow_optimization = false;
631 printk(KERN_INFO "Kprobes globally unoptimized\n");
632 get_online_cpus(); /* For avoiding text_mutex deadlock */
633 mutex_lock(&text_mutex);
634 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
635 head = &kprobe_table[i];
636 hlist_for_each_entry_rcu(p, node, head, hlist) {
637 if (!kprobe_disabled(p))
638 unoptimize_kprobe(p);
639 }
640 }
641
642 mutex_unlock(&text_mutex);
643 put_online_cpus();
644 /* Allow all currently running kprobes to complete */
645 synchronize_sched();
646}
647
648int sysctl_kprobes_optimization;
649int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
650 void __user *buffer, size_t *length,
651 loff_t *ppos)
652{
653 int ret;
654
655 mutex_lock(&kprobe_mutex);
656 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
657 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
658
659 if (sysctl_kprobes_optimization)
660 optimize_all_kprobes();
661 else
662 unoptimize_all_kprobes();
663 mutex_unlock(&kprobe_mutex);
664
665 return ret;
666}
667#endif /* CONFIG_SYSCTL */
668
669static void __kprobes __arm_kprobe(struct kprobe *p)
670{
671 struct kprobe *old_p;
672
673 /* Check collision with other optimized kprobes */
674 old_p = get_optimized_kprobe((unsigned long)p->addr);
675 if (unlikely(old_p))
676 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
677
678 arch_arm_kprobe(p);
679 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
680}
681
682static void __kprobes __disarm_kprobe(struct kprobe *p)
683{
684 struct kprobe *old_p;
685
686 unoptimize_kprobe(p); /* Try to unoptimize */
687 arch_disarm_kprobe(p);
688
689 /* If another kprobe was blocked, optimize it. */
690 old_p = get_optimized_kprobe((unsigned long)p->addr);
691 if (unlikely(old_p))
692 optimize_kprobe(old_p);
693}
694
695#else /* !CONFIG_OPTPROBES */
696
697#define optimize_kprobe(p) do {} while (0)
698#define unoptimize_kprobe(p) do {} while (0)
699#define kill_optimized_kprobe(p) do {} while (0)
700#define prepare_optimized_kprobe(p) do {} while (0)
701#define try_to_optimize_kprobe(p) do {} while (0)
702#define __arm_kprobe(p) arch_arm_kprobe(p)
703#define __disarm_kprobe(p) arch_disarm_kprobe(p)
704
705static __kprobes void free_aggr_kprobe(struct kprobe *p)
706{
707 kfree(p);
708}
709
710static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
711{
712 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
713}
714#endif /* CONFIG_OPTPROBES */
715
290/* Arm a kprobe with text_mutex */ 716/* Arm a kprobe with text_mutex */
291static void __kprobes arm_kprobe(struct kprobe *kp) 717static void __kprobes arm_kprobe(struct kprobe *kp)
292{ 718{
719 /*
720 * Here, since __arm_kprobe() doesn't use stop_machine(),
721 * this doesn't cause deadlock on text_mutex. So, we don't
722 * need get_online_cpus().
723 */
293 mutex_lock(&text_mutex); 724 mutex_lock(&text_mutex);
294 arch_arm_kprobe(kp); 725 __arm_kprobe(kp);
295 mutex_unlock(&text_mutex); 726 mutex_unlock(&text_mutex);
296} 727}
297 728
298/* Disarm a kprobe with text_mutex */ 729/* Disarm a kprobe with text_mutex */
299static void __kprobes disarm_kprobe(struct kprobe *kp) 730static void __kprobes disarm_kprobe(struct kprobe *kp)
300{ 731{
732 get_online_cpus(); /* For avoiding text_mutex deadlock */
301 mutex_lock(&text_mutex); 733 mutex_lock(&text_mutex);
302 arch_disarm_kprobe(kp); 734 __disarm_kprobe(kp);
303 mutex_unlock(&text_mutex); 735 mutex_unlock(&text_mutex);
736 put_online_cpus();
304} 737}
305 738
306/* 739/*
@@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
369void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 802void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
370{ 803{
371 struct kprobe *kp; 804 struct kprobe *kp;
372 if (p->pre_handler != aggr_pre_handler) { 805 if (!kprobe_aggrprobe(p)) {
373 p->nmissed++; 806 p->nmissed++;
374 } else { 807 } else {
375 list_for_each_entry_rcu(kp, &p->list, list) 808 list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
493} 926}
494 927
495/* 928/*
496 * Keep all fields in the kprobe consistent
497 */
498static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
499{
500 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
501 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
502}
503
504/*
505* Add the new probe to ap->list. Fail if this is the 929* Add the new probe to ap->list. Fail if this is the
506* second jprobe at the address - two jprobes can't coexist 930* second jprobe at the address - two jprobes can't coexist
507*/ 931*/
508static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 932static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
509{ 933{
510 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 934 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
935
936 if (p->break_handler || p->post_handler)
937 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
938
511 if (p->break_handler) { 939 if (p->break_handler) {
512 if (ap->break_handler) 940 if (ap->break_handler)
513 return -EEXIST; 941 return -EEXIST;
@@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
522 ap->flags &= ~KPROBE_FLAG_DISABLED; 950 ap->flags &= ~KPROBE_FLAG_DISABLED;
523 if (!kprobes_all_disarmed) 951 if (!kprobes_all_disarmed)
524 /* Arm the breakpoint again. */ 952 /* Arm the breakpoint again. */
525 arm_kprobe(ap); 953 __arm_kprobe(ap);
526 } 954 }
527 return 0; 955 return 0;
528} 956}
@@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
531 * Fill in the required fields of the "manager kprobe". Replace the 959 * Fill in the required fields of the "manager kprobe". Replace the
532 * earlier kprobe in the hlist with the manager kprobe 960 * earlier kprobe in the hlist with the manager kprobe
533 */ 961 */
534static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 962static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
535{ 963{
964 /* Copy p's insn slot to ap */
536 copy_kprobe(p, ap); 965 copy_kprobe(p, ap);
537 flush_insn_slot(ap); 966 flush_insn_slot(ap);
538 ap->addr = p->addr; 967 ap->addr = p->addr;
539 ap->flags = p->flags; 968 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
540 ap->pre_handler = aggr_pre_handler; 969 ap->pre_handler = aggr_pre_handler;
541 ap->fault_handler = aggr_fault_handler; 970 ap->fault_handler = aggr_fault_handler;
542 /* We don't care the kprobe which has gone. */ 971 /* We don't care the kprobe which has gone. */
@@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
546 ap->break_handler = aggr_break_handler; 975 ap->break_handler = aggr_break_handler;
547 976
548 INIT_LIST_HEAD(&ap->list); 977 INIT_LIST_HEAD(&ap->list);
549 list_add_rcu(&p->list, &ap->list); 978 INIT_HLIST_NODE(&ap->hlist);
550 979
980 list_add_rcu(&p->list, &ap->list);
551 hlist_replace_rcu(&p->hlist, &ap->hlist); 981 hlist_replace_rcu(&p->hlist, &ap->hlist);
552} 982}
553 983
@@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
561 int ret = 0; 991 int ret = 0;
562 struct kprobe *ap = old_p; 992 struct kprobe *ap = old_p;
563 993
564 if (old_p->pre_handler != aggr_pre_handler) { 994 if (!kprobe_aggrprobe(old_p)) {
565 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 995 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
566 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 996 ap = alloc_aggr_kprobe(old_p);
567 if (!ap) 997 if (!ap)
568 return -ENOMEM; 998 return -ENOMEM;
569 add_aggr_kprobe(ap, old_p); 999 init_aggr_kprobe(ap, old_p);
570 } 1000 }
571 1001
572 if (kprobe_gone(ap)) { 1002 if (kprobe_gone(ap)) {
@@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
585 */ 1015 */
586 return ret; 1016 return ret;
587 1017
1018 /* Prepare optimized instructions if possible. */
1019 prepare_optimized_kprobe(ap);
1020
588 /* 1021 /*
589 * Clear gone flag to prevent allocating new slot again, and 1022 * Clear gone flag to prevent allocating new slot again, and
590 * set disabled flag because it is not armed yet. 1023 * set disabled flag because it is not armed yet.
@@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
593 | KPROBE_FLAG_DISABLED; 1026 | KPROBE_FLAG_DISABLED;
594 } 1027 }
595 1028
1029 /* Copy ap's insn slot to p */
596 copy_kprobe(ap, p); 1030 copy_kprobe(ap, p);
597 return add_new_kprobe(ap, p); 1031 return add_new_kprobe(ap, p);
598} 1032}
@@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
743 p->nmissed = 0; 1177 p->nmissed = 0;
744 INIT_LIST_HEAD(&p->list); 1178 INIT_LIST_HEAD(&p->list);
745 mutex_lock(&kprobe_mutex); 1179 mutex_lock(&kprobe_mutex);
1180
1181 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1182 mutex_lock(&text_mutex);
1183
746 old_p = get_kprobe(p->addr); 1184 old_p = get_kprobe(p->addr);
747 if (old_p) { 1185 if (old_p) {
1186 /* Since this may unoptimize old_p, locking text_mutex. */
748 ret = register_aggr_kprobe(old_p, p); 1187 ret = register_aggr_kprobe(old_p, p);
749 goto out; 1188 goto out;
750 } 1189 }
751 1190
752 mutex_lock(&text_mutex);
753 ret = arch_prepare_kprobe(p); 1191 ret = arch_prepare_kprobe(p);
754 if (ret) 1192 if (ret)
755 goto out_unlock_text; 1193 goto out;
756 1194
757 INIT_HLIST_NODE(&p->hlist); 1195 INIT_HLIST_NODE(&p->hlist);
758 hlist_add_head_rcu(&p->hlist, 1196 hlist_add_head_rcu(&p->hlist,
759 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1197 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
760 1198
761 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1199 if (!kprobes_all_disarmed && !kprobe_disabled(p))
762 arch_arm_kprobe(p); 1200 __arm_kprobe(p);
1201
1202 /* Try to optimize kprobe */
1203 try_to_optimize_kprobe(p);
763 1204
764out_unlock_text:
765 mutex_unlock(&text_mutex);
766out: 1205out:
1206 mutex_unlock(&text_mutex);
1207 put_online_cpus();
767 mutex_unlock(&kprobe_mutex); 1208 mutex_unlock(&kprobe_mutex);
768 1209
769 if (probed_mod) 1210 if (probed_mod)
@@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
785 return -EINVAL; 1226 return -EINVAL;
786 1227
787 if (old_p == p || 1228 if (old_p == p ||
788 (old_p->pre_handler == aggr_pre_handler && 1229 (kprobe_aggrprobe(old_p) &&
789 list_is_singular(&old_p->list))) { 1230 list_is_singular(&old_p->list))) {
790 /* 1231 /*
791 * Only probe on the hash list. Disarm only if kprobes are 1232 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793 * already have been removed. We save on flushing icache. 1234 * already have been removed. We save on flushing icache.
794 */ 1235 */
795 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1236 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
796 disarm_kprobe(p); 1237 disarm_kprobe(old_p);
797 hlist_del_rcu(&old_p->hlist); 1238 hlist_del_rcu(&old_p->hlist);
798 } else { 1239 } else {
799 if (p->break_handler && !kprobe_gone(p)) 1240 if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1250,13 @@ noclean:
809 list_del_rcu(&p->list); 1250 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) { 1251 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p); 1252 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1253 if (!kprobes_all_disarmed) {
813 disarm_kprobe(old_p); 1254 if (kprobe_disabled(old_p))
1255 disarm_kprobe(old_p);
1256 else
1257 /* Try to optimize this probe again */
1258 optimize_kprobe(old_p);
1259 }
814 } 1260 }
815 } 1261 }
816 return 0; 1262 return 0;
@@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
827 old_p = list_entry(p->list.next, struct kprobe, list); 1273 old_p = list_entry(p->list.next, struct kprobe, list);
828 list_del(&p->list); 1274 list_del(&p->list);
829 arch_remove_kprobe(old_p); 1275 arch_remove_kprobe(old_p);
830 kfree(old_p); 1276 free_aggr_kprobe(old_p);
831 } 1277 }
832} 1278}
833 1279
@@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1123 struct kprobe *kp; 1569 struct kprobe *kp;
1124 1570
1125 p->flags |= KPROBE_FLAG_GONE; 1571 p->flags |= KPROBE_FLAG_GONE;
1126 if (p->pre_handler == aggr_pre_handler) { 1572 if (kprobe_aggrprobe(p)) {
1127 /* 1573 /*
1128 * If this is an aggr_kprobe, we have to list all the 1574 * If this is an aggr_kprobe, we have to list all the
1129 * chained probes and mark them GONE. 1575 * chained probes and mark them GONE.
@@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1132 kp->flags |= KPROBE_FLAG_GONE; 1578 kp->flags |= KPROBE_FLAG_GONE;
1133 p->post_handler = NULL; 1579 p->post_handler = NULL;
1134 p->break_handler = NULL; 1580 p->break_handler = NULL;
1581 kill_optimized_kprobe(p);
1135 } 1582 }
1136 /* 1583 /*
1137 * Here, we can remove insn_slot safely, because no thread calls 1584 * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1688,15 @@ static int __init init_kprobes(void)
1241 } 1688 }
1242 } 1689 }
1243 1690
1691#if defined(CONFIG_OPTPROBES)
1692#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1693 /* Init kprobe_optinsn_slots */
1694 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1695#endif
1696 /* By default, kprobes can be optimized */
1697 kprobes_allow_optimization = true;
1698#endif
1699
1244 /* By default, kprobes are armed */ 1700 /* By default, kprobes are armed */
1245 kprobes_all_disarmed = false; 1701 kprobes_all_disarmed = false;
1246 1702
@@ -1259,7 +1715,7 @@ static int __init init_kprobes(void)
1259 1715
1260#ifdef CONFIG_DEBUG_FS 1716#ifdef CONFIG_DEBUG_FS
1261static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1717static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1262 const char *sym, int offset,char *modname) 1718 const char *sym, int offset, char *modname, struct kprobe *pp)
1263{ 1719{
1264 char *kprobe_type; 1720 char *kprobe_type;
1265 1721
@@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1269 kprobe_type = "j"; 1725 kprobe_type = "j";
1270 else 1726 else
1271 kprobe_type = "k"; 1727 kprobe_type = "k";
1728
1272 if (sym) 1729 if (sym)
1273 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1730 seq_printf(pi, "%p %s %s+0x%x %s ",
1274 p->addr, kprobe_type, sym, offset, 1731 p->addr, kprobe_type, sym, offset,
1275 (modname ? modname : " "), 1732 (modname ? modname : " "));
1276 (kprobe_gone(p) ? "[GONE]" : ""),
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1278 "[DISABLED]" : ""));
1279 else 1733 else
1280 seq_printf(pi, "%p %s %p %s%s\n", 1734 seq_printf(pi, "%p %s %p ",
1281 p->addr, kprobe_type, p->addr, 1735 p->addr, kprobe_type, p->addr);
1282 (kprobe_gone(p) ? "[GONE]" : ""), 1736
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1737 if (!pp)
1284 "[DISABLED]" : "")); 1738 pp = p;
1739 seq_printf(pi, "%s%s%s\n",
1740 (kprobe_gone(p) ? "[GONE]" : ""),
1741 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1742 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1285} 1743}
1286 1744
1287static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1745static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1317 hlist_for_each_entry_rcu(p, node, head, hlist) { 1775 hlist_for_each_entry_rcu(p, node, head, hlist) {
1318 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1776 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1319 &offset, &modname, namebuf); 1777 &offset, &modname, namebuf);
1320 if (p->pre_handler == aggr_pre_handler) { 1778 if (kprobe_aggrprobe(p)) {
1321 list_for_each_entry_rcu(kp, &p->list, list) 1779 list_for_each_entry_rcu(kp, &p->list, list)
1322 report_probe(pi, kp, sym, offset, modname); 1780 report_probe(pi, kp, sym, offset, modname, p);
1323 } else 1781 } else
1324 report_probe(pi, p, sym, offset, modname); 1782 report_probe(pi, p, sym, offset, modname, NULL);
1325 } 1783 }
1326 preempt_enable(); 1784 preempt_enable();
1327 return 0; 1785 return 0;
@@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1399 goto out; 1857 goto out;
1400 } 1858 }
1401 1859
1402 if (!kprobes_all_disarmed && kprobe_disabled(p))
1403 arm_kprobe(p);
1404
1405 p->flags &= ~KPROBE_FLAG_DISABLED;
1406 if (p != kp) 1860 if (p != kp)
1407 kp->flags &= ~KPROBE_FLAG_DISABLED; 1861 kp->flags &= ~KPROBE_FLAG_DISABLED;
1862
1863 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1864 p->flags &= ~KPROBE_FLAG_DISABLED;
1865 arm_kprobe(p);
1866 }
1408out: 1867out:
1409 mutex_unlock(&kprobe_mutex); 1868 mutex_unlock(&kprobe_mutex);
1410 return ret; 1869 return ret;
@@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
1424 if (!kprobes_all_disarmed) 1883 if (!kprobes_all_disarmed)
1425 goto already_enabled; 1884 goto already_enabled;
1426 1885
1886 /* Arming kprobes doesn't optimize kprobe itself */
1427 mutex_lock(&text_mutex); 1887 mutex_lock(&text_mutex);
1428 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1888 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1429 head = &kprobe_table[i]; 1889 head = &kprobe_table[i];
1430 hlist_for_each_entry_rcu(p, node, head, hlist) 1890 hlist_for_each_entry_rcu(p, node, head, hlist)
1431 if (!kprobe_disabled(p)) 1891 if (!kprobe_disabled(p))
1432 arch_arm_kprobe(p); 1892 __arm_kprobe(p);
1433 } 1893 }
1434 mutex_unlock(&text_mutex); 1894 mutex_unlock(&text_mutex);
1435 1895
@@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
1456 1916
1457 kprobes_all_disarmed = true; 1917 kprobes_all_disarmed = true;
1458 printk(KERN_INFO "Kprobes globally disabled\n"); 1918 printk(KERN_INFO "Kprobes globally disabled\n");
1919
1920 /*
1921 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1922 * because disarming may also unoptimize kprobes.
1923 */
1924 get_online_cpus();
1459 mutex_lock(&text_mutex); 1925 mutex_lock(&text_mutex);
1460 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1926 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1461 head = &kprobe_table[i]; 1927 head = &kprobe_table[i];
1462 hlist_for_each_entry_rcu(p, node, head, hlist) { 1928 hlist_for_each_entry_rcu(p, node, head, hlist) {
1463 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1929 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1464 arch_disarm_kprobe(p); 1930 __disarm_kprobe(p);
1465 } 1931 }
1466 } 1932 }
1467 1933
1468 mutex_unlock(&text_mutex); 1934 mutex_unlock(&text_mutex);
1935 put_online_cpus();
1469 mutex_unlock(&kprobe_mutex); 1936 mutex_unlock(&kprobe_mutex);
1470 /* Allow all currently running kprobes to complete */ 1937 /* Allow all currently running kprobes to complete */
1471 synchronize_sched(); 1938 synchronize_sched();
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f9bcb8313d6..93caf65ff57c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
642 if (!pd) 642 if (!pd)
643 goto err_free_inst; 643 goto err_free_inst;
644 644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
645 rcu_assign_pointer(pinst->pd, pd); 648 rcu_assign_pointer(pinst->pd, pd);
646 649
647 pinst->wq = wq; 650 pinst->wq = wq;
@@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 pinst->cpu_notifier.priority = 0; 657 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier); 658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err) 659 if (err)
657 goto err_free_pd; 660 goto err_free_cpumask;
658 661
659 mutex_init(&pinst->lock); 662 mutex_init(&pinst->lock);
660 663
661 return pinst; 664 return pinst;
662 665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
663err_free_pd: 668err_free_pd:
664 padata_free_pd(pd); 669 padata_free_pd(pd);
665err_free_inst: 670err_free_inst:
@@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst)
685 690
686 unregister_hotcpu_notifier(&pinst->cpu_notifier); 691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd); 692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
688 kfree(pinst); 694 kfree(pinst);
689} 695}
690EXPORT_SYMBOL(padata_free); 696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 33e7a38b6eb9..0ef19c614f6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_event.h> 52#include <linux/perf_event.h>
53#include <linux/kprobes.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/processor.h> 56#include <asm/processor.h>
@@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1451 .proc_handler = proc_dointvec
1451 }, 1452 },
1452#endif 1453#endif
1454#if defined(CONFIG_OPTPROBES)
1455 {
1456 .procname = "kprobes-optimization",
1457 .data = &sysctl_kprobes_optimization,
1458 .maxlen = sizeof(int),
1459 .mode = 0644,
1460 .proc_handler = proc_kprobes_optimization_handler,
1461 .extra1 = &zero,
1462 .extra2 = &one,
1463 },
1464#endif
1453 { } 1465 { }
1454}; 1466};
1455 1467
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 2de34075f6a4..34202b1be0bb 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -41,7 +41,8 @@ OPTIONS
41 41
42-d:: 42-d::
43--del=:: 43--del=::
44 Delete a probe event. 44 Delete probe events. This accepts glob wildcards('*', '?') and character
45 classes(e.g. [a-z], [!A-Z]).
45 46
46-l:: 47-l::
47--list:: 48--list::
@@ -50,17 +51,29 @@ OPTIONS
50-L:: 51-L::
51--line=:: 52--line=::
52 Show source code lines which can be probed. This needs an argument 53 Show source code lines which can be probed. This needs an argument
53 which specifies a range of the source code. 54 which specifies a range of the source code. (see LINE SYNTAX for detail)
55
56-f::
57--force::
58 Forcibly add events with existing name.
54 59
55PROBE SYNTAX 60PROBE SYNTAX
56------------ 61------------
57Probe points are defined by following syntax. 62Probe points are defined by following syntax.
58 63
59 "[EVENT=]FUNC[+OFFS|:RLN|%return][@SRC]|SRC:ALN [ARG ...]" 64 1) Define event based on function name
65 [EVENT=]FUNC[@SRC][:RLN|+OFFS|%return|;PTN] [ARG ...]
66
67 2) Define event based on source file with line number
68 [EVENT=]SRC:ALN [ARG ...]
69
70 3) Define event based on source file with lazy pattern
71 [EVENT=]SRC;PTN [ARG ...]
72
60 73
61'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. Currently, event group name is set as 'probe'. 74'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. Currently, event group name is set as 'probe'.
62'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, 'RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. In addition, 'SRC' specifies a source file which has that function. 75'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition. In addition, '@SRC' specifies a source file which has that function.
63It is also possible to specify a probe point by the source line number by using 'SRC:ALN' syntax, where 'SRC' is the source file path and 'ALN' is the line number. 76It is also possible to specify a probe point by the source line number or lazy matching by using 'SRC:ALN' or 'SRC;PTN' syntax, where 'SRC' is the source file path, ':ALN' is the line number and ';PTN' is the lazy matching pattern.
64'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc). 77'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc).
65 78
66LINE SYNTAX 79LINE SYNTAX
@@ -76,6 +89,41 @@ and 'ALN2' is end line number in the file. It is also possible to specify how
76many lines to show by using 'NUM'. 89many lines to show by using 'NUM'.
77So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. 90So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function.
78 91
92LAZY MATCHING
93-------------
94 The lazy line matching is similar to glob matching but ignoring spaces in both of pattern and target. So this accepts wildcards('*', '?') and character classes(e.g. [a-z], [!A-Z]).
95
96e.g.
97 'a=*' can matches 'a=b', 'a = b', 'a == b' and so on.
98
99This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.)
100
101
102EXAMPLES
103--------
104Display which lines in schedule() can be probed:
105
106 ./perf probe --line schedule
107
108Add a probe on schedule() function 12th line with recording cpu local variable:
109
110 ./perf probe schedule:12 cpu
111 or
112 ./perf probe --add='schedule:12 cpu'
113
114 this will add one or more probes which has the name start with "schedule".
115
116 Add probes on lines in schedule() function which calls update_rq_clock().
117
118 ./perf probe 'schedule;update_rq_clock*'
119 or
120 ./perf probe --add='schedule;update_rq_clock*'
121
122Delete all probes on schedule().
123
124 ./perf probe --del='schedule*'
125
126
79SEE ALSO 127SEE ALSO
80-------- 128--------
81linkperf:perf-trace[1], linkperf:perf-record[1] 129linkperf:perf-trace[1], linkperf:perf-record[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 54a5b50ff312..2d537382c686 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -500,12 +500,12 @@ else
500 msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]); 500 msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
501endif 501endif
502 502
503ifneq ($(shell sh -c "(echo '\#ifndef _MIPS_SZLONG'; echo '\#define _MIPS_SZLONG 0'; echo '\#endif'; echo '\#include <dwarf.h>'; echo '\#include <libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/libdwarf -ldwarf -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) 503ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
504 msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231); 504 msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
505 BASIC_CFLAGS += -DNO_LIBDWARF 505 BASIC_CFLAGS += -DNO_DWARF_SUPPORT
506else 506else
507 BASIC_CFLAGS += -I/usr/include/libdwarf 507 BASIC_CFLAGS += -I/usr/include/elfutils
508 EXTLIBS += -lelf -ldwarf 508 EXTLIBS += -lelf -ldw
509 LIB_OBJS += util/probe-finder.o 509 LIB_OBJS += util/probe-finder.o
510endif 510endif
511 511
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index ad47bd4c50ef..c30a33592340 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -128,7 +128,7 @@ static void evaluate_probe_point(struct probe_point *pp)
128 pp->function); 128 pp->function);
129} 129}
130 130
131#ifndef NO_LIBDWARF 131#ifndef NO_DWARF_SUPPORT
132static int open_vmlinux(void) 132static int open_vmlinux(void)
133{ 133{
134 if (map__load(session.kmaps[MAP__FUNCTION], NULL) < 0) { 134 if (map__load(session.kmaps[MAP__FUNCTION], NULL) < 0) {
@@ -156,14 +156,16 @@ static const char * const probe_usage[] = {
156 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]", 156 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
157 "perf probe [<options>] --del '[GROUP:]EVENT' ...", 157 "perf probe [<options>] --del '[GROUP:]EVENT' ...",
158 "perf probe --list", 158 "perf probe --list",
159#ifndef NO_DWARF_SUPPORT
159 "perf probe --line 'LINEDESC'", 160 "perf probe --line 'LINEDESC'",
161#endif
160 NULL 162 NULL
161}; 163};
162 164
163static const struct option options[] = { 165static const struct option options[] = {
164 OPT_BOOLEAN('v', "verbose", &verbose, 166 OPT_BOOLEAN('v', "verbose", &verbose,
165 "be more verbose (show parsed arguments, etc)"), 167 "be more verbose (show parsed arguments, etc)"),
166#ifndef NO_LIBDWARF 168#ifndef NO_DWARF_SUPPORT
167 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, 169 OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
168 "file", "vmlinux pathname"), 170 "file", "vmlinux pathname"),
169#endif 171#endif
@@ -172,30 +174,32 @@ static const struct option options[] = {
172 OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.", 174 OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
173 opt_del_probe_event), 175 opt_del_probe_event),
174 OPT_CALLBACK('a', "add", NULL, 176 OPT_CALLBACK('a', "add", NULL,
175#ifdef NO_LIBDWARF 177#ifdef NO_DWARF_SUPPORT
176 "[EVENT=]FUNC[+OFFS|%return] [ARG ...]", 178 "[EVENT=]FUNC[+OFF|%return] [ARG ...]",
177#else 179#else
178 "[EVENT=]FUNC[+OFFS|%return|:RLN][@SRC]|SRC:ALN [ARG ...]", 180 "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
181 " [ARG ...]",
179#endif 182#endif
180 "probe point definition, where\n" 183 "probe point definition, where\n"
181 "\t\tGROUP:\tGroup name (optional)\n" 184 "\t\tGROUP:\tGroup name (optional)\n"
182 "\t\tEVENT:\tEvent name\n" 185 "\t\tEVENT:\tEvent name\n"
183 "\t\tFUNC:\tFunction name\n" 186 "\t\tFUNC:\tFunction name\n"
184 "\t\tOFFS:\tOffset from function entry (in byte)\n" 187 "\t\tOFF:\tOffset from function entry (in byte)\n"
185 "\t\t%return:\tPut the probe at function return\n" 188 "\t\t%return:\tPut the probe at function return\n"
186#ifdef NO_LIBDWARF 189#ifdef NO_DWARF_SUPPORT
187 "\t\tARG:\tProbe argument (only \n" 190 "\t\tARG:\tProbe argument (only \n"
188#else 191#else
189 "\t\tSRC:\tSource code path\n" 192 "\t\tSRC:\tSource code path\n"
190 "\t\tRLN:\tRelative line number from function entry.\n" 193 "\t\tRL:\tRelative line number from function entry.\n"
191 "\t\tALN:\tAbsolute line number in file.\n" 194 "\t\tAL:\tAbsolute line number in file.\n"
195 "\t\tPT:\tLazy expression of line code.\n"
192 "\t\tARG:\tProbe argument (local variable name or\n" 196 "\t\tARG:\tProbe argument (local variable name or\n"
193#endif 197#endif
194 "\t\t\tkprobe-tracer argument format.)\n", 198 "\t\t\tkprobe-tracer argument format.)\n",
195 opt_add_probe_event), 199 opt_add_probe_event),
196 OPT_BOOLEAN('f', "force", &session.force_add, "forcibly add events" 200 OPT_BOOLEAN('f', "force", &session.force_add, "forcibly add events"
197 " with existing name"), 201 " with existing name"),
198#ifndef NO_LIBDWARF 202#ifndef NO_DWARF_SUPPORT
199 OPT_CALLBACK('L', "line", NULL, 203 OPT_CALLBACK('L', "line", NULL,
200 "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]", 204 "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]",
201 "Show source code lines.", opt_show_lines), 205 "Show source code lines.", opt_show_lines),
@@ -223,7 +227,7 @@ static void init_vmlinux(void)
223int cmd_probe(int argc, const char **argv, const char *prefix __used) 227int cmd_probe(int argc, const char **argv, const char *prefix __used)
224{ 228{
225 int i, ret; 229 int i, ret;
226#ifndef NO_LIBDWARF 230#ifndef NO_DWARF_SUPPORT
227 int fd; 231 int fd;
228#endif 232#endif
229 struct probe_point *pp; 233 struct probe_point *pp;
@@ -259,7 +263,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
259 return 0; 263 return 0;
260 } 264 }
261 265
262#ifndef NO_LIBDWARF 266#ifndef NO_DWARF_SUPPORT
263 if (session.show_lines) { 267 if (session.show_lines) {
264 if (session.nr_probe != 0 || session.dellist) { 268 if (session.nr_probe != 0 || session.dellist) {
265 pr_warning(" Error: Don't use --line with" 269 pr_warning(" Error: Don't use --line with"
@@ -290,9 +294,9 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
290 init_vmlinux(); 294 init_vmlinux();
291 295
292 if (session.need_dwarf) 296 if (session.need_dwarf)
293#ifdef NO_LIBDWARF 297#ifdef NO_DWARF_SUPPORT
294 die("Debuginfo-analysis is not supported"); 298 die("Debuginfo-analysis is not supported");
295#else /* !NO_LIBDWARF */ 299#else /* !NO_DWARF_SUPPORT */
296 pr_debug("Some probes require debuginfo.\n"); 300 pr_debug("Some probes require debuginfo.\n");
297 301
298 fd = open_vmlinux(); 302 fd = open_vmlinux();
@@ -312,7 +316,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
312 continue; 316 continue;
313 317
314 lseek(fd, SEEK_SET, 0); 318 lseek(fd, SEEK_SET, 0);
315 ret = find_probepoint(fd, pp); 319 ret = find_probe_point(fd, pp);
316 if (ret > 0) 320 if (ret > 0)
317 continue; 321 continue;
318 if (ret == 0) { /* No error but failed to find probe point. */ 322 if (ret == 0) { /* No error but failed to find probe point. */
@@ -333,7 +337,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
333 close(fd); 337 close(fd);
334 338
335end_dwarf: 339end_dwarf:
336#endif /* !NO_LIBDWARF */ 340#endif /* !NO_DWARF_SUPPORT */
337 341
338 /* Synthesize probes without dwarf */ 342 /* Synthesize probes without dwarf */
339 for (i = 0; i < session.nr_probe; i++) { 343 for (i = 0; i < session.nr_probe; i++) {
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 8f0568849691..c971e81e9cbf 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -119,14 +119,14 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
119 char c, nc = 0; 119 char c, nc = 0;
120 /* 120 /*
121 * <Syntax> 121 * <Syntax>
122 * perf probe [EVENT=]SRC:LN 122 * perf probe [EVENT=]SRC[:LN|;PTN]
123 * perf probe [EVENT=]FUNC[+OFFS|%return][@SRC] 123 * perf probe [EVENT=]FUNC[@SRC][+OFFS|%return|:LN|;PAT]
124 * 124 *
125 * TODO:Group name support 125 * TODO:Group name support
126 */ 126 */
127 127
128 ptr = strchr(arg, '='); 128 ptr = strpbrk(arg, ";=@+%");
129 if (ptr) { /* Event name */ 129 if (ptr && *ptr == '=') { /* Event name */
130 *ptr = '\0'; 130 *ptr = '\0';
131 tmp = ptr + 1; 131 tmp = ptr + 1;
132 ptr = strchr(arg, ':'); 132 ptr = strchr(arg, ':');
@@ -139,7 +139,7 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
139 arg = tmp; 139 arg = tmp;
140 } 140 }
141 141
142 ptr = strpbrk(arg, ":+@%"); 142 ptr = strpbrk(arg, ";:+@%");
143 if (ptr) { 143 if (ptr) {
144 nc = *ptr; 144 nc = *ptr;
145 *ptr++ = '\0'; 145 *ptr++ = '\0';
@@ -156,7 +156,11 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
156 while (ptr) { 156 while (ptr) {
157 arg = ptr; 157 arg = ptr;
158 c = nc; 158 c = nc;
159 ptr = strpbrk(arg, ":+@%"); 159 if (c == ';') { /* Lazy pattern must be the last part */
160 pp->lazy_line = strdup(arg);
161 break;
162 }
163 ptr = strpbrk(arg, ";:+@%");
160 if (ptr) { 164 if (ptr) {
161 nc = *ptr; 165 nc = *ptr;
162 *ptr++ = '\0'; 166 *ptr++ = '\0';
@@ -165,13 +169,13 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
165 case ':': /* Line number */ 169 case ':': /* Line number */
166 pp->line = strtoul(arg, &tmp, 0); 170 pp->line = strtoul(arg, &tmp, 0);
167 if (*tmp != '\0') 171 if (*tmp != '\0')
168 semantic_error("There is non-digit charactor" 172 semantic_error("There is non-digit char"
169 " in line number."); 173 " in line number.");
170 break; 174 break;
171 case '+': /* Byte offset from a symbol */ 175 case '+': /* Byte offset from a symbol */
172 pp->offset = strtoul(arg, &tmp, 0); 176 pp->offset = strtoul(arg, &tmp, 0);
173 if (*tmp != '\0') 177 if (*tmp != '\0')
174 semantic_error("There is non-digit charactor" 178 semantic_error("There is non-digit character"
175 " in offset."); 179 " in offset.");
176 break; 180 break;
177 case '@': /* File name */ 181 case '@': /* File name */
@@ -179,9 +183,6 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
179 semantic_error("SRC@SRC is not allowed."); 183 semantic_error("SRC@SRC is not allowed.");
180 pp->file = strdup(arg); 184 pp->file = strdup(arg);
181 DIE_IF(pp->file == NULL); 185 DIE_IF(pp->file == NULL);
182 if (ptr)
183 semantic_error("@SRC must be the last "
184 "option.");
185 break; 186 break;
186 case '%': /* Probe places */ 187 case '%': /* Probe places */
187 if (strcmp(arg, "return") == 0) { 188 if (strcmp(arg, "return") == 0) {
@@ -196,11 +197,18 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
196 } 197 }
197 198
198 /* Exclusion check */ 199 /* Exclusion check */
200 if (pp->lazy_line && pp->line)
201 semantic_error("Lazy pattern can't be used with line number.");
202
203 if (pp->lazy_line && pp->offset)
204 semantic_error("Lazy pattern can't be used with offset.");
205
199 if (pp->line && pp->offset) 206 if (pp->line && pp->offset)
200 semantic_error("Offset can't be used with line number."); 207 semantic_error("Offset can't be used with line number.");
201 208
202 if (!pp->line && pp->file && !pp->function) 209 if (!pp->line && !pp->lazy_line && pp->file && !pp->function)
203 semantic_error("File always requires line number."); 210 semantic_error("File always requires line number or "
211 "lazy pattern.");
204 212
205 if (pp->offset && !pp->function) 213 if (pp->offset && !pp->function)
206 semantic_error("Offset requires an entry function."); 214 semantic_error("Offset requires an entry function.");
@@ -208,11 +216,13 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
208 if (pp->retprobe && !pp->function) 216 if (pp->retprobe && !pp->function)
209 semantic_error("Return probe requires an entry function."); 217 semantic_error("Return probe requires an entry function.");
210 218
211 if ((pp->offset || pp->line) && pp->retprobe) 219 if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe)
212 semantic_error("Offset/Line can't be used with return probe."); 220 semantic_error("Offset/Line/Lazy pattern can't be used with "
221 "return probe.");
213 222
214 pr_debug("symbol:%s file:%s line:%d offset:%d, return:%d\n", 223 pr_debug("symbol:%s file:%s line:%d offset:%d return:%d lazy:%s\n",
215 pp->function, pp->file, pp->line, pp->offset, pp->retprobe); 224 pp->function, pp->file, pp->line, pp->offset, pp->retprobe,
225 pp->lazy_line);
216} 226}
217 227
218/* Parse perf-probe event definition */ 228/* Parse perf-probe event definition */
@@ -458,6 +468,8 @@ static void clear_probe_point(struct probe_point *pp)
458 free(pp->function); 468 free(pp->function);
459 if (pp->file) 469 if (pp->file)
460 free(pp->file); 470 free(pp->file);
471 if (pp->lazy_line)
472 free(pp->lazy_line);
461 for (i = 0; i < pp->nr_args; i++) 473 for (i = 0; i < pp->nr_args; i++)
462 free(pp->args[i]); 474 free(pp->args[i]);
463 if (pp->args) 475 if (pp->args)
@@ -719,6 +731,7 @@ void del_trace_kprobe_events(struct strlist *dellist)
719} 731}
720 732
721#define LINEBUF_SIZE 256 733#define LINEBUF_SIZE 256
734#define NR_ADDITIONAL_LINES 2
722 735
723static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num) 736static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
724{ 737{
@@ -779,5 +792,11 @@ void show_line_range(struct line_range *lr)
779 show_one_line(fp, (l++) - lr->offset, false, false); 792 show_one_line(fp, (l++) - lr->offset, false, false);
780 show_one_line(fp, (l++) - lr->offset, false, true); 793 show_one_line(fp, (l++) - lr->offset, false, true);
781 } 794 }
795
796 if (lr->end == INT_MAX)
797 lr->end = l + NR_ADDITIONAL_LINES;
798 while (l < lr->end && !feof(fp))
799 show_one_line(fp, (l++) - lr->offset, false, false);
800
782 fclose(fp); 801 fclose(fp);
783} 802}
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1b2124d12f68..e77dc886760e 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -32,21 +32,13 @@
32#include <stdarg.h> 32#include <stdarg.h>
33#include <ctype.h> 33#include <ctype.h>
34 34
35#include "string.h"
35#include "event.h" 36#include "event.h"
36#include "debug.h" 37#include "debug.h"
37#include "util.h" 38#include "util.h"
38#include "probe-finder.h" 39#include "probe-finder.h"
39 40
40 41
41/* Dwarf_Die Linkage to parent Die */
42struct die_link {
43 struct die_link *parent; /* Parent die */
44 Dwarf_Die die; /* Current die */
45};
46
47static Dwarf_Debug __dw_debug;
48static Dwarf_Error __dw_error;
49
50/* 42/*
51 * Generic dwarf analysis helpers 43 * Generic dwarf analysis helpers
52 */ 44 */
@@ -113,281 +105,190 @@ static int strtailcmp(const char *s1, const char *s2)
113 return 0; 105 return 0;
114} 106}
115 107
116/* Find the fileno of the target file. */ 108/* Line number list operations */
117static Dwarf_Unsigned cu_find_fileno(Dwarf_Die cu_die, const char *fname)
118{
119 Dwarf_Signed cnt, i;
120 Dwarf_Unsigned found = 0;
121 char **srcs;
122 int ret;
123 109
124 if (!fname) 110/* Add a line to line number list */
125 return 0; 111static void line_list__add_line(struct list_head *head, unsigned int line)
112{
113 struct line_node *ln;
114 struct list_head *p;
126 115
127 ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error); 116 /* Reverse search, because new line will be the last one */
128 if (ret == DW_DLV_OK) { 117 list_for_each_entry_reverse(ln, head, list) {
129 for (i = 0; i < cnt && !found; i++) { 118 if (ln->line < line) {
130 if (strtailcmp(srcs[i], fname) == 0) 119 p = &ln->list;
131 found = i + 1; 120 goto found;
132 dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING); 121 } else if (ln->line == line) /* Already exist */
133 } 122 return ;
134 for (; i < cnt; i++)
135 dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
136 dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST);
137 } 123 }
138 if (found) 124 /* List is empty, or the smallest entry */
139 pr_debug("found fno: %d\n", (int)found); 125 p = head;
140 return found; 126found:
127 pr_debug("line list: add a line %u\n", line);
128 ln = zalloc(sizeof(struct line_node));
129 DIE_IF(ln == NULL);
130 ln->line = line;
131 INIT_LIST_HEAD(&ln->list);
132 list_add(&ln->list, p);
141} 133}
142 134
143static int cu_get_filename(Dwarf_Die cu_die, Dwarf_Unsigned fno, char **buf) 135/* Check if the line in line number list */
136static int line_list__has_line(struct list_head *head, unsigned int line)
144{ 137{
145 Dwarf_Signed cnt, i; 138 struct line_node *ln;
146 char **srcs; 139
147 int ret = 0; 140 /* Reverse search, because new line will be the last one */
148 141 list_for_each_entry(ln, head, list)
149 if (!buf || !fno) 142 if (ln->line == line)
150 return -EINVAL; 143 return 1;
151 144
152 ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error); 145 return 0;
153 if (ret == DW_DLV_OK) {
154 if ((Dwarf_Unsigned)cnt > fno - 1) {
155 *buf = strdup(srcs[fno - 1]);
156 ret = 0;
157 pr_debug("found filename: %s\n", *buf);
158 } else
159 ret = -ENOENT;
160 for (i = 0; i < cnt; i++)
161 dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
162 dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST);
163 } else
164 ret = -EINVAL;
165 return ret;
166} 146}
167 147
168/* Compare diename and tname */ 148/* Init line number list */
169static int die_compare_name(Dwarf_Die dw_die, const char *tname) 149static void line_list__init(struct list_head *head)
170{ 150{
171 char *name; 151 INIT_LIST_HEAD(head);
172 int ret;
173 ret = dwarf_diename(dw_die, &name, &__dw_error);
174 DIE_IF(ret == DW_DLV_ERROR);
175 if (ret == DW_DLV_OK) {
176 ret = strcmp(tname, name);
177 dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
178 } else
179 ret = -1;
180 return ret;
181} 152}
182 153
183/* Check the address is in the subprogram(function). */ 154/* Free line number list */
184static int die_within_subprogram(Dwarf_Die sp_die, Dwarf_Addr addr, 155static void line_list__free(struct list_head *head)
185 Dwarf_Signed *offs)
186{ 156{
187 Dwarf_Addr lopc, hipc; 157 struct line_node *ln;
188 int ret; 158 while (!list_empty(head)) {
189 159 ln = list_first_entry(head, struct line_node, list);
190 /* TODO: check ranges */ 160 list_del(&ln->list);
191 ret = dwarf_lowpc(sp_die, &lopc, &__dw_error); 161 free(ln);
192 DIE_IF(ret == DW_DLV_ERROR); 162 }
193 if (ret == DW_DLV_NO_ENTRY)
194 return 0;
195 ret = dwarf_highpc(sp_die, &hipc, &__dw_error);
196 DIE_IF(ret != DW_DLV_OK);
197 if (lopc <= addr && addr < hipc) {
198 *offs = addr - lopc;
199 return 1;
200 } else
201 return 0;
202} 163}
203 164
204/* Check the die is inlined function */ 165/* Dwarf wrappers */
205static Dwarf_Bool die_inlined_subprogram(Dwarf_Die dw_die) 166
167/* Find the realpath of the target file. */
168static const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname)
206{ 169{
207 /* TODO: check strictly */ 170 Dwarf_Files *files;
208 Dwarf_Bool inl; 171 size_t nfiles, i;
172 const char *src;
209 int ret; 173 int ret;
210 174
211 ret = dwarf_hasattr(dw_die, DW_AT_inline, &inl, &__dw_error); 175 if (!fname)
212 DIE_IF(ret == DW_DLV_ERROR); 176 return NULL;
213 return inl;
214}
215 177
216/* Get the offset of abstruct_origin */ 178 ret = dwarf_getsrcfiles(cu_die, &files, &nfiles);
217static Dwarf_Off die_get_abstract_origin(Dwarf_Die dw_die) 179 if (ret != 0)
218{ 180 return NULL;
219 Dwarf_Attribute attr;
220 Dwarf_Off cu_offs;
221 int ret;
222 181
223 ret = dwarf_attr(dw_die, DW_AT_abstract_origin, &attr, &__dw_error); 182 for (i = 0; i < nfiles; i++) {
224 DIE_IF(ret != DW_DLV_OK); 183 src = dwarf_filesrc(files, i, NULL, NULL);
225 ret = dwarf_formref(attr, &cu_offs, &__dw_error); 184 if (strtailcmp(src, fname) == 0)
226 DIE_IF(ret != DW_DLV_OK); 185 break;
227 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); 186 }
228 return cu_offs; 187 return src;
229} 188}
230 189
231/* Get entry pc(or low pc, 1st entry of ranges) of the die */ 190struct __addr_die_search_param {
232static Dwarf_Addr die_get_entrypc(Dwarf_Die dw_die) 191 Dwarf_Addr addr;
192 Dwarf_Die *die_mem;
193};
194
195static int __die_search_func_cb(Dwarf_Die *fn_die, void *data)
233{ 196{
234 Dwarf_Attribute attr; 197 struct __addr_die_search_param *ad = data;
235 Dwarf_Addr addr;
236 Dwarf_Off offs;
237 Dwarf_Ranges *ranges;
238 Dwarf_Signed cnt;
239 int ret;
240 198
241 /* Try to get entry pc */ 199 if (dwarf_tag(fn_die) == DW_TAG_subprogram &&
242 ret = dwarf_attr(dw_die, DW_AT_entry_pc, &attr, &__dw_error); 200 dwarf_haspc(fn_die, ad->addr)) {
243 DIE_IF(ret == DW_DLV_ERROR); 201 memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die));
244 if (ret == DW_DLV_OK) { 202 return DWARF_CB_ABORT;
245 ret = dwarf_formaddr(attr, &addr, &__dw_error);
246 DIE_IF(ret != DW_DLV_OK);
247 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
248 return addr;
249 } 203 }
204 return DWARF_CB_OK;
205}
250 206
251 /* Try to get low pc */ 207/* Search a real subprogram including this line, */
252 ret = dwarf_lowpc(dw_die, &addr, &__dw_error); 208static Dwarf_Die *die_get_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr,
253 DIE_IF(ret == DW_DLV_ERROR); 209 Dwarf_Die *die_mem)
254 if (ret == DW_DLV_OK) 210{
255 return addr; 211 struct __addr_die_search_param ad;
256 212 ad.addr = addr;
257 /* Try to get ranges */ 213 ad.die_mem = die_mem;
258 ret = dwarf_attr(dw_die, DW_AT_ranges, &attr, &__dw_error); 214 /* dwarf_getscopes can't find subprogram. */
259 DIE_IF(ret != DW_DLV_OK); 215 if (!dwarf_getfuncs(cu_die, __die_search_func_cb, &ad, 0))
260 ret = dwarf_formref(attr, &offs, &__dw_error); 216 return NULL;
261 DIE_IF(ret != DW_DLV_OK); 217 else
262 ret = dwarf_get_ranges(__dw_debug, offs, &ranges, &cnt, NULL, 218 return die_mem;
263 &__dw_error);
264 DIE_IF(ret != DW_DLV_OK);
265 addr = ranges[0].dwr_addr1;
266 dwarf_ranges_dealloc(__dw_debug, ranges, cnt);
267 return addr;
268} 219}
269 220
270/* 221/* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */
271 * Search a Die from Die tree. 222static Dwarf_Die *die_get_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
272 * Note: cur_link->die should be deallocated in this function. 223 Dwarf_Die *die_mem)
273 */
274static int __search_die_tree(struct die_link *cur_link,
275 int (*die_cb)(struct die_link *, void *),
276 void *data)
277{ 224{
278 Dwarf_Die new_die; 225 Dwarf_Die child_die;
279 struct die_link new_link;
280 int ret; 226 int ret;
281 227
282 if (!die_cb) 228 ret = dwarf_child(sp_die, die_mem);
283 return 0; 229 if (ret != 0)
284 230 return NULL;
285 /* Check current die */
286 while (!(ret = die_cb(cur_link, data))) {
287 /* Check child die */
288 ret = dwarf_child(cur_link->die, &new_die, &__dw_error);
289 DIE_IF(ret == DW_DLV_ERROR);
290 if (ret == DW_DLV_OK) {
291 new_link.parent = cur_link;
292 new_link.die = new_die;
293 ret = __search_die_tree(&new_link, die_cb, data);
294 if (ret)
295 break;
296 }
297 231
298 /* Move to next sibling */ 232 do {
299 ret = dwarf_siblingof(__dw_debug, cur_link->die, &new_die, 233 if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine &&
300 &__dw_error); 234 dwarf_haspc(die_mem, addr))
301 DIE_IF(ret == DW_DLV_ERROR); 235 return die_mem;
302 dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
303 cur_link->die = new_die;
304 if (ret == DW_DLV_NO_ENTRY)
305 return 0;
306 }
307 dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
308 return ret;
309}
310 236
311/* Search a die in its children's die tree */ 237 if (die_get_inlinefunc(die_mem, addr, &child_die)) {
312static int search_die_from_children(Dwarf_Die parent_die, 238 memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
313 int (*die_cb)(struct die_link *, void *), 239 return die_mem;
314 void *data) 240 }
315{ 241 } while (dwarf_siblingof(die_mem, die_mem) == 0);
316 struct die_link new_link;
317 int ret;
318 242
319 new_link.parent = NULL; 243 return NULL;
320 ret = dwarf_child(parent_die, &new_link.die, &__dw_error);
321 DIE_IF(ret == DW_DLV_ERROR);
322 if (ret == DW_DLV_OK)
323 return __search_die_tree(&new_link, die_cb, data);
324 else
325 return 0;
326} 244}
327 245
328/* Find a locdesc corresponding to the address */ 246/* Compare diename and tname */
329static int attr_get_locdesc(Dwarf_Attribute attr, Dwarf_Locdesc *desc, 247static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
330 Dwarf_Addr addr)
331{ 248{
332 Dwarf_Signed lcnt; 249 const char *name;
333 Dwarf_Locdesc **llbuf; 250 name = dwarf_diename(dw_die);
334 int ret, i; 251 DIE_IF(name == NULL);
335 252 return strcmp(tname, name);
336 ret = dwarf_loclist_n(attr, &llbuf, &lcnt, &__dw_error);
337 DIE_IF(ret != DW_DLV_OK);
338 ret = DW_DLV_NO_ENTRY;
339 for (i = 0; i < lcnt; ++i) {
340 if (llbuf[i]->ld_lopc <= addr &&
341 llbuf[i]->ld_hipc > addr) {
342 memcpy(desc, llbuf[i], sizeof(Dwarf_Locdesc));
343 desc->ld_s =
344 malloc(sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
345 DIE_IF(desc->ld_s == NULL);
346 memcpy(desc->ld_s, llbuf[i]->ld_s,
347 sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
348 ret = DW_DLV_OK;
349 break;
350 }
351 dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
352 dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
353 }
354 /* Releasing loop */
355 for (; i < lcnt; ++i) {
356 dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
357 dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
358 }
359 dwarf_dealloc(__dw_debug, llbuf, DW_DLA_LIST);
360 return ret;
361} 253}
362 254
363/* Get decl_file attribute value (file number) */ 255/* Get entry pc(or low pc, 1st entry of ranges) of the die */
364static Dwarf_Unsigned die_get_decl_file(Dwarf_Die sp_die) 256static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die)
365{ 257{
366 Dwarf_Attribute attr; 258 Dwarf_Addr epc;
367 Dwarf_Unsigned fno;
368 int ret; 259 int ret;
369 260
370 ret = dwarf_attr(sp_die, DW_AT_decl_file, &attr, &__dw_error); 261 ret = dwarf_entrypc(dw_die, &epc);
371 DIE_IF(ret != DW_DLV_OK); 262 DIE_IF(ret == -1);
372 dwarf_formudata(attr, &fno, &__dw_error); 263 return epc;
373 DIE_IF(ret != DW_DLV_OK);
374 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
375 return fno;
376} 264}
377 265
378/* Get decl_line attribute value (line number) */ 266/* Get a variable die */
379static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die) 267static Dwarf_Die *die_find_variable(Dwarf_Die *sp_die, const char *name,
268 Dwarf_Die *die_mem)
380{ 269{
381 Dwarf_Attribute attr; 270 Dwarf_Die child_die;
382 Dwarf_Unsigned lno; 271 int tag;
383 int ret; 272 int ret;
384 273
385 ret = dwarf_attr(sp_die, DW_AT_decl_line, &attr, &__dw_error); 274 ret = dwarf_child(sp_die, die_mem);
386 DIE_IF(ret != DW_DLV_OK); 275 if (ret != 0)
387 dwarf_formudata(attr, &lno, &__dw_error); 276 return NULL;
388 DIE_IF(ret != DW_DLV_OK); 277
389 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); 278 do {
390 return lno; 279 tag = dwarf_tag(die_mem);
280 if ((tag == DW_TAG_formal_parameter ||
281 tag == DW_TAG_variable) &&
282 (die_compare_name(die_mem, name) == 0))
283 return die_mem;
284
285 if (die_find_variable(die_mem, name, &child_die)) {
286 memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
287 return die_mem;
288 }
289 } while (dwarf_siblingof(die_mem, die_mem) == 0);
290
291 return NULL;
391} 292}
392 293
393/* 294/*
@@ -395,47 +296,45 @@ static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die)
395 */ 296 */
396 297
397/* Show a location */ 298/* Show a location */
398static void show_location(Dwarf_Loc *loc, struct probe_finder *pf) 299static void show_location(Dwarf_Op *op, struct probe_finder *pf)
399{ 300{
400 Dwarf_Small op; 301 unsigned int regn;
401 Dwarf_Unsigned regn; 302 Dwarf_Word offs = 0;
402 Dwarf_Signed offs;
403 int deref = 0, ret; 303 int deref = 0, ret;
404 const char *regs; 304 const char *regs;
405 305
406 op = loc->lr_atom; 306 /* TODO: support CFA */
407
408 /* If this is based on frame buffer, set the offset */ 307 /* If this is based on frame buffer, set the offset */
409 if (op == DW_OP_fbreg) { 308 if (op->atom == DW_OP_fbreg) {
309 if (pf->fb_ops == NULL)
310 die("The attribute of frame base is not supported.\n");
410 deref = 1; 311 deref = 1;
411 offs = (Dwarf_Signed)loc->lr_number; 312 offs = op->number;
412 op = pf->fbloc.ld_s[0].lr_atom; 313 op = &pf->fb_ops[0];
413 loc = &pf->fbloc.ld_s[0]; 314 }
414 } else
415 offs = 0;
416 315
417 if (op >= DW_OP_breg0 && op <= DW_OP_breg31) { 316 if (op->atom >= DW_OP_breg0 && op->atom <= DW_OP_breg31) {
418 regn = op - DW_OP_breg0; 317 regn = op->atom - DW_OP_breg0;
419 offs += (Dwarf_Signed)loc->lr_number; 318 offs += op->number;
420 deref = 1; 319 deref = 1;
421 } else if (op >= DW_OP_reg0 && op <= DW_OP_reg31) { 320 } else if (op->atom >= DW_OP_reg0 && op->atom <= DW_OP_reg31) {
422 regn = op - DW_OP_reg0; 321 regn = op->atom - DW_OP_reg0;
423 } else if (op == DW_OP_bregx) { 322 } else if (op->atom == DW_OP_bregx) {
424 regn = loc->lr_number; 323 regn = op->number;
425 offs += (Dwarf_Signed)loc->lr_number2; 324 offs += op->number2;
426 deref = 1; 325 deref = 1;
427 } else if (op == DW_OP_regx) { 326 } else if (op->atom == DW_OP_regx) {
428 regn = loc->lr_number; 327 regn = op->number;
429 } else 328 } else
430 die("Dwarf_OP %d is not supported.", op); 329 die("DW_OP %d is not supported.", op->atom);
431 330
432 regs = get_arch_regstr(regn); 331 regs = get_arch_regstr(regn);
433 if (!regs) 332 if (!regs)
434 die("%lld exceeds max register number.", regn); 333 die("%u exceeds max register number.", regn);
435 334
436 if (deref) 335 if (deref)
437 ret = snprintf(pf->buf, pf->len, 336 ret = snprintf(pf->buf, pf->len, " %s=+%ju(%s)",
438 " %s=%+lld(%s)", pf->var, offs, regs); 337 pf->var, (uintmax_t)offs, regs);
439 else 338 else
440 ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs); 339 ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
441 DIE_IF(ret < 0); 340 DIE_IF(ret < 0);
@@ -443,52 +342,37 @@ static void show_location(Dwarf_Loc *loc, struct probe_finder *pf)
443} 342}
444 343
445/* Show a variables in kprobe event format */ 344/* Show a variables in kprobe event format */
446static void show_variable(Dwarf_Die vr_die, struct probe_finder *pf) 345static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
447{ 346{
448 Dwarf_Attribute attr; 347 Dwarf_Attribute attr;
449 Dwarf_Locdesc ld; 348 Dwarf_Op *expr;
349 size_t nexpr;
450 int ret; 350 int ret;
451 351
452 ret = dwarf_attr(vr_die, DW_AT_location, &attr, &__dw_error); 352 if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
453 if (ret != DW_DLV_OK)
454 goto error; 353 goto error;
455 ret = attr_get_locdesc(attr, &ld, (pf->addr - pf->cu_base)); 354 /* TODO: handle more than 1 exprs */
456 if (ret != DW_DLV_OK) 355 ret = dwarf_getlocation_addr(&attr, (pf->addr - pf->cu_base),
356 &expr, &nexpr, 1);
357 if (ret <= 0 || nexpr == 0)
457 goto error; 358 goto error;
458 /* TODO? */ 359
459 DIE_IF(ld.ld_cents != 1); 360 show_location(expr, pf);
460 show_location(&ld.ld_s[0], pf); 361 /* *expr will be cached in libdw. Don't free it. */
461 free(ld.ld_s);
462 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
463 return ; 362 return ;
464error: 363error:
364 /* TODO: Support const_value */
465 die("Failed to find the location of %s at this address.\n" 365 die("Failed to find the location of %s at this address.\n"
466 " Perhaps, it has been optimized out.", pf->var); 366 " Perhaps, it has been optimized out.", pf->var);
467} 367}
468 368
469static int variable_callback(struct die_link *dlink, void *data)
470{
471 struct probe_finder *pf = (struct probe_finder *)data;
472 Dwarf_Half tag;
473 int ret;
474
475 ret = dwarf_tag(dlink->die, &tag, &__dw_error);
476 DIE_IF(ret == DW_DLV_ERROR);
477 if ((tag == DW_TAG_formal_parameter ||
478 tag == DW_TAG_variable) &&
479 (die_compare_name(dlink->die, pf->var) == 0)) {
480 show_variable(dlink->die, pf);
481 return 1;
482 }
483 /* TODO: Support struct members and arrays */
484 return 0;
485}
486
487/* Find a variable in a subprogram die */ 369/* Find a variable in a subprogram die */
488static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf) 370static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
489{ 371{
490 int ret; 372 int ret;
373 Dwarf_Die vr_die;
491 374
375 /* TODO: Support struct members and arrays */
492 if (!is_c_varname(pf->var)) { 376 if (!is_c_varname(pf->var)) {
493 /* Output raw parameters */ 377 /* Output raw parameters */
494 ret = snprintf(pf->buf, pf->len, " %s", pf->var); 378 ret = snprintf(pf->buf, pf->len, " %s", pf->var);
@@ -499,58 +383,51 @@ static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf)
499 383
500 pr_debug("Searching '%s' variable in context.\n", pf->var); 384 pr_debug("Searching '%s' variable in context.\n", pf->var);
501 /* Search child die for local variables and parameters. */ 385 /* Search child die for local variables and parameters. */
502 ret = search_die_from_children(sp_die, variable_callback, pf); 386 if (!die_find_variable(sp_die, pf->var, &vr_die))
503 if (!ret)
504 die("Failed to find '%s' in this function.", pf->var); 387 die("Failed to find '%s' in this function.", pf->var);
505}
506
507/* Get a frame base on the address */
508static void get_current_frame_base(Dwarf_Die sp_die, struct probe_finder *pf)
509{
510 Dwarf_Attribute attr;
511 int ret;
512 388
513 ret = dwarf_attr(sp_die, DW_AT_frame_base, &attr, &__dw_error); 389 show_variable(&vr_die, pf);
514 DIE_IF(ret != DW_DLV_OK);
515 ret = attr_get_locdesc(attr, &pf->fbloc, (pf->addr - pf->cu_base));
516 DIE_IF(ret != DW_DLV_OK);
517 dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
518}
519
520static void free_current_frame_base(struct probe_finder *pf)
521{
522 free(pf->fbloc.ld_s);
523 memset(&pf->fbloc, 0, sizeof(Dwarf_Locdesc));
524} 390}
525 391
526/* Show a probe point to output buffer */ 392/* Show a probe point to output buffer */
527static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs, 393static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
528 struct probe_finder *pf)
529{ 394{
530 struct probe_point *pp = pf->pp; 395 struct probe_point *pp = pf->pp;
531 char *name; 396 Dwarf_Addr eaddr;
397 Dwarf_Die die_mem;
398 const char *name;
532 char tmp[MAX_PROBE_BUFFER]; 399 char tmp[MAX_PROBE_BUFFER];
533 int ret, i, len; 400 int ret, i, len;
401 Dwarf_Attribute fb_attr;
402 size_t nops;
403
404 /* If no real subprogram, find a real one */
405 if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) {
406 sp_die = die_get_real_subprogram(&pf->cu_die,
407 pf->addr, &die_mem);
408 if (!sp_die)
409 die("Probe point is not found in subprograms.");
410 }
534 411
535 /* Output name of probe point */ 412 /* Output name of probe point */
536 ret = dwarf_diename(sp_die, &name, &__dw_error); 413 name = dwarf_diename(sp_die);
537 DIE_IF(ret == DW_DLV_ERROR); 414 if (name) {
538 if (ret == DW_DLV_OK) { 415 dwarf_entrypc(sp_die, &eaddr);
539 ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%u", name, 416 ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%lu", name,
540 (unsigned int)offs); 417 (unsigned long)(pf->addr - eaddr));
541 /* Copy the function name if possible */ 418 /* Copy the function name if possible */
542 if (!pp->function) { 419 if (!pp->function) {
543 pp->function = strdup(name); 420 pp->function = strdup(name);
544 pp->offset = offs; 421 pp->offset = (size_t)(pf->addr - eaddr);
545 } 422 }
546 dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
547 } else { 423 } else {
548 /* This function has no name. */ 424 /* This function has no name. */
549 ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%llx", pf->addr); 425 ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%jx",
426 (uintmax_t)pf->addr);
550 if (!pp->function) { 427 if (!pp->function) {
551 /* TODO: Use _stext */ 428 /* TODO: Use _stext */
552 pp->function = strdup(""); 429 pp->function = strdup("");
553 pp->offset = (int)pf->addr; 430 pp->offset = (size_t)pf->addr;
554 } 431 }
555 } 432 }
556 DIE_IF(ret < 0); 433 DIE_IF(ret < 0);
@@ -558,8 +435,15 @@ static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs,
558 len = ret; 435 len = ret;
559 pr_debug("Probe point found: %s\n", tmp); 436 pr_debug("Probe point found: %s\n", tmp);
560 437
438 /* Get the frame base attribute/ops */
439 dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
440 ret = dwarf_getlocation_addr(&fb_attr, (pf->addr - pf->cu_base),
441 &pf->fb_ops, &nops, 1);
442 if (ret <= 0 || nops == 0)
443 pf->fb_ops = NULL;
444
561 /* Find each argument */ 445 /* Find each argument */
562 get_current_frame_base(sp_die, pf); 446 /* TODO: use dwarf_cfi_addrframe */
563 for (i = 0; i < pp->nr_args; i++) { 447 for (i = 0; i < pp->nr_args; i++) {
564 pf->var = pp->args[i]; 448 pf->var = pp->args[i];
565 pf->buf = &tmp[len]; 449 pf->buf = &tmp[len];
@@ -567,289 +451,327 @@ static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs,
567 find_variable(sp_die, pf); 451 find_variable(sp_die, pf);
568 len += strlen(pf->buf); 452 len += strlen(pf->buf);
569 } 453 }
570 free_current_frame_base(pf); 454
455 /* *pf->fb_ops will be cached in libdw. Don't free it. */
456 pf->fb_ops = NULL;
571 457
572 pp->probes[pp->found] = strdup(tmp); 458 pp->probes[pp->found] = strdup(tmp);
573 pp->found++; 459 pp->found++;
574} 460}
575 461
576static int probeaddr_callback(struct die_link *dlink, void *data) 462/* Find probe point from its line number */
463static void find_probe_point_by_line(struct probe_finder *pf)
577{ 464{
578 struct probe_finder *pf = (struct probe_finder *)data; 465 Dwarf_Lines *lines;
579 Dwarf_Half tag; 466 Dwarf_Line *line;
580 Dwarf_Signed offs; 467 size_t nlines, i;
468 Dwarf_Addr addr;
469 int lineno;
581 int ret; 470 int ret;
582 471
583 ret = dwarf_tag(dlink->die, &tag, &__dw_error); 472 ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines);
584 DIE_IF(ret == DW_DLV_ERROR); 473 DIE_IF(ret != 0);
585 /* Check the address is in this subprogram */ 474
586 if (tag == DW_TAG_subprogram && 475 for (i = 0; i < nlines; i++) {
587 die_within_subprogram(dlink->die, pf->addr, &offs)) { 476 line = dwarf_onesrcline(lines, i);
588 show_probepoint(dlink->die, offs, pf); 477 dwarf_lineno(line, &lineno);
589 return 1; 478 if (lineno != pf->lno)
479 continue;
480
481 /* TODO: Get fileno from line, but how? */
482 if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
483 continue;
484
485 ret = dwarf_lineaddr(line, &addr);
486 DIE_IF(ret != 0);
487 pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n",
488 (int)i, lineno, (uintmax_t)addr);
489 pf->addr = addr;
490
491 show_probe_point(NULL, pf);
492 /* Continuing, because target line might be inlined. */
590 } 493 }
591 return 0;
592} 494}
593 495
594/* Find probe point from its line number */ 496/* Find lines which match lazy pattern */
595static void find_probe_point_by_line(struct probe_finder *pf) 497static int find_lazy_match_lines(struct list_head *head,
498 const char *fname, const char *pat)
596{ 499{
597 Dwarf_Signed cnt, i, clm; 500 char *fbuf, *p1, *p2;
598 Dwarf_Line *lines; 501 int fd, line, nlines = 0;
599 Dwarf_Unsigned lineno = 0; 502 struct stat st;
503
504 fd = open(fname, O_RDONLY);
505 if (fd < 0)
506 die("failed to open %s", fname);
507 DIE_IF(fstat(fd, &st) < 0);
508 fbuf = malloc(st.st_size + 2);
509 DIE_IF(fbuf == NULL);
510 DIE_IF(read(fd, fbuf, st.st_size) < 0);
511 close(fd);
512 fbuf[st.st_size] = '\n'; /* Dummy line */
513 fbuf[st.st_size + 1] = '\0';
514 p1 = fbuf;
515 line = 1;
516 while ((p2 = strchr(p1, '\n')) != NULL) {
517 *p2 = '\0';
518 if (strlazymatch(p1, pat)) {
519 line_list__add_line(head, line);
520 nlines++;
521 }
522 line++;
523 p1 = p2 + 1;
524 }
525 free(fbuf);
526 return nlines;
527}
528
529/* Find probe points from lazy pattern */
530static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
531{
532 Dwarf_Lines *lines;
533 Dwarf_Line *line;
534 size_t nlines, i;
600 Dwarf_Addr addr; 535 Dwarf_Addr addr;
601 Dwarf_Unsigned fno; 536 Dwarf_Die die_mem;
537 int lineno;
602 int ret; 538 int ret;
603 539
604 ret = dwarf_srclines(pf->cu_die, &lines, &cnt, &__dw_error); 540 if (list_empty(&pf->lcache)) {
605 DIE_IF(ret != DW_DLV_OK); 541 /* Matching lazy line pattern */
542 ret = find_lazy_match_lines(&pf->lcache, pf->fname,
543 pf->pp->lazy_line);
544 if (ret <= 0)
545 die("No matched lines found in %s.", pf->fname);
546 }
547
548 ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines);
549 DIE_IF(ret != 0);
550 for (i = 0; i < nlines; i++) {
551 line = dwarf_onesrcline(lines, i);
606 552
607 for (i = 0; i < cnt; i++) { 553 dwarf_lineno(line, &lineno);
608 ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error); 554 if (!line_list__has_line(&pf->lcache, lineno))
609 DIE_IF(ret != DW_DLV_OK);
610 if (fno != pf->fno)
611 continue; 555 continue;
612 556
613 ret = dwarf_lineno(lines[i], &lineno, &__dw_error); 557 /* TODO: Get fileno from line, but how? */
614 DIE_IF(ret != DW_DLV_OK); 558 if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
615 if (lineno != pf->lno)
616 continue; 559 continue;
617 560
618 ret = dwarf_lineoff(lines[i], &clm, &__dw_error); 561 ret = dwarf_lineaddr(line, &addr);
619 DIE_IF(ret != DW_DLV_OK); 562 DIE_IF(ret != 0);
563 if (sp_die) {
564 /* Address filtering 1: does sp_die include addr? */
565 if (!dwarf_haspc(sp_die, addr))
566 continue;
567 /* Address filtering 2: No child include addr? */
568 if (die_get_inlinefunc(sp_die, addr, &die_mem))
569 continue;
570 }
620 571
621 ret = dwarf_lineaddr(lines[i], &addr, &__dw_error); 572 pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n",
622 DIE_IF(ret != DW_DLV_OK); 573 (int)i, lineno, (unsigned long long)addr);
623 pr_debug("Probe line found: line[%d]:%u,%d addr:0x%llx\n",
624 (int)i, (unsigned)lineno, (int)clm, addr);
625 pf->addr = addr; 574 pf->addr = addr;
626 /* Search a real subprogram including this line, */ 575
627 ret = search_die_from_children(pf->cu_die, 576 show_probe_point(sp_die, pf);
628 probeaddr_callback, pf);
629 if (ret == 0)
630 die("Probe point is not found in subprograms.");
631 /* Continuing, because target line might be inlined. */ 577 /* Continuing, because target line might be inlined. */
632 } 578 }
633 dwarf_srclines_dealloc(__dw_debug, lines, cnt); 579 /* TODO: deallocate lines, but how? */
580}
581
582static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
583{
584 struct probe_finder *pf = (struct probe_finder *)data;
585 struct probe_point *pp = pf->pp;
586
587 if (pp->lazy_line)
588 find_probe_point_lazy(in_die, pf);
589 else {
590 /* Get probe address */
591 pf->addr = die_get_entrypc(in_die);
592 pf->addr += pp->offset;
593 pr_debug("found inline addr: 0x%jx\n",
594 (uintmax_t)pf->addr);
595
596 show_probe_point(in_die, pf);
597 }
598
599 return DWARF_CB_OK;
634} 600}
635 601
636/* Search function from function name */ 602/* Search function from function name */
637static int probefunc_callback(struct die_link *dlink, void *data) 603static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
638{ 604{
639 struct probe_finder *pf = (struct probe_finder *)data; 605 struct probe_finder *pf = (struct probe_finder *)data;
640 struct probe_point *pp = pf->pp; 606 struct probe_point *pp = pf->pp;
641 struct die_link *lk;
642 Dwarf_Signed offs;
643 Dwarf_Half tag;
644 int ret;
645 607
646 ret = dwarf_tag(dlink->die, &tag, &__dw_error); 608 /* Check tag and diename */
647 DIE_IF(ret == DW_DLV_ERROR); 609 if (dwarf_tag(sp_die) != DW_TAG_subprogram ||
648 if (tag == DW_TAG_subprogram) { 610 die_compare_name(sp_die, pp->function) != 0)
649 if (die_compare_name(dlink->die, pp->function) == 0) { 611 return 0;
650 if (pp->line) { /* Function relative line */ 612
651 pf->fno = die_get_decl_file(dlink->die); 613 pf->fname = dwarf_decl_file(sp_die);
652 pf->lno = die_get_decl_line(dlink->die) 614 if (pp->line) { /* Function relative line */
653 + pp->line; 615 dwarf_decl_line(sp_die, &pf->lno);
654 find_probe_point_by_line(pf); 616 pf->lno += pp->line;
655 return 1; 617 find_probe_point_by_line(pf);
656 } 618 } else if (!dwarf_func_inline(sp_die)) {
657 if (die_inlined_subprogram(dlink->die)) { 619 /* Real function */
658 /* Inlined function, save it. */ 620 if (pp->lazy_line)
659 ret = dwarf_die_CU_offset(dlink->die, 621 find_probe_point_lazy(sp_die, pf);
660 &pf->inl_offs, 622 else {
661 &__dw_error); 623 pf->addr = die_get_entrypc(sp_die);
662 DIE_IF(ret != DW_DLV_OK);
663 pr_debug("inline definition offset %lld\n",
664 pf->inl_offs);
665 return 0; /* Continue to search */
666 }
667 /* Get probe address */
668 pf->addr = die_get_entrypc(dlink->die);
669 pf->addr += pp->offset; 624 pf->addr += pp->offset;
670 /* TODO: Check the address in this function */ 625 /* TODO: Check the address in this function */
671 show_probepoint(dlink->die, pp->offset, pf); 626 show_probe_point(sp_die, pf);
672 return 1; /* Exit; no same symbol in this CU. */
673 }
674 } else if (tag == DW_TAG_inlined_subroutine && pf->inl_offs) {
675 if (die_get_abstract_origin(dlink->die) == pf->inl_offs) {
676 /* Get probe address */
677 pf->addr = die_get_entrypc(dlink->die);
678 pf->addr += pp->offset;
679 pr_debug("found inline addr: 0x%llx\n", pf->addr);
680 /* Inlined function. Get a real subprogram */
681 for (lk = dlink->parent; lk != NULL; lk = lk->parent) {
682 tag = 0;
683 dwarf_tag(lk->die, &tag, &__dw_error);
684 DIE_IF(ret == DW_DLV_ERROR);
685 if (tag == DW_TAG_subprogram &&
686 !die_inlined_subprogram(lk->die))
687 goto found;
688 }
689 die("Failed to find real subprogram.");
690found:
691 /* Get offset from subprogram */
692 ret = die_within_subprogram(lk->die, pf->addr, &offs);
693 DIE_IF(!ret);
694 show_probepoint(lk->die, offs, pf);
695 /* Continue to search */
696 } 627 }
697 } 628 } else
698 return 0; 629 /* Inlined function: search instances */
630 dwarf_func_inline_instances(sp_die, probe_point_inline_cb, pf);
631
632 return 1; /* Exit; no same symbol in this CU. */
699} 633}
700 634
701static void find_probe_point_by_func(struct probe_finder *pf) 635static void find_probe_point_by_func(struct probe_finder *pf)
702{ 636{
703 search_die_from_children(pf->cu_die, probefunc_callback, pf); 637 dwarf_getfuncs(&pf->cu_die, probe_point_search_cb, pf, 0);
704} 638}
705 639
706/* Find a probe point */ 640/* Find a probe point */
707int find_probepoint(int fd, struct probe_point *pp) 641int find_probe_point(int fd, struct probe_point *pp)
708{ 642{
709 Dwarf_Half addr_size = 0;
710 Dwarf_Unsigned next_cuh = 0;
711 int cu_number = 0, ret;
712 struct probe_finder pf = {.pp = pp}; 643 struct probe_finder pf = {.pp = pp};
644 int ret;
645 Dwarf_Off off, noff;
646 size_t cuhl;
647 Dwarf_Die *diep;
648 Dwarf *dbg;
713 649
714 ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error); 650 dbg = dwarf_begin(fd, DWARF_C_READ);
715 if (ret != DW_DLV_OK) 651 if (!dbg)
716 return -ENOENT; 652 return -ENOENT;
717 653
718 pp->found = 0; 654 pp->found = 0;
719 while (++cu_number) { 655 off = 0;
720 /* Search CU (Compilation Unit) */ 656 line_list__init(&pf.lcache);
721 ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL, 657 /* Loop on CUs (Compilation Unit) */
722 &addr_size, &next_cuh, &__dw_error); 658 while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) {
723 DIE_IF(ret == DW_DLV_ERROR);
724 if (ret == DW_DLV_NO_ENTRY)
725 break;
726
727 /* Get the DIE(Debugging Information Entry) of this CU */ 659 /* Get the DIE(Debugging Information Entry) of this CU */
728 ret = dwarf_siblingof(__dw_debug, 0, &pf.cu_die, &__dw_error); 660 diep = dwarf_offdie(dbg, off + cuhl, &pf.cu_die);
729 DIE_IF(ret != DW_DLV_OK); 661 if (!diep)
662 continue;
730 663
731 /* Check if target file is included. */ 664 /* Check if target file is included. */
732 if (pp->file) 665 if (pp->file)
733 pf.fno = cu_find_fileno(pf.cu_die, pp->file); 666 pf.fname = cu_find_realpath(&pf.cu_die, pp->file);
667 else
668 pf.fname = NULL;
734 669
735 if (!pp->file || pf.fno) { 670 if (!pp->file || pf.fname) {
736 /* Save CU base address (for frame_base) */ 671 /* Save CU base address (for frame_base) */
737 ret = dwarf_lowpc(pf.cu_die, &pf.cu_base, &__dw_error); 672 ret = dwarf_lowpc(&pf.cu_die, &pf.cu_base);
738 DIE_IF(ret == DW_DLV_ERROR); 673 if (ret != 0)
739 if (ret == DW_DLV_NO_ENTRY)
740 pf.cu_base = 0; 674 pf.cu_base = 0;
741 if (pp->function) 675 if (pp->function)
742 find_probe_point_by_func(&pf); 676 find_probe_point_by_func(&pf);
677 else if (pp->lazy_line)
678 find_probe_point_lazy(NULL, &pf);
743 else { 679 else {
744 pf.lno = pp->line; 680 pf.lno = pp->line;
745 find_probe_point_by_line(&pf); 681 find_probe_point_by_line(&pf);
746 } 682 }
747 } 683 }
748 dwarf_dealloc(__dw_debug, pf.cu_die, DW_DLA_DIE); 684 off = noff;
749 } 685 }
750 ret = dwarf_finish(__dw_debug, &__dw_error); 686 line_list__free(&pf.lcache);
751 DIE_IF(ret != DW_DLV_OK); 687 dwarf_end(dbg);
752 688
753 return pp->found; 689 return pp->found;
754} 690}
755 691
756
757static void line_range_add_line(struct line_range *lr, unsigned int line)
758{
759 struct line_node *ln;
760 struct list_head *p;
761
762 /* Reverse search, because new line will be the last one */
763 list_for_each_entry_reverse(ln, &lr->line_list, list) {
764 if (ln->line < line) {
765 p = &ln->list;
766 goto found;
767 } else if (ln->line == line) /* Already exist */
768 return ;
769 }
770 /* List is empty, or the smallest entry */
771 p = &lr->line_list;
772found:
773 pr_debug("Debug: add a line %u\n", line);
774 ln = zalloc(sizeof(struct line_node));
775 DIE_IF(ln == NULL);
776 ln->line = line;
777 INIT_LIST_HEAD(&ln->list);
778 list_add(&ln->list, p);
779}
780
781/* Find line range from its line number */ 692/* Find line range from its line number */
782static void find_line_range_by_line(struct line_finder *lf) 693static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
783{ 694{
784 Dwarf_Signed cnt, i; 695 Dwarf_Lines *lines;
785 Dwarf_Line *lines; 696 Dwarf_Line *line;
786 Dwarf_Unsigned lineno = 0; 697 size_t nlines, i;
787 Dwarf_Unsigned fno;
788 Dwarf_Addr addr; 698 Dwarf_Addr addr;
699 int lineno;
789 int ret; 700 int ret;
701 const char *src;
702 Dwarf_Die die_mem;
790 703
791 ret = dwarf_srclines(lf->cu_die, &lines, &cnt, &__dw_error); 704 line_list__init(&lf->lr->line_list);
792 DIE_IF(ret != DW_DLV_OK); 705 ret = dwarf_getsrclines(&lf->cu_die, &lines, &nlines);
706 DIE_IF(ret != 0);
793 707
794 for (i = 0; i < cnt; i++) { 708 for (i = 0; i < nlines; i++) {
795 ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error); 709 line = dwarf_onesrcline(lines, i);
796 DIE_IF(ret != DW_DLV_OK); 710 ret = dwarf_lineno(line, &lineno);
797 if (fno != lf->fno) 711 DIE_IF(ret != 0);
798 continue;
799
800 ret = dwarf_lineno(lines[i], &lineno, &__dw_error);
801 DIE_IF(ret != DW_DLV_OK);
802 if (lf->lno_s > lineno || lf->lno_e < lineno) 712 if (lf->lno_s > lineno || lf->lno_e < lineno)
803 continue; 713 continue;
804 714
805 /* Filter line in the function address range */ 715 if (sp_die) {
806 if (lf->addr_s && lf->addr_e) { 716 /* Address filtering 1: does sp_die include addr? */
807 ret = dwarf_lineaddr(lines[i], &addr, &__dw_error); 717 ret = dwarf_lineaddr(line, &addr);
808 DIE_IF(ret != DW_DLV_OK); 718 DIE_IF(ret != 0);
809 if (lf->addr_s > addr || lf->addr_e <= addr) 719 if (!dwarf_haspc(sp_die, addr))
720 continue;
721
722 /* Address filtering 2: No child include addr? */
723 if (die_get_inlinefunc(sp_die, addr, &die_mem))
810 continue; 724 continue;
811 } 725 }
812 line_range_add_line(lf->lr, (unsigned int)lineno); 726
727 /* TODO: Get fileno from line, but how? */
728 src = dwarf_linesrc(line, NULL, NULL);
729 if (strtailcmp(src, lf->fname) != 0)
730 continue;
731
732 /* Copy real path */
733 if (!lf->lr->path)
734 lf->lr->path = strdup(src);
735 line_list__add_line(&lf->lr->line_list, (unsigned int)lineno);
813 } 736 }
814 dwarf_srclines_dealloc(__dw_debug, lines, cnt); 737 /* Update status */
815 if (!list_empty(&lf->lr->line_list)) 738 if (!list_empty(&lf->lr->line_list))
816 lf->found = 1; 739 lf->found = 1;
740 else {
741 free(lf->lr->path);
742 lf->lr->path = NULL;
743 }
744}
745
746static int line_range_inline_cb(Dwarf_Die *in_die, void *data)
747{
748 find_line_range_by_line(in_die, (struct line_finder *)data);
749 return DWARF_CB_ABORT; /* No need to find other instances */
817} 750}
818 751
819/* Search function from function name */ 752/* Search function from function name */
820static int linefunc_callback(struct die_link *dlink, void *data) 753static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
821{ 754{
822 struct line_finder *lf = (struct line_finder *)data; 755 struct line_finder *lf = (struct line_finder *)data;
823 struct line_range *lr = lf->lr; 756 struct line_range *lr = lf->lr;
824 Dwarf_Half tag;
825 int ret;
826 757
827 ret = dwarf_tag(dlink->die, &tag, &__dw_error); 758 if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
828 DIE_IF(ret == DW_DLV_ERROR); 759 die_compare_name(sp_die, lr->function) == 0) {
829 if (tag == DW_TAG_subprogram && 760 lf->fname = dwarf_decl_file(sp_die);
830 die_compare_name(dlink->die, lr->function) == 0) { 761 dwarf_decl_line(sp_die, &lr->offset);
831 /* Get the address range of this function */ 762 pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset);
832 ret = dwarf_highpc(dlink->die, &lf->addr_e, &__dw_error);
833 if (ret == DW_DLV_OK)
834 ret = dwarf_lowpc(dlink->die, &lf->addr_s, &__dw_error);
835 DIE_IF(ret == DW_DLV_ERROR);
836 if (ret == DW_DLV_NO_ENTRY) {
837 lf->addr_s = 0;
838 lf->addr_e = 0;
839 }
840
841 lf->fno = die_get_decl_file(dlink->die);
842 lr->offset = die_get_decl_line(dlink->die);;
843 lf->lno_s = lr->offset + lr->start; 763 lf->lno_s = lr->offset + lr->start;
844 if (!lr->end) 764 if (!lr->end)
845 lf->lno_e = (Dwarf_Unsigned)-1; 765 lf->lno_e = INT_MAX;
846 else 766 else
847 lf->lno_e = lr->offset + lr->end; 767 lf->lno_e = lr->offset + lr->end;
848 lr->start = lf->lno_s; 768 lr->start = lf->lno_s;
849 lr->end = lf->lno_e; 769 lr->end = lf->lno_e;
850 find_line_range_by_line(lf); 770 if (dwarf_func_inline(sp_die))
851 /* If we find a target function, this should be end. */ 771 dwarf_func_inline_instances(sp_die,
852 lf->found = 1; 772 line_range_inline_cb, lf);
773 else
774 find_line_range_by_line(sp_die, lf);
853 return 1; 775 return 1;
854 } 776 }
855 return 0; 777 return 0;
@@ -857,55 +779,55 @@ static int linefunc_callback(struct die_link *dlink, void *data)
857 779
858static void find_line_range_by_func(struct line_finder *lf) 780static void find_line_range_by_func(struct line_finder *lf)
859{ 781{
860 search_die_from_children(lf->cu_die, linefunc_callback, lf); 782 dwarf_getfuncs(&lf->cu_die, line_range_search_cb, lf, 0);
861} 783}
862 784
863int find_line_range(int fd, struct line_range *lr) 785int find_line_range(int fd, struct line_range *lr)
864{ 786{
865 Dwarf_Half addr_size = 0; 787 struct line_finder lf = {.lr = lr, .found = 0};
866 Dwarf_Unsigned next_cuh = 0;
867 int ret; 788 int ret;
868 struct line_finder lf = {.lr = lr}; 789 Dwarf_Off off = 0, noff;
790 size_t cuhl;
791 Dwarf_Die *diep;
792 Dwarf *dbg;
869 793
870 ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error); 794 dbg = dwarf_begin(fd, DWARF_C_READ);
871 if (ret != DW_DLV_OK) 795 if (!dbg)
872 return -ENOENT; 796 return -ENOENT;
873 797
798 /* Loop on CUs (Compilation Unit) */
874 while (!lf.found) { 799 while (!lf.found) {
875 /* Search CU (Compilation Unit) */ 800 ret = dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL);
876 ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL, 801 if (ret != 0)
877 &addr_size, &next_cuh, &__dw_error);
878 DIE_IF(ret == DW_DLV_ERROR);
879 if (ret == DW_DLV_NO_ENTRY)
880 break; 802 break;
881 803
882 /* Get the DIE(Debugging Information Entry) of this CU */ 804 /* Get the DIE(Debugging Information Entry) of this CU */
883 ret = dwarf_siblingof(__dw_debug, 0, &lf.cu_die, &__dw_error); 805 diep = dwarf_offdie(dbg, off + cuhl, &lf.cu_die);
884 DIE_IF(ret != DW_DLV_OK); 806 if (!diep)
807 continue;
885 808
886 /* Check if target file is included. */ 809 /* Check if target file is included. */
887 if (lr->file) 810 if (lr->file)
888 lf.fno = cu_find_fileno(lf.cu_die, lr->file); 811 lf.fname = cu_find_realpath(&lf.cu_die, lr->file);
812 else
813 lf.fname = 0;
889 814
890 if (!lr->file || lf.fno) { 815 if (!lr->file || lf.fname) {
891 if (lr->function) 816 if (lr->function)
892 find_line_range_by_func(&lf); 817 find_line_range_by_func(&lf);
893 else { 818 else {
894 lf.lno_s = lr->start; 819 lf.lno_s = lr->start;
895 if (!lr->end) 820 if (!lr->end)
896 lf.lno_e = (Dwarf_Unsigned)-1; 821 lf.lno_e = INT_MAX;
897 else 822 else
898 lf.lno_e = lr->end; 823 lf.lno_e = lr->end;
899 find_line_range_by_line(&lf); 824 find_line_range_by_line(NULL, &lf);
900 } 825 }
901 /* Get the real file path */
902 if (lf.found)
903 cu_get_filename(lf.cu_die, lf.fno, &lr->path);
904 } 826 }
905 dwarf_dealloc(__dw_debug, lf.cu_die, DW_DLA_DIE); 827 off = noff;
906 } 828 }
907 ret = dwarf_finish(__dw_debug, &__dw_error); 829 pr_debug("path: %lx\n", (unsigned long)lr->path);
908 DIE_IF(ret != DW_DLV_OK); 830 dwarf_end(dbg);
909 return lf.found; 831 return lf.found;
910} 832}
911 833
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 972b386116f1..d1a651793ba6 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -1,6 +1,7 @@
1#ifndef _PROBE_FINDER_H 1#ifndef _PROBE_FINDER_H
2#define _PROBE_FINDER_H 2#define _PROBE_FINDER_H
3 3
4#include <stdbool.h>
4#include "util.h" 5#include "util.h"
5 6
6#define MAX_PATH_LEN 256 7#define MAX_PATH_LEN 256
@@ -20,6 +21,7 @@ struct probe_point {
20 /* Inputs */ 21 /* Inputs */
21 char *file; /* File name */ 22 char *file; /* File name */
22 int line; /* Line number */ 23 int line; /* Line number */
24 char *lazy_line; /* Lazy line pattern */
23 25
24 char *function; /* Function name */ 26 char *function; /* Function name */
25 int offset; /* Offset bytes */ 27 int offset; /* Offset bytes */
@@ -46,53 +48,46 @@ struct line_range {
46 char *function; /* Function name */ 48 char *function; /* Function name */
47 unsigned int start; /* Start line number */ 49 unsigned int start; /* Start line number */
48 unsigned int end; /* End line number */ 50 unsigned int end; /* End line number */
49 unsigned int offset; /* Start line offset */ 51 int offset; /* Start line offset */
50 char *path; /* Real path name */ 52 char *path; /* Real path name */
51 struct list_head line_list; /* Visible lines */ 53 struct list_head line_list; /* Visible lines */
52}; 54};
53 55
54#ifndef NO_LIBDWARF 56#ifndef NO_DWARF_SUPPORT
55extern int find_probepoint(int fd, struct probe_point *pp); 57extern int find_probe_point(int fd, struct probe_point *pp);
56extern int find_line_range(int fd, struct line_range *lr); 58extern int find_line_range(int fd, struct line_range *lr);
57 59
58/* Workaround for undefined _MIPS_SZLONG bug in libdwarf.h: */
59#ifndef _MIPS_SZLONG
60# define _MIPS_SZLONG 0
61#endif
62
63#include <dwarf.h> 60#include <dwarf.h>
64#include <libdwarf.h> 61#include <libdw.h>
65 62
66struct probe_finder { 63struct probe_finder {
67 struct probe_point *pp; /* Target probe point */ 64 struct probe_point *pp; /* Target probe point */
68 65
69 /* For function searching */ 66 /* For function searching */
70 Dwarf_Addr addr; /* Address */ 67 Dwarf_Addr addr; /* Address */
71 Dwarf_Unsigned fno; /* File number */ 68 const char *fname; /* File name */
72 Dwarf_Unsigned lno; /* Line number */ 69 int lno; /* Line number */
73 Dwarf_Off inl_offs; /* Inline offset */ 70 Dwarf_Die cu_die; /* Current CU */
74 Dwarf_Die cu_die; /* Current CU */
75 71
76 /* For variable searching */ 72 /* For variable searching */
77 Dwarf_Addr cu_base; /* Current CU base address */ 73 Dwarf_Op *fb_ops; /* Frame base attribute */
78 Dwarf_Locdesc fbloc; /* Location of Current Frame Base */ 74 Dwarf_Addr cu_base; /* Current CU base address */
79 const char *var; /* Current variable name */ 75 const char *var; /* Current variable name */
80 char *buf; /* Current output buffer */ 76 char *buf; /* Current output buffer */
81 int len; /* Length of output buffer */ 77 int len; /* Length of output buffer */
78 struct list_head lcache; /* Line cache for lazy match */
82}; 79};
83 80
84struct line_finder { 81struct line_finder {
85 struct line_range *lr; /* Target line range */ 82 struct line_range *lr; /* Target line range */
86 83
87 Dwarf_Unsigned fno; /* File number */ 84 const char *fname; /* File name */
88 Dwarf_Unsigned lno_s; /* Start line number */ 85 int lno_s; /* Start line number */
89 Dwarf_Unsigned lno_e; /* End line number */ 86 int lno_e; /* End line number */
90 Dwarf_Addr addr_s; /* Start address */ 87 Dwarf_Die cu_die; /* Current CU */
91 Dwarf_Addr addr_e; /* End address */
92 Dwarf_Die cu_die; /* Current CU */
93 int found; 88 int found;
94}; 89};
95 90
96#endif /* NO_LIBDWARF */ 91#endif /* NO_DWARF_SUPPORT */
97 92
98#endif /*_PROBE_FINDER_H */ 93#endif /*_PROBE_FINDER_H */
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index c397d4f6f748..a175949ed216 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -265,21 +265,21 @@ error:
265 return false; 265 return false;
266} 266}
267 267
268/** 268/* Glob/lazy pattern matching */
269 * strglobmatch - glob expression pattern matching 269static bool __match_glob(const char *str, const char *pat, bool ignore_space)
270 * @str: the target string to match
271 * @pat: the pattern string to match
272 *
273 * This returns true if the @str matches @pat. @pat can includes wildcards
274 * ('*','?') and character classes ([CHARS], complementation and ranges are
275 * also supported). Also, this supports escape character ('\') to use special
276 * characters as normal character.
277 *
278 * Note: if @pat syntax is broken, this always returns false.
279 */
280bool strglobmatch(const char *str, const char *pat)
281{ 270{
282 while (*str && *pat && *pat != '*') { 271 while (*str && *pat && *pat != '*') {
272 if (ignore_space) {
273 /* Ignore spaces for lazy matching */
274 if (isspace(*str)) {
275 str++;
276 continue;
277 }
278 if (isspace(*pat)) {
279 pat++;
280 continue;
281 }
282 }
283 if (*pat == '?') { /* Matches any single character */ 283 if (*pat == '?') { /* Matches any single character */
284 str++; 284 str++;
285 pat++; 285 pat++;
@@ -308,3 +308,32 @@ bool strglobmatch(const char *str, const char *pat)
308 return !*str && !*pat; 308 return !*str && !*pat;
309} 309}
310 310
311/**
312 * strglobmatch - glob expression pattern matching
313 * @str: the target string to match
314 * @pat: the pattern string to match
315 *
316 * This returns true if the @str matches @pat. @pat can includes wildcards
317 * ('*','?') and character classes ([CHARS], complementation and ranges are
318 * also supported). Also, this supports escape character ('\') to use special
319 * characters as normal character.
320 *
321 * Note: if @pat syntax is broken, this always returns false.
322 */
323bool strglobmatch(const char *str, const char *pat)
324{
325 return __match_glob(str, pat, false);
326}
327
328/**
329 * strlazymatch - matching pattern strings lazily with glob pattern
330 * @str: the target string to match
331 * @pat: the pattern string to match
332 *
333 * This is similar to strglobmatch, except this ignores spaces in
334 * the target string.
335 */
336bool strlazymatch(const char *str, const char *pat)
337{
338 return __match_glob(str, pat, true);
339}
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
index 02ede58c54b4..542e44de3719 100644
--- a/tools/perf/util/string.h
+++ b/tools/perf/util/string.h
@@ -10,6 +10,7 @@ s64 perf_atoll(const char *str);
10char **argv_split(const char *str, int *argcp); 10char **argv_split(const char *str, int *argcp);
11void argv_free(char **argv); 11void argv_free(char **argv);
12bool strglobmatch(const char *str, const char *pat); 12bool strglobmatch(const char *str, const char *pat);
13bool strlazymatch(const char *str, const char *pat);
13 14
14#define _STR(x) #x 15#define _STR(x) #x
15#define STR(x) _STR(x) 16#define STR(x) _STR(x)